clang: lib/Lex/Lexer.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

29#include "llvm/ADT/STLExtras.h"

30#include "llvm/ADT/StringExtras.h"

31#include "llvm/ADT/StringRef.h"

32#include "llvm/ADT/StringSwitch.h"

33#include "llvm/Support/Compiler.h"

34#include "llvm/Support/ConvertUTF.h"

35#include "llvm/Support/MemoryBufferRef.h"

36#include "llvm/Support/NativeFormatting.h"

37#include "llvm/Support/Unicode.h"

38#include "llvm/Support/UnicodeCharRanges.h"

39#include

40#include

41#include

42#include

43#include

44#include

45#include

46#include

47#include

48

49#ifdef __SSE4_2__

50#include <nmmintrin.h>

51#endif

52

53using namespace clang;

54

55

56

57

58

59

62 return false;

64 return II->getObjCKeywordID() == objcKey;

65 return false;

66}

67

68

71 return tok::objc_not_keyword;

73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;

74}

75

76

79 case tok::annot_typename:

80 case tok::annot_decltype:

81 case tok::annot_pack_indexing_type:

82 return true;

83

84 case tok::kw_short:

85 case tok::kw_long:

86 case tok::kw___int64:

87 case tok::kw___int128:

88 case tok::kw_signed:

89 case tok::kw_unsigned:

90 case tok::kw_void:

91 case tok::kw_char:

92 case tok::kw_int:

93 case tok::kw_half:

94 case tok::kw_float:

95 case tok::kw_double:

96 case tok::kw___bf16:

97 case tok::kw__Float16:

98 case tok::kw___float128:

99 case tok::kw___ibm128:

100 case tok::kw_wchar_t:

101 case tok::kw_bool:

102 case tok::kw__Bool:

103 case tok::kw__Accum:

104 case tok::kw__Fract:

105 case tok::kw__Sat:

106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:

107#include "clang/Basic/TransformTypeTraits.def"

108 case tok::kw___auto_type:

109 case tok::kw_char16_t:

110 case tok::kw_char32_t:

111 case tok::kw_typeof:

112 case tok::kw_decltype:

113 case tok::kw_char8_t:

115

116 default:

117 return false;

118 }

119}

120

121

122

123

124

125void Lexer::anchor() {}

126

127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,

128 const char *BufEnd) {

129 BufferStart = BufStart;

130 BufferPtr = BufPtr;

131 BufferEnd = BufEnd;

132

133 assert(BufEnd[0] == 0 &&

134 "We assume that the input buffer has a null character at the end"

135 " to simplify lexing!");

136

137

138

139

140 if (BufferStart == BufferPtr) {

141

142 StringRef Buf(BufferStart, BufferEnd - BufferStart);

143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)

144 .StartsWith("\xEF\xBB\xBF", 3)

145 .Default(0);

146

147

148 BufferPtr += BOMLength;

149 }

150

151 Is_PragmaLexer = false;

152 CurrentConflictMarkerState = CMK_None;

153

154

155 IsAtStartOfLine = true;

156 IsAtPhysicalStartOfLine = true;

157

158 HasLeadingSpace = false;

159 HasLeadingEmptyMacro = false;

160

161

163

164

166

167

168

169

170

172

173

174 ExtendedTokenMode = 0;

175

176 NewLinePtr = nullptr;

177}

178

179

180

181

182

186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),

187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),

188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {

189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),

190 InputFile.getBufferEnd());

191

193}

194

195

196

197

199 const char *BufStart, const char *BufPtr, const char *BufEnd,

200 bool IsFirstIncludeOfFile)

201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),

202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {

203 InitLexer(BufStart, BufPtr, BufEnd);

204

205

207}

208

209

210

211

214 bool IsFirstIncludeOfFile)

215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),

216 FromFile.getBufferStart(), FromFile.getBufferEnd(),

217 IsFirstIncludeOfFile) {}

218

220 assert(PP && "Cannot reset token mode without a preprocessor");

221 if (LangOpts.TraditionalCPP)

223 else

225}

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

247

248

249 FileID SpellingFID = SM.getFileID(SpellingLoc);

250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);

251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);

252

253

254

255

256 const char *StrData = SM.getCharacterData(SpellingLoc);

257

258 L->BufferPtr = StrData;

259 L->BufferEnd = StrData+TokLen;

260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");

261

262

263

264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),

265 ExpansionLocStart,

266 ExpansionLocEnd, TokLen);

267

268

269

271

272

273 L->Is_PragmaLexer = true;

274 return L;

275}

276

277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {

278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;

279 this->IsAtStartOfLine = IsAtStartOfLine;

280 assert((BufferStart + Offset) <= BufferEnd);

281 BufferPtr = BufferStart + Offset;

282}

283

284template static void StringifyImpl(T &Str, char Quote) {

285 typename T::size_type i = 0, e = Str.size();

286 while (i < e) {

287 if (Str[i] == '\\' || Str[i] == Quote) {

288 Str.insert(Str.begin() + i, '\\');

289 i += 2;

290 ++e;

291 } else if (Str[i] == '\n' || Str[i] == '\r') {

292

293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&

294 Str[i] != Str[i + 1]) {

295 Str[i] = '\\';

296 Str[i + 1] = 'n';

297 } else {

298

299 Str[i] = '\\';

300 Str.insert(Str.begin() + i + 1, 'n');

301 ++e;

302 }

303 i += 2;

304 } else

305 ++i;

306 }

307}

308

310 std::string Result = std::string(Str);

311 char Quote = Charify ? '\'' : '"';

314}

315

317

318

319

320

321

322

323

325 const LangOptions &LangOpts, char *Spelling) {

326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");

327

328 size_t Length = 0;

329 const char *BufEnd = BufPtr + Tok.getLength();

330

332

333 while (BufPtr < BufEnd) {

335 Spelling[Length++] = CharAndSize.Char;

336 BufPtr += CharAndSize.Size;

337

338 if (Spelling[Length - 1] == '"')

339 break;

340 }

341

342

343

344

345 if (Length >= 2 &&

346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {

347

348

349 const char *RawEnd = BufEnd;

350 do --RawEnd; while (*RawEnd != '"');

351 size_t RawLength = RawEnd - BufPtr + 1;

352

353

354 memcpy(Spelling + Length, BufPtr, RawLength);

355 Length += RawLength;

356 BufPtr += RawLength;

357

358

359 }

360 }

361

362 while (BufPtr < BufEnd) {

364 Spelling[Length++] = CharAndSize.Char;

365 BufPtr += CharAndSize.Size;

366 }

367

368 assert(Length < Tok.getLength() &&

369 "NeedsCleaning flag set on token that didn't need cleaning!");

370 return Length;

371}

372

373

374

375

376

377

382 bool *invalid) {

383

385

386

387 bool invalidTemp = false;

388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);

389 if (invalidTemp) {

390 if (invalid) *invalid = true;

391 return {};

392 }

393

394 const char *tokenBegin = file.data() + locInfo.second;

395

396

397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,

398 file.begin(), tokenBegin, file.end());

401

403

404

406 return StringRef(tokenBegin, length);

407

408

409 buffer.resize(length);

411 return StringRef(buffer.data(), buffer.size());

412}

413

414

415

416

417

418

421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");

422

423 bool CharDataInvalid = false;

424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),

425 &CharDataInvalid);

427 *Invalid = CharDataInvalid;

428 if (CharDataInvalid)

429 return {};

430

431

432 if (Tok.needsCleaning())

433 return std::string(TokStart, TokStart + Tok.getLength());

434

439}

440

441

442

443

444

445

446

447

448

449

450

454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");

455

456 const char *TokStart = nullptr;

457

458 if (Tok.is(tok::raw_identifier))

459 TokStart = Tok.getRawIdentifier().data();

460 else if (Tok.hasUCN()) {

462

463 Buffer = II->getNameStart();

464 return II->getLength();

465 }

466 }

467

468

469 if (Tok.isLiteral())

470 TokStart = Tok.getLiteralData();

471

472 if (!TokStart) {

473

474 bool CharDataInvalid = false;

475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);

477 *Invalid = CharDataInvalid;

478 if (CharDataInvalid) {

479 Buffer = "";

480 return 0;

481 }

482 }

483

484

485 if (Tok.needsCleaning()) {

486 Buffer = TokStart;

487 return Tok.getLength();

488 }

489

490

491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));

492}

493

494

495

496

497

506

507

508

512 bool IgnoreWhiteSpace) {

513

514

515

516

517

518

519

520

521 Loc = SM.getExpansionLoc(Loc);

524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

526 return true;

527

528 const char *StrData = Buffer.data()+LocInfo.second;

529

530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))

531 return true;

532

533

534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,

535 Buffer.begin(), StrData, Buffer.end());

538 return false;

539}

540

541

542

544 const char *BufStart = Buffer.data();

545 if (Offset >= Buffer.size())

546 return nullptr;

547

548 const char *LexStart = BufStart + Offset;

549 for (; LexStart != BufStart; --LexStart) {

552

553 ++LexStart;

554 break;

555 }

556 }

557 return LexStart;

558}

559

565 if (LocInfo.first.isInvalid())

566 return Loc;

567

569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

571 return Loc;

572

573

574

575 const char *StrData = Buffer.data() + LocInfo.second;

577 if (!LexStart || LexStart == StrData)

578 return Loc;

579

580

582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,

583 Buffer.end());

585

586

588 do {

590

592

593

594

597

598

599

600 break;

601 }

602 } while (TheTok.getKind() != tok::eof);

603

604

605 return Loc;

606}

607

613

614 if (SM.isMacroArgExpansion(Loc))

615 return Loc;

616

620 FileIDAndOffset BeginFileLocInfo = SM.getDecomposedLoc(BeginFileLoc);

621 assert(FileLocInfo.first == BeginFileLocInfo.first &&

622 FileLocInfo.second >= BeginFileLocInfo.second);

623 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);

624}

625

626namespace {

627

628enum PreambleDirectiveKind {

629 PDK_Skipped,

630 PDK_Unknown

631};

632

633}

634

637 unsigned MaxLines) {

638

639

640

643 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),

644 Buffer.end());

646

647 bool InPreprocessorDirective = false;

650

651 unsigned MaxLineOffset = 0;

652 if (MaxLines) {

653 const char *CurPtr = Buffer.begin();

654 unsigned CurLine = 0;

655 while (CurPtr != Buffer.end()) {

656 char ch = *CurPtr++;

657 if (ch == '\n') {

658 ++CurLine;

659 if (CurLine == MaxLines)

660 break;

661 }

662 }

663 if (CurPtr != Buffer.end())

664 MaxLineOffset = CurPtr - Buffer.begin();

665 }

666

667 do {

669

670 if (InPreprocessorDirective) {

671

672 if (TheTok.getKind() == tok::eof) {

673 break;

674 }

675

676

677

679 continue;

680

681

682

683 InPreprocessorDirective = false;

684 }

685

686

689

690

691

692 if (MaxLineOffset && TokOffset >= MaxLineOffset)

693 break;

694 }

695

696

697 if (TheTok.getKind() == tok::comment) {

698 if (ActiveCommentLoc.isInvalid())

700 continue;

701 }

702

704

705 Token HashTok = TheTok;

706 InPreprocessorDirective = true;

708

709

710

711

715 PreambleDirectiveKind PDK

716 = llvm::StringSwitch(Keyword)

717 .Case("include", PDK_Skipped)

718 .Case("__include_macros", PDK_Skipped)

719 .Case("define", PDK_Skipped)

720 .Case("undef", PDK_Skipped)

721 .Case("line", PDK_Skipped)

722 .Case("error", PDK_Skipped)

723 .Case("pragma", PDK_Skipped)

724 .Case("import", PDK_Skipped)

725 .Case("include_next", PDK_Skipped)

726 .Case("warning", PDK_Skipped)

727 .Case("ident", PDK_Skipped)

728 .Case("sccs", PDK_Skipped)

729 .Case("assert", PDK_Skipped)

730 .Case("unassert", PDK_Skipped)

731 .Case("if", PDK_Skipped)

732 .Case("ifdef", PDK_Skipped)

733 .Case("ifndef", PDK_Skipped)

734 .Case("elif", PDK_Skipped)

735 .Case("elifdef", PDK_Skipped)

736 .Case("elifndef", PDK_Skipped)

737 .Case("else", PDK_Skipped)

738 .Case("endif", PDK_Skipped)

739 .Default(PDK_Unknown);

740

741 switch (PDK) {

742 case PDK_Skipped:

743 continue;

744

745 case PDK_Unknown:

746

747 break;

748 }

749 }

750

751

752

753

754 TheTok = HashTok;

756 TheTok.getKind() == tok::raw_identifier &&

758 LangOpts.CPlusPlusModules) {

759

760

761 Token ModuleTok = TheTok;

762 do {

764 } while (TheTok.getKind() == tok::comment);

765 if (TheTok.getKind() != tok::semi) {

766

767 TheTok = ModuleTok;

768 break;

769 }

770 continue;

771 }

772

773

774

775

776 break;

777 } while (true);

778

780 if (ActiveCommentLoc.isValid())

781 End = ActiveCommentLoc;

782 else

784

787}

788

792

793

794

796 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);

797

798

799 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))

800 return 0;

801

802 unsigned PhysOffset = 0;

803

804

805

806

807 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {

808 if (CharNo == 0)

809 return PhysOffset;

810 ++TokPtr;

811 --CharNo;

812 ++PhysOffset;

813 }

814

815

816

817 for (; CharNo; --CharNo) {

819 TokPtr += CharAndSize.Size;

820 PhysOffset += CharAndSize.Size;

821 }

822

823

824

825

826

827 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))

828 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;

829

830 return PhysOffset;

831}

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

852 return {};

853

856 return {};

857 }

858

860 if (Len > Offset)

861 Len = Len - Offset;

862 else

863 return Loc;

864

866}

867

868

869

874 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");

875

877 if (SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))

878 return false;

879

880 if (expansionLoc.isFileID()) {

881

882 if (MacroBegin)

883 *MacroBegin = expansionLoc;

884 return true;

885 }

886

888}

889

890

891

896 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");

897

900 if (tokLen == 0)

901 return false;

902

905 if (SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))

906 return false;

907

908 if (expansionLoc.isFileID()) {

909

910 if (MacroEnd)

911 *MacroEnd = expansionLoc;

912 return true;

913 }

914

916}

917

924 if (Range.isTokenRange()) {

927 return {};

928 }

929

930

931 auto [FID, BeginOffs] = SM.getDecomposedLoc(Begin);

932 if (FID.isInvalid())

933 return {};

934

935 unsigned EndOffs;

936 if (SM.isInFileID(End, FID, &EndOffs) ||

937 BeginOffs > EndOffs)

938 return {};

939

941}

942

943

946 return SM.getSLocEntry(SM.getFileID(Loc))

947 .getExpansion()

948 .isExpansionTokenRange();

949}

950

957 return {};

958

961

964 return {};

965 Range.setBegin(Begin);

967 }

968

970 if (Range.isTokenRange()) {

972 return {};

973

976 return {};

977 Range.setEnd(End);

979 }

980

985 &MacroEnd)) ||

987 &MacroEnd)))) {

988 Range.setBegin(MacroBegin);

989 Range.setEnd(MacroEnd);

990

991 if (Range.isTokenRange())

994 }

995

1000 return {};

1001

1006 return {};

1007

1011 Range.setBegin(SM.getImmediateSpellingLoc(Begin));

1012 Range.setEnd(SM.getImmediateSpellingLoc(End));

1014 }

1015 }

1016

1017 return {};

1018}

1019

1025 if (Range.isInvalid()) {

1027 return {};

1028 }

1029

1030

1031 FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin());

1032 if (beginInfo.first.isInvalid()) {

1034 return {};

1035 }

1036

1037 unsigned EndOffs;

1038 if (SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||

1039 beginInfo.second > EndOffs) {

1041 return {};

1042 }

1043

1044

1045 bool invalidTemp = false;

1046 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);

1047 if (invalidTemp) {

1049 return {};

1050 }

1051

1053 return file.substr(beginInfo.second, EndOffs - beginInfo.second);

1054}

1055

1059 assert(Loc.isMacroID() && "Only reasonable to call this on macros");

1060

1061

1062 while (true) {

1068 break;

1069

1070

1071

1072

1073

1074

1075 Loc = SM.getImmediateExpansionRange(Loc).getBegin();

1078 break;

1079

1080

1081

1082 FileID MacroFID = SM.getFileID(Loc);

1083 if (SM.isInFileID(SpellLoc, MacroFID))

1084 break;

1085

1086

1087 Loc = SpellLoc;

1088 }

1089

1090

1091

1092

1093 Loc = SM.getSpellingLoc(Loc);

1094

1095

1096

1099 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);

1100 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);

1101}

1102

1105 assert(Loc.isMacroID() && "Only reasonable to call this on macros");

1106

1107 while (SM.isMacroArgExpansion(Loc))

1108 Loc = SM.getImmediateExpansionRange(Loc).getBegin();

1109

1110

1111

1112

1114 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))

1115 return {};

1116

1117

1118

1119

1120 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());

1121

1122

1123

1126 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);

1127 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);

1128}

1129

1133

1136 if (Str - 1 < BufferStart)

1137 return false;

1138

1139 if ((Str[0] == '\n' && Str[-1] == '\r') ||

1140 (Str[0] == '\r' && Str[-1] == '\n')) {

1141 if (Str - 2 < BufferStart)

1142 return false;

1143 --Str;

1144 }

1145 --Str;

1146

1147

1149 --Str;

1150

1151 return *Str == '\\';

1152}

1153

1157 return {};

1159 if (LocInfo.first.isInvalid())

1160 return {};

1162 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

1164 return {};

1167 return {};

1168 StringRef Rest = Buffer.substr(Line - Buffer.data());

1169 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");

1170 return NumWhitespaceChars == StringRef::npos

1171 ? ""

1172 : Rest.take_front(NumWhitespaceChars);

1173}

1174

1175

1176

1177

1178

1179

1180

1181

1182

1187 unsigned CharNo, unsigned TokLen) {

1188 assert(FileLoc.isMacroID() && "Must be a macro expansion");

1189

1190

1191

1192

1194

1195

1196

1199

1200

1201

1203

1204 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);

1205}

1206

1207

1208

1210 unsigned TokLen) const {

1211 assert(Loc >= BufferStart && Loc <= BufferEnd &&

1212 "Location out of range for this buffer!");

1213

1214

1215

1216 unsigned CharNo = Loc-BufferStart;

1217 if (FileLoc.isFileID())

1218 return FileLoc.getLocWithOffset(CharNo);

1219

1220

1221

1222 assert(PP && "This doesn't work on raw lexers");

1224}

1225

1226

1227

1231

1232

1233

1234

1235

1236

1237

1239 switch (Letter) {

1240 default: return 0;

1241 case '=': return '#';

1242 case ')': return ']';

1243 case '(': return '[';

1244 case '!': return '|';

1245 case '\'': return '^';

1246 case '>': return '}';

1247 case '/': return '\\';

1248 case '<': return '{';

1249 case '-': return '~';

1250 }

1251}

1252

1253

1254

1255

1256

1259 if (!Res)

1260 return Res;

1261

1262 if (!Trigraphs) {

1264 L->Diag(CP-2, diag::trigraph_ignored);

1265 return 0;

1266 }

1267

1269 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);

1270 return Res;

1271}

1272

1273

1274

1275

1277 unsigned Size = 0;

1279 ++Size;

1280

1281 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')

1282 continue;

1283

1284

1285 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&

1286 Ptr[Size-1] != Ptr[Size])

1287 ++Size;

1288

1289 return Size;

1290 }

1291

1292

1293 return 0;

1294}

1295

1296

1297

1298

1299const char *Lexer::SkipEscapedNewLines(const char *P) {

1300 while (true) {

1301 const char *AfterEscape;

1302 if (*P == '\\') {

1303 AfterEscape = P+1;

1304 } else if (*P == '?') {

1305

1306 if (P[1] != '?' || P[2] != '/')

1307 return P;

1308

1309

1310 AfterEscape = P+3;

1311 } else {

1312 return P;

1313 }

1314

1316 if (NewLineSize == 0) return P;

1317 P = AfterEscape+NewLineSize;

1318 }

1319}

1320

1324 bool IncludeComments) {

1327 return std::nullopt;

1328 }

1330

1331

1333

1334

1335 bool InvalidTemp = false;

1336 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);

1337 if (InvalidTemp)

1338 return std::nullopt;

1339

1340 const char *TokenBegin = File.data() + LocInfo.second;

1341

1342

1343 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),

1344 TokenBegin, File.end());

1346

1349 return Tok;

1350}

1351

1355 bool IncludeComments) {

1356 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));

1357 while (Loc != StartOfFile) {

1360 return std::nullopt;

1361

1365 continue;

1366 if (Tok.is(tok::comment) || IncludeComments) {

1367 return Tok;

1368 }

1369 }

1370 return std::nullopt;

1371}

1372

1373

1374

1375

1376

1379 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {

1381 if (Tok || Tok->isNot(TKind))

1382 return {};

1384

1385

1386 unsigned NumWhitespaceChars = 0;

1387 if (SkipTrailingWhitespaceAndNewLine) {

1388 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();

1389 unsigned char C = *TokenEnd;

1391 C = *(++TokenEnd);

1392 NumWhitespaceChars++;

1393 }

1394

1395

1396 if (C == '\n' || C == '\r') {

1397 char PrevC = C;

1398 C = *(++TokenEnd);

1399 NumWhitespaceChars++;

1400 if ((C == '\n' || C == '\r') && C != PrevC)

1401 NumWhitespaceChars++;

1402 }

1403 }

1404

1405 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);

1406}

1407

1408

1409

1410

1411

1412

1413

1414

1415

1416

1417

1418

1419

1420

1421

1422

1424 unsigned Size = 0;

1425

1426 if (Ptr[0] == '\\') {

1427 ++Size;

1428 ++Ptr;

1429Slash:

1430

1432 return {'\\', Size};

1433

1434

1435

1437

1439

1440

1442 Diag(Ptr, diag::backslash_newline_space);

1443

1444

1445 Size += EscapedNewLineSize;

1446 Ptr += EscapedNewLineSize;

1447

1448

1449 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);

1450 CharAndSize.Size += Size;

1451 return CharAndSize;

1452 }

1453

1454

1455 return {'\\', Size};

1456 }

1457

1458

1459 if (Ptr[0] == '?' && Ptr[1] == '?') {

1460

1461

1463 LangOpts.Trigraphs)) {

1464

1466

1467 Ptr += 3;

1469 if (C == '\\') goto Slash;

1470 return {C, Size};

1471 }

1472 }

1473

1474

1475 return {*Ptr, Size + 1u};

1476}

1477

1478

1479

1480

1481

1482

1483

1484Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,

1486

1487 unsigned Size = 0;

1488

1489 if (Ptr[0] == '\\') {

1491 ++Ptr;

1492Slash:

1493

1495 return {'\\', Size};

1496

1497

1499

1500 Size += EscapedNewLineSize;

1501 Ptr += EscapedNewLineSize;

1502

1503

1504 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);

1505 CharAndSize.Size += Size;

1506 return CharAndSize;

1507 }

1508

1509

1510 return {'\\', Size};

1511 }

1512

1513

1514 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {

1515

1516

1518 Ptr += 3;

1520 if (C == '\\') goto Slash;

1521 return {C, Size};

1522 }

1523 }

1524

1525

1526 return {*Ptr, Size + 1u};

1527}

1528

1529

1530

1531

1532

1533

1534void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {

1535 BufferPtr = BufferStart + Offset;

1536 if (BufferPtr > BufferEnd)

1537 BufferPtr = BufferEnd;

1538

1539

1540

1541 IsAtStartOfLine = StartOfLine;

1542 IsAtPhysicalStartOfLine = StartOfLine;

1543}

1544

1546 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(

1548 return UnicodeWhitespaceChars.contains(Codepoint);

1549}

1550

1553 llvm::raw_svector_ostream CharOS(CharBuf);

1554 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);

1555 return CharBuf;

1556}

1557

1558

1559

1560

1561

1562

1563

1565 bool IsStart, bool &IsExtension) {

1566 static const llvm::sys::UnicodeCharSet MathStartChars(

1568 static const llvm::sys::UnicodeCharSet MathContinueChars(

1570 if (MathStartChars.contains(C) ||

1571 (!IsStart && MathContinueChars.contains(C))) {

1572 IsExtension = true;

1573 return true;

1574 }

1575 return false;

1576}

1577

1579 bool &IsExtension) {

1580 if (LangOpts.AsmPreprocessor) {

1581 return false;

1582 } else if (LangOpts.DollarIdents && '$' == C) {

1583 return true;

1584 } else if (LangOpts.CPlusPlus || LangOpts.C23) {

1585

1586

1587

1588

1589 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);

1590 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);

1591 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))

1592 return true;

1594 IsExtension);

1595 } else if (LangOpts.C11) {

1596 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(

1598 return C11AllowedIDChars.contains(C);

1599 } else {

1600 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(

1602 return C99AllowedIDChars.contains(C);

1603 }

1604}

1605

1607 bool &IsExtension) {

1608 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");

1609 IsExtension = false;

1610 if (LangOpts.AsmPreprocessor) {

1611 return false;

1612 }

1613 if (LangOpts.CPlusPlus || LangOpts.C23) {

1614 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);

1615 if (XIDStartChars.contains(C))

1616 return true;

1618 IsExtension);

1619 }

1621 return false;

1622 if (LangOpts.C11) {

1623 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(

1625 return !C11DisallowedInitialIDChars.contains(C);

1626 }

1627 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(

1629 return !C99DisallowedInitialIDChars.contains(C);

1630}

1631

1634

1635 static const llvm::sys::UnicodeCharSet MathStartChars(

1637 static const llvm::sys::UnicodeCharSet MathContinueChars(

1639

1640 (void)MathStartChars;

1641 (void)MathContinueChars;

1642 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&

1643 "Unexpected mathematical notation codepoint");

1644 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)

1646}

1647

1649 const char *End) {

1652}

1653

1656

1657 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {

1658 enum {

1659 CannotAppearInIdentifier = 0,

1660 CannotStartIdentifier

1661 };

1662

1663 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(

1665 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(

1667 if (!C99AllowedIDChars.contains(C)) {

1668 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)

1669 << Range

1670 << CannotAppearInIdentifier;

1671 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {

1672 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)

1673 << Range

1674 << CannotStartIdentifier;

1675 }

1676 }

1677}

1678

1679

1680

1681

1682

1685

1686 struct HomoglyphPair {

1687 uint32_t Character;

1688 char LooksLike;

1689 bool operator<(HomoglyphPair R) const { return Character < R.Character; }

1690 };

1691 static constexpr HomoglyphPair SortedHomoglyphs[] = {

1692 {U'\u00ad', 0},

1693 {U'\u01c3', '!'},

1694 {U'\u037e', ';'},

1695 {U'\u200b', 0},

1696 {U'\u200c', 0},

1697 {U'\u200d', 0},

1698 {U'\u2060', 0},

1699 {U'\u2061', 0},

1700 {U'\u2062', 0},

1701 {U'\u2063', 0},

1702 {U'\u2064', 0},

1703 {U'\u2212', '-'},

1704 {U'\u2215', '/'},

1705 {U'\u2216', '\\'},

1706 {U'\u2217', '*'},

1707 {U'\u2223', '|'},

1708 {U'\u2227', '^'},

1709 {U'\u2236', ':'},

1710 {U'\u223c', '~'},

1711 {U'\ua789', ':'},

1712 {U'\ufeff', 0},

1713 {U'\uff01', '!'},

1714 {U'\uff03', '#'},

1715 {U'\uff04', '$'},

1716 {U'\uff05', '%'},

1717 {U'\uff06', '&'},

1718 {U'\uff08', '('},

1719 {U'\uff09', ')'},

1720 {U'\uff0a', '*'},

1721 {U'\uff0b', '+'},

1722 {U'\uff0c', ','},

1723 {U'\uff0d', '-'},

1724 {U'\uff0e', '.'},

1725 {U'\uff0f', '/'},

1726 {U'\uff1a', ':'},

1727 {U'\uff1b', ';'},

1728 {U'\uff1c', '<'},

1729 {U'\uff1d', '='},

1730 {U'\uff1e', '>'},

1731 {U'\uff1f', '?'},

1732 {U'\uff20', '@'},

1733 {U'\uff3b', '['},

1734 {U'\uff3c', '\\'},

1735 {U'\uff3d', ']'},

1736 {U'\uff3e', '^'},

1737 {U'\uff5b', '{'},

1738 {U'\uff5c', '|'},

1739 {U'\uff5d', '}'},

1740 {U'\uff5e', '~'},

1741 {0, 0}

1742 };

1743 auto Homoglyph =

1744 std::lower_bound(std::begin(SortedHomoglyphs),

1745 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});

1746 if (Homoglyph->Character == C) {

1747 if (Homoglyph->LooksLike) {

1748 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};

1749 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)

1751 } else {

1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)

1754 }

1755 }

1756}

1757

1762 return;

1763

1764 bool IsExtension;

1766 bool IsIDContinue =

1767 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);

1768

1769 if ((IsFirst && IsIDStart) || (IsFirst && IsIDContinue))

1770 return;

1771

1772 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;

1773

1774 if (IsFirst || InvalidOnlyAtStart) {

1775 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)

1778 } else {

1779 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)

1782 }

1783}

1784

1785bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,

1787 const char *UCNPtr = CurPtr + Size;

1788 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, nullptr);

1789 if (CodePoint == 0) {

1790 return false;

1791 }

1792 bool IsExtension = false;

1793 if (isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {

1795 return false;

1797 PP->isPreprocessedOutput())

1799 PP->getDiagnostics(), LangOpts, CodePoint,

1801 false);

1802

1803

1804

1805

1807 if (IsExtension)

1810

1813 false);

1814 }

1815

1817 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||

1818 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))

1819 CurPtr = UCNPtr;

1820 else

1821 while (CurPtr != UCNPtr)

1822 (void)getAndAdvanceChar(CurPtr, Result);

1823 return true;

1824}

1825

1826bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {

1827 llvm::UTF32 CodePoint;

1828

1829

1830

1831

1832 unsigned FirstCodeUnitSize;

1833 getCharAndSize(CurPtr, FirstCodeUnitSize);

1834 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;

1835 const char *UnicodePtr = CharStart;

1836

1837 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(

1838 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,

1839 &CodePoint, llvm::strictConversion);

1840 if (ConvResult != llvm::conversionOK)

1841 return false;

1842

1843 bool IsExtension = false;

1845 IsExtension)) {

1847 return false;

1848

1850 PP->isPreprocessedOutput())

1852 PP->getDiagnostics(), LangOpts, CodePoint,

1853 makeCharRange(*this, CharStart, UnicodePtr), false);

1854

1855

1856

1858 if (IsExtension)

1860 PP->getDiagnostics(), CodePoint,

1864 false);

1867 }

1868

1869

1870

1871

1872 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);

1873 CurPtr = UnicodePtr;

1874 return true;

1875}

1876

1877bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,

1878 const char *CurPtr) {

1879 bool IsExtension = false;

1882 PP->isPreprocessedOutput()) {

1883 if (IsExtension)

1888 true);

1891 }

1892

1893 MIOpt.ReadToken();

1894 return LexIdentifierContinue(Result, CurPtr);

1895 }

1896

1898 PP->isPreprocessedOutput() && isASCII(*BufferPtr) &&

1900

1901

1902

1903

1904

1905

1906

1907

1908

1910 PP->getDiagnostics(), LangOpts, C,

1911 makeCharRange(*this, BufferPtr, CurPtr), true);

1912 BufferPtr = CurPtr;

1913 return false;

1914 }

1915

1916

1917

1918 MIOpt.ReadToken();

1919 FormTokenWithChars(Result, CurPtr, tok::unknown);

1920 return true;

1921}

1922

1923static const char *

1925 [[maybe_unused]] const char *BufferEnd) {

1926#ifdef __SSE4_2__

1927 alignas(16) static constexpr char AsciiIdentifierRange[16] = {

1928 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',

1929 };

1930 constexpr ssize_t BytesPerRegister = 16;

1931

1932 __m128i AsciiIdentifierRangeV =

1933 _mm_load_si128((const __m128i *)AsciiIdentifierRange);

1934

1935 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {

1936 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));

1937

1938 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,

1941 CurPtr += Consumed;

1942 if (Consumed == BytesPerRegister)

1943 continue;

1944 return CurPtr;

1945 }

1946#endif

1947

1948 unsigned char C = *CurPtr;

1950 C = *++CurPtr;

1951 return CurPtr;

1952}

1953

1954bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {

1955

1956

1957 while (true) {

1958

1960

1961 unsigned Size;

1962

1963 unsigned char C = getCharAndSize(CurPtr, Size);

1965 CurPtr = ConsumeChar(CurPtr, Size, Result);

1966 continue;

1967 }

1968 if (C == '$') {

1969

1970 if (!LangOpts.DollarIdents)

1971 break;

1972

1974 Diag(CurPtr, diag::ext_dollar_in_identifier);

1975 CurPtr = ConsumeChar(CurPtr, Size, Result);

1976 continue;

1977 }

1978 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

1979 continue;

1980 if (isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

1981 continue;

1982

1983 break;

1984 }

1985

1986 const char *IdStart = BufferPtr;

1987 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);

1988 Result.setRawIdentifierData(IdStart);

1989

1990

1991

1993 return true;

1994

1995

1996

1997 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);

1998

1999

2000

2001

2002

2003

2004 if (isCodeCompletionPoint(CurPtr)) {

2005

2006 Result.setKind(tok::code_completion);

2007

2008

2009

2010

2011

2012 assert(*CurPtr == 0 && "Completion character must be 0");

2013 ++CurPtr;

2014

2015

2016

2017 if (CurPtr < BufferEnd) {

2019 ++CurPtr;

2020 }

2021 BufferPtr = CurPtr;

2022 return true;

2023 }

2024

2025

2026

2028 return PP->HandleIdentifier(Result);

2029

2030 return true;

2031}

2032

2033

2034

2035bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {

2037 char C1 = CharAndSize1.Char;

2038 if (C1 != '0')

2039 return false;

2040

2041 auto CharAndSize2 =

2043 char C2 = CharAndSize2.Char;

2044 return (C2 == 'x' || C2 == 'X');

2045}

2046

2047

2048

2049

2050bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {

2051 unsigned Size;

2052 char C = getCharAndSize(CurPtr, Size);

2053 char PrevCh = 0;

2055 CurPtr = ConsumeChar(CurPtr, Size, Result);

2056 PrevCh = C;

2057 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {

2058 CurPtr -= Size;

2059 break;

2060 }

2061 C = getCharAndSize(CurPtr, Size);

2062 }

2063

2064

2065 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {

2066

2067

2068 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))

2069 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));

2070 }

2071

2072

2073 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {

2074

2075

2076

2077 bool IsHexFloat = true;

2078 if (!LangOpts.C99) {

2079 if (!isHexaLiteral(BufferPtr, LangOpts))

2080 IsHexFloat = false;

2081 else if (!LangOpts.CPlusPlus17 &&

2082 std::find(BufferPtr, CurPtr, '_') != CurPtr)

2083 IsHexFloat = false;

2084 }

2085 if (IsHexFloat)

2086 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));

2087 }

2088

2089

2090 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {

2094 Diag(CurPtr, LangOpts.CPlusPlus

2095 ? diag::warn_cxx11_compat_digit_separator

2096 : diag::warn_c23_compat_digit_separator);

2097 CurPtr = ConsumeChar(CurPtr, Size, Result);

2098 CurPtr = ConsumeChar(CurPtr, NextSize, Result);

2099 return LexNumericConstant(Result, CurPtr);

2100 }

2101 }

2102

2103

2104 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

2105 return LexNumericConstant(Result, CurPtr);

2106 if (isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

2107 return LexNumericConstant(Result, CurPtr);

2108

2109

2110 const char *TokStart = BufferPtr;

2111 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);

2112 Result.setLiteralData(TokStart);

2113 return true;

2114}

2115

2116

2117

2118const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,

2119 bool IsStringLiteral) {

2120 assert(LangOpts.CPlusPlus);

2121

2122

2123 unsigned Size;

2124 char C = getCharAndSize(CurPtr, Size);

2125 bool Consumed = false;

2126

2128 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

2129 Consumed = true;

2130 else if (isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

2131 Consumed = true;

2132 else

2133 return CurPtr;

2134 }

2135

2136 if (!LangOpts.CPlusPlus11) {

2138 Diag(CurPtr,

2139 C == '_' ? diag::warn_cxx11_compat_user_defined_literal

2140 : diag::warn_cxx11_compat_reserved_user_defined_literal)

2142 return CurPtr;

2143 }

2144

2145

2146

2147

2148

2149

2150 if (!Consumed) {

2151 bool IsUDSuffix = false;

2152 if (C == '_')

2153 IsUDSuffix = true;

2154 else if (IsStringLiteral && LangOpts.CPlusPlus14) {

2155

2156

2157

2158 const unsigned MaxStandardSuffixLength = 3;

2159 char Buffer[MaxStandardSuffixLength] = { C };

2160 unsigned Consumed = Size;

2161 unsigned Chars = 1;

2162 while (true) {

2163 auto [Next, NextSize] =

2166

2167 const StringRef CompleteSuffix(Buffer, Chars);

2168 IsUDSuffix =

2170 break;

2171 }

2172

2173 if (Chars == MaxStandardSuffixLength)

2174

2175 break;

2176

2177 Buffer[Chars++] = Next;

2178 Consumed += NextSize;

2179 }

2180 }

2181

2182 if (!IsUDSuffix) {

2184 Diag(CurPtr, LangOpts.MSVCCompat

2185 ? diag::ext_ms_reserved_user_defined_literal

2186 : diag::ext_reserved_user_defined_literal)

2188 return CurPtr;

2189 }

2190

2191 CurPtr = ConsumeChar(CurPtr, Size, Result);

2192 }

2193

2195 while (true) {

2196 C = getCharAndSize(CurPtr, Size);

2198 CurPtr = ConsumeChar(CurPtr, Size, Result);

2199 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {

2200 } else if (isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {

2201 } else

2202 break;

2203 }

2204

2205 return CurPtr;

2206}

2207

2208

2209

2210bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,

2212 const char *AfterQuote = CurPtr;

2213

2214 const char *NulCharacter = nullptr;

2215

2217 (Kind == tok::utf8_string_literal ||

2218 Kind == tok::utf16_string_literal ||

2219 Kind == tok::utf32_string_literal))

2220 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal

2221 : diag::warn_c99_compat_unicode_literal);

2222

2223 char C = getAndAdvanceChar(CurPtr, Result);

2224 while (C != '"') {

2225

2226

2227 if (C == '\\')

2228 C = getAndAdvanceChar(CurPtr, Result);

2229

2230 if (C == '\n' || C == '\r' ||

2231 (C == 0 && CurPtr-1 == BufferEnd)) {

2233 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;

2234 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2235 return true;

2236 }

2237

2238 if (C == 0) {

2239 if (isCodeCompletionPoint(CurPtr-1)) {

2241 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, false);

2242 else

2243 PP->CodeCompleteNaturalLanguage();

2244 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);

2245 cutOffLexing();

2246 return true;

2247 }

2248

2249 NulCharacter = CurPtr-1;

2250 }

2251 C = getAndAdvanceChar(CurPtr, Result);

2252 }

2253

2254

2255 if (LangOpts.CPlusPlus)

2256 CurPtr = LexUDSuffix(Result, CurPtr, true);

2257

2258

2260 Diag(NulCharacter, diag::null_in_char_or_string) << 1;

2261

2262

2263 const char *TokStart = BufferPtr;

2264 FormTokenWithChars(Result, CurPtr, Kind);

2265 Result.setLiteralData(TokStart);

2266 return true;

2267}

2268

2269

2270

2271bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,

2273

2274

2275

2276

2277

2279 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);

2280

2281 unsigned PrefixLen = 0;

2282

2285 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {

2286 const char *Pos = &CurPtr[PrefixLen];

2287 Diag(Pos, LangOpts.CPlusPlus26

2288 ? diag::warn_cxx26_compat_raw_string_literal_character_set

2289 : diag::ext_cxx26_raw_string_literal_character_set)

2290 << StringRef(Pos, 1);

2291 }

2292 ++PrefixLen;

2293 }

2294

2295

2296 if (CurPtr[PrefixLen] != '(') {

2298 const char *PrefixEnd = &CurPtr[PrefixLen];

2299 if (PrefixLen == 16) {

2300 Diag(PrefixEnd, diag::err_raw_delim_too_long);

2301 } else if (*PrefixEnd == '\n') {

2302 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);

2303 } else {

2304 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)

2305 << StringRef(PrefixEnd, 1);

2306 }

2307 }

2308

2309

2310

2311

2312 while (true) {

2313 char C = *CurPtr++;

2314

2315 if (C == '"')

2316 break;

2317 if (C == 0 && CurPtr-1 == BufferEnd) {

2318 --CurPtr;

2319 break;

2320 }

2321 }

2322

2323 FormTokenWithChars(Result, CurPtr, tok::unknown);

2324 return true;

2325 }

2326

2327

2328 const char *Prefix = CurPtr;

2329 CurPtr += PrefixLen + 1;

2330

2331 while (true) {

2332 char C = *CurPtr++;

2333

2334 if (C == ')') {

2335

2336 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {

2337 CurPtr += PrefixLen + 1;

2338 break;

2339 }

2340 } else if (C == 0 && CurPtr-1 == BufferEnd) {

2342 Diag(BufferPtr, diag::err_unterminated_raw_string)

2343 << StringRef(Prefix, PrefixLen);

2344 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2345 return true;

2346 }

2347 }

2348

2349

2350 if (LangOpts.CPlusPlus)

2351 CurPtr = LexUDSuffix(Result, CurPtr, true);

2352

2353

2354 const char *TokStart = BufferPtr;

2355 FormTokenWithChars(Result, CurPtr, Kind);

2356 Result.setLiteralData(TokStart);

2357 return true;

2358}

2359

2360

2361

2362bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {

2363

2364 const char *NulCharacter = nullptr;

2365 const char *AfterLessPos = CurPtr;

2366 char C = getAndAdvanceChar(CurPtr, Result);

2367 while (C != '>') {

2368

2369

2370 if (C == '\\')

2371 C = getAndAdvanceChar(CurPtr, Result);

2372

2374 (C == 0 && (CurPtr - 1 == BufferEnd))) {

2375

2376

2377 FormTokenWithChars(Result, AfterLessPos, tok::less);

2378 return true;

2379 }

2380

2381 if (C == 0) {

2382 if (isCodeCompletionPoint(CurPtr - 1)) {

2383 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, true);

2384 cutOffLexing();

2385 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);

2386 return true;

2387 }

2388 NulCharacter = CurPtr-1;

2389 }

2390 C = getAndAdvanceChar(CurPtr, Result);

2391 }

2392

2393

2395 Diag(NulCharacter, diag::null_in_char_or_string) << 1;

2396

2397

2398 const char *TokStart = BufferPtr;

2399 FormTokenWithChars(Result, CurPtr, tok::header_name);

2400 Result.setLiteralData(TokStart);

2401 return true;

2402}

2403

2404void Lexer::codeCompleteIncludedFile(const char *PathStart,

2405 const char *CompletionPoint,

2406 bool IsAngled) {

2407

2408 StringRef PartialPath(PathStart, CompletionPoint - PathStart);

2409 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";

2410 auto Slash = PartialPath.find_last_of(SlashChars);

2411 StringRef Dir =

2412 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);

2413 const char *StartOfFilename =

2414 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;

2415

2416 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(

2417 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));

2418

2419

2420 while (CompletionPoint < BufferEnd) {

2421 char Next = *(CompletionPoint + 1);

2422 if (Next == 0 || Next == '\r' || Next == '\n')

2423 break;

2424 ++CompletionPoint;

2425 if (Next == (IsAngled ? '>' : '"'))

2426 break;

2427 if (SlashChars.contains(Next))

2428 break;

2429 }

2430

2431 PP->setCodeCompletionTokenRange(

2432 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),

2433 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));

2434 PP->CodeCompleteIncludedFile(Dir, IsAngled);

2435}

2436

2437

2438

2439bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,

2441

2442 const char *NulCharacter = nullptr;

2443

2445 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)

2446 Diag(BufferPtr, LangOpts.CPlusPlus

2447 ? diag::warn_cxx98_compat_unicode_literal

2448 : diag::warn_c99_compat_unicode_literal);

2449 else if (Kind == tok::utf8_char_constant)

2450 Diag(BufferPtr, LangOpts.CPlusPlus

2451 ? diag::warn_cxx14_compat_u8_character_literal

2452 : diag::warn_c17_compat_u8_character_literal);

2453 }

2454

2455 char C = getAndAdvanceChar(CurPtr, Result);

2456 if (C == '\'') {

2458 Diag(BufferPtr, diag::ext_empty_character);

2459 FormTokenWithChars(Result, CurPtr, tok::unknown);

2460 return true;

2461 }

2462

2463 while (C != '\'') {

2464

2465 if (C == '\\')

2466 C = getAndAdvanceChar(CurPtr, Result);

2467

2468 if (C == '\n' || C == '\r' ||

2469 (C == 0 && CurPtr-1 == BufferEnd)) {

2471 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;

2472 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2473 return true;

2474 }

2475

2476 if (C == 0) {

2477 if (isCodeCompletionPoint(CurPtr-1)) {

2478 PP->CodeCompleteNaturalLanguage();

2479 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2480 cutOffLexing();

2481 return true;

2482 }

2483

2484 NulCharacter = CurPtr-1;

2485 }

2486 C = getAndAdvanceChar(CurPtr, Result);

2487 }

2488

2489

2490 if (LangOpts.CPlusPlus)

2491 CurPtr = LexUDSuffix(Result, CurPtr, false);

2492

2493

2495 Diag(NulCharacter, diag::null_in_char_or_string) << 0;

2496

2497

2498 const char *TokStart = BufferPtr;

2499 FormTokenWithChars(Result, CurPtr, Kind);

2500 Result.setLiteralData(TokStart);

2501 return true;

2502}

2503

2504

2505

2506

2507

2508bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,

2509 bool &TokAtPhysicalStartOfLine) {

2510

2512

2513 unsigned char Char = *CurPtr;

2514

2515 const char *lastNewLine = nullptr;

2516 auto setLastNewLine = [&](const char *Ptr) {

2517 lastNewLine = Ptr;

2518 if (!NewLinePtr)

2519 NewLinePtr = Ptr;

2520 };

2521 if (SawNewline)

2522 setLastNewLine(CurPtr - 1);

2523

2524

2525 while (true) {

2526

2528 Char = *++CurPtr;

2529

2530

2532 break;

2533

2535

2536 BufferPtr = CurPtr;

2537 return false;

2538 }

2539

2540

2541 if (*CurPtr == '\n')

2542 setLastNewLine(CurPtr);

2543 SawNewline = true;

2544 Char = *++CurPtr;

2545 }

2546

2547

2549 FormTokenWithChars(Result, CurPtr, tok::unknown);

2550 if (SawNewline) {

2551 IsAtStartOfLine = true;

2552 IsAtPhysicalStartOfLine = true;

2553 }

2554

2555 return true;

2556 }

2557

2558

2559 char PrevChar = CurPtr[-1];

2561

2563 if (SawNewline) {

2565 TokAtPhysicalStartOfLine = true;

2566

2567 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {

2568 if (auto *Handler = PP->getEmptylineHandler())

2569 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),

2571 }

2572 }

2573

2574 BufferPtr = CurPtr;

2575 return false;

2576}

2577

2578

2579

2580

2581

2582

2583

2584bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,

2585 bool &TokAtPhysicalStartOfLine) {

2586

2587

2588 if (!LineComment) {

2589 if (isLexingRawMode())

2590 Diag(BufferPtr, diag::ext_line_comment);

2591

2592

2593

2594 LineComment = true;

2595 }

2596

2597

2598

2599

2600

2601

2602

2603

2604

2605

2606

2607

2608 bool UnicodeDecodingAlreadyDiagnosed = false;

2609

2610 char C;

2611 while (true) {

2612 C = *CurPtr;

2613

2614 while (isASCII(C) && C != 0 &&

2615 C != '\n' && C != '\r') {

2616 C = *++CurPtr;

2617 UnicodeDecodingAlreadyDiagnosed = false;

2618 }

2619

2621 unsigned Length = llvm::getUTF8SequenceSize(

2622 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);

2623 if (Length == 0) {

2624 if (!UnicodeDecodingAlreadyDiagnosed && isLexingRawMode())

2625 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);

2626 UnicodeDecodingAlreadyDiagnosed = true;

2627 ++CurPtr;

2628 } else {

2629 UnicodeDecodingAlreadyDiagnosed = false;

2630 CurPtr += Length;

2631 }

2632 continue;

2633 }

2634

2635 const char *NextLine = CurPtr;

2636 if (C != 0) {

2637

2638 const char *EscapePtr = CurPtr-1;

2639 bool HasSpace = false;

2641 --EscapePtr;

2642 HasSpace = true;

2643 }

2644

2645 if (*EscapePtr == '\\')

2646

2647 CurPtr = EscapePtr;

2648 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&

2649 EscapePtr[-2] == '?' && LangOpts.Trigraphs)

2650

2651 CurPtr = EscapePtr-2;

2652 else

2653 break;

2654

2655

2657 Diag(EscapePtr, diag::backslash_newline_space);

2658 }

2659

2660

2661

2662

2663

2664 const char *OldPtr = CurPtr;

2667 C = getAndAdvanceChar(CurPtr, Result);

2669

2670

2671

2672 if (C != 0 && CurPtr == OldPtr+1) {

2673 CurPtr = NextLine;

2674 break;

2675 }

2676

2677

2678

2679

2680 if (CurPtr != OldPtr + 1 && C != '/' &&

2681 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {

2682 for (; OldPtr != CurPtr; ++OldPtr)

2683 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {

2684

2685

2687 const char *ForwardPtr = CurPtr;

2688 while (isWhitespace(*ForwardPtr))

2689 ++ForwardPtr;

2690 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')

2691 break;

2692 }

2693

2695 Diag(OldPtr-1, diag::ext_multi_line_line_comment);

2696 break;

2697 }

2698 }

2699

2700 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {

2701 --CurPtr;

2702 break;

2703 }

2704

2705 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {

2706 PP->CodeCompleteNaturalLanguage();

2707 cutOffLexing();

2708 return false;

2709 }

2710 }

2711

2712

2713

2717 BufferPtr = CurPtr;

2718 return true;

2719 }

2720

2721

2723 return SaveLineComment(Result, CurPtr);

2724

2725

2726

2728 BufferPtr = CurPtr;

2729 return false;

2730 }

2731

2732

2733

2734

2735

2736

2737 NewLinePtr = CurPtr++;

2738

2739

2741 TokAtPhysicalStartOfLine = true;

2742

2744 BufferPtr = CurPtr;

2745 return false;

2746}

2747

2748

2749

2750bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {

2751

2752

2753 FormTokenWithChars(Result, CurPtr, tok::comment);

2754

2756 return true;

2757

2758

2759

2761 std::string Spelling = PP->getSpelling(Result, &Invalid);

2763 return true;

2764

2765 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");

2766 Spelling[1] = '*';

2767 Spelling += "*/";

2768

2769 Result.setKind(tok::comment);

2770 PP->CreateString(Spelling, Result,

2771 Result.getLocation(), Result.getLocation());

2772 return true;

2773}

2774

2775

2776

2777

2779 bool Trigraphs) {

2780 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');

2781

2782

2783 const char *TrigraphPos = nullptr;

2784

2785 const char *SpacePos = nullptr;

2786

2787 while (true) {

2788

2789 --CurPtr;

2790

2791

2792 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {

2793

2794 if (CurPtr[0] == CurPtr[1])

2795 return false;

2796

2797 --CurPtr;

2798 }

2799

2800

2801

2803 SpacePos = CurPtr;

2804 --CurPtr;

2805 }

2806

2807

2808 if (*CurPtr == '\\') {

2809 --CurPtr;

2810 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {

2811

2812 TrigraphPos = CurPtr - 2;

2813 CurPtr -= 3;

2814 } else {

2815 return false;

2816 }

2817

2818

2819

2820 if (*CurPtr == '*')

2821 break;

2822

2823 if (*CurPtr != '\n' && *CurPtr != '\r')

2824 return false;

2825 }

2826

2827 if (TrigraphPos) {

2828

2829

2830 if (!Trigraphs) {

2832 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);

2833 return false;

2834 }

2836 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);

2837 }

2838

2839

2841 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);

2842

2843

2845 L->Diag(SpacePos, diag::backslash_newline_space);

2846

2847 return true;

2848}

2849

2850#ifdef __SSE2__

2851#include <emmintrin.h>

2852#elif __ALTIVEC__

2854#undef bool

2855#endif

2856

2857

2858

2859

2860

2861

2862

2863

2864

2865

2866bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,

2867 bool &TokAtPhysicalStartOfLine) {

2868

2869

2870

2871

2872

2873

2874

2875

2876 unsigned CharSize;

2877 unsigned char C = getCharAndSize(CurPtr, CharSize);

2878 CurPtr += CharSize;

2879 if (C == 0 && CurPtr == BufferEnd+1) {

2881 Diag(BufferPtr, diag::err_unterminated_block_comment);

2882 --CurPtr;

2883

2884

2885

2887 FormTokenWithChars(Result, CurPtr, tok::unknown);

2888 return true;

2889 }

2890

2891 BufferPtr = CurPtr;

2892 return false;

2893 }

2894

2895

2896

2897 if (C == '/')

2898 C = *CurPtr++;

2899

2900

2901

2902

2903

2904 bool UnicodeDecodingAlreadyDiagnosed = false;

2905

2906 while (true) {

2907

2908

2909 if (CurPtr + 24 < BufferEnd &&

2910

2911

2912 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {

2913

2914 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {

2916 goto MultiByteUTF8;

2917 C = *CurPtr++;

2918 }

2919 if (C == '/') goto FoundSlash;

2920

2921#ifdef __SSE2__

2923 while (CurPtr + 16 < BufferEnd) {

2925 if (LLVM_UNLIKELY(Mask != 0)) {

2926 goto MultiByteUTF8;

2927 }

2928

2930 Slashes));

2931 if (cmp != 0) {

2932

2933

2934

2935 CurPtr += llvm::countr_zero(cmp) + 1;

2936 goto FoundSlash;

2937 }

2938 CurPtr += 16;

2939 }

2940#elif __ALTIVEC__

2941 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

2942 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

2943 0x80, 0x80, 0x80, 0x80};

2944 __vector unsigned char Slashes = {

2945 '/', '/', '/', '/', '/', '/', '/', '/',

2946 '/', '/', '/', '/', '/', '/', '/', '/'

2947 };

2948 while (CurPtr + 16 < BufferEnd) {

2949 if (LLVM_UNLIKELY(

2950 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))

2951 goto MultiByteUTF8;

2952 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {

2953 break;

2954 }

2955 CurPtr += 16;

2956 }

2957

2958#else

2959 while (CurPtr + 16 < BufferEnd) {

2960 bool HasNonASCII = false;

2961 for (unsigned I = 0; I < 16; ++I)

2962 HasNonASCII |= isASCII(CurPtr[I]);

2963

2964 if (LLVM_UNLIKELY(HasNonASCII))

2965 goto MultiByteUTF8;

2966

2967 bool HasSlash = false;

2968 for (unsigned I = 0; I < 16; ++I)

2969 HasSlash |= CurPtr[I] == '/';

2970 if (HasSlash)

2971 break;

2972 CurPtr += 16;

2973 }

2974#endif

2975

2976

2977 C = *CurPtr++;

2978 }

2979

2980

2981

2982

2983 while (C != '/' && C != '\0') {

2985 UnicodeDecodingAlreadyDiagnosed = false;

2986 C = *CurPtr++;

2987 continue;

2988 }

2989 MultiByteUTF8:

2990

2991

2992 unsigned Length = llvm::getUTF8SequenceSize(

2993 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);

2994 if (Length == 0) {

2995 if (!UnicodeDecodingAlreadyDiagnosed && isLexingRawMode())

2996 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);

2997 UnicodeDecodingAlreadyDiagnosed = true;

2998 } else {

2999 UnicodeDecodingAlreadyDiagnosed = false;

3000 CurPtr += Length - 1;

3001 }

3002 C = *CurPtr++;

3003 }

3004

3005 if (C == '/') {

3006 FoundSlash:

3007 if (CurPtr[-2] == '*')

3008 break;

3009

3010 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {

3012 LangOpts.Trigraphs)) {

3013

3014

3015 break;

3016 }

3017 }

3018 if (CurPtr[0] == '*' && CurPtr[1] != '/') {

3019

3020

3021

3023 Diag(CurPtr-1, diag::warn_nested_block_comment);

3024 }

3025 } else if (C == 0 && CurPtr == BufferEnd+1) {

3027 Diag(BufferPtr, diag::err_unterminated_block_comment);

3028

3029

3030

3031 --CurPtr;

3032

3033

3034

3036 FormTokenWithChars(Result, CurPtr, tok::unknown);

3037 return true;

3038 }

3039

3040 BufferPtr = CurPtr;

3041 return false;

3042 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {

3043 PP->CodeCompleteNaturalLanguage();

3044 cutOffLexing();

3045 return false;

3046 }

3047

3048 C = *CurPtr++;

3049 }

3050

3051

3055 BufferPtr = CurPtr;

3056 return true;

3057 }

3058

3059

3061 FormTokenWithChars(Result, CurPtr, tok::comment);

3062 return true;

3063 }

3064

3065

3066

3067

3068

3070 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);

3071 return false;

3072 }

3073

3074

3075 BufferPtr = CurPtr;

3077 return false;

3078}

3079

3080

3081

3082

3083

3084

3085

3088 "Must be in a preprocessing directive!");

3091

3092

3093 const char *CurPtr = BufferPtr;

3094 while (true) {

3095 char Char = getAndAdvanceChar(CurPtr, Tmp);

3096 switch (Char) {

3097 default:

3099 Result->push_back(Char);

3100 break;

3101 case 0:

3102

3103 if (CurPtr-1 != BufferEnd) {

3104 if (isCodeCompletionPoint(CurPtr-1)) {

3105 PP->CodeCompleteNaturalLanguage();

3106 cutOffLexing();

3107 return;

3108 }

3109

3110

3112 Result->push_back(Char);

3113 break;

3114 }

3115

3116 [[fallthrough]];

3117 case '\r':

3118 case '\n':

3119

3120 assert(CurPtr[-1] == Char && "Trigraphs for newline?");

3121 BufferPtr = CurPtr-1;

3122

3123

3124 Lex(Tmp);

3125 if (Tmp.is(tok::code_completion)) {

3126 if (PP)

3127 PP->CodeCompleteNaturalLanguage();

3128 Lex(Tmp);

3129 }

3130 assert(Tmp.is(tok::eod) && "Unexpected token!");

3131

3132

3133 return;

3134 }

3135 }

3136}

3137

3138

3139

3140

3141

3142bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {

3143

3144

3145

3147

3149

3150 FormTokenWithChars(Result, CurPtr, tok::eod);

3151

3152

3153 if (PP)

3155 return true;

3156 }

3157

3158

3159

3161 Result.startToken();

3162 BufferPtr = BufferEnd;

3163 FormTokenWithChars(Result, BufferEnd, tok::eof);

3164 return true;

3165 }

3166

3167 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {

3169

3170

3171

3173 MIOpt.ExitTopLevelConditional();

3175 }

3176

3177

3178

3179

3181 if (PP->getCodeCompletionFileLoc() != FileLoc)

3183 diag::err_pp_unterminated_conditional);

3185 }

3186

3187

3188

3189

3190 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))

3191 Diag(BufferEnd, diag::warn_no_newline_eof)

3193

3194 BufferPtr = CurPtr;

3195

3196

3198}

3199

3200

3201

3202

3203std::optional Lexer::peekNextPPToken() {

3204 assert(LexingRawMode && "How can we expand a macro from a skipping buffer?");

3205

3206 if (isDependencyDirectivesLexer()) {

3207 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())

3208 return std::nullopt;

3210 (void)convertDependencyDirectiveToken(

3211 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex], Result);

3213 }

3214

3215

3216

3217

3219

3220

3221 const char *TmpBufferPtr = BufferPtr;

3223 bool atStartOfLine = IsAtStartOfLine;

3224 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;

3225 bool leadingSpace = HasLeadingSpace;

3226

3227 Token Tok;

3229

3230

3231 BufferPtr = TmpBufferPtr;

3233 HasLeadingSpace = leadingSpace;

3234 IsAtStartOfLine = atStartOfLine;

3235 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;

3236

3238

3239 if (Tok.is(tok::eof))

3240 return std::nullopt;

3241 return Tok;

3242}

3243

3244

3245static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,

3247 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";

3248 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;

3249 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);

3250 size_t Pos = RestOfBuffer.find(Terminator);

3251 while (Pos != StringRef::npos) {

3252

3253 if (Pos == 0 ||

3254 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {

3255 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);

3256 Pos = RestOfBuffer.find(Terminator);

3257 continue;

3258 }

3259 return RestOfBuffer.data()+Pos;

3260 }

3261 return nullptr;

3262}

3263

3264

3265

3266

3267

3268bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {

3269

3270 if (CurPtr != BufferStart &&

3271 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')

3272 return false;

3273

3274

3275 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&

3276 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))

3277 return false;

3278

3279

3280

3282 return false;

3283

3285

3286

3287

3289

3290

3291 Diag(CurPtr, diag::err_conflict_marker);

3292 CurrentConflictMarkerState = Kind;

3293

3294

3295

3296 while (*CurPtr != '\r' && *CurPtr != '\n') {

3297 assert(CurPtr != BufferEnd && "Didn't find end of line");

3298 ++CurPtr;

3299 }

3300 BufferPtr = CurPtr;

3301 return true;

3302 }

3303

3304

3305 return false;

3306}

3307

3308

3309

3310

3311

3312bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {

3313

3314 if (CurPtr != BufferStart &&

3315 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')

3316 return false;

3317

3318

3319

3321 return false;

3322

3323

3324 for (unsigned i = 1; i != 4; ++i)

3325 if (CurPtr[i] != CurPtr[0])

3326 return false;

3327

3328

3329

3330

3331 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,

3332 CurrentConflictMarkerState)) {

3333 CurPtr = End;

3334

3335

3336 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')

3337 ++CurPtr;

3338

3339 BufferPtr = CurPtr;

3340

3341

3342 CurrentConflictMarkerState = CMK_None;

3343 return true;

3344 }

3345

3346 return false;

3347}

3348

3350 const char *BufferEnd) {

3351 if (CurPtr == BufferEnd)

3352 return nullptr;

3353 BufferEnd -= 1;

3354 for (; CurPtr != BufferEnd; ++CurPtr) {

3355 if (CurPtr[0] == '#' && CurPtr[1] == '>')

3356 return CurPtr + 2;

3357 }

3358 return nullptr;

3359}

3360

3361bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {

3362 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");

3363 if (PP || PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)

3364 return false;

3366 if (!End)

3367 return false;

3368 const char *Start = CurPtr - 1;

3369 if (!LangOpts.AllowEditorPlaceholders)

3370 Diag(Start, diag::err_placeholder_in_source);

3371 Result.startToken();

3372 FormTokenWithChars(Result, End, tok::raw_identifier);

3373 Result.setRawIdentifierData(Start);

3374 PP->LookUpIdentifierInfo(Result);

3376 BufferPtr = End;

3377 return true;

3378}

3379

3380bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {

3381 if (PP && PP->isCodeCompletionEnabled()) {

3382 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);

3383 return Loc == PP->getCodeCompletionLoc();

3384 }

3385

3386 return false;

3387}

3388

3390 bool Named,

3393 unsigned DiagId;

3394 if (Opts.CPlusPlus23)

3395 DiagId = diag::warn_cxx23_delimited_escape_sequence;

3396 else if (Opts.C2y && !Named)

3397 DiagId = diag::warn_c2y_delimited_escape_sequence;

3398 else

3399 DiagId = diag::ext_delimited_escape_sequence;

3400

3401

3402

3403

3404 unsigned Ext;

3405 if (!Opts.CPlusPlus)

3406 Ext = Named ? 2 : 1 ;

3407 else

3408 Ext = 0;

3409

3410 Diags.Report(Loc, DiagId) << Named << Ext;

3411}

3412

3413std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,

3414 const char *SlashLoc,

3416 unsigned CharSize;

3417 char Kind = getCharAndSize(StartPtr, CharSize);

3418 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");

3419

3420 unsigned NumHexDigits;

3421 if (Kind == 'u')

3422 NumHexDigits = 4;

3423 else if (Kind == 'U')

3424 NumHexDigits = 8;

3425

3426 bool Delimited = false;

3427 bool FoundEndDelimiter = false;

3428 unsigned Count = 0;

3430

3431 if (!LangOpts.CPlusPlus && !LangOpts.C99) {

3432 if (Diagnose)

3433 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);

3434 return std::nullopt;

3435 }

3436

3437 const char *CurPtr = StartPtr + CharSize;

3438 const char *KindLoc = &CurPtr[-1];

3439

3440 uint32_t CodePoint = 0;

3441 while (Count != NumHexDigits || Delimited) {

3442 char C = getCharAndSize(CurPtr, CharSize);

3443 if (!Delimited && Count == 0 && C == '{') {

3444 Delimited = true;

3445 CurPtr += CharSize;

3446 continue;

3447 }

3448

3449 if (Delimited && C == '}') {

3450 CurPtr += CharSize;

3451 FoundEndDelimiter = true;

3452 break;

3453 }

3454

3455 unsigned Value = llvm::hexDigitValue(C);

3456 if (Value == std::numeric_limits::max()) {

3457 if (!Delimited)

3458 break;

3459 if (Diagnose)

3460 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)

3461 << StringRef(KindLoc, 1);

3462 return std::nullopt;

3463 }

3464

3465 if (CodePoint & 0xF000'0000) {

3466 if (Diagnose)

3467 Diag(KindLoc, diag::err_escape_too_large) << 0;

3468 return std::nullopt;

3469 }

3470

3471 CodePoint <<= 4;

3472 CodePoint |= Value;

3473 CurPtr += CharSize;

3474 Count++;

3475 }

3476

3477 if (Count == 0) {

3478 if (Diagnose)

3479 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty

3480 : diag::warn_ucn_escape_no_digits)

3481 << StringRef(KindLoc, 1);

3482 return std::nullopt;

3483 }

3484

3485 if (Delimited && Kind == 'U') {

3486 if (Diagnose)

3487 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);

3488 return std::nullopt;

3489 }

3490

3491 if (!Delimited && Count != NumHexDigits) {

3492 if (Diagnose) {

3493 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);

3494

3495 if (Count == 4 && NumHexDigits == 8) {

3496 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);

3497 Diag(KindLoc, diag::note_ucn_four_not_eight)

3499 }

3500 }

3501 return std::nullopt;

3502 }

3503

3504 if (Delimited && PP)

3506 PP->getLangOpts(),

3507 PP->getDiagnostics());

3508

3511

3512

3513

3514 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))

3515 StartPtr = CurPtr;

3516 else

3517 while (StartPtr != CurPtr)

3518 (void)getAndAdvanceChar(StartPtr, *Result);

3519 } else {

3520 StartPtr = CurPtr;

3521 }

3522 return CodePoint;

3523}

3524

3525std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,

3526 const char *SlashLoc,

3528 unsigned CharSize;

3530

3531 char C = getCharAndSize(StartPtr, CharSize);

3532 assert(C == 'N' && "expected \\N{...}");

3533

3534 const char *CurPtr = StartPtr + CharSize;

3535 const char *KindLoc = &CurPtr[-1];

3536

3537 C = getCharAndSize(CurPtr, CharSize);

3538 if (C != '{') {

3539 if (Diagnose)

3540 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);

3541 return std::nullopt;

3542 }

3543 CurPtr += CharSize;

3544 const char *StartName = CurPtr;

3545 bool FoundEndDelimiter = false;

3546 llvm::SmallVector<char, 30> Buffer;

3547 while (C) {

3548 C = getCharAndSize(CurPtr, CharSize);

3549 CurPtr += CharSize;

3550 if (C == '}') {

3551 FoundEndDelimiter = true;

3552 break;

3553 }

3554

3556 break;

3557 Buffer.push_back(C);

3558 }

3559

3560 if (!FoundEndDelimiter || Buffer.empty()) {

3561 if (Diagnose)

3562 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty

3563 : diag::warn_delimited_ucn_incomplete)

3564 << StringRef(KindLoc, 1);

3565 return std::nullopt;

3566 }

3567

3568 StringRef Name(Buffer.data(), Buffer.size());

3569 std::optional<char32_t> Match =

3570 llvm::sys::unicode::nameToCodepointStrict(Name);

3571 std::optionalllvm::sys::unicode::LooseMatchingResult LooseMatch;

3573 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);

3574 if (Diagnose) {

3575 Diag(StartName, diag::err_invalid_ucn_name)

3576 << StringRef(Buffer.data(), Buffer.size())

3577 << makeCharRange(*this, StartName, CurPtr - CharSize);

3578 if (LooseMatch) {

3579 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)

3581 makeCharRange(*this, StartName, CurPtr - CharSize),

3582 LooseMatch->Name);

3583 }

3584 }

3585

3586

3587

3588 }

3589

3590 if (Diagnose && Match)

3592 PP->getLangOpts(),

3593 PP->getDiagnostics());

3594

3595

3596

3597

3598

3599 if (LooseMatch && Diagnose)

3600 Match = LooseMatch->CodePoint;

3601

3604

3605

3606

3607 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))

3608 StartPtr = CurPtr;

3609 else

3610 while (StartPtr != CurPtr)

3611 (void)getAndAdvanceChar(StartPtr, *Result);

3612 } else {

3613 StartPtr = CurPtr;

3614 }

3615 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;

3616}

3617

3618uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,

3620

3621 unsigned CharSize;

3622 std::optional<uint32_t> CodePointOpt;

3623 char Kind = getCharAndSize(StartPtr, CharSize);

3624 if (Kind == 'u' || Kind == 'U')

3625 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);

3626 else if (Kind == 'N')

3627 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);

3628

3629 if (!CodePointOpt)

3630 return 0;

3631

3632 uint32_t CodePoint = *CodePointOpt;

3633

3634

3635 if (LangOpts.AsmPreprocessor)

3636 return CodePoint;

3637

3638

3639

3640

3641

3642

3643

3644

3645

3646

3647

3648

3649

3650

3651

3652

3653

3654 if (CodePoint < 0xA0) {

3655

3656

3658 if (CodePoint < 0x20 || CodePoint >= 0x7F)

3659 Diag(BufferPtr, diag::err_ucn_control_character);

3660 else {

3661 char C = static_cast<char>(CodePoint);

3662 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);

3663 }

3664 }

3665

3666 return 0;

3667 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {

3668

3669

3670

3672 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)

3673 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);

3674 else

3675 Diag(BufferPtr, diag::err_ucn_escape_invalid);

3676 }

3677 return 0;

3678 }

3679

3680 return CodePoint;

3681}

3682

3683bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,

3684 const char *CurPtr) {

3687 Diag(BufferPtr, diag::ext_unicode_whitespace)

3689

3691 return true;

3692 }

3693 return false;

3694}

3695

3696void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {

3697 IsAtStartOfLine = Result.isAtStartOfLine();

3698 HasLeadingSpace = Result.hasLeadingSpace();

3699 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();

3700

3701}

3702

3704 assert(!isDependencyDirectivesLexer());

3705

3706

3707 Result.startToken();

3708

3709

3710 if (IsAtStartOfLine) {

3712 IsAtStartOfLine = false;

3713 }

3714

3715 if (HasLeadingSpace) {

3717 HasLeadingSpace = false;

3718 }

3719

3720 if (HasLeadingEmptyMacro) {

3722 HasLeadingEmptyMacro = false;

3723 }

3724

3725 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;

3726 IsAtPhysicalStartOfLine = false;

3728 (void) isRawLex;

3729 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);

3730

3731 assert((returnedToken || !isRawLex) && "Raw lex must succeed");

3732 return returnedToken;

3733}

3734

3735

3736

3737

3738

3739

3740bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {

3741LexStart:

3742 assert(Result.needsCleaning() && "Result needs cleaning");

3743 assert(Result.hasPtrData() && "Result has not been reset");

3744

3745

3746 const char *CurPtr = BufferPtr;

3747

3748

3750 do {

3751 ++CurPtr;

3753

3754

3755

3756

3758 FormTokenWithChars(Result, CurPtr, tok::unknown);

3759

3760 return true;

3761 }

3762

3763 BufferPtr = CurPtr;

3765 }

3766

3767 unsigned SizeTmp, SizeTmp2;

3768

3769

3770 char Char = getAndAdvanceChar(CurPtr, Result);

3772

3774 NewLinePtr = nullptr;

3775

3776 switch (Char) {

3777 case 0:

3778

3779 if (CurPtr-1 == BufferEnd)

3780 return LexEndOfFile(Result, CurPtr-1);

3781

3782

3783 if (isCodeCompletionPoint(CurPtr-1)) {

3784

3785 Result.startToken();

3786 FormTokenWithChars(Result, CurPtr, tok::code_completion);

3787 return true;

3788 }

3789

3791 Diag(CurPtr-1, diag::null_in_file);

3793 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3794 return true;

3795

3796

3797

3798 goto LexNextToken;

3799

3800 case 26:

3801

3802 if (LangOpts.MicrosoftExt) {

3804 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);

3805 return LexEndOfFile(Result, CurPtr-1);

3806 }

3807

3808

3809 Kind = tok::unknown;

3810 break;

3811

3812 case '\r':

3813 if (CurPtr[0] == '\n')

3814 (void)getAndAdvanceChar(CurPtr, Result);

3815 [[fallthrough]];

3816 case '\n':

3817

3818

3820

3822

3823

3824 if (PP)

3826

3827

3828 IsAtStartOfLine = true;

3829 IsAtPhysicalStartOfLine = true;

3830 NewLinePtr = CurPtr - 1;

3831

3832 Kind = tok::eod;

3833 break;

3834 }

3835

3836

3838

3839 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3840 return true;

3841

3842

3843

3844 goto LexNextToken;

3845 case ' ':

3846 case '\t':

3847 case '\f':

3848 case '\v':

3849 SkipHorizontalWhitespace:

3851 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3852 return true;

3853

3854 SkipIgnoredUnits:

3855 CurPtr = BufferPtr;

3856

3857

3858

3859 if (CurPtr[0] == '/' && CurPtr[1] == '/' && inKeepCommentMode() &&

3860 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {

3861 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))

3862 return true;

3863 goto SkipIgnoredUnits;

3864 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && inKeepCommentMode()) {

3865 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))

3866 return true;

3867 goto SkipIgnoredUnits;

3869 goto SkipHorizontalWhitespace;

3870 }

3871

3872

3873 goto LexNextToken;

3874

3875

3876

3877 case '0': case '1': case '2': case '3': case '4':

3878 case '5': case '6': case '7': case '8': case '9':

3879

3880 MIOpt.ReadToken();

3881 return LexNumericConstant(Result, CurPtr);

3882

3883

3884

3885

3886 case 'u':

3887

3888 MIOpt.ReadToken();

3889

3890 if (LangOpts.CPlusPlus11 || LangOpts.C11) {

3891 Char = getCharAndSize(CurPtr, SizeTmp);

3892

3893

3894 if (Char == '"')

3895 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3896 tok::utf16_string_literal);

3897

3898

3899 if (Char == '\'')

3900 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3901 tok::utf16_char_constant);

3902

3903

3904 if (Char == 'R' && LangOpts.RawStringLiterals &&

3905 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

3906 return LexRawStringLiteral(Result,

3907 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3909 tok::utf16_string_literal);

3910

3911 if (Char == '8') {

3912 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);

3913

3914

3915 if (Char2 == '"')

3916 return LexStringLiteral(Result,

3917 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3919 tok::utf8_string_literal);

3920 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))

3921 return LexCharConstant(

3922 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3924 tok::utf8_char_constant);

3925

3926 if (Char2 == 'R' && LangOpts.RawStringLiterals) {

3927 unsigned SizeTmp3;

3928 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);

3929

3930 if (Char3 == '"') {

3931 return LexRawStringLiteral(Result,

3932 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3935 tok::utf8_string_literal);

3936 }

3937 }

3938 }

3939 }

3940

3941

3942 return LexIdentifierContinue(Result, CurPtr);

3943

3944 case 'U':

3945

3946 MIOpt.ReadToken();

3947

3948 if (LangOpts.CPlusPlus11 || LangOpts.C11) {

3949 Char = getCharAndSize(CurPtr, SizeTmp);

3950

3951

3952 if (Char == '"')

3953 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3954 tok::utf32_string_literal);

3955

3956

3957 if (Char == '\'')

3958 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3959 tok::utf32_char_constant);

3960

3961

3962 if (Char == 'R' && LangOpts.RawStringLiterals &&

3963 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

3964 return LexRawStringLiteral(Result,

3965 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3967 tok::utf32_string_literal);

3968 }

3969

3970

3971 return LexIdentifierContinue(Result, CurPtr);

3972

3973 case 'R':

3974

3975 MIOpt.ReadToken();

3976

3977 if (LangOpts.RawStringLiterals) {

3978 Char = getCharAndSize(CurPtr, SizeTmp);

3979

3980 if (Char == '"')

3981 return LexRawStringLiteral(Result,

3982 ConsumeChar(CurPtr, SizeTmp, Result),

3983 tok::string_literal);

3984 }

3985

3986

3987 return LexIdentifierContinue(Result, CurPtr);

3988

3989 case 'L':

3990

3991 MIOpt.ReadToken();

3992 Char = getCharAndSize(CurPtr, SizeTmp);

3993

3994

3995 if (Char == '"')

3996 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3997 tok::wide_string_literal);

3998

3999

4000 if (LangOpts.RawStringLiterals && Char == 'R' &&

4001 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

4002 return LexRawStringLiteral(Result,

4003 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4005 tok::wide_string_literal);

4006

4007

4008 if (Char == '\'')

4009 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4010 tok::wide_char_constant);

4011

4012 [[fallthrough]];

4013

4014

4015 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':

4016 case 'H': case 'I': case 'J': case 'K': case 'M': case 'N':

4017 case 'O': case 'P': case 'Q': case 'S': case 'T':

4018 case 'V': case 'W': case 'X': case 'Y': case 'Z':

4019 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':

4020 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':

4021 case 'o': case 'p': case 'q': case 'r': case 's': case 't':

4022 case 'v': case 'w': case 'x': case 'y': case 'z':

4023 case '_':

4024

4025 MIOpt.ReadToken();

4026 return LexIdentifierContinue(Result, CurPtr);

4027

4028 case '$':

4029 if (LangOpts.DollarIdents) {

4031 Diag(CurPtr-1, diag::ext_dollar_in_identifier);

4032

4033 MIOpt.ReadToken();

4034 return LexIdentifierContinue(Result, CurPtr);

4035 }

4036

4037 Kind = tok::unknown;

4038 break;

4039

4040

4041 case '\'':

4042

4043 MIOpt.ReadToken();

4044 return LexCharConstant(Result, CurPtr, tok::char_constant);

4045

4046

4047 case '"':

4048

4049 MIOpt.ReadToken();

4050 return LexStringLiteral(Result, CurPtr,

4052 : tok::string_literal);

4053

4054

4055 case '?':

4056 Kind = tok::question;

4057 break;

4058 case '[':

4059 Kind = tok::l_square;

4060 break;

4061 case ']':

4062 Kind = tok::r_square;

4063 break;

4064 case '(':

4065 Kind = tok::l_paren;

4066 break;

4067 case ')':

4068 Kind = tok::r_paren;

4069 break;

4070 case '{':

4071 Kind = tok::l_brace;

4072 break;

4073 case '}':

4074 Kind = tok::r_brace;

4075 break;

4076 case '.':

4077 Char = getCharAndSize(CurPtr, SizeTmp);

4078 if (Char >= '0' && Char <= '9') {

4079

4080 MIOpt.ReadToken();

4081

4082 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));

4083 } else if (LangOpts.CPlusPlus && Char == '*') {

4084 Kind = tok::periodstar;

4085 CurPtr += SizeTmp;

4086 } else if (Char == '.' &&

4087 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {

4088 Kind = tok::ellipsis;

4089 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4091 } else {

4092 Kind = tok::period;

4093 }

4094 break;

4095 case '&':

4096 Char = getCharAndSize(CurPtr, SizeTmp);

4097 if (Char == '&') {

4098 Kind = tok::ampamp;

4099 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4100 } else if (Char == '=') {

4101 Kind = tok::ampequal;

4102 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4103 } else {

4104 Kind = tok::amp;

4105 }

4106 break;

4107 case '*':

4108 if (getCharAndSize(CurPtr, SizeTmp) == '=') {

4109 Kind = tok::starequal;

4110 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4111 } else {

4112 Kind = tok::star;

4113 }

4114 break;

4115 case '+':

4116 Char = getCharAndSize(CurPtr, SizeTmp);

4117 if (Char == '+') {

4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4119 Kind = tok::plusplus;

4120 } else if (Char == '=') {

4121 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4122 Kind = tok::plusequal;

4123 } else {

4124 Kind = tok::plus;

4125 }

4126 break;

4127 case '-':

4128 Char = getCharAndSize(CurPtr, SizeTmp);

4129 if (Char == '-') {

4130 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4131 Kind = tok::minusminus;

4132 } else if (Char == '>' && LangOpts.CPlusPlus &&

4133 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {

4134 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4136 Kind = tok::arrowstar;

4137 } else if (Char == '>') {

4138 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4139 Kind = tok::arrow;

4140 } else if (Char == '=') {

4141 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4142 Kind = tok::minusequal;

4143 } else {

4144 Kind = tok::minus;

4145 }

4146 break;

4147 case '~':

4148 Kind = tok::tilde;

4149 break;

4150 case '!':

4151 if (getCharAndSize(CurPtr, SizeTmp) == '=') {

4152 Kind = tok::exclaimequal;

4153 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4154 } else {

4155 Kind = tok::exclaim;

4156 }

4157 break;

4158 case '/':

4159

4160 Char = getCharAndSize(CurPtr, SizeTmp);

4161 if (Char == '/') {

4162

4163

4164

4165

4166

4167

4168

4169

4170 bool TreatAsComment =

4171 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);

4172 if (!TreatAsComment)

4173 if (!(PP && PP->isPreprocessedOutput()))

4174 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';

4175

4176 if (TreatAsComment) {

4177 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4178 TokAtPhysicalStartOfLine))

4179 return true;

4180

4181

4182

4183

4184 goto SkipIgnoredUnits;

4185 }

4186 }

4187

4188 if (Char == '*') {

4189 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4190 TokAtPhysicalStartOfLine))

4191 return true;

4192

4193

4194

4195 goto LexNextToken;

4196 }

4197

4198 if (Char == '=') {

4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4200 Kind = tok::slashequal;

4201 } else {

4202 Kind = tok::slash;

4203 }

4204 break;

4205 case '%':

4206 Char = getCharAndSize(CurPtr, SizeTmp);

4207 if (Char == '=') {

4208 Kind = tok::percentequal;

4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4210 } else if (LangOpts.Digraphs && Char == '>') {

4211 Kind = tok::r_brace;

4212 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4213 } else if (LangOpts.Digraphs && Char == ':') {

4214 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4215 Char = getCharAndSize(CurPtr, SizeTmp);

4216 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {

4217 Kind = tok::hashhash;

4218 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4220 } else if (Char == '@' && LangOpts.MicrosoftExt) {

4221 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4223 Diag(BufferPtr, diag::ext_charize_microsoft);

4224 Kind = tok::hashat;

4225 } else {

4226

4227

4228

4229

4230 if (TokAtPhysicalStartOfLine && LexingRawMode && !Is_PragmaLexer)

4231 goto HandleDirective;

4232

4233 Kind = tok::hash;

4234 }

4235 } else {

4236 Kind = tok::percent;

4237 }

4238 break;

4239 case '<':

4240 Char = getCharAndSize(CurPtr, SizeTmp);

4242 return LexAngledStringLiteral(Result, CurPtr);

4243 } else if (Char == '<') {

4244 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4245 if (After == '=') {

4246 Kind = tok::lesslessequal;

4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4249 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {

4250

4251

4252 goto LexNextToken;

4253 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {

4254

4255

4256 goto LexNextToken;

4257 } else if (LangOpts.CUDA && After == '<') {

4258 Kind = tok::lesslessless;

4259 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4261 } else {

4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4263 Kind = tok::lessless;

4264 }

4265 } else if (Char == '=') {

4266 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4267 if (After == '>') {

4268 if (LangOpts.CPlusPlus20) {

4270 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);

4271 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4273 Kind = tok::spaceship;

4274 break;

4275 }

4276

4277

4279 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)

4282 }

4283 }

4284 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4285 Kind = tok::lessequal;

4286 } else if (LangOpts.Digraphs && Char == ':') {

4287 if (LangOpts.CPlusPlus11 &&

4288 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {

4289

4290

4291

4292

4293

4294 unsigned SizeTmp3;

4295 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);

4296 if (After != ':' && After != '>') {

4297 Kind = tok::less;

4299 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);

4300 break;

4301 }

4302 }

4303

4304 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4305 Kind = tok::l_square;

4306 } else if (LangOpts.Digraphs && Char == '%') {

4307 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4308 Kind = tok::l_brace;

4309 } else if (Char == '#' && SizeTmp == 1 &&

4310 lexEditorPlaceholder(Result, CurPtr)) {

4311 return true;

4312 } else {

4313 Kind = tok::less;

4314 }

4315 break;

4316 case '>':

4317 Char = getCharAndSize(CurPtr, SizeTmp);

4318 if (Char == '=') {

4319 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4320 Kind = tok::greaterequal;

4321 } else if (Char == '>') {

4322 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4323 if (After == '=') {

4324 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4326 Kind = tok::greatergreaterequal;

4327 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {

4328

4329

4330 goto LexNextToken;

4331 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {

4332

4333 goto LexNextToken;

4334 } else if (LangOpts.CUDA && After == '>') {

4335 Kind = tok::greatergreatergreater;

4336 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4338 } else {

4339 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4340 Kind = tok::greatergreater;

4341 }

4342 } else {

4343 Kind = tok::greater;

4344 }

4345 break;

4346 case '^':

4347 Char = getCharAndSize(CurPtr, SizeTmp);

4348 if (Char == '=') {

4349 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4350 Kind = tok::caretequal;

4351 } else {

4352 if (LangOpts.OpenCL && Char == '^')

4353 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);

4354 Kind = tok::caret;

4355 }

4356 break;

4357 case '|':

4358 Char = getCharAndSize(CurPtr, SizeTmp);

4359 if (Char == '=') {

4360 Kind = tok::pipeequal;

4361 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4362 } else if (Char == '|') {

4363

4364 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))

4365 goto LexNextToken;

4366 Kind = tok::pipepipe;

4367 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4368 } else {

4369 Kind = tok::pipe;

4370 }

4371 break;

4372 case ':':

4373 Char = getCharAndSize(CurPtr, SizeTmp);

4374 if (LangOpts.Digraphs && Char == '>') {

4375 Kind = tok::r_square;

4376 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4377 } else if (Char == ':') {

4378 Kind = tok::coloncolon;

4379 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4380 } else {

4381 Kind = tok::colon;

4382 }

4383 break;

4384 case ';':

4385 Kind = tok::semi;

4386 break;

4387 case '=':

4388 Char = getCharAndSize(CurPtr, SizeTmp);

4389 if (Char == '=') {

4390

4391 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))

4392 goto LexNextToken;

4393

4394 Kind = tok::equalequal;

4395 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4396 } else {

4397 Kind = tok::equal;

4398 }

4399 break;

4400 case ',':

4401 Kind = tok::comma;

4402 break;

4403 case '#':

4404 Char = getCharAndSize(CurPtr, SizeTmp);

4405 if (Char == '#') {

4406 Kind = tok::hashhash;

4407 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4408 } else if (Char == '@' && LangOpts.MicrosoftExt) {

4409 Kind = tok::hashat;

4411 Diag(BufferPtr, diag::ext_charize_microsoft);

4412 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4413 } else {

4414

4415

4416

4417

4418 if (TokAtPhysicalStartOfLine && LexingRawMode && !Is_PragmaLexer)

4419 goto HandleDirective;

4420

4421 Kind = tok::hash;

4422 }

4423 break;

4424

4425 case '@':

4426

4427 if (CurPtr[-1] == '@' && LangOpts.ObjC)

4428 Kind = tok::at;

4429 else

4430 Kind = tok::unknown;

4431 break;

4432

4433

4434 case '\\':

4435 if (!LangOpts.AsmPreprocessor) {

4436 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {

4437 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {

4438 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

4439 return true;

4440

4441

4442

4443 goto LexNextToken;

4444 }

4445

4446 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);

4447 }

4448 }

4449

4450 Kind = tok::unknown;

4451 break;

4452

4453 default: {

4455 Kind = tok::unknown;

4456 break;

4457 }

4458

4459 llvm::UTF32 CodePoint;

4460

4461

4462

4463 --CurPtr;

4464 llvm::ConversionResult Status =

4465 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,

4466 (const llvm::UTF8 *)BufferEnd,

4467 &CodePoint,

4468 llvm::strictConversion);

4469 if (Status == llvm::conversionOK) {

4470 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {

4471 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

4472 return true;

4473

4474

4475

4476 goto LexNextToken;

4477 }

4478 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);

4479 }

4480

4482 PP->isPreprocessedOutput()) {

4483 ++CurPtr;

4484 Kind = tok::unknown;

4485 break;

4486 }

4487

4488

4489

4490

4491 Diag(CurPtr, diag::err_invalid_utf8);

4492

4493 BufferPtr = CurPtr+1;

4494

4495

4496

4497 goto LexNextToken;

4498 }

4499 }

4500

4501

4502 MIOpt.ReadToken();

4503

4504

4505 FormTokenWithChars(Result, CurPtr, Kind);

4506 return true;

4507

4508HandleDirective:

4509

4510

4511 FormTokenWithChars(Result, CurPtr, tok::hash);

4512 PP->HandleDirective(Result);

4513

4514 if (PP->hadModuleLoaderFatalFailure())

4515

4516 return true;

4517

4518

4519 return false;

4520

4521LexNextToken:

4523 goto LexStart;

4524}

4525

4526const char *Lexer::convertDependencyDirectiveToken(

4528 const char *TokPtr = BufferStart + DDTok.Offset;

4529 Result.startToken();

4534 BufferPtr = TokPtr + DDTok.Length;

4535 return TokPtr;

4536}

4537

4538bool Lexer::LexDependencyDirectiveToken(Token &Result) {

4539 assert(isDependencyDirectivesLexer());

4540

4541 using namespace dependency_directives_scan;

4542

4543 if (BufferPtr == BufferEnd)

4544 return LexEndOfFile(Result, BufferPtr);

4545

4546 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {

4547 if (DepDirectives.front().Kind == pp_eof)

4548 return LexEndOfFile(Result, BufferEnd);

4549 if (DepDirectives.front().Kind == tokens_present_before_eof)

4550 MIOpt.ReadToken();

4551 NextDepDirectiveTokenIndex = 0;

4552 DepDirectives = DepDirectives.drop_front();

4553 }

4554

4555 const dependency_directives_scan::Token &DDTok =

4556 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];

4557 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {

4558

4559 MIOpt.ReadToken();

4560 }

4561

4563 BufferPtr = BufferStart + DDTok.Offset;

4564 LexAngledStringLiteral(Result, BufferPtr + 1);

4565 if (Result.isNot(tok::header_name))

4566 return true;

4567

4568 while (true) {

4569 const dependency_directives_scan::Token &NextTok =

4570 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];

4571 if (BufferStart + NextTok.Offset >= BufferPtr)

4572 break;

4573 ++NextDepDirectiveTokenIndex;

4574 }

4575 return true;

4576 }

4577

4578 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);

4579

4580 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {

4581 PP->HandleDirective(Result);

4582 if (PP->hadModuleLoaderFatalFailure())

4583

4584 return true;

4585 return false;

4586 }

4587 if (Result.is(tok::raw_identifier)) {

4588 Result.setRawIdentifierData(TokPtr);

4590 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);

4592 return PP->HandleIdentifier(Result);

4593 }

4594 return true;

4595 }

4596 if (Result.isLiteral()) {

4597 Result.setLiteralData(TokPtr);

4598 return true;

4599 }

4600 if (Result.is(tok::colon)) {

4601

4602 if (*BufferPtr == ':') {

4603 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(

4604 tok::colon));

4605 ++NextDepDirectiveTokenIndex;

4606 Result.setKind(tok::coloncolon);

4607 }

4608 return true;

4609 }

4610 if (Result.is(tok::eod))

4612

4613 return true;

4614}

4615

4616bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {

4617 assert(isDependencyDirectivesLexer());

4618

4619 using namespace dependency_directives_scan;

4620

4621 bool Stop = false;

4622 unsigned NestedIfs = 0;

4623 do {

4624 DepDirectives = DepDirectives.drop_front();

4625 switch (DepDirectives.front().Kind) {

4627 llvm_unreachable("unexpected 'pp_none'");

4646 break;

4650 ++NestedIfs;

4651 break;

4656 if (!NestedIfs) {

4657 Stop = true;

4658 }

4659 break;

4661 if (!NestedIfs) {

4662 Stop = true;

4663 } else {

4664 --NestedIfs;

4665 }

4666 break;

4668 NextDepDirectiveTokenIndex = 0;

4669 return LexEndOfFile(Result, BufferEnd);

4670 }

4671 } while (!Stop);

4672

4673 const dependency_directives_scan::Token &DDTok =

4674 DepDirectives.front().Tokens.front();

4675 assert(DDTok.is(tok::hash));

4676 NextDepDirectiveTokenIndex = 1;

4677

4678 convertDependencyDirectiveToken(DDTok, Result);

4679 return false;

4680}

Defines the Diagnostic-related interfaces.

unsigned IsFirst

Indicates that this is the first token of the file.

FormatToken * Next

The next token in the unwrapped line.

Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.

Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.

Defines the clang::LangOptions interface.

static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)

Definition Lexer.cpp:944

static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)

Definition Lexer.cpp:1564

static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)

Definition Lexer.cpp:1758

static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)

DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?

Definition Lexer.cpp:1257

static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)

Slow case of getSpelling.

Definition Lexer.cpp:324

static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)

Find the end of a version control conflict marker.

Definition Lexer.cpp:3245

static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)

After encountering UTF-8 character C and interpreting it as an identifier character,...

Definition Lexer.cpp:1683

static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Definition Lexer.cpp:560

static void StringifyImpl(T &Str, char Quote)

Definition Lexer.cpp:284

static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)

GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...

Definition Lexer.cpp:1185

static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)

Definition Lexer.cpp:1578

static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)

Definition Lexer.cpp:1648

static bool isUnicodeWhitespace(uint32_t Codepoint)

Definition Lexer.cpp:1545

static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)

Definition Lexer.cpp:1632

static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)

Definition Lexer.cpp:3349

static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)

Definition Lexer.cpp:1551

static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)

Definition Lexer.cpp:918

static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)

isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...

Definition Lexer.cpp:2778

static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)

Definition Lexer.cpp:1924

static char GetTrigraphCharForLetter(char Letter)

GetTrigraphCharForLetter - Given a character that occurs after a ?

Definition Lexer.cpp:1238

static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)

Definition Lexer.cpp:1606

static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)

Definition Lexer.cpp:1654

static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)

Returns the pointer that points to the beginning of line that contains the given offset,...

Definition Lexer.cpp:543

Defines the MultipleIncludeOpt interface.

Defines the clang::Preprocessor interface.

Defines the clang::SourceLocation class and associated facilities.

Defines the SourceManager interface.

Defines the clang::TokenKind enum and support functions.

static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]

static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]

static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]

static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]

static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]

static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]

static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]

static const llvm::sys::UnicodeCharRange XIDStartRanges[]

static const llvm::sys::UnicodeCharRange XIDContinueRanges[]

__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)

__device__ __2f16 float c

__PTRDIFF_TYPE__ ptrdiff_t

A signed integer type that is the result of subtracting two pointers.

static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)

static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)

Represents a character-granular source range.

static CharSourceRange getCharRange(SourceRange R)

SourceLocation getEnd() const

SourceLocation getBegin() const

A little helper class used to produce diagnostics.

Concrete class used by the front-end to report problems and issues.

DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)

Issue the message to the client.

bool isIgnored(unsigned DiagID, SourceLocation Loc) const

Determine whether the diagnostic is known to be ignored.

An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...

static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)

Create a code modification hint that replaces the given source range with the given code string.

static FixItHint CreateRemoval(CharSourceRange RemoveRange)

Create a code modification hint that removes the given source range.

static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)

Create a code modification hint that inserts the given code string at a specific location.

One of these records is kept for each identifier that is lexed.

bool isHandleIdentifierCase() const

Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.

bool isKeyword(const LangOptions &LangOpts) const

Return true if this token is a keyword in the specified language.

tok::ObjCKeywordKind getObjCKeywordID() const

Return the Objective-C keyword ID for the this identifier.

Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...

Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.

static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)

Returns a string for the source that the range encompasses.

Definition Lexer.cpp:1020

friend class Preprocessor

void SetKeepWhitespaceMode(bool Val)

SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.

static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)

Checks that the given token is the first token that occurs after the given location (this excludes co...

Definition Lexer.cpp:1377

bool LexFromRawLexer(Token &Result)

LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...

static unsigned getEscapedNewLineSize(const char *P)

getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...

Definition Lexer.cpp:1276

bool inKeepCommentMode() const

inKeepCommentMode - Return true if the lexer should return comments as tokens.

void SetCommentRetentionState(bool Mode)

SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.

static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)

Finds the token that comes before the given location.

Definition Lexer.cpp:1352

void seek(unsigned Offset, bool IsAtStartOfLine)

Set the lexer's buffer pointer to Offset.

Definition Lexer.cpp:277

static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Retrieve the name of the immediate macro expansion.

Definition Lexer.cpp:1056

void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)

ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.

Definition Lexer.cpp:3086

static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)

Returns true if the given MacroID location points at the first token of the macro expansion.

Definition Lexer.cpp:870

DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const

Diag - Forwarding function for diagnostics.

Definition Lexer.cpp:1228

const char * getBufferLocation() const

Return the current location in the buffer.

bool Lex(Token &Result)

Lex - Return the next token in the file.

Definition Lexer.cpp:3703

bool isPragmaLexer() const

isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.

static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc, bool Named, const LangOptions &Opts, DiagnosticsEngine &Diags)

Diagnose use of a delimited or named escape sequence.

Definition Lexer.cpp:3389

static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)

Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...

Definition Lexer.cpp:789

Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)

Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...

Definition Lexer.cpp:183

static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)

Returns true if the given MacroID location points at the last token of the macro expansion.

Definition Lexer.cpp:892

SourceLocation getSourceLocation() override

getSourceLocation - Return a source location for the next character in the current file.

static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)

Accepts a range and returns a character range with file locations.

Definition Lexer.cpp:951

static bool isNewLineEscaped(const char *BufferStart, const char *Str)

Checks whether new line pointed by Str is preceded by escape sequence.

Definition Lexer.cpp:1134

SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const

getSourceLocation - Return a source location identifier for the specified offset in the current file.

Definition Lexer.cpp:1209

static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)

Returns the leading whitespace for line that corresponds to the given location Loc.

Definition Lexer.cpp:1154

static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)

getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...

Definition Lexer.cpp:451

bool isKeepWhitespaceMode() const

isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...

static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)

Returns true if the given character could appear in an identifier.

Definition Lexer.cpp:1130

static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)

Finds the token that comes right after the given location.

Definition Lexer.cpp:1321

static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...

Definition Lexer.cpp:498

static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Given a location any where in a source buffer, find the location that corresponds to the beginning of...

Definition Lexer.cpp:608

void resetExtendedTokenMode()

Sets the extended token mode back to its initial value, according to the language options and preproc...

Definition Lexer.cpp:219

static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Retrieve the name of the immediate macro expansion.

Definition Lexer.cpp:1103

static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)

Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.

Definition Lexer.cpp:242

static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)

Compute the preamble of the given file.

Definition Lexer.cpp:635

static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)

Relex the token at the specified location.

Definition Lexer.cpp:509

static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)

Computes the source location just past the end of the token at this source location.

Definition Lexer.cpp:848

static std::string Stringify(StringRef Str, bool Charify=false)

Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...

Definition Lexer.cpp:309

static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)

getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.

bool LexingRawMode

True if in raw mode.

SmallVector< PPConditionalInfo, 4 > ConditionalStack

Information about the set of #if/#ifdef/#ifndef blocks we are currently in.

bool ParsingPreprocessorDirective

True when parsing #XXX; turns '\n' into a tok::eod token.

MultipleIncludeOpt MIOpt

A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.

bool ParsingFilename

True after #include; turns or "xxx" into a tok::header_name token.

bool isLexingRawMode() const

Return true if this lexer is in raw mode or not.

const FileID FID

The SourceManager FileID corresponding to the file being lexed.

Engages in a tight little dance with the lexer to efficiently preprocess tokens.

SourceManager & getSourceManager() const

Encodes a location in the source.

static SourceLocation getFromRawEncoding(UIntTy Encoding)

Turn a raw encoding of a SourceLocation object into a real SourceLocation.

bool isValid() const

Return true if this is a valid SourceLocation object.

SourceLocation getLocWithOffset(IntTy Offset) const

Return a source location with the specified offset from this SourceLocation.

UIntTy getRawEncoding() const

When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.

This class handles loading and caching of source files into memory.

Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...

SourceLocation getExpansionLocStart() const

SourceLocation getSpellingLoc() const

bool isMacroArgExpansion() const

This is a discriminated union of FileInfo and ExpansionInfo.

const ExpansionInfo & getExpansion() const

static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)

Determine whether a suffix is a valid ud-suffix.

Token - This structure provides full information about a lexed token.

IdentifierInfo * getIdentifierInfo() const

SourceLocation getLocation() const

Return a source location identifier for the specified offset in the current file.

unsigned getLength() const

tok::ObjCKeywordKind getObjCKeywordID() const

Return the ObjC keyword kind.

Definition Lexer.cpp:69

bool is(tok::TokenKind K) const

is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....

tok::TokenKind getKind() const

bool isAtStartOfLine() const

isAtStartOfLine - Return true if this token is at the start of a line.

bool isAnnotation() const

Return true if this is any of tok::annot_* kind tokens.

bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const

Return true if we have an ObjC keyword identifier.

Definition Lexer.cpp:60

bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const

Determine whether the token kind starts a simple-type-specifier.

Definition Lexer.cpp:77

void startToken()

Reset all flags to cleared.

bool needsCleaning() const

Return true if this token has trigraphs or escaped newlines in it.

StringRef getRawIdentifier() const

getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...

void setFlag(TokenFlags Flag)

Set the specified flag.

static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_epi8(__m128i __a)

Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...

static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a, __m128i __b)

Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.

static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)

Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...

static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)

Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...

static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)

Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.

@ tokens_present_before_eof

Indicates that there are tokens present between the last scanned directive and eof.

@ pp_pragma_system_header

@ pp_pragma_include_alias

@ After

Like System, but searched after the system directories.

bool isStringLiteral(TokenKind K)

Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.

ObjCKeywordKind

Provides a namespace for Objective-C keywords which start with an '@'.

TokenKind

Provides a simple uniform namespace for tokens from all C languages.

The JSON file list parser is used to communicate input to InstallAPI.

LLVM_READNONE bool isASCII(char c)

Returns true if a byte is an ASCII character.

@ Match

This is not an overload because the signature exactly matches an existing declaration.

LLVM_READONLY bool isVerticalWhitespace(unsigned char c)

Returns true if this character is vertical ASCII whitespace: '\n', '\r'.

ConflictMarkerKind

ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.

@ CMK_Perforce

A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.

@ CMK_None

Not within a conflict marker.

@ CMK_Normal

A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...

LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)

std::pair< FileID, unsigned > FileIDAndOffset

bool operator<(DeclarationName LHS, DeclarationName RHS)

Ordering on two declaration names.

LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)

Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.

@ Result

The result type of a method or function.

const FunctionProtoType * T

LLVM_READONLY bool isRawStringDelimBody(unsigned char c)

Return true if this is the body character of a C++ raw string delimiter.

LLVM_READONLY bool isWhitespace(unsigned char c)

Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...

LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)

Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.

@ Keyword

The name has been typo-corrected to a keyword.

LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)

Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].

__INTPTR_TYPE__ intptr_t

A signed integer type with the property that any valid pointer to void can be converted to this type,...

float __ovld __cnfn length(float)

Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)

#define _mm_cmpistri(A, B, M)

Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...

#define _SIDD_LEAST_SIGNIFICANT

#define _SIDD_NEGATIVE_POLARITY

Represents a char and the number of bytes parsed to produce it.

Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...

Token lexed as part of dependency directive scanning.

unsigned Offset

Offset into the original source input.

bool is(tok::TokenKind K) const