Lexer.cpp Source File (original) (raw)

29#include "llvm/ADT/STLExtras.h"

30#include "llvm/ADT/StringExtras.h"

31#include "llvm/ADT/StringRef.h"

32#include "llvm/ADT/StringSwitch.h"

33#include "llvm/Support/Compiler.h"

34#include "llvm/Support/ConvertUTF.h"

35#include "llvm/Support/MemoryBufferRef.h"

36#include "llvm/Support/NativeFormatting.h"

37#include "llvm/Support/Unicode.h"

38#include "llvm/Support/UnicodeCharRanges.h"

39#include

40#include

41#include

42#include

43#include

44#include

45#include

46#include

47#include

49#ifdef __SSE4_2__

50#include <nmmintrin.h>

51#endif

53using namespace clang;

62 return false;

64 return II->getObjCKeywordID() == objcKey;

65 return false;

66}

71 return tok::objc_not_keyword;

73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;

74}

79 case tok::annot_typename:

80 case tok::annot_decltype:

81 case tok::annot_pack_indexing_type:

82 return true;

84 case tok::kw_short:

85 case tok::kw_long:

86 case tok::kw___int64:

87 case tok::kw___int128:

88 case tok::kw_signed:

89 case tok::kw_unsigned:

90 case tok::kw_void:

91 case tok::kw_char:

92 case tok::kw_int:

93 case tok::kw_half:

94 case tok::kw_float:

95 case tok::kw_double:

96 case tok::kw___bf16:

97 case tok::kw__Float16:

98 case tok::kw___float128:

99 case tok::kw___ibm128:

100 case tok::kw_wchar_t:

101 case tok::kw_bool:

102 case tok::kw__Bool:

103 case tok::kw__Accum:

104 case tok::kw__Fract:

105 case tok::kw__Sat:

106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:

107#include "clang/Basic/TransformTypeTraits.def"

108 case tok::kw___auto_type:

109 case tok::kw_char16_t:

110 case tok::kw_char32_t:

111 case tok::kw_typeof:

112 case tok::kw_decltype:

113 case tok::kw_char8_t:

115

116 default:

117 return false;

118 }

119}

120

121

122

123

124

125void Lexer::anchor() {}

126

127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,

128 const char *BufEnd) {

129 BufferStart = BufStart;

130 BufferPtr = BufPtr;

131 BufferEnd = BufEnd;

132

133 assert(BufEnd[0] == 0 &&

134 "We assume that the input buffer has a null character at the end"

135 " to simplify lexing!");

136

137

138

139

140 if (BufferStart == BufferPtr) {

141

142 StringRef Buf(BufferStart, BufferEnd - BufferStart);

143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)

144 .StartsWith("\xEF\xBB\xBF", 3)

145 .Default(0);

146

147

148 BufferPtr += BOMLength;

149 }

150

151 Is_PragmaLexer = false;

152 CurrentConflictMarkerState = CMK_None;

153

154

155 IsAtStartOfLine = true;

156 IsAtPhysicalStartOfLine = true;

157

158 HasLeadingSpace = false;

159 HasLeadingEmptyMacro = false;

160

161

163

164

166

167

168

169

170

172

173

174 ExtendedTokenMode = 0;

175

176 NewLinePtr = nullptr;

177}

178

179

180

181

182

186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),

188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {

189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),

190 InputFile.getBufferEnd());

191

193}

194

195

196

197

199 const char *BufStart, const char *BufPtr, const char *BufEnd,

200 bool IsFirstIncludeOfFile)

202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {

203 InitLexer(BufStart, BufPtr, BufEnd);

204

205

207}

208

209

210

211

214 bool IsFirstIncludeOfFile)

215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),

216 FromFile.getBufferStart(), FromFile.getBufferEnd(),

217 IsFirstIncludeOfFile) {}

218

220 assert(PP && "Cannot reset token mode without a preprocessor");

221 if (LangOpts.TraditionalCPP)

223 else

225}

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

247

248

249 FileID SpellingFID = SM.getFileID(SpellingLoc);

250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);

251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);

252

253

254

255

256 const char *StrData = SM.getCharacterData(SpellingLoc);

257

258 L->BufferPtr = StrData;

259 L->BufferEnd = StrData+TokLen;

260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");

261

262

263

264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),

265 ExpansionLocStart,

266 ExpansionLocEnd, TokLen);

267

268

269

271

272

273 L->Is_PragmaLexer = true;

274 return L;

275}

276

277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {

278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;

279 this->IsAtStartOfLine = IsAtStartOfLine;

280 assert((BufferStart + Offset) <= BufferEnd);

281 BufferPtr = BufferStart + Offset;

282}

283

284template static void StringifyImpl(T &Str, char Quote) {

285 typename T::size_type i = 0, e = Str.size();

286 while (i < e) {

287 if (Str[i] == '\\' || Str[i] == Quote) {

288 Str.insert(Str.begin() + i, '\\');

289 i += 2;

290 ++e;

291 } else if (Str[i] == '\n' || Str[i] == '\r') {

292

293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&

294 Str[i] != Str[i + 1]) {

295 Str[i] = '\\';

296 Str[i + 1] = 'n';

297 } else {

298

299 Str[i] = '\\';

300 Str.insert(Str.begin() + i + 1, 'n');

301 ++e;

302 }

303 i += 2;

304 } else

305 ++i;

306 }

307}

308

310 std::string Result = std::string(Str);

311 char Quote = Charify ? '\'' : '"';

314}

315

317

318

319

320

321

322

323

325 const LangOptions &LangOpts, char *Spelling) {

326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");

327

328 size_t Length = 0;

329 const char *BufEnd = BufPtr + Tok.getLength();

330

332

333 while (BufPtr < BufEnd) {

335 Spelling[Length++] = CharAndSize.Char;

336 BufPtr += CharAndSize.Size;

337

338 if (Spelling[Length - 1] == '"')

339 break;

340 }

341

342

343

344

345 if (Length >= 2 &&

346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {

347

348

349 const char *RawEnd = BufEnd;

350 do --RawEnd; while (*RawEnd != '"');

351 size_t RawLength = RawEnd - BufPtr + 1;

352

353

354 memcpy(Spelling + Length, BufPtr, RawLength);

355 Length += RawLength;

356 BufPtr += RawLength;

357

358

359 }

360 }

361

362 while (BufPtr < BufEnd) {

364 Spelling[Length++] = CharAndSize.Char;

365 BufPtr += CharAndSize.Size;

366 }

367

368 assert(Length < Tok.getLength() &&

369 "NeedsCleaning flag set on token that didn't need cleaning!");

370 return Length;

371}

372

373

374

375

376

377

382 bool *invalid) {

383

384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);

385

386

387 bool invalidTemp = false;

388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);

389 if (invalidTemp) {

390 if (invalid) *invalid = true;

391 return {};

392 }

393

394 const char *tokenBegin = file.data() + locInfo.second;

395

396

397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,

398 file.begin(), tokenBegin, file.end());

401

403

404

406 return StringRef(tokenBegin, length);

407

408

409 buffer.resize(length);

410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));

411 return StringRef(buffer.data(), buffer.size());

412}

413

414

415

416

417

418

421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");

422

423 bool CharDataInvalid = false;

425 &CharDataInvalid);

427 *Invalid = CharDataInvalid;

428 if (CharDataInvalid)

429 return {};

430

431

433 return std::string(TokStart, TokStart + Tok.getLength());

434

439}

440

441

442

443

444

445

446

447

448

449

450

454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");

455

456 const char *TokStart = nullptr;

457

458 if (Tok.is(tok::raw_identifier))

460 else if (!Tok.hasUCN()) {

462

463 Buffer = II->getNameStart();

464 return II->getLength();

465 }

466 }

467

468

471

472 if (!TokStart) {

473

474 bool CharDataInvalid = false;

477 *Invalid = CharDataInvalid;

478 if (CharDataInvalid) {

479 Buffer = "";

480 return 0;

481 }

482 }

483

484

486 Buffer = TokStart;

488 }

489

490

491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));

492}

493

494

495

496

497

503 return 0;

505}

506

507

508

512 bool IgnoreWhiteSpace) {

513

514

515

516

517

518

519

520

521 Loc = SM.getExpansionLoc(Loc);

522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);

524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

526 return true;

527

528 const char *StrData = Buffer.data()+LocInfo.second;

529

530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))

531 return true;

532

533

534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,

535 Buffer.begin(), StrData, Buffer.end());

538 return false;

539}

540

541

542

544 const char *BufStart = Buffer.data();

545 if (Offset >= Buffer.size())

546 return nullptr;

547

548 const char *LexStart = BufStart + Offset;

549 for (; LexStart != BufStart; --LexStart) {

552

553 ++LexStart;

554 break;

555 }

556 }

557 return LexStart;

558}

559

564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);

565 if (LocInfo.first.isInvalid())

566 return Loc;

567

569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

571 return Loc;

572

573

574

575 const char *StrData = Buffer.data() + LocInfo.second;

577 if (!LexStart || LexStart == StrData)

578 return Loc;

579

580

582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,

583 Buffer.end());

585

586

588 do {

590

592

593

594

597

598

599

600 break;

601 }

602 } while (TheTok.getKind() != tok::eof);

603

604

605 return Loc;

606}

607

613

614 if (.isMacroArgExpansion(Loc))

615 return Loc;

616

619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);

620 std::pair<FileID, unsigned> BeginFileLocInfo =

621 SM.getDecomposedLoc(BeginFileLoc);

622 assert(FileLocInfo.first == BeginFileLocInfo.first &&

623 FileLocInfo.second >= BeginFileLocInfo.second);

624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);

625}

626

627namespace {

628

629enum PreambleDirectiveKind {

630 PDK_Skipped,

631 PDK_Unknown

632};

633

634}

635

638 unsigned MaxLines) {

639

640

641

644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),

645 Buffer.end());

647

648 bool InPreprocessorDirective = false;

651

652 unsigned MaxLineOffset = 0;

653 if (MaxLines) {

654 const char *CurPtr = Buffer.begin();

655 unsigned CurLine = 0;

656 while (CurPtr != Buffer.end()) {

657 char ch = *CurPtr++;

658 if (ch == '\n') {

659 ++CurLine;

660 if (CurLine == MaxLines)

661 break;

662 }

663 }

664 if (CurPtr != Buffer.end())

665 MaxLineOffset = CurPtr - Buffer.begin();

666 }

667

668 do {

670

671 if (InPreprocessorDirective) {

672

673 if (TheTok.getKind() == tok::eof) {

674 break;

675 }

676

677

678

680 continue;

681

682

683

684 InPreprocessorDirective = false;

685 }

686

687

690

691

692

693 if (MaxLineOffset && TokOffset >= MaxLineOffset)

694 break;

695 }

696

697

698 if (TheTok.getKind() == tok::comment) {

699 if (ActiveCommentLoc.isInvalid())

701 continue;

702 }

703

705

706 Token HashTok = TheTok;

707 InPreprocessorDirective = true;

709

710

711

712

716 PreambleDirectiveKind PDK

717 = llvm::StringSwitch(Keyword)

718 .Case("include", PDK_Skipped)

719 .Case("__include_macros", PDK_Skipped)

720 .Case("define", PDK_Skipped)

721 .Case("undef", PDK_Skipped)

722 .Case("line", PDK_Skipped)

723 .Case("error", PDK_Skipped)

724 .Case("pragma", PDK_Skipped)

725 .Case("import", PDK_Skipped)

726 .Case("include_next", PDK_Skipped)

727 .Case("warning", PDK_Skipped)

728 .Case("ident", PDK_Skipped)

729 .Case("sccs", PDK_Skipped)

730 .Case("assert", PDK_Skipped)

731 .Case("unassert", PDK_Skipped)

732 .Case("if", PDK_Skipped)

733 .Case("ifdef", PDK_Skipped)

734 .Case("ifndef", PDK_Skipped)

735 .Case("elif", PDK_Skipped)

736 .Case("elifdef", PDK_Skipped)

737 .Case("elifndef", PDK_Skipped)

738 .Case("else", PDK_Skipped)

739 .Case("endif", PDK_Skipped)

740 .Default(PDK_Unknown);

741

742 switch (PDK) {

743 case PDK_Skipped:

744 continue;

745

746 case PDK_Unknown:

747

748 break;

749 }

750 }

751

752

753

754

755 TheTok = HashTok;

757 TheTok.getKind() == tok::raw_identifier &&

759 LangOpts.CPlusPlusModules) {

760

761

762 Token ModuleTok = TheTok;

763 do {

765 } while (TheTok.getKind() == tok::comment);

766 if (TheTok.getKind() != tok::semi) {

767

768 TheTok = ModuleTok;

769 break;

770 }

771 continue;

772 }

773

774

775

776

777 break;

778 } while (true);

779

781 if (ActiveCommentLoc.isValid())

782 End = ActiveCommentLoc;

783 else

785

788}

789

793

794

795

797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);

798

799

800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))

801 return 0;

802

803 unsigned PhysOffset = 0;

804

805

806

807

808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {

809 if (CharNo == 0)

810 return PhysOffset;

811 ++TokPtr;

812 --CharNo;

813 ++PhysOffset;

814 }

815

816

817

818 for (; CharNo; --CharNo) {

820 TokPtr += CharAndSize.Size;

821 PhysOffset += CharAndSize.Size;

822 }

823

824

825

826

827

828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))

829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;

830

831 return PhysOffset;

832}

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

853 return {};

854

857 return {};

858 }

859

861 if (Len > Offset)

862 Len = Len - Offset;

863 else

864 return Loc;

865

867}

868

869

870

875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");

876

878 if (.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))

879 return false;

880

881 if (expansionLoc.isFileID()) {

882

883 if (MacroBegin)

884 *MacroBegin = expansionLoc;

885 return true;

886 }

887

889}

890

891

892

897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");

898

901 if (tokLen == 0)

902 return false;

903

906 if (.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))

907 return false;

908

909 if (expansionLoc.isFileID()) {

910

911 if (MacroEnd)

912 *MacroEnd = expansionLoc;

913 return true;

914 }

915

917}

918

925 if (Range.isTokenRange()) {

927 if (End.isInvalid())

928 return {};

929 }

930

931

933 unsigned BeginOffs;

934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);

936 return {};

937

938 unsigned EndOffs;

939 if (.isInFileID(End, FID, &EndOffs) ||

940 BeginOffs > EndOffs)

941 return {};

942

944}

945

946

949 return SM.getSLocEntry(SM.getFileID(Loc))

950 .getExpansion()

951 .isExpansionTokenRange();

952}

953

960 return {};

961

964

967 return {};

970 }

971

973 if (Range.isTokenRange()) {

975 return {};

976

979 return {};

982 }

983

988 &MacroEnd)) ||

990 &MacroEnd)))) {

993

994 if (Range.isTokenRange())

997 }

998

1003 return {};

1004

1009 return {};

1010

1017 }

1018 }

1019

1020 return {};

1021}

1022

1030 return {};

1031 }

1032

1033

1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());

1035 if (beginInfo.first.isInvalid()) {

1037 return {};

1038 }

1039

1040 unsigned EndOffs;

1041 if (.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||

1042 beginInfo.second > EndOffs) {

1044 return {};

1045 }

1046

1047

1048 bool invalidTemp = false;

1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);

1050 if (invalidTemp) {

1052 return {};

1053 }

1054

1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second);

1057}

1058

1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros");

1063

1064

1065 while (true) {

1071 break;

1072

1073

1074

1075

1076

1077

1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin();

1081 break;

1082

1083

1084

1086 if (SM.isInFileID(SpellLoc, MacroFID))

1087 break;

1088

1089

1090 Loc = SpellLoc;

1091 }

1092

1093

1094

1095

1096 Loc = SM.getSpellingLoc(Loc);

1097

1098

1099

1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);

1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);

1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);

1104}

1105

1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros");

1109

1110 while (SM.isMacroArgExpansion(Loc))

1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin();

1112

1113

1114

1115

1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))

1118 return {};

1119

1120

1121

1122

1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());

1124

1125

1126

1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);

1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);

1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);

1131}

1132

1135}

1136

1139 if (Str - 1 < BufferStart)

1140 return false;

1141

1142 if ((Str[0] == '\n' && Str[-1] == '\r') ||

1143 (Str[0] == '\r' && Str[-1] == '\n')) {

1144 if (Str - 2 < BufferStart)

1145 return false;

1146 --Str;

1147 }

1148 --Str;

1149

1150

1152 --Str;

1153

1154 return *Str == '\\';

1155}

1156

1160 return {};

1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);

1162 if (LocInfo.first.isInvalid())

1163 return {};

1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);

1167 return {};

1170 return {};

1171 StringRef Rest = Buffer.substr(Line - Buffer.data());

1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");

1173 return NumWhitespaceChars == StringRef::npos

1174 ? ""

1175 : Rest.take_front(NumWhitespaceChars);

1176}

1177

1178

1179

1180

1181

1182

1183

1184

1185

1190 unsigned CharNo, unsigned TokLen) {

1191 assert(FileLoc.isMacroID() && "Must be a macro expansion");

1192

1193

1194

1195

1197

1198

1199

1202

1203

1204

1206

1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);

1208}

1209

1210

1211

1213 unsigned TokLen) const {

1214 assert(Loc >= BufferStart && Loc <= BufferEnd &&

1215 "Location out of range for this buffer!");

1216

1217

1218

1219 unsigned CharNo = Loc-BufferStart;

1222

1223

1224

1225 assert(PP && "This doesn't work on raw lexers");

1227}

1228

1229

1230

1233}

1234

1235

1236

1237

1238

1239

1240

1242 switch (Letter) {

1243 default: return 0;

1244 case '=': return '#';

1245 case ')': return ']';

1246 case '(': return '[';

1247 case '!': return '|';

1248 case '\'': return '^';

1249 case '>': return '}';

1250 case '/': return '\\';

1251 case '<': return '{';

1252 case '-': return '~';

1253 }

1254}

1255

1256

1257

1258

1259

1262 if (!Res)

1263 return Res;

1264

1265 if (!Trigraphs) {

1267 L->Diag(CP-2, diag::trigraph_ignored);

1268 return 0;

1269 }

1270

1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);

1273 return Res;

1274}

1275

1276

1277

1278

1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {

1280 unsigned Size = 0;

1283

1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')

1285 continue;

1286

1287

1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&

1289 Ptr[Size-1] != Ptr[Size])

1291

1292 return Size;

1293 }

1294

1295

1296 return 0;

1297}

1298

1299

1300

1301

1302const char *Lexer::SkipEscapedNewLines(const char *P) {

1303 while (true) {

1304 const char *AfterEscape;

1305 if (*P == '\\') {

1306 AfterEscape = P+1;

1307 } else if (*P == '?') {

1308

1309 if (P[1] != '?' || P[2] != '/')

1310 return P;

1311

1312

1313 AfterEscape = P+3;

1314 } else {

1315 return P;

1316 }

1317

1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);

1319 if (NewLineSize == 0) return P;

1320 P = AfterEscape+NewLineSize;

1321 }

1322}

1323

1327 bool IncludeComments) {

1330 return std::nullopt;

1331 }

1333

1334

1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);

1336

1337

1338 bool InvalidTemp = false;

1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);

1340 if (InvalidTemp)

1341 return std::nullopt;

1342

1343 const char *TokenBegin = File.data() + LocInfo.second;

1344

1345

1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),

1347 TokenBegin, File.end());

1349

1352 return Tok;

1353}

1354

1358 bool IncludeComments) {

1359 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));

1360 while (Loc != StartOfFile) {

1363 return std::nullopt;

1364

1368 continue;

1369 if (!Tok.is(tok::comment) || IncludeComments) {

1370 return Tok;

1371 }

1372 }

1373 return std::nullopt;

1374}

1375

1376

1377

1378

1379

1382 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {

1384 if (!Tok || Tok->isNot(TKind))

1385 return {};

1387

1388

1389 unsigned NumWhitespaceChars = 0;

1390 if (SkipTrailingWhitespaceAndNewLine) {

1391 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();

1392 unsigned char C = *TokenEnd;

1394 C = *(++TokenEnd);

1395 NumWhitespaceChars++;

1396 }

1397

1398

1399 if (C == '\n' || C == '\r') {

1400 char PrevC = C;

1401 C = *(++TokenEnd);

1402 NumWhitespaceChars++;

1403 if ((C == '\n' || C == '\r') && C != PrevC)

1404 NumWhitespaceChars++;

1405 }

1406 }

1407

1408 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);

1409}

1410

1411

1412

1413

1414

1415

1416

1417

1418

1419

1420

1421

1422

1423

1424

1425

1427 unsigned Size = 0;

1428

1429 if (Ptr[0] == '\\') {

1430 ++Size;

1431 ++Ptr;

1432Slash:

1433

1435 return {'\\', Size};

1436

1437

1438

1439 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {

1440

1442

1443

1444 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && isLexingRawMode ())

1445 Diag(Ptr, diag::backslash_newline_space);

1446

1447

1448 Size += EscapedNewLineSize;

1449 Ptr += EscapedNewLineSize;

1450

1451

1452 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);

1453 CharAndSize.Size += Size;

1454 return CharAndSize;

1455 }

1456

1457

1458 return {'\\', Size};

1459 }

1460

1461

1462 if (Ptr[0] == '?' && Ptr[1] == '?') {

1463

1464

1466 LangOpts.Trigraphs)) {

1467

1469

1470 Ptr += 3;

1472 if (C == '\\') goto Slash;

1473 return {C, Size};

1474 }

1475 }

1476

1477

1478 return {*Ptr, Size + 1u};

1479}

1480

1481

1482

1483

1484

1485

1486

1487Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,

1489

1490 unsigned Size = 0;

1491

1492 if (Ptr[0] == '\\') {

1494 ++Ptr;

1495Slash:

1496

1498 return {'\\', Size};

1499

1500

1501 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {

1502

1503 Size += EscapedNewLineSize;

1504 Ptr += EscapedNewLineSize;

1505

1506

1507 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);

1508 CharAndSize.Size += Size;

1509 return CharAndSize;

1510 }

1511

1512

1513 return {'\\', Size};

1514 }

1515

1516

1517 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {

1518

1519

1521 Ptr += 3;

1523 if (C == '\\') goto Slash;

1524 return {C, Size};

1525 }

1526 }

1527

1528

1529 return {*Ptr, Size + 1u};

1530}

1531

1532

1533

1534

1535

1536

1537void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {

1538 BufferPtr = BufferStart + Offset;

1539 if (BufferPtr > BufferEnd)

1540 BufferPtr = BufferEnd;

1541

1542

1543

1544 IsAtStartOfLine = StartOfLine;

1545 IsAtPhysicalStartOfLine = StartOfLine;

1546}

1547

1549 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(

1551 return UnicodeWhitespaceChars.contains(Codepoint);

1552}

1553

1556 llvm::raw_svector_ostream CharOS(CharBuf);

1557 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);

1558 return CharBuf;

1559}

1560

1561

1562

1563

1564

1565

1566

1568 bool IsStart, bool &IsExtension) {

1569 static const llvm::sys::UnicodeCharSet MathStartChars(

1571 static const llvm::sys::UnicodeCharSet MathContinueChars(

1573 if (MathStartChars.contains(C) ||

1574 (!IsStart && MathContinueChars.contains(C))) {

1575 IsExtension = true;

1576 return true;

1577 }

1578 return false;

1579}

1580

1582 bool &IsExtension) {

1583 if (LangOpts.AsmPreprocessor) {

1584 return false;

1585 } else if (LangOpts.DollarIdents && '$' == C) {

1586 return true;

1587 } else if (LangOpts.CPlusPlus || LangOpts.C23) {

1588

1589

1590

1591

1592 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);

1593 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);

1594 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))

1595 return true;

1597 IsExtension);

1598 } else if (LangOpts.C11) {

1599 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(

1601 return C11AllowedIDChars.contains(C);

1602 } else {

1603 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(

1605 return C99AllowedIDChars.contains(C);

1606 }

1607}

1608

1610 bool &IsExtension) {

1611 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");

1612 IsExtension = false;

1613 if (LangOpts.AsmPreprocessor) {

1614 return false;

1615 }

1616 if (LangOpts.CPlusPlus || LangOpts.C23) {

1617 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);

1618 if (XIDStartChars.contains(C))

1619 return true;

1621 IsExtension);

1622 }

1624 return false;

1625 if (LangOpts.C11) {

1626 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(

1628 return !C11DisallowedInitialIDChars.contains(C);

1629 }

1630 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(

1632 return !C99DisallowedInitialIDChars.contains(C);

1633}

1634

1637

1638 static const llvm::sys::UnicodeCharSet MathStartChars(

1640 static const llvm::sys::UnicodeCharSet MathContinueChars(

1642

1643 (void)MathStartChars;

1644 (void)MathContinueChars;

1645 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&

1646 "Unexpected mathematical notation codepoint");

1649}

1650

1652 const char *End) {

1655}

1656

1659

1661 enum {

1662 CannotAppearInIdentifier = 0,

1663 CannotStartIdentifier

1664 };

1665

1666 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(

1668 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(

1670 if (!C99AllowedIDChars.contains(C)) {

1673 << CannotAppearInIdentifier;

1674 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {

1677 << CannotStartIdentifier;

1678 }

1679 }

1680}

1681

1682

1683

1684

1685

1688

1689 struct HomoglyphPair {

1690 uint32_t Character;

1691 char LooksLike;

1692 bool operator<(HomoglyphPair R) const { return Character < R.Character; }

1693 };

1694 static constexpr HomoglyphPair SortedHomoglyphs[] = {

1695 {U'\u00ad', 0},

1696 {U'\u01c3', '!'},

1697 {U'\u037e', ';'},

1698 {U'\u200b', 0},

1699 {U'\u200c', 0},

1700 {U'\u200d', 0},

1701 {U'\u2060', 0},

1702 {U'\u2061', 0},

1703 {U'\u2062', 0},

1704 {U'\u2063', 0},

1705 {U'\u2064', 0},

1706 {U'\u2212', '-'},

1707 {U'\u2215', '/'},

1708 {U'\u2216', '\\'},

1709 {U'\u2217', '*'},

1710 {U'\u2223', '|'},

1711 {U'\u2227', '^'},

1712 {U'\u2236', ':'},

1713 {U'\u223c', '~'},

1714 {U'\ua789', ':'},

1715 {U'\ufeff', 0},

1716 {U'\uff01', '!'},

1717 {U'\uff03', '#'},

1718 {U'\uff04', '$'},

1719 {U'\uff05', '%'},

1720 {U'\uff06', '&'},

1721 {U'\uff08', '('},

1722 {U'\uff09', ')'},

1723 {U'\uff0a', '*'},

1724 {U'\uff0b', '+'},

1725 {U'\uff0c', ','},

1726 {U'\uff0d', '-'},

1727 {U'\uff0e', '.'},

1728 {U'\uff0f', '/'},

1729 {U'\uff1a', ':'},

1730 {U'\uff1b', ';'},

1731 {U'\uff1c', '<'},

1732 {U'\uff1d', '='},

1733 {U'\uff1e', '>'},

1734 {U'\uff1f', '?'},

1735 {U'\uff20', '@'},

1736 {U'\uff3b', '['},

1737 {U'\uff3c', '\\'},

1738 {U'\uff3d', ']'},

1739 {U'\uff3e', '^'},

1740 {U'\uff5b', '{'},

1741 {U'\uff5c', '|'},

1742 {U'\uff5d', '}'},

1743 {U'\uff5e', '~'},

1744 {0, 0}

1745 };

1746 auto Homoglyph =

1747 std::lower_bound(std::begin(SortedHomoglyphs),

1748 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});

1749 if (Homoglyph->Character == C) {

1750 if (Homoglyph->LooksLike) {

1751 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};

1754 } else {

1757 }

1758 }

1759}

1760

1765 return;

1766

1767 bool IsExtension;

1769 bool IsIDContinue =

1770 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);

1771

1772 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))

1773 return;

1774

1775 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;

1776

1777 if (!IsFirst || InvalidOnlyAtStart) {

1778 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)

1781 } else {

1785 }

1786}

1787

1788bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,

1790 const char *UCNPtr = CurPtr + Size;

1791 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, nullptr);

1792 if (CodePoint == 0) {

1793 return false;

1794 }

1795 bool IsExtension = false;

1796 if ( isAllowedIDChar (CodePoint, LangOpts, IsExtension)) {

1798 return false;

1804 false);

1805

1806

1807

1808

1810 if (IsExtension)

1813

1816 false);

1817 }

1818

1820 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||

1821 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))

1822 CurPtr = UCNPtr;

1823 else

1824 while (CurPtr != UCNPtr)

1825 (void)getAndAdvanceChar(CurPtr, Result);

1826 return true;

1827}

1828

1829bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {

1830 llvm::UTF32 CodePoint;

1831

1832

1833

1834

1835 unsigned FirstCodeUnitSize;

1836 getCharAndSize(CurPtr, FirstCodeUnitSize);

1837 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;

1838 const char *UnicodePtr = CharStart;

1839

1840 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(

1841 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,

1842 &CodePoint, llvm::strictConversion);

1843 if (ConvResult != llvm::conversionOK)

1844 return false;

1845

1846 bool IsExtension = false;

1848 IsExtension)) {

1850 return false;

1851

1856 makeCharRange(*this, CharStart, UnicodePtr), false);

1857

1858

1859

1861 if (IsExtension)

1867 false);

1870 }

1871

1872

1873

1874

1875 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);

1876 CurPtr = UnicodePtr;

1877 return true;

1878}

1879

1880bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,

1881 const char *CurPtr) {

1882 bool IsExtension = false;

1886 if (IsExtension)

1891 true);

1894 }

1895

1897 return LexIdentifierContinue(Result, CurPtr);

1898 }

1899

1903

1904

1905

1906

1907

1908

1909

1910

1911

1914 makeCharRange(*this, BufferPtr, CurPtr), true);

1915 BufferPtr = CurPtr;

1916 return false;

1917 }

1918

1919

1920

1922 FormTokenWithChars(Result, CurPtr, tok::unknown);

1923 return true;

1924}

1925

1926static const char *

1928 [[maybe_unused]] const char *BufferEnd) {

1929#ifdef __SSE4_2__

1930 alignas(16) static constexpr char AsciiIdentifierRange[16] = {

1931 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',

1932 };

1933 constexpr ssize_t BytesPerRegister = 16;

1934

1935 __m128i AsciiIdentifierRangeV =

1936 _mm_load_si128((const __m128i *)AsciiIdentifierRange);

1937

1938 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {

1939 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));

1940

1941 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,

1944 CurPtr += Consumed;

1945 if (Consumed == BytesPerRegister)

1946 continue;

1947 return CurPtr;

1948 }

1949#endif

1950

1951 unsigned char C = *CurPtr;

1953 C = *++CurPtr;

1954 return CurPtr;

1955}

1956

1957bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {

1958

1959

1960 while (true) {

1961

1963

1964 unsigned Size;

1965

1966 unsigned char C = getCharAndSize(CurPtr, Size);

1968 CurPtr = ConsumeChar(CurPtr, Size, Result);

1969 continue;

1970 }

1971 if (C == '$') {

1972

1973 if (!LangOpts.DollarIdents)

1974 break;

1975

1977 Diag(CurPtr, diag::ext_dollar_in_identifier);

1978 CurPtr = ConsumeChar(CurPtr, Size, Result);

1979 continue;

1980 }

1981 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

1982 continue;

1983 if ( isASCII (C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

1984 continue;

1985

1986 break;

1987 }

1988

1989 const char *IdStart = BufferPtr;

1990 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);

1991 Result.setRawIdentifierData(IdStart);

1992

1993

1994

1996 return true;

1997

1998

1999

2001

2002

2003

2004

2005

2006

2007 if (isCodeCompletionPoint(CurPtr)) {

2008

2009 Result.setKind(tok::code_completion);

2010

2011

2012

2013

2014

2015 assert(*CurPtr == 0 && "Completion character must be 0");

2016 ++CurPtr;

2017

2018

2019

2020 if (CurPtr < BufferEnd) {

2022 ++CurPtr;

2023 }

2024 BufferPtr = CurPtr;

2025 return true;

2026 }

2027

2028

2029

2032

2033 return true;

2034}

2035

2036

2037

2038bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {

2040 char C1 = CharAndSize1.Char;

2041 if (C1 != '0')

2042 return false;

2043

2044 auto CharAndSize2 =

2046 char C2 = CharAndSize2.Char;

2047 return (C2 == 'x' || C2 == 'X');

2048}

2049

2050

2051

2052

2053bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {

2054 unsigned Size;

2055 char C = getCharAndSize(CurPtr, Size);

2056 char PrevCh = 0;

2058 CurPtr = ConsumeChar(CurPtr, Size, Result);

2059 PrevCh = C;

2060 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {

2061 CurPtr -= Size;

2062 break;

2063 }

2064 C = getCharAndSize(CurPtr, Size);

2065 }

2066

2067

2068 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {

2069

2070

2071 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))

2072 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));

2073 }

2074

2075

2076 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {

2077

2078

2079

2080 bool IsHexFloat = true;

2081 if (!LangOpts.C99) {

2082 if (!isHexaLiteral(BufferPtr, LangOpts))

2083 IsHexFloat = false;

2084 else if (!LangOpts.CPlusPlus17 &&

2085 std::find(BufferPtr, CurPtr, '_') != CurPtr)

2086 IsHexFloat = false;

2087 }

2088 if (IsHexFloat)

2089 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));

2090 }

2091

2092

2093 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {

2097 Diag(CurPtr, LangOpts.CPlusPlus

2098 ? diag::warn_cxx11_compat_digit_separator

2099 : diag::warn_c23_compat_digit_separator);

2100 CurPtr = ConsumeChar(CurPtr, Size, Result);

2101 CurPtr = ConsumeChar(CurPtr, NextSize, Result);

2102 return LexNumericConstant(Result, CurPtr);

2103 }

2104 }

2105

2106

2107 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

2108 return LexNumericConstant(Result, CurPtr);

2109 if ( isASCII (C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

2110 return LexNumericConstant(Result, CurPtr);

2111

2112

2113 const char *TokStart = BufferPtr;

2114 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);

2115 Result.setLiteralData(TokStart);

2116 return true;

2117}

2118

2119

2120

2121const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,

2122 bool IsStringLiteral) {

2123 assert(LangOpts.CPlusPlus);

2124

2125

2126 unsigned Size;

2127 char C = getCharAndSize(CurPtr, Size);

2128 bool Consumed = false;

2129

2131 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))

2132 Consumed = true;

2133 else if ( isASCII (C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))

2134 Consumed = true;

2135 else

2136 return CurPtr;

2137 }

2138

2139 if (!LangOpts.CPlusPlus11) {

2141 Diag(CurPtr,

2142 C == '_' ? diag::warn_cxx11_compat_user_defined_literal

2143 : diag::warn_cxx11_compat_reserved_user_defined_literal)

2145 return CurPtr;

2146 }

2147

2148

2149

2150

2151

2152

2153 if (!Consumed) {

2154 bool IsUDSuffix = false;

2155 if (C == '_')

2156 IsUDSuffix = true;

2157 else if (IsStringLiteral && LangOpts.CPlusPlus14) {

2158

2159

2160

2161 const unsigned MaxStandardSuffixLength = 3;

2162 char Buffer[MaxStandardSuffixLength] = { C };

2163 unsigned Consumed = Size;

2164 unsigned Chars = 1;

2165 while (true) {

2166 auto [Next, NextSize] =

2169

2170 const StringRef CompleteSuffix(Buffer, Chars);

2171 IsUDSuffix =

2173 break;

2174 }

2175

2176 if (Chars == MaxStandardSuffixLength)

2177

2178 break;

2179

2180 Buffer[Chars++] = Next;

2181 Consumed += NextSize;

2182 }

2183 }

2184

2185 if (!IsUDSuffix) {

2187 Diag(CurPtr, LangOpts.MSVCCompat

2188 ? diag::ext_ms_reserved_user_defined_literal

2189 : diag::ext_reserved_user_defined_literal)

2191 return CurPtr;

2192 }

2193

2194 CurPtr = ConsumeChar(CurPtr, Size, Result);

2195 }

2196

2198 while (true) {

2199 C = getCharAndSize(CurPtr, Size);

2201 CurPtr = ConsumeChar(CurPtr, Size, Result);

2202 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {

2203 } else if ( isASCII (C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {

2204 } else

2205 break;

2206 }

2207

2208 return CurPtr;

2209}

2210

2211

2212

2213bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,

2215 const char *AfterQuote = CurPtr;

2216

2217 const char *NulCharacter = nullptr;

2218

2220 (Kind == tok::utf8_string_literal ||

2221 Kind == tok::utf16_string_literal ||

2222 Kind == tok::utf32_string_literal))

2223 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal

2224 : diag::warn_c99_compat_unicode_literal);

2225

2226 char C = getAndAdvanceChar(CurPtr, Result);

2227 while (C != '"') {

2228

2229

2230 if (C == '\\')

2231 C = getAndAdvanceChar(CurPtr, Result);

2232

2233 if (C == '\n' || C == '\r' ||

2234 (C == 0 && CurPtr-1 == BufferEnd)) {

2236 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;

2237 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2238 return true;

2239 }

2240

2241 if (C == 0) {

2242 if (isCodeCompletionPoint(CurPtr-1)) {

2244 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, false);

2245 else

2247 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);

2248 cutOffLexing();

2249 return true;

2250 }

2251

2252 NulCharacter = CurPtr-1;

2253 }

2254 C = getAndAdvanceChar(CurPtr, Result);

2255 }

2256

2257

2258 if (LangOpts.CPlusPlus)

2259 CurPtr = LexUDSuffix(Result, CurPtr, true);

2260

2261

2263 Diag(NulCharacter, diag::null_in_char_or_string) << 1;

2264

2265

2266 const char *TokStart = BufferPtr;

2267 FormTokenWithChars(Result, CurPtr, Kind);

2268 Result.setLiteralData(TokStart);

2269 return true;

2270}

2271

2272

2273

2274bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,

2276

2277

2278

2279

2280

2282 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);

2283

2284 unsigned PrefixLen = 0;

2285

2288 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {

2289 const char *Pos = &CurPtr[PrefixLen];

2290 Diag(Pos, LangOpts.CPlusPlus26

2291 ? diag::warn_cxx26_compat_raw_string_literal_character_set

2292 : diag::ext_cxx26_raw_string_literal_character_set)

2293 << StringRef(Pos, 1);

2294 }

2295 ++PrefixLen;

2296 }

2297

2298

2299 if (CurPtr[PrefixLen] != '(') {

2301 const char *PrefixEnd = &CurPtr[PrefixLen];

2302 if (PrefixLen == 16) {

2303 Diag(PrefixEnd, diag::err_raw_delim_too_long);

2304 } else if (*PrefixEnd == '\n') {

2305 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);

2306 } else {

2307 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)

2308 << StringRef(PrefixEnd, 1);

2309 }

2310 }

2311

2312

2313

2314

2315 while (true) {

2316 char C = *CurPtr++;

2317

2318 if (C == '"')

2319 break;

2320 if (C == 0 && CurPtr-1 == BufferEnd) {

2321 --CurPtr;

2322 break;

2323 }

2324 }

2325

2326 FormTokenWithChars(Result, CurPtr, tok::unknown);

2327 return true;

2328 }

2329

2330

2331 const char *Prefix = CurPtr;

2332 CurPtr += PrefixLen + 1;

2333

2334 while (true) {

2335 char C = *CurPtr++;

2336

2337 if (C == ')') {

2338

2339 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {

2340 CurPtr += PrefixLen + 1;

2341 break;

2342 }

2343 } else if (C == 0 && CurPtr-1 == BufferEnd) {

2345 Diag(BufferPtr, diag::err_unterminated_raw_string)

2346 << StringRef(Prefix, PrefixLen);

2347 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2348 return true;

2349 }

2350 }

2351

2352

2353 if (LangOpts.CPlusPlus)

2354 CurPtr = LexUDSuffix(Result, CurPtr, true);

2355

2356

2357 const char *TokStart = BufferPtr;

2358 FormTokenWithChars(Result, CurPtr, Kind);

2359 Result.setLiteralData(TokStart);

2360 return true;

2361}

2362

2363

2364

2365bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {

2366

2367 const char *NulCharacter = nullptr;

2368 const char *AfterLessPos = CurPtr;

2369 char C = getAndAdvanceChar(CurPtr, Result);

2370 while (C != '>') {

2371

2372

2373 if (C == '\\')

2374 C = getAndAdvanceChar(CurPtr, Result);

2375

2377 (C == 0 && (CurPtr - 1 == BufferEnd))) {

2378

2379

2380 FormTokenWithChars(Result, AfterLessPos, tok::less);

2381 return true;

2382 }

2383

2384 if (C == 0) {

2385 if (isCodeCompletionPoint(CurPtr - 1)) {

2386 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, true);

2387 cutOffLexing();

2388 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);

2389 return true;

2390 }

2391 NulCharacter = CurPtr-1;

2392 }

2393 C = getAndAdvanceChar(CurPtr, Result);

2394 }

2395

2396

2398 Diag(NulCharacter, diag::null_in_char_or_string) << 1;

2399

2400

2401 const char *TokStart = BufferPtr;

2402 FormTokenWithChars(Result, CurPtr, tok::header_name);

2403 Result.setLiteralData(TokStart);

2404 return true;

2405}

2406

2407void Lexer::codeCompleteIncludedFile(const char *PathStart,

2408 const char *CompletionPoint,

2409 bool IsAngled) {

2410

2411 StringRef PartialPath(PathStart, CompletionPoint - PathStart);

2412 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";

2413 auto Slash = PartialPath.find_last_of(SlashChars);

2414 StringRef Dir =

2415 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);

2416 const char *StartOfFilename =

2417 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;

2418

2420 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));

2421

2422

2423 while (CompletionPoint < BufferEnd) {

2424 char Next = *(CompletionPoint + 1);

2425 if (Next == 0 || Next == '\r' || Next == '\n')

2426 break;

2427 ++CompletionPoint;

2428 if (Next == (IsAngled ? '>' : '"'))

2429 break;

2430 if (SlashChars.contains(Next))

2431 break;

2432 }

2433

2438}

2439

2440

2441

2442bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,

2444

2445 const char *NulCharacter = nullptr;

2446

2448 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)

2449 Diag(BufferPtr, LangOpts.CPlusPlus

2450 ? diag::warn_cxx98_compat_unicode_literal

2451 : diag::warn_c99_compat_unicode_literal);

2452 else if (Kind == tok::utf8_char_constant)

2453 Diag(BufferPtr, LangOpts.CPlusPlus

2454 ? diag::warn_cxx14_compat_u8_character_literal

2455 : diag::warn_c17_compat_u8_character_literal);

2456 }

2457

2458 char C = getAndAdvanceChar(CurPtr, Result);

2459 if (C == '\'') {

2461 Diag(BufferPtr, diag::ext_empty_character);

2462 FormTokenWithChars(Result, CurPtr, tok::unknown);

2463 return true;

2464 }

2465

2466 while (C != '\'') {

2467

2468 if (C == '\\')

2469 C = getAndAdvanceChar(CurPtr, Result);

2470

2471 if (C == '\n' || C == '\r' ||

2472 (C == 0 && CurPtr-1 == BufferEnd)) {

2474 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;

2475 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2476 return true;

2477 }

2478

2479 if (C == 0) {

2480 if (isCodeCompletionPoint(CurPtr-1)) {

2482 FormTokenWithChars(Result, CurPtr-1, tok::unknown);

2483 cutOffLexing();

2484 return true;

2485 }

2486

2487 NulCharacter = CurPtr-1;

2488 }

2489 C = getAndAdvanceChar(CurPtr, Result);

2490 }

2491

2492

2493 if (LangOpts.CPlusPlus)

2494 CurPtr = LexUDSuffix(Result, CurPtr, false);

2495

2496

2498 Diag(NulCharacter, diag::null_in_char_or_string) << 0;

2499

2500

2501 const char *TokStart = BufferPtr;

2502 FormTokenWithChars(Result, CurPtr, Kind);

2503 Result.setLiteralData(TokStart);

2504 return true;

2505}

2506

2507

2508

2509

2510

2511bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,

2512 bool &TokAtPhysicalStartOfLine) {

2513

2515

2516 unsigned char Char = *CurPtr;

2517

2518 const char *lastNewLine = nullptr;

2519 auto setLastNewLine = [&](const char *Ptr) {

2520 lastNewLine = Ptr;

2521 if (!NewLinePtr)

2522 NewLinePtr = Ptr;

2523 };

2524 if (SawNewline)

2525 setLastNewLine(CurPtr - 1);

2526

2527

2528 while (true) {

2529

2531 Char = *++CurPtr;

2532

2533

2535 break;

2536

2538

2539 BufferPtr = CurPtr;

2540 return false;

2541 }

2542

2543

2544 if (*CurPtr == '\n')

2545 setLastNewLine(CurPtr);

2546 SawNewline = true;

2547 Char = *++CurPtr;

2548 }

2549

2550

2552 FormTokenWithChars(Result, CurPtr, tok::unknown);

2553 if (SawNewline) {

2554 IsAtStartOfLine = true;

2555 IsAtPhysicalStartOfLine = true;

2556 }

2557

2558 return true;

2559 }

2560

2561

2562 char PrevChar = CurPtr[-1];

2564

2566 if (SawNewline) {

2568 TokAtPhysicalStartOfLine = true;

2569

2570 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {

2574 }

2575 }

2576

2577 BufferPtr = CurPtr;

2578 return false;

2579}

2580

2581

2582

2583

2584

2585

2586

2587bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,

2588 bool &TokAtPhysicalStartOfLine) {

2589

2590

2591 if (!LineComment) {

2592 if ( isLexingRawMode ())

2593 Diag(BufferPtr, diag::ext_line_comment);

2594

2595

2596

2597 LineComment = true;

2598 }

2599

2600

2601

2602

2603

2604

2605

2606

2607

2608

2609

2610

2611 bool UnicodeDecodingAlreadyDiagnosed = false;

2612

2613 char C;

2614 while (true) {

2615 C = *CurPtr;

2616

2617 while (isASCII(C) && C != 0 &&

2618 C != '\n' && C != '\r') {

2619 C = *++CurPtr;

2620 UnicodeDecodingAlreadyDiagnosed = false;

2621 }

2622

2624 unsigned Length = llvm::getUTF8SequenceSize(

2625 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);

2626 if (Length == 0) {

2627 if (!UnicodeDecodingAlreadyDiagnosed && isLexingRawMode ())

2628 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);

2629 UnicodeDecodingAlreadyDiagnosed = true;

2630 ++CurPtr;

2631 } else {

2632 UnicodeDecodingAlreadyDiagnosed = false;

2633 CurPtr += Length;

2634 }

2635 continue;

2636 }

2637

2638 const char *NextLine = CurPtr;

2639 if (C != 0) {

2640

2641 const char *EscapePtr = CurPtr-1;

2642 bool HasSpace = false;

2644 --EscapePtr;

2645 HasSpace = true;

2646 }

2647

2648 if (*EscapePtr == '\\')

2649

2650 CurPtr = EscapePtr;

2651 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&

2652 EscapePtr[-2] == '?' && LangOpts.Trigraphs)

2653

2654 CurPtr = EscapePtr-2;

2655 else

2656 break;

2657

2658

2660 Diag(EscapePtr, diag::backslash_newline_space);

2661 }

2662

2663

2664

2665

2666

2667 const char *OldPtr = CurPtr;

2670 C = getAndAdvanceChar(CurPtr, Result);

2672

2673

2674

2675 if (C != 0 && CurPtr == OldPtr+1) {

2676 CurPtr = NextLine;

2677 break;

2678 }

2679

2680

2681

2682

2683 if (CurPtr != OldPtr + 1 && C != '/' &&

2684 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {

2685 for (; OldPtr != CurPtr; ++OldPtr)

2686 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {

2687

2688

2690 const char *ForwardPtr = CurPtr;

2691 while (isWhitespace(*ForwardPtr))

2692 ++ForwardPtr;

2693 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')

2694 break;

2695 }

2696

2698 Diag(OldPtr-1, diag::ext_multi_line_line_comment);

2699 break;

2700 }

2701 }

2702

2703 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {

2704 --CurPtr;

2705 break;

2706 }

2707

2708 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {

2710 cutOffLexing();

2711 return false;

2712 }

2713 }

2714

2715

2716

2720 BufferPtr = CurPtr;

2721 return true;

2722 }

2723

2724

2726 return SaveLineComment(Result, CurPtr);

2727

2728

2729

2731 BufferPtr = CurPtr;

2732 return false;

2733 }

2734

2735

2736

2737

2738

2739

2740 NewLinePtr = CurPtr++;

2741

2742

2744 TokAtPhysicalStartOfLine = true;

2745

2747 BufferPtr = CurPtr;

2748 return false;

2749}

2750

2751

2752

2753bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {

2754

2755

2756 FormTokenWithChars(Result, CurPtr, tok::comment);

2757

2759 return true;

2760

2761

2762

2766 return true;

2767

2768 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");

2769 Spelling[1] = '*';

2770 Spelling += "*/";

2771

2772 Result.setKind(tok::comment);

2774 Result.getLocation(), Result.getLocation());

2775 return true;

2776}

2777

2778

2779

2780

2782 bool Trigraphs) {

2783 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');

2784

2785

2786 const char *TrigraphPos = nullptr;

2787

2788 const char *SpacePos = nullptr;

2789

2790 while (true) {

2791

2792 --CurPtr;

2793

2794

2795 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {

2796

2797 if (CurPtr[0] == CurPtr[1])

2798 return false;

2799

2800 --CurPtr;

2801 }

2802

2803

2804

2806 SpacePos = CurPtr;

2807 --CurPtr;

2808 }

2809

2810

2811 if (*CurPtr == '\\') {

2812 --CurPtr;

2813 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {

2814

2815 TrigraphPos = CurPtr - 2;

2816 CurPtr -= 3;

2817 } else {

2818 return false;

2819 }

2820

2821

2822

2823 if (*CurPtr == '*')

2824 break;

2825

2826 if (*CurPtr != '\n' && *CurPtr != '\r')

2827 return false;

2828 }

2829

2830 if (TrigraphPos) {

2831

2832

2833 if (!Trigraphs) {

2835 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);

2836 return false;

2837 }

2839 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);

2840 }

2841

2842

2844 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);

2845

2846

2848 L->Diag(SpacePos, diag::backslash_newline_space);

2849

2850 return true;

2851}

2852

2853#ifdef __SSE2__

2854#include <emmintrin.h>

2855#elif __ALTIVEC__

2857#undef bool

2858#endif

2859

2860

2861

2862

2863

2864

2865

2866

2867

2868

2869bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,

2870 bool &TokAtPhysicalStartOfLine) {

2871

2872

2873

2874

2875

2876

2877

2878

2879 unsigned CharSize;

2880 unsigned char C = getCharAndSize(CurPtr, CharSize);

2881 CurPtr += CharSize;

2882 if (C == 0 && CurPtr == BufferEnd+1) {

2884 Diag(BufferPtr, diag::err_unterminated_block_comment);

2885 --CurPtr;

2886

2887

2888

2890 FormTokenWithChars(Result, CurPtr, tok::unknown);

2891 return true;

2892 }

2893

2894 BufferPtr = CurPtr;

2895 return false;

2896 }

2897

2898

2899

2900 if (C == '/')

2901 C = *CurPtr++;

2902

2903

2904

2905

2906

2907 bool UnicodeDecodingAlreadyDiagnosed = false;

2908

2909 while (true) {

2910

2911

2912 if (CurPtr + 24 < BufferEnd &&

2913

2914

2916

2917 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {

2919 goto MultiByteUTF8;

2920 C = *CurPtr++;

2921 }

2922 if (C == '/') goto FoundSlash;

2923

2924#ifdef __SSE2__

2926 while (CurPtr + 16 < BufferEnd) {

2928 if (LLVM_UNLIKELY(Mask != 0)) {

2929 goto MultiByteUTF8;

2930 }

2931

2933 Slashes));

2934 if (cmp != 0) {

2935

2936

2937

2938 CurPtr += llvm::countr_zero(cmp) + 1;

2939 goto FoundSlash;

2940 }

2941 CurPtr += 16;

2942 }

2943#elif __ALTIVEC__

2944 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

2945 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

2946 0x80, 0x80, 0x80, 0x80};

2947 __vector unsigned char Slashes = {

2948 '/', '/', '/', '/', '/', '/', '/', '/',

2949 '/', '/', '/', '/', '/', '/', '/', '/'

2950 };

2951 while (CurPtr + 16 < BufferEnd) {

2952 if (LLVM_UNLIKELY(

2953 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))

2954 goto MultiByteUTF8;

2955 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {

2956 break;

2957 }

2958 CurPtr += 16;

2959 }

2960

2961#else

2962 while (CurPtr + 16 < BufferEnd) {

2963 bool HasNonASCII = false;

2964 for (unsigned I = 0; I < 16; ++I)

2965 HasNonASCII |= isASCII (CurPtr[I]);

2966

2967 if (LLVM_UNLIKELY(HasNonASCII))

2968 goto MultiByteUTF8;

2969

2970 bool HasSlash = false;

2971 for (unsigned I = 0; I < 16; ++I)

2972 HasSlash |= CurPtr[I] == '/';

2973 if (HasSlash)

2974 break;

2975 CurPtr += 16;

2976 }

2977#endif

2978

2979

2980 C = *CurPtr++;

2981 }

2982

2983

2984

2985

2986 while (C != '/' && C != '\0') {

2988 UnicodeDecodingAlreadyDiagnosed = false;

2989 C = *CurPtr++;

2990 continue;

2991 }

2992 MultiByteUTF8:

2993

2994

2995 unsigned Length = llvm::getUTF8SequenceSize(

2996 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);

2997 if (Length == 0) {

2998 if (!UnicodeDecodingAlreadyDiagnosed && isLexingRawMode ())

2999 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);

3000 UnicodeDecodingAlreadyDiagnosed = true;

3001 } else {

3002 UnicodeDecodingAlreadyDiagnosed = false;

3003 CurPtr += Length - 1;

3004 }

3005 C = *CurPtr++;

3006 }

3007

3008 if (C == '/') {

3009 FoundSlash:

3010 if (CurPtr[-2] == '*')

3011 break;

3012

3013 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {

3015 LangOpts.Trigraphs)) {

3016

3017

3018 break;

3019 }

3020 }

3021 if (CurPtr[0] == '*' && CurPtr[1] != '/') {

3022

3023

3024

3026 Diag(CurPtr-1, diag::warn_nested_block_comment);

3027 }

3028 } else if (C == 0 && CurPtr == BufferEnd+1) {

3030 Diag(BufferPtr, diag::err_unterminated_block_comment);

3031

3032

3033

3034 --CurPtr;

3035

3036

3037

3039 FormTokenWithChars(Result, CurPtr, tok::unknown);

3040 return true;

3041 }

3042

3043 BufferPtr = CurPtr;

3044 return false;

3045 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {

3047 cutOffLexing();

3048 return false;

3049 }

3050

3051 C = *CurPtr++;

3052 }

3053

3054

3058 BufferPtr = CurPtr;

3059 return true;

3060 }

3061

3062

3064 FormTokenWithChars(Result, CurPtr, tok::comment);

3065 return true;

3066 }

3067

3068

3069

3070

3071

3073 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);

3074 return false;

3075 }

3076

3077

3078 BufferPtr = CurPtr;

3080 return false;

3081}

3082

3083

3084

3085

3086

3087

3088

3091 "Must be in a preprocessing directive!");

3094

3095

3096 const char *CurPtr = BufferPtr;

3097 while (true) {

3098 char Char = getAndAdvanceChar(CurPtr, Tmp);

3099 switch (Char) {

3100 default:

3102 Result->push_back(Char);

3103 break;

3104 case 0:

3105

3106 if (CurPtr-1 != BufferEnd) {

3107 if (isCodeCompletionPoint(CurPtr-1)) {

3109 cutOffLexing();

3110 return;

3111 }

3112

3113

3115 Result->push_back(Char);

3116 break;

3117 }

3118

3119 [[fallthrough]];

3120 case '\r':

3121 case '\n':

3122

3123 assert(CurPtr[-1] == Char && "Trigraphs for newline?");

3124 BufferPtr = CurPtr-1;

3125

3126

3127 Lex(Tmp);

3128 if (Tmp.is(tok::code_completion)) {

3129 if (PP)

3131 Lex(Tmp);

3132 }

3133 assert(Tmp.is(tok::eod) && "Unexpected token!");

3134

3135

3136 return;

3137 }

3138 }

3139}

3140

3141

3142

3143

3144

3145bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {

3146

3147

3148

3150

3152

3153 FormTokenWithChars(Result, CurPtr, tok::eod);

3154

3155

3156 if (PP)

3158 return true;

3159 }

3160

3161

3162

3164 Result.startToken();

3165 BufferPtr = BufferEnd;

3166 FormTokenWithChars(Result, BufferEnd, tok::eof);

3167 return true;

3168 }

3169

3172

3173

3174

3178 }

3179

3180

3181

3182

3186 diag::err_pp_unterminated_conditional);

3188 }

3189

3190

3191

3192 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {

3195 unsigned DiagID;

3196

3197 if (LangOpts.CPlusPlus11) {

3198

3199

3200

3201 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {

3202 DiagID = diag::warn_cxx98_compat_no_newline_eof;

3203 } else {

3204 DiagID = diag::warn_no_newline_eof;

3205 }

3206 } else {

3207 DiagID = diag::ext_no_newline_eof;

3208 }

3209

3210 Diag(BufferEnd, DiagID)

3212 }

3213

3214 BufferPtr = CurPtr;

3215

3216

3218}

3219

3220

3221

3222

3223

3224unsigned Lexer::isNextPPTokenLParen() {

3225 assert( LexingRawMode && "How can we expand a macro from a skipping buffer?");

3226

3227 if (isDependencyDirectivesLexer()) {

3228 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())

3229 return 2;

3230 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(

3231 tok::l_paren);

3232 }

3233

3234

3235

3236

3238

3239

3240 const char *TmpBufferPtr = BufferPtr;

3242 bool atStartOfLine = IsAtStartOfLine;

3243 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;

3244 bool leadingSpace = HasLeadingSpace;

3245

3247 Lex(Tok);

3248

3249

3250 BufferPtr = TmpBufferPtr;

3252 HasLeadingSpace = leadingSpace;

3253 IsAtStartOfLine = atStartOfLine;

3254 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;

3255

3256

3258

3259 if (Tok.is(tok::eof))

3260 return 2;

3261 return Tok.is(tok::l_paren);

3262}

3263

3264

3265static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,

3267 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";

3268 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;

3269 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);

3270 size_t Pos = RestOfBuffer.find(Terminator);

3271 while (Pos != StringRef::npos) {

3272

3273 if (Pos == 0 ||

3274 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {

3275 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);

3276 Pos = RestOfBuffer.find(Terminator);

3277 continue;

3278 }

3279 return RestOfBuffer.data()+Pos;

3280 }

3281 return nullptr;

3282}

3283

3284

3285

3286

3287

3288bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {

3289

3290 if (CurPtr != BufferStart &&

3291 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')

3292 return false;

3293

3294

3295 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&

3296 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))

3297 return false;

3298

3299

3300

3302 return false;

3303

3305

3306

3307

3309

3310

3311 Diag(CurPtr, diag::err_conflict_marker);

3312 CurrentConflictMarkerState = Kind;

3313

3314

3315

3316 while (*CurPtr != '\r' && *CurPtr != '\n') {

3317 assert(CurPtr != BufferEnd && "Didn't find end of line");

3318 ++CurPtr;

3319 }

3320 BufferPtr = CurPtr;

3321 return true;

3322 }

3323

3324

3325 return false;

3326}

3327

3328

3329

3330

3331

3332bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {

3333

3334 if (CurPtr != BufferStart &&

3335 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')

3336 return false;

3337

3338

3339

3341 return false;

3342

3343

3344 for (unsigned i = 1; i != 4; ++i)

3345 if (CurPtr[i] != CurPtr[0])

3346 return false;

3347

3348

3349

3350

3351 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,

3352 CurrentConflictMarkerState)) {

3353 CurPtr = End;

3354

3355

3356 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')

3357 ++CurPtr;

3358

3359 BufferPtr = CurPtr;

3360

3361

3362 CurrentConflictMarkerState = CMK_None;

3363 return true;

3364 }

3365

3366 return false;

3367}

3368

3370 const char *BufferEnd) {

3371 if (CurPtr == BufferEnd)

3372 return nullptr;

3373 BufferEnd -= 1;

3374 for (; CurPtr != BufferEnd; ++CurPtr) {

3375 if (CurPtr[0] == '#' && CurPtr[1] == '>')

3376 return CurPtr + 2;

3377 }

3378 return nullptr;

3379}

3380

3381bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {

3382 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");

3384 return false;

3386 if (!End)

3387 return false;

3388 const char *Start = CurPtr - 1;

3389 if (!LangOpts.AllowEditorPlaceholders)

3390 Diag(Start, diag::err_placeholder_in_source);

3391 Result.startToken();

3392 FormTokenWithChars(Result, End, tok::raw_identifier);

3393 Result.setRawIdentifierData(Start);

3396 BufferPtr = End;

3397 return true;

3398}

3399

3400bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {

3404 }

3405

3406 return false;

3407}

3408

3409std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,

3410 const char *SlashLoc,

3412 unsigned CharSize;

3413 char Kind = getCharAndSize(StartPtr, CharSize);

3414 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");

3415

3416 unsigned NumHexDigits;

3417 if (Kind == 'u')

3418 NumHexDigits = 4;

3419 else if (Kind == 'U')

3420 NumHexDigits = 8;

3421

3422 bool Delimited = false;

3423 bool FoundEndDelimiter = false;

3424 unsigned Count = 0;

3426

3427 if (!LangOpts.CPlusPlus && !LangOpts.C99) {

3428 if (Diagnose)

3429 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);

3430 return std::nullopt;

3431 }

3432

3433 const char *CurPtr = StartPtr + CharSize;

3434 const char *KindLoc = &CurPtr[-1];

3435

3437 while (Count != NumHexDigits || Delimited) {

3438 char C = getCharAndSize(CurPtr, CharSize);

3439 if (!Delimited && Count == 0 && C == '{') {

3440 Delimited = true;

3441 CurPtr += CharSize;

3442 continue;

3443 }

3444

3445 if (Delimited && C == '}') {

3446 CurPtr += CharSize;

3447 FoundEndDelimiter = true;

3448 break;

3449 }

3450

3451 unsigned Value = llvm::hexDigitValue(C);

3452 if (Value == -1U) {

3453 if (!Delimited)

3454 break;

3455 if (Diagnose)

3456 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)

3457 << StringRef(KindLoc, 1);

3458 return std::nullopt;

3459 }

3460

3461 if (CodePoint & 0xF000'0000) {

3462 if (Diagnose)

3463 Diag(KindLoc, diag::err_escape_too_large) << 0;

3464 return std::nullopt;

3465 }

3466

3467 CodePoint <<= 4;

3468 CodePoint |= Value;

3469 CurPtr += CharSize;

3470 Count++;

3471 }

3472

3473 if (Count == 0) {

3474 if (Diagnose)

3475 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty

3476 : diag::warn_ucn_escape_no_digits)

3477 << StringRef(KindLoc, 1);

3478 return std::nullopt;

3479 }

3480

3481 if (Delimited && Kind == 'U') {

3482 if (Diagnose)

3483 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);

3484 return std::nullopt;

3485 }

3486

3487 if (!Delimited && Count != NumHexDigits) {

3488 if (Diagnose) {

3489 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);

3490

3491 if (Count == 4 && NumHexDigits == 8) {

3493 Diag(KindLoc, diag::note_ucn_four_not_eight)

3495 }

3496 }

3497 return std::nullopt;

3498 }

3499

3500 if (Delimited && PP) {

3502 ? diag::warn_cxx23_delimited_escape_sequence

3503 : diag::ext_delimited_escape_sequence)

3504 << 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);

3505 }

3506

3509

3510

3511

3512 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))

3513 StartPtr = CurPtr;

3514 else

3515 while (StartPtr != CurPtr)

3516 (void)getAndAdvanceChar(StartPtr, *Result);

3517 } else {

3518 StartPtr = CurPtr;

3519 }

3520 return CodePoint;

3521}

3522

3523std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,

3524 const char *SlashLoc,

3526 unsigned CharSize;

3528

3529 char C = getCharAndSize(StartPtr, CharSize);

3530 assert(C == 'N' && "expected \\N{...}");

3531

3532 const char *CurPtr = StartPtr + CharSize;

3533 const char *KindLoc = &CurPtr[-1];

3534

3535 C = getCharAndSize(CurPtr, CharSize);

3536 if (C != '{') {

3537 if (Diagnose)

3538 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);

3539 return std::nullopt;

3540 }

3541 CurPtr += CharSize;

3542 const char *StartName = CurPtr;

3543 bool FoundEndDelimiter = false;

3545 while (C) {

3546 C = getCharAndSize(CurPtr, CharSize);

3547 CurPtr += CharSize;

3548 if (C == '}') {

3549 FoundEndDelimiter = true;

3550 break;

3551 }

3552

3554 break;

3555 Buffer.push_back(C);

3556 }

3557

3558 if (!FoundEndDelimiter || Buffer.empty()) {

3559 if (Diagnose)

3560 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty

3561 : diag::warn_delimited_ucn_incomplete)

3562 << StringRef(KindLoc, 1);

3563 return std::nullopt;

3564 }

3565

3566 StringRef Name(Buffer.data(), Buffer.size());

3567 std::optional<char32_t> Match =

3568 llvm::sys::unicode::nameToCodepointStrict(Name);

3569 std::optionalllvm::sys::unicode::LooseMatchingResult LooseMatch;

3570 if (!Match) {

3571 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);

3572 if (Diagnose) {

3573 Diag(StartName, diag::err_invalid_ucn_name)

3574 << StringRef(Buffer.data(), Buffer.size())

3575 << makeCharRange(*this, StartName, CurPtr - CharSize);

3576 if (LooseMatch) {

3577 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)

3579 makeCharRange(*this, StartName, CurPtr - CharSize),

3580 LooseMatch->Name);

3581 }

3582 }

3583

3584

3585

3586 }

3587

3588 if (Diagnose && Match)

3590 ? diag::warn_cxx23_delimited_escape_sequence

3591 : diag::ext_delimited_escape_sequence)

3592 << 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);

3593

3594

3595

3596

3597

3598 if (LooseMatch && Diagnose)

3599 Match = LooseMatch->CodePoint;

3600

3603

3604

3605

3606 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))

3607 StartPtr = CurPtr;

3608 else

3609 while (StartPtr != CurPtr)

3610 (void)getAndAdvanceChar(StartPtr, *Result);

3611 } else {

3612 StartPtr = CurPtr;

3613 }

3614 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;

3615}

3616

3617uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,

3619

3620 unsigned CharSize;

3621 std::optional<uint32_t> CodePointOpt;

3622 char Kind = getCharAndSize(StartPtr, CharSize);

3623 if (Kind == 'u' || Kind == 'U')

3624 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);

3625 else if (Kind == 'N')

3626 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);

3627

3628 if (!CodePointOpt)

3629 return 0;

3630

3631 uint32_t CodePoint = *CodePointOpt;

3632

3633

3634 if (LangOpts.AsmPreprocessor)

3635 return CodePoint;

3636

3637

3638

3639

3640

3641

3642

3643

3644

3645

3646

3647

3648

3649

3650

3651

3652

3653 if (CodePoint < 0xA0) {

3654

3655

3657 if (CodePoint < 0x20 || CodePoint >= 0x7F)

3658 Diag(BufferPtr, diag::err_ucn_control_character);

3659 else {

3660 char C = static_cast<char>(CodePoint);

3661 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);

3662 }

3663 }

3664

3665 return 0;

3666 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {

3667

3668

3669

3671 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)

3672 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);

3673 else

3674 Diag(BufferPtr, diag::err_ucn_escape_invalid);

3675 }

3676 return 0;

3677 }

3678

3679 return CodePoint;

3680}

3681

3682bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,

3683 const char *CurPtr) {

3686 Diag(BufferPtr, diag::ext_unicode_whitespace)

3688

3690 return true;

3691 }

3692 return false;

3693}

3694

3695void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {

3696 IsAtStartOfLine = Result.isAtStartOfLine();

3697 HasLeadingSpace = Result.hasLeadingSpace();

3698 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();

3699

3700}

3701

3703 assert(!isDependencyDirectivesLexer());

3704

3705

3706 Result.startToken();

3707

3708

3709 if (IsAtStartOfLine) {

3711 IsAtStartOfLine = false;

3712 }

3713

3714 if (HasLeadingSpace) {

3716 HasLeadingSpace = false;

3717 }

3718

3719 if (HasLeadingEmptyMacro) {

3721 HasLeadingEmptyMacro = false;

3722 }

3723

3724 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;

3725 IsAtPhysicalStartOfLine = false;

3727 (void) isRawLex;

3728 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);

3729

3730 assert((returnedToken || !isRawLex) && "Raw lex must succeed");

3731 return returnedToken;

3732}

3733

3734

3735

3736

3737

3738

3739bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {

3740LexStart:

3741 assert( Result .needsCleaning() && "Result needs cleaning");

3742 assert( Result .hasPtrData() && "Result has not been reset");

3743

3744

3745 const char *CurPtr = BufferPtr;

3746

3747

3749 do {

3750 ++CurPtr;

3752

3753

3754

3755

3757 FormTokenWithChars(Result, CurPtr, tok::unknown);

3758

3759 return true;

3760 }

3761

3762 BufferPtr = CurPtr;

3764 }

3765

3766 unsigned SizeTmp, SizeTmp2;

3767

3768

3769 char Char = getAndAdvanceChar(CurPtr, Result);

3771

3773 NewLinePtr = nullptr;

3774

3775 switch (Char) {

3776 case 0:

3777

3778 if (CurPtr-1 == BufferEnd)

3779 return LexEndOfFile(Result, CurPtr-1);

3780

3781

3782 if (isCodeCompletionPoint(CurPtr-1)) {

3783

3784 Result.startToken();

3785 FormTokenWithChars(Result, CurPtr, tok::code_completion);

3786 return true;

3787 }

3788

3790 Diag(CurPtr-1, diag::null_in_file);

3792 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3793 return true;

3794

3795

3796

3797 goto LexNextToken;

3798

3799 case 26:

3800

3801 if (LangOpts.MicrosoftExt) {

3803 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);

3804 return LexEndOfFile(Result, CurPtr-1);

3805 }

3806

3807

3808 Kind = tok::unknown;

3809 break;

3810

3811 case '\r':

3812 if (CurPtr[0] == '\n')

3813 (void)getAndAdvanceChar(CurPtr, Result);

3814 [[fallthrough]];

3815 case '\n':

3816

3817

3819

3821

3822

3823 if (PP)

3825

3826

3827 IsAtStartOfLine = true;

3828 IsAtPhysicalStartOfLine = true;

3829 NewLinePtr = CurPtr - 1;

3830

3831 Kind = tok::eod;

3832 break;

3833 }

3834

3835

3837

3838 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3839 return true;

3840

3841

3842

3843 goto LexNextToken;

3844 case ' ':

3845 case '\t':

3846 case '\f':

3847 case '\v':

3848 SkipHorizontalWhitespace:

3850 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

3851 return true;

3852

3853 SkipIgnoredUnits:

3854 CurPtr = BufferPtr;

3855

3856

3857

3858 if (CurPtr[0] == '/' && CurPtr[1] == '/' && inKeepCommentMode () &&

3859 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {

3860 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))

3861 return true;

3862 goto SkipIgnoredUnits;

3863 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && inKeepCommentMode ()) {

3864 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))

3865 return true;

3866 goto SkipIgnoredUnits;

3868 goto SkipHorizontalWhitespace;

3869 }

3870

3871

3872 goto LexNextToken;

3873

3874

3875

3876 case '0': case '1': case '2': case '3': case '4':

3877 case '5': case '6': case '7': case '8': case '9':

3878

3880 return LexNumericConstant(Result, CurPtr);

3881

3882

3883

3884

3885 case 'u':

3886

3888

3889 if (LangOpts.CPlusPlus11 || LangOpts.C11) {

3890 Char = getCharAndSize(CurPtr, SizeTmp);

3891

3892

3893 if (Char == '"')

3894 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3895 tok::utf16_string_literal);

3896

3897

3898 if (Char == '\'')

3899 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3900 tok::utf16_char_constant);

3901

3902

3903 if (Char == 'R' && LangOpts.RawStringLiterals &&

3904 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

3905 return LexRawStringLiteral(Result,

3906 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3908 tok::utf16_string_literal);

3909

3910 if (Char == '8') {

3911 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);

3912

3913

3914 if (Char2 == '"')

3915 return LexStringLiteral(Result,

3916 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3918 tok::utf8_string_literal);

3919 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))

3920 return LexCharConstant(

3921 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3923 tok::utf8_char_constant);

3924

3925 if (Char2 == 'R' && LangOpts.RawStringLiterals) {

3926 unsigned SizeTmp3;

3927 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);

3928

3929 if (Char3 == '"') {

3930 return LexRawStringLiteral(Result,

3931 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3934 tok::utf8_string_literal);

3935 }

3936 }

3937 }

3938 }

3939

3940

3941 return LexIdentifierContinue(Result, CurPtr);

3942

3943 case 'U':

3944

3946

3947 if (LangOpts.CPlusPlus11 || LangOpts.C11) {

3948 Char = getCharAndSize(CurPtr, SizeTmp);

3949

3950

3951 if (Char == '"')

3952 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3953 tok::utf32_string_literal);

3954

3955

3956 if (Char == '\'')

3957 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3958 tok::utf32_char_constant);

3959

3960

3961 if (Char == 'R' && LangOpts.RawStringLiterals &&

3962 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

3963 return LexRawStringLiteral(Result,

3964 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

3966 tok::utf32_string_literal);

3967 }

3968

3969

3970 return LexIdentifierContinue(Result, CurPtr);

3971

3972 case 'R':

3973

3975

3976 if (LangOpts.RawStringLiterals) {

3977 Char = getCharAndSize(CurPtr, SizeTmp);

3978

3979 if (Char == '"')

3980 return LexRawStringLiteral(Result,

3981 ConsumeChar(CurPtr, SizeTmp, Result),

3982 tok::string_literal);

3983 }

3984

3985

3986 return LexIdentifierContinue(Result, CurPtr);

3987

3988 case 'L':

3989

3991 Char = getCharAndSize(CurPtr, SizeTmp);

3992

3993

3994 if (Char == '"')

3995 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),

3996 tok::wide_string_literal);

3997

3998

3999 if (LangOpts.RawStringLiterals && Char == 'R' &&

4000 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')

4001 return LexRawStringLiteral(Result,

4002 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4004 tok::wide_string_literal);

4005

4006

4007 if (Char == '\'')

4008 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4009 tok::wide_char_constant);

4010

4011 [[fallthrough]];

4012

4013

4014 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':

4015 case 'H': case 'I': case 'J': case 'K': case 'M': case 'N':

4016 case 'O': case 'P': case 'Q': case 'S': case 'T':

4017 case 'V': case 'W': case 'X': case 'Y': case 'Z':

4018 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':

4019 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':

4020 case 'o': case 'p': case 'q': case 'r': case 's': case 't':

4021 case 'v': case 'w': case 'x': case 'y': case 'z':

4022 case '_':

4023

4025 return LexIdentifierContinue(Result, CurPtr);

4026

4027 case '$':

4028 if (LangOpts.DollarIdents) {

4030 Diag(CurPtr-1, diag::ext_dollar_in_identifier);

4031

4033 return LexIdentifierContinue(Result, CurPtr);

4034 }

4035

4036 Kind = tok::unknown;

4037 break;

4038

4039

4040 case '\'':

4041

4043 return LexCharConstant(Result, CurPtr, tok::char_constant);

4044

4045

4046 case '"':

4047

4049 return LexStringLiteral(Result, CurPtr,

4051 : tok::string_literal);

4052

4053

4054 case '?':

4055 Kind = tok::question;

4056 break;

4057 case '[':

4058 Kind = tok::l_square;

4059 break;

4060 case ']':

4061 Kind = tok::r_square;

4062 break;

4063 case '(':

4064 Kind = tok::l_paren;

4065 break;

4066 case ')':

4067 Kind = tok::r_paren;

4068 break;

4069 case '{':

4070 Kind = tok::l_brace;

4071 break;

4072 case '}':

4073 Kind = tok::r_brace;

4074 break;

4075 case '.':

4076 Char = getCharAndSize(CurPtr, SizeTmp);

4077 if (Char >= '0' && Char <= '9') {

4078

4080

4081 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));

4082 } else if (LangOpts.CPlusPlus && Char == '*') {

4083 Kind = tok::periodstar;

4084 CurPtr += SizeTmp;

4085 } else if (Char == '.' &&

4086 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {

4087 Kind = tok::ellipsis;

4088 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4090 } else {

4091 Kind = tok::period;

4092 }

4093 break;

4094 case '&':

4095 Char = getCharAndSize(CurPtr, SizeTmp);

4096 if (Char == '&') {

4097 Kind = tok::ampamp;

4098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4099 } else if (Char == '=') {

4100 Kind = tok::ampequal;

4101 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4102 } else {

4103 Kind = tok::amp;

4104 }

4105 break;

4106 case '*':

4107 if (getCharAndSize(CurPtr, SizeTmp) == '=') {

4108 Kind = tok::starequal;

4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4110 } else {

4111 Kind = tok::star;

4112 }

4113 break;

4114 case '+':

4115 Char = getCharAndSize(CurPtr, SizeTmp);

4116 if (Char == '+') {

4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4118 Kind = tok::plusplus;

4119 } else if (Char == '=') {

4120 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4121 Kind = tok::plusequal;

4122 } else {

4123 Kind = tok::plus;

4124 }

4125 break;

4126 case '-':

4127 Char = getCharAndSize(CurPtr, SizeTmp);

4128 if (Char == '-') {

4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4130 Kind = tok::minusminus;

4131 } else if (Char == '>' && LangOpts.CPlusPlus &&

4132 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {

4133 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4135 Kind = tok::arrowstar;

4136 } else if (Char == '>') {

4137 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4138 Kind = tok::arrow;

4139 } else if (Char == '=') {

4140 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4141 Kind = tok::minusequal;

4142 } else {

4143 Kind = tok::minus;

4144 }

4145 break;

4146 case '~':

4147 Kind = tok::tilde;

4148 break;

4149 case '!':

4150 if (getCharAndSize(CurPtr, SizeTmp) == '=') {

4151 Kind = tok::exclaimequal;

4152 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4153 } else {

4154 Kind = tok::exclaim;

4155 }

4156 break;

4157 case '/':

4158

4159 Char = getCharAndSize(CurPtr, SizeTmp);

4160 if (Char == '/') {

4161

4162

4163

4164

4165

4166

4167

4168

4169 bool TreatAsComment =

4170 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);

4171 if (!TreatAsComment)

4173 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';

4174

4175 if (TreatAsComment) {

4176 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4177 TokAtPhysicalStartOfLine))

4178 return true;

4179

4180

4181

4182

4183 goto SkipIgnoredUnits;

4184 }

4185 }

4186

4187 if (Char == '*') {

4188 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),

4189 TokAtPhysicalStartOfLine))

4190 return true;

4191

4192

4193

4194 goto LexNextToken;

4195 }

4196

4197 if (Char == '=') {

4198 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4199 Kind = tok::slashequal;

4200 } else {

4201 Kind = tok::slash;

4202 }

4203 break;

4204 case '%':

4205 Char = getCharAndSize(CurPtr, SizeTmp);

4206 if (Char == '=') {

4207 Kind = tok::percentequal;

4208 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4209 } else if (LangOpts.Digraphs && Char == '>') {

4210 Kind = tok::r_brace;

4211 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4212 } else if (LangOpts.Digraphs && Char == ':') {

4213 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4214 Char = getCharAndSize(CurPtr, SizeTmp);

4215 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {

4216 Kind = tok::hashhash;

4217 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4219 } else if (Char == '@' && LangOpts.MicrosoftExt) {

4220 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4222 Diag(BufferPtr, diag::ext_charize_microsoft);

4223 Kind = tok::hashat;

4224 } else {

4225

4226

4227

4228

4229 if (TokAtPhysicalStartOfLine && LexingRawMode && !Is_PragmaLexer)

4230 goto HandleDirective;

4231

4232 Kind = tok::hash;

4233 }

4234 } else {

4235 Kind = tok::percent;

4236 }

4237 break;

4238 case '<':

4239 Char = getCharAndSize(CurPtr, SizeTmp);

4241 return LexAngledStringLiteral(Result, CurPtr);

4242 } else if (Char == '<') {

4243 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4244 if (After == '=') {

4245 Kind = tok::lesslessequal;

4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4248 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {

4249

4250

4251 goto LexNextToken;

4252 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {

4253

4254

4255 goto LexNextToken;

4256 } else if (LangOpts.CUDA && After == '<') {

4257 Kind = tok::lesslessless;

4258 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4260 } else {

4261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4262 Kind = tok::lessless;

4263 }

4264 } else if (Char == '=') {

4265 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4266 if (After == '>') {

4267 if (LangOpts.CPlusPlus20) {

4269 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);

4270 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4272 Kind = tok::spaceship;

4273 break;

4274 }

4275

4276

4278 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)

4281 }

4282 }

4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4284 Kind = tok::lessequal;

4285 } else if (LangOpts.Digraphs && Char == ':') {

4286 if (LangOpts.CPlusPlus11 &&

4287 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {

4288

4289

4290

4291

4292

4293 unsigned SizeTmp3;

4294 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);

4295 if (After != ':' && After != '>') {

4296 Kind = tok::less;

4298 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);

4299 break;

4300 }

4301 }

4302

4303 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4304 Kind = tok::l_square;

4305 } else if (LangOpts.Digraphs && Char == '%') {

4306 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4307 Kind = tok::l_brace;

4308 } else if (Char == '#' && SizeTmp == 1 &&

4309 lexEditorPlaceholder(Result, CurPtr)) {

4310 return true;

4311 } else {

4312 Kind = tok::less;

4313 }

4314 break;

4315 case '>':

4316 Char = getCharAndSize(CurPtr, SizeTmp);

4317 if (Char == '=') {

4318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4319 Kind = tok::greaterequal;

4320 } else if (Char == '>') {

4321 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);

4322 if (After == '=') {

4323 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4325 Kind = tok::greatergreaterequal;

4326 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {

4327

4328

4329 goto LexNextToken;

4330 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {

4331

4332 goto LexNextToken;

4333 } else if (LangOpts.CUDA && After == '>') {

4334 Kind = tok::greatergreatergreater;

4335 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),

4337 } else {

4338 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4339 Kind = tok::greatergreater;

4340 }

4341 } else {

4342 Kind = tok::greater;

4343 }

4344 break;

4345 case '^':

4346 Char = getCharAndSize(CurPtr, SizeTmp);

4347 if (Char == '=') {

4348 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4349 Kind = tok::caretequal;

4350 } else {

4351 if (LangOpts.OpenCL && Char == '^')

4352 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);

4353 Kind = tok::caret;

4354 }

4355 break;

4356 case '|':

4357 Char = getCharAndSize(CurPtr, SizeTmp);

4358 if (Char == '=') {

4359 Kind = tok::pipeequal;

4360 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4361 } else if (Char == '|') {

4362

4363 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))

4364 goto LexNextToken;

4365 Kind = tok::pipepipe;

4366 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4367 } else {

4368 Kind = tok::pipe;

4369 }

4370 break;

4371 case ':':

4372 Char = getCharAndSize(CurPtr, SizeTmp);

4373 if (LangOpts.Digraphs && Char == '>') {

4374 Kind = tok::r_square;

4375 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4376 } else if (Char == ':') {

4377 Kind = tok::coloncolon;

4378 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4379 } else {

4380 Kind = tok::colon;

4381 }

4382 break;

4383 case ';':

4384 Kind = tok::semi;

4385 break;

4386 case '=':

4387 Char = getCharAndSize(CurPtr, SizeTmp);

4388 if (Char == '=') {

4389

4390 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))

4391 goto LexNextToken;

4392

4393 Kind = tok::equalequal;

4394 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4395 } else {

4396 Kind = tok::equal;

4397 }

4398 break;

4399 case ',':

4400 Kind = tok::comma;

4401 break;

4402 case '#':

4403 Char = getCharAndSize(CurPtr, SizeTmp);

4404 if (Char == '#') {

4405 Kind = tok::hashhash;

4406 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4407 } else if (Char == '@' && LangOpts.MicrosoftExt) {

4408 Kind = tok::hashat;

4410 Diag(BufferPtr, diag::ext_charize_microsoft);

4411 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);

4412 } else {

4413

4414

4415

4416

4417 if (TokAtPhysicalStartOfLine && LexingRawMode && !Is_PragmaLexer)

4418 goto HandleDirective;

4419

4420 Kind = tok::hash;

4421 }

4422 break;

4423

4424 case '@':

4425

4426 if (CurPtr[-1] == '@' && LangOpts.ObjC)

4427 Kind = tok::at;

4428 else

4429 Kind = tok::unknown;

4430 break;

4431

4432

4433 case '\\':

4434 if (!LangOpts.AsmPreprocessor) {

4435 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {

4436 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {

4437 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

4438 return true;

4439

4440

4441

4442 goto LexNextToken;

4443 }

4444

4445 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);

4446 }

4447 }

4448

4449 Kind = tok::unknown;

4450 break;

4451

4452 default: {

4454 Kind = tok::unknown;

4455 break;

4456 }

4457

4458 llvm::UTF32 CodePoint;

4459

4460

4461

4462 --CurPtr;

4463 llvm::ConversionResult Status =

4464 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,

4465 (const llvm::UTF8 *)BufferEnd,

4466 &CodePoint,

4467 llvm::strictConversion);

4468 if (Status == llvm::conversionOK) {

4469 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {

4470 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))

4471 return true;

4472

4473

4474

4475 goto LexNextToken;

4476 }

4477 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);

4478 }

4479

4482 ++CurPtr;

4483 Kind = tok::unknown;

4484 break;

4485 }

4486

4487

4488

4489

4490 Diag(CurPtr, diag::err_invalid_utf8);

4491

4492 BufferPtr = CurPtr+1;

4493

4494

4495

4496 goto LexNextToken;

4497 }

4498 }

4499

4500

4502

4503

4504 FormTokenWithChars(Result, CurPtr, Kind);

4505 return true;

4506

4507HandleDirective:

4508

4509

4510 FormTokenWithChars(Result, CurPtr, tok::hash);

4512

4514

4515 return true;

4516

4517

4518 return false;

4519

4520LexNextToken:

4522 goto LexStart;

4523}

4524

4525const char *Lexer::convertDependencyDirectiveToken(

4527 const char *TokPtr = BufferStart + DDTok.Offset;

4528 Result.startToken();

4533 BufferPtr = TokPtr + DDTok.Length;

4534 return TokPtr;

4535}

4536

4537bool Lexer::LexDependencyDirectiveToken(Token &Result) {

4538 assert(isDependencyDirectivesLexer());

4539

4540 using namespace dependency_directives_scan;

4541

4542 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {

4543 if (DepDirectives.front().Kind == pp_eof)

4544 return LexEndOfFile(Result, BufferEnd);

4545 if (DepDirectives.front().Kind == tokens_present_before_eof)

4547 NextDepDirectiveTokenIndex = 0;

4548 DepDirectives = DepDirectives.drop_front();

4549 }

4550

4552 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];

4553 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {

4554

4556 }

4557

4559 BufferPtr = BufferStart + DDTok.Offset;

4560 LexAngledStringLiteral(Result, BufferPtr + 1);

4561 if (Result.isNot(tok::header_name))

4562 return true;

4563

4564 while (true) {

4566 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];

4567 if (BufferStart + NextTok.Offset >= BufferPtr)

4568 break;

4569 ++NextDepDirectiveTokenIndex;

4570 }

4571 return true;

4572 }

4573

4574 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);

4575

4576 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {

4578 return false;

4579 }

4580 if (Result.is(tok::raw_identifier)) {

4581 Result.setRawIdentifierData(TokPtr);

4586 }

4587 return true;

4588 }

4589 if (Result.isLiteral()) {

4590 Result.setLiteralData(TokPtr);

4591 return true;

4592 }

4593 if (Result.is(tok::colon)) {

4594

4595 if (*BufferPtr == ':') {

4596 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(

4597 tok::colon));

4598 ++NextDepDirectiveTokenIndex;

4599 Result.setKind(tok::coloncolon);

4600 }

4601 return true;

4602 }

4603 if (Result.is(tok::eod))

4605

4606 return true;

4607}

4608

4609bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {

4610 assert(isDependencyDirectivesLexer());

4611

4612 using namespace dependency_directives_scan;

4613

4614 bool Stop = false;

4615 unsigned NestedIfs = 0;

4616 do {

4617 DepDirectives = DepDirectives.drop_front();

4618 switch (DepDirectives.front().Kind) {

4620 llvm_unreachable("unexpected 'pp_none'");

4639 break;

4643 ++NestedIfs;

4644 break;

4649 if (!NestedIfs) {

4650 Stop = true;

4651 }

4652 break;

4654 if (!NestedIfs) {

4655 Stop = true;

4656 } else {

4657 --NestedIfs;

4658 }

4659 break;

4661 NextDepDirectiveTokenIndex = 0;

4662 return LexEndOfFile(Result, BufferEnd);

4663 }

4664 } while (!Stop);

4665

4667 DepDirectives.front().Tokens.front();

4668 assert(DDTok.is(tok::hash));

4669 NextDepDirectiveTokenIndex = 1;

4670

4671 convertDependencyDirectiveToken(DDTok, Result);

4672 return false;

4673}

Defines the Diagnostic-related interfaces.

Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.

Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.

Defines the clang::LangOptions interface.

static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)

static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)

static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)

static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)

DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...

static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)

Slow case of getSpelling.

static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)

Find the end of a version control conflict marker.

static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)

After encountering UTF-8 character C and interpreting it as an identifier character,...

static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

static void StringifyImpl(T &Str, char Quote)

static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)

GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...

static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)

static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)

static bool isUnicodeWhitespace(uint32_t Codepoint)

static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)

static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)

static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)

static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)

static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)

isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...

static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)

static char GetTrigraphCharForLetter(char Letter)

GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...

static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)

static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)

static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)

Returns the pointer that points to the beginning of line that contains the given offset,...

Defines the MultipleIncludeOpt interface.

Defines the clang::Preprocessor interface.

Defines the clang::SourceLocation class and associated facilities.

Defines the SourceManager interface.

Defines the clang::TokenKind enum and support functions.

static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]

static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]

static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]

static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]

static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]

static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]

static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]

static const llvm::sys::UnicodeCharRange XIDStartRanges[]

static const llvm::sys::UnicodeCharRange XIDContinueRanges[]

__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)

__device__ __2f16 float c

__PTRDIFF_TYPE__ ptrdiff_t

static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)

static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)

Represents a character-granular source range.

static CharSourceRange getCharRange(SourceRange R)

SourceLocation getEnd() const

SourceLocation getBegin() const

A little helper class used to produce diagnostics.

Concrete class used by the front-end to report problems and issues.

DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)

Issue the message to the client.

bool isIgnored(unsigned DiagID, SourceLocation Loc) const

Determine whether the diagnostic is known to be ignored.

An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...

static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)

Create a code modification hint that replaces the given source range with the given code string.

static FixItHint CreateRemoval(CharSourceRange RemoveRange)

Create a code modification hint that removes the given source range.

static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)

Create a code modification hint that inserts the given code string at a specific location.

One of these records is kept for each identifier that is lexed.

bool isHandleIdentifierCase() const

Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.

bool isKeyword(const LangOptions &LangOpts) const

Return true if this token is a keyword in the specified language.

tok::ObjCKeywordKind getObjCKeywordID() const

Return the Objective-C keyword ID for the this identifier.

IdentifierInfo & get(StringRef Name)

Return the identifier token info for the specified named identifier.

Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...

Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.

static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)

Returns a string for the source that the range encompasses.

void SetKeepWhitespaceMode(bool Val)

SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.

static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)

Checks that the given token is the first token that occurs after the given location (this excludes co...

bool LexFromRawLexer(Token &Result)

LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...

bool inKeepCommentMode() const

inKeepCommentMode - Return true if the lexer should return comments as tokens.

void SetCommentRetentionState(bool Mode)

SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.

static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)

Finds the token that comes before the given location.

void seek(unsigned Offset, bool IsAtStartOfLine)

Set the lexer's buffer pointer to Offset.

static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Retrieve the name of the immediate macro expansion.

void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)

ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.

static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)

Returns true if the given MacroID location points at the first token of the macro expansion.

DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const

Diag - Forwarding function for diagnostics.

const char * getBufferLocation() const

Return the current location in the buffer.

bool Lex(Token &Result)

Lex - Return the next token in the file.

bool isPragmaLexer() const

isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.

static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)

Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...

Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)

Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...

static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)

Returns true if the given MacroID location points at the last token of the macro expansion.

SourceLocation getSourceLocation() override

getSourceLocation - Return a source location for the next character in the current file.

static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)

Accepts a range and returns a character range with file locations.

static bool isNewLineEscaped(const char *BufferStart, const char *Str)

Checks whether new line pointed by Str is preceded by escape sequence.

SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const

getSourceLocation - Return a source location identifier for the specified offset in the current file.

static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)

Returns the leading whitespace for line that corresponds to the given location Loc.

static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)

getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...

bool isKeepWhitespaceMode() const

isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...

static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)

Returns true if the given character could appear in an identifier.

static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)

Finds the token that comes right after the given location.

static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...

static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Given a location any where in a source buffer, find the location that corresponds to the beginning of...

void resetExtendedTokenMode()

Sets the extended token mode back to its initial value, according to the language options and preproc...

static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)

Retrieve the name of the immediate macro expansion.

static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)

Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.

static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)

Compute the preamble of the given file.

static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)

Relex the token at the specified location.

static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)

Computes the source location just past the end of the token at this source location.

static std::string Stringify(StringRef Str, bool Charify=false)

Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...

static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)

getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.

void ExitTopLevelConditional()

Called when the lexer exits the top-level conditional.

bool LexingRawMode

True if in raw mode.

SmallVector< PPConditionalInfo, 4 > ConditionalStack

Information about the set of #if/#ifdef/#ifndef blocks we are currently in.

bool ParsingPreprocessorDirective

True when parsing #XXX; turns '\n' into a tok::eod token.

MultipleIncludeOpt MIOpt

A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.

bool ParsingFilename

True after #include; turns or "xxx" into a tok::header_name token.

bool isLexingRawMode() const

Return true if this lexer is in raw mode or not.

const FileID FID

The SourceManager FileID corresponding to the file being lexed.

bool LexEditorPlaceholders

When enabled, the preprocessor will construct editor placeholder tokens.

Engages in a tight little dance with the lexer to efficiently preprocess tokens.

SourceLocation getCodeCompletionLoc() const

Returns the location of the code-completion point.

SourceLocation getCodeCompletionFileLoc() const

Returns the start location of the file of code-completion point.

void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)

Set the code completion token range for detecting replacement range later on.

bool isRecordingPreamble() const

void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)

bool isInPrimaryFile() const

Return true if we're in the top-level file, not in a #include.

void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())

Plop the specified string into a scratch buffer and set the specified token's location and length to ...

IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const

Given a tok::raw_identifier token, look up the identifier information for the token and install it in...

bool isPreprocessedOutput() const

Returns true if the preprocessor is responsible for generating output, false if it is producing token...

bool HandleIdentifier(Token &Identifier)

Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...

SourceManager & getSourceManager() const

EmptylineHandler * getEmptylineHandler() const

bool getCommentRetentionState() const

bool hadModuleLoaderFatalFailure() const

PreprocessorOptions & getPreprocessorOpts() const

Retrieve the preprocessor options used to initialize this preprocessor.

StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const

Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...

bool HandleComment(Token &result, SourceRange Comment)

bool isCodeCompletionEnabled() const

Determine if we are performing code completion.

void HandleDirective(Token &Result)

Callback invoked when the lexer sees a # token at the start of a line.

IdentifierTable & getIdentifierTable()

const LangOptions & getLangOpts() const

void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)

Hook used by the lexer to invoke the "included file" code completion point.

void CodeCompleteNaturalLanguage()

Hook used by the lexer to invoke the "natural language" code completion point.

bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)

Callback invoked when the lexer hits the end of the current file.

DiagnosticsEngine & getDiagnostics() const

void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)

Set the code completion token for filtering purposes.

DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const

Forwarding function for diagnostics.

Encodes a location in the source.

static SourceLocation getFromRawEncoding(UIntTy Encoding)

Turn a raw encoding of a SourceLocation object into a real SourceLocation.

bool isValid() const

Return true if this is a valid SourceLocation object.

SourceLocation getLocWithOffset(IntTy Offset) const

Return a source location with the specified offset from this SourceLocation.

UIntTy getRawEncoding() const

When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.

This class handles loading and caching of source files into memory.

const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const

Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.

A trivial tuple used to represent a source range.

void setBegin(SourceLocation b)

SourceLocation getEnd() const

SourceLocation getBegin() const

void setEnd(SourceLocation e)

Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...

SourceLocation getExpansionLocStart() const

SourceLocation getSpellingLoc() const

bool isMacroArgExpansion() const

This is a discriminated union of FileInfo and ExpansionInfo.

const ExpansionInfo & getExpansion() const

static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)

Determine whether a suffix is a valid ud-suffix.

Token - This structure provides full information about a lexed token.

IdentifierInfo * getIdentifierInfo() const

bool hasUCN() const

Returns true if this token contains a universal character name.

bool isLiteral() const

Return true if this is a "literal", like a numeric constant, string, etc.

SourceLocation getLocation() const

Return a source location identifier for the specified offset in the current file.

unsigned getLength() const

tok::ObjCKeywordKind getObjCKeywordID() const

Return the ObjC keyword kind.

bool is(tok::TokenKind K) const

is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....

tok::TokenKind getKind() const

bool isAtStartOfLine() const

isAtStartOfLine - Return true if this token is at the start of a line.

bool isAnnotation() const

Return true if this is any of tok::annot_* kind tokens.

bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const

Return true if we have an ObjC keyword identifier.

bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const

Determine whether the token kind starts a simple-type-specifier.

void startToken()

Reset all flags to cleared.

bool needsCleaning() const

Return true if this token has trigraphs or escaped newlines in it.

StringRef getRawIdentifier() const

getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...

const char * getLiteralData() const

getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...

void setFlag(TokenFlags Flag)

Set the specified flag.

static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)

Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...

static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)

Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.

static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)

Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...

static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)

Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...

static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)

Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.

@ tokens_present_before_eof

Indicates that there are tokens present between the last scanned directive and eof.

@ pp_pragma_system_header

@ pp_pragma_include_alias

@ After

Like System, but searched after the system directories.

bool isStringLiteral(TokenKind K)

Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.

ObjCKeywordKind

Provides a namespace for Objective-C keywords which start with an '@'.

TokenKind

Provides a simple uniform namespace for tokens from all C languages.

The JSON file list parser is used to communicate input to InstallAPI.

LLVM_READNONE bool isASCII(char c)

Returns true if a byte is an ASCII character.

LLVM_READONLY bool isVerticalWhitespace(unsigned char c)

Returns true if this character is vertical ASCII whitespace: '\n', '\r'.

ConflictMarkerKind

ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.

@ CMK_Perforce

A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.

@ CMK_None

Not within a conflict marker.

@ CMK_Normal

A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...

LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)

bool operator<(DeclarationName LHS, DeclarationName RHS)

Ordering on two declaration names.

LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)

Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.

@ Result

The result type of a method or function.

LLVM_READONLY bool isRawStringDelimBody(unsigned char c)

Return true if this is the body character of a C++ raw string delimiter.

LLVM_READONLY bool isWhitespace(unsigned char c)

Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...

LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)

Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.

const FunctionProtoType * T

LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)

Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].

__INTPTR_TYPE__ intptr_t

A signed integer type with the property that any valid pointer to void can be converted to this type,...

float __ovld __cnfn length(float)

Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)

#define _mm_cmpistri(A, B, M)

Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...

#define _SIDD_LEAST_SIGNIFICANT

#define _SIDD_NEGATIVE_POLARITY

Represents a char and the number of bytes parsed to produce it.

Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...

Token lexed as part of dependency directive scanning.

unsigned Offset

Offset into the original source input.

bool is(tok::TokenKind K) const