clang: lib/Lex/Lexer.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include
40#include
41#include
42#include
43#include
44#include
45#include
46#include
47#include
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55
56
57
58
59
62 return false;
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68
71 return tok::objc_not_keyword;
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
115
116 default:
117 return false;
118 }
119}
120
121
122
123
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137
138
139
140 if (BufferStart == BufferPtr) {
141
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3)
145 .Default(0);
146
147
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161
163
164
166
167
168
169
170
172
173
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179
180
181
182
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195
196
197
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205
207}
208
209
210
211
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
247
248
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253
254
255
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262
263
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268
269
271
272
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
314}
315
317
318
319
320
321
322
323
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
332
333 while (BufPtr < BufEnd) {
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342
343
344
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347
348
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358
359 }
360 }
361
362 while (BufPtr < BufEnd) {
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373
374
375
376
377
382 bool *invalid) {
383
384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
385
386
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
401
403
404
406 return StringRef(tokenBegin, length);
407
408
409 buffer.resize(length);
410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
411 return StringRef(buffer.data(), buffer.size());
412}
413
414
415
416
417
418
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
425 &CharDataInvalid);
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431
433 return std::string(TokStart, TokStart + Tok.getLength());
434
439}
440
441
442
443
444
445
446
447
448
449
450
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457
458 if (Tok.is(tok::raw_identifier))
460 else if (!Tok.hasUCN()) {
462
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468
471
472 if (!TokStart) {
473
474 bool CharDataInvalid = false;
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484
486 Buffer = TokStart;
488 }
489
490
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494
495
496
497
503 return 0;
505}
506
507
508
512 bool IgnoreWhiteSpace) {
513
514
515
516
517
518
519
520
521 Loc = SM.getExpansionLoc(Loc);
522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
538 return false;
539}
540
541
542
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
552
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
571 return Loc;
572
573
574
575 const char *StrData = Buffer.data() + LocInfo.second;
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
585
586
588 do {
590
592
593
594
597
598
599
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604
605 return Loc;
606}
607
613
614 if (.isMacroArgExpansion(Loc))
615 return Loc;
616
619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
620 std::pair<FileID, unsigned> BeginFileLocInfo =
621 SM.getDecomposedLoc(BeginFileLoc);
622 assert(FileLocInfo.first == BeginFileLocInfo.first &&
623 FileLocInfo.second >= BeginFileLocInfo.second);
624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
625}
626
627namespace {
628
629enum PreambleDirectiveKind {
630 PDK_Skipped,
631 PDK_Unknown
632};
633
634}
635
638 unsigned MaxLines) {
639
640
641
644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
645 Buffer.end());
647
648 bool InPreprocessorDirective = false;
651
652 unsigned MaxLineOffset = 0;
653 if (MaxLines) {
654 const char *CurPtr = Buffer.begin();
655 unsigned CurLine = 0;
656 while (CurPtr != Buffer.end()) {
657 char ch = *CurPtr++;
658 if (ch == '\n') {
659 ++CurLine;
660 if (CurLine == MaxLines)
661 break;
662 }
663 }
664 if (CurPtr != Buffer.end())
665 MaxLineOffset = CurPtr - Buffer.begin();
666 }
667
668 do {
670
671 if (InPreprocessorDirective) {
672
673 if (TheTok.getKind() == tok::eof) {
674 break;
675 }
676
677
678
680 continue;
681
682
683
684 InPreprocessorDirective = false;
685 }
686
687
690
691
692
693 if (MaxLineOffset && TokOffset >= MaxLineOffset)
694 break;
695 }
696
697
698 if (TheTok.getKind() == tok::comment) {
699 if (ActiveCommentLoc.isInvalid())
701 continue;
702 }
703
705
706 Token HashTok = TheTok;
707 InPreprocessorDirective = true;
709
710
711
712
716 PreambleDirectiveKind PDK
717 = llvm::StringSwitch(Keyword)
718 .Case("include", PDK_Skipped)
719 .Case("__include_macros", PDK_Skipped)
720 .Case("define", PDK_Skipped)
721 .Case("undef", PDK_Skipped)
722 .Case("line", PDK_Skipped)
723 .Case("error", PDK_Skipped)
724 .Case("pragma", PDK_Skipped)
725 .Case("import", PDK_Skipped)
726 .Case("include_next", PDK_Skipped)
727 .Case("warning", PDK_Skipped)
728 .Case("ident", PDK_Skipped)
729 .Case("sccs", PDK_Skipped)
730 .Case("assert", PDK_Skipped)
731 .Case("unassert", PDK_Skipped)
732 .Case("if", PDK_Skipped)
733 .Case("ifdef", PDK_Skipped)
734 .Case("ifndef", PDK_Skipped)
735 .Case("elif", PDK_Skipped)
736 .Case("elifdef", PDK_Skipped)
737 .Case("elifndef", PDK_Skipped)
738 .Case("else", PDK_Skipped)
739 .Case("endif", PDK_Skipped)
740 .Default(PDK_Unknown);
741
742 switch (PDK) {
743 case PDK_Skipped:
744 continue;
745
746 case PDK_Unknown:
747
748 break;
749 }
750 }
751
752
753
754
755 TheTok = HashTok;
757 TheTok.getKind() == tok::raw_identifier &&
759 LangOpts.CPlusPlusModules) {
760
761
762 Token ModuleTok = TheTok;
763 do {
765 } while (TheTok.getKind() == tok::comment);
766 if (TheTok.getKind() != tok::semi) {
767
768 TheTok = ModuleTok;
769 break;
770 }
771 continue;
772 }
773
774
775
776
777 break;
778 } while (true);
779
781 if (ActiveCommentLoc.isValid())
782 End = ActiveCommentLoc;
783 else
785
788}
789
793
794
795
797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
798
799
800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
801 return 0;
802
803 unsigned PhysOffset = 0;
804
805
806
807
808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
809 if (CharNo == 0)
810 return PhysOffset;
811 ++TokPtr;
812 --CharNo;
813 ++PhysOffset;
814 }
815
816
817
818 for (; CharNo; --CharNo) {
820 TokPtr += CharAndSize.Size;
821 PhysOffset += CharAndSize.Size;
822 }
823
824
825
826
827
828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
830
831 return PhysOffset;
832}
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
853 return {};
854
857 return {};
858 }
859
861 if (Len > Offset)
862 Len = Len - Offset;
863 else
864 return Loc;
865
867}
868
869
870
875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
876
878 if (.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
879 return false;
880
881 if (expansionLoc.isFileID()) {
882
883 if (MacroBegin)
884 *MacroBegin = expansionLoc;
885 return true;
886 }
887
889}
890
891
892
897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
898
901 if (tokLen == 0)
902 return false;
903
906 if (.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
907 return false;
908
909 if (expansionLoc.isFileID()) {
910
911 if (MacroEnd)
912 *MacroEnd = expansionLoc;
913 return true;
914 }
915
917}
918
925 if (Range.isTokenRange()) {
927 if (End.isInvalid())
928 return {};
929 }
930
931
933 unsigned BeginOffs;
934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
936 return {};
937
938 unsigned EndOffs;
939 if (.isInFileID(End, FID, &EndOffs) ||
940 BeginOffs > EndOffs)
941 return {};
942
944}
945
946
949 return SM.getSLocEntry(SM.getFileID(Loc))
950 .getExpansion()
951 .isExpansionTokenRange();
952}
953
960 return {};
961
964
967 return {};
970 }
971
973 if (Range.isTokenRange()) {
975 return {};
976
979 return {};
982 }
983
988 &MacroEnd)) ||
990 &MacroEnd)))) {
993
994 if (Range.isTokenRange())
997 }
998
1003 return {};
1004
1009 return {};
1010
1017 }
1018 }
1019
1020 return {};
1021}
1022
1030 return {};
1031 }
1032
1033
1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1035 if (beginInfo.first.isInvalid()) {
1037 return {};
1038 }
1039
1040 unsigned EndOffs;
1041 if (.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1042 beginInfo.second > EndOffs) {
1044 return {};
1045 }
1046
1047
1048 bool invalidTemp = false;
1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1050 if (invalidTemp) {
1052 return {};
1053 }
1054
1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1057}
1058
1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1063
1064
1065 while (true) {
1071 break;
1072
1073
1074
1075
1076
1077
1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1081 break;
1082
1083
1084
1086 if (SM.isInFileID(SpellLoc, MacroFID))
1087 break;
1088
1089
1090 Loc = SpellLoc;
1091 }
1092
1093
1094
1095
1096 Loc = SM.getSpellingLoc(Loc);
1097
1098
1099
1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1104}
1105
1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1109
1110 while (SM.isMacroArgExpansion(Loc))
1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1112
1113
1114
1115
1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1118 return {};
1119
1120
1121
1122
1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1124
1125
1126
1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1131}
1132
1135}
1136
1139 if (Str - 1 < BufferStart)
1140 return false;
1141
1142 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1143 (Str[0] == '\r' && Str[-1] == '\n')) {
1144 if (Str - 2 < BufferStart)
1145 return false;
1146 --Str;
1147 }
1148 --Str;
1149
1150
1152 --Str;
1153
1154 return *Str == '\\';
1155}
1156
1160 return {};
1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1162 if (LocInfo.first.isInvalid())
1163 return {};
1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1167 return {};
1170 return {};
1171 StringRef Rest = Buffer.substr(Line - Buffer.data());
1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1173 return NumWhitespaceChars == StringRef::npos
1174 ? ""
1175 : Rest.take_front(NumWhitespaceChars);
1176}
1177
1178
1179
1180
1181
1182
1183
1184
1185
1190 unsigned CharNo, unsigned TokLen) {
1191 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1192
1193
1194
1195
1197
1198
1199
1202
1203
1204
1206
1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1208}
1209
1210
1211
1213 unsigned TokLen) const {
1214 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1215 "Location out of range for this buffer!");
1216
1217
1218
1219 unsigned CharNo = Loc-BufferStart;
1222
1223
1224
1225 assert(PP && "This doesn't work on raw lexers");
1227}
1228
1229
1230
1233}
1234
1235
1236
1237
1238
1239
1240
1242 switch (Letter) {
1243 default: return 0;
1244 case '=': return '#';
1245 case ')': return ']';
1246 case '(': return '[';
1247 case '!': return '|';
1248 case '\'': return '^';
1249 case '>': return '}';
1250 case '/': return '\\';
1251 case '<': return '{';
1252 case '-': return '~';
1253 }
1254}
1255
1256
1257
1258
1259
1262 if (!Res)
1263 return Res;
1264
1265 if (!Trigraphs) {
1267 L->Diag(CP-2, diag::trigraph_ignored);
1268 return 0;
1269 }
1270
1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1273 return Res;
1274}
1275
1276
1277
1278
1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1280 unsigned Size = 0;
1283
1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1285 continue;
1286
1287
1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1289 Ptr[Size-1] != Ptr[Size])
1291
1292 return Size;
1293 }
1294
1295
1296 return 0;
1297}
1298
1299
1300
1301
1302const char *Lexer::SkipEscapedNewLines(const char *P) {
1303 while (true) {
1304 const char *AfterEscape;
1305 if (*P == '\\') {
1306 AfterEscape = P+1;
1307 } else if (*P == '?') {
1308
1309 if (P[1] != '?' || P[2] != '/')
1310 return P;
1311
1312
1313 AfterEscape = P+3;
1314 } else {
1315 return P;
1316 }
1317
1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1319 if (NewLineSize == 0) return P;
1320 P = AfterEscape+NewLineSize;
1321 }
1322}
1323
1327 bool IncludeComments) {
1330 return std::nullopt;
1331 }
1333
1334
1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337
1338 bool InvalidTemp = false;
1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340 if (InvalidTemp)
1341 return std::nullopt;
1342
1343 const char *TokenBegin = File.data() + LocInfo.second;
1344
1345
1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347 TokenBegin, File.end());
1349
1352 return Tok;
1353}
1354
1358 bool IncludeComments) {
1359 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));
1360 while (Loc != StartOfFile) {
1363 return std::nullopt;
1364
1368 continue;
1369 if (!Tok.is(tok::comment) || IncludeComments) {
1370 return Tok;
1371 }
1372 }
1373 return std::nullopt;
1374}
1375
1376
1377
1378
1379
1382 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1384 if (!Tok || Tok->isNot(TKind))
1385 return {};
1387
1388
1389 unsigned NumWhitespaceChars = 0;
1390 if (SkipTrailingWhitespaceAndNewLine) {
1391 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1392 unsigned char C = *TokenEnd;
1394 C = *(++TokenEnd);
1395 NumWhitespaceChars++;
1396 }
1397
1398
1399 if (C == '\n' || C == '\r') {
1400 char PrevC = C;
1401 C = *(++TokenEnd);
1402 NumWhitespaceChars++;
1403 if ((C == '\n' || C == '\r') && C != PrevC)
1404 NumWhitespaceChars++;
1405 }
1406 }
1407
1408 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1409}
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1427 unsigned Size = 0;
1428
1429 if (Ptr[0] == '\\') {
1430 ++Size;
1431 ++Ptr;
1432Slash:
1433
1435 return {'\\', Size};
1436
1437
1438
1439 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1440
1442
1443
1444 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && ())
1445 Diag(Ptr, diag::backslash_newline_space);
1446
1447
1448 Size += EscapedNewLineSize;
1449 Ptr += EscapedNewLineSize;
1450
1451
1452 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1453 CharAndSize.Size += Size;
1454 return CharAndSize;
1455 }
1456
1457
1458 return {'\\', Size};
1459 }
1460
1461
1462 if (Ptr[0] == '?' && Ptr[1] == '?') {
1463
1464
1466 LangOpts.Trigraphs)) {
1467
1469
1470 Ptr += 3;
1472 if (C == '\\') goto Slash;
1474 }
1475 }
1476
1477
1478 return {*Ptr, Size + 1u};
1479}
1480
1481
1482
1483
1484
1485
1486
1487Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1489
1490 unsigned Size = 0;
1491
1492 if (Ptr[0] == '\\') {
1494 ++Ptr;
1495Slash:
1496
1498 return {'\\', Size};
1499
1500
1501 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1502
1503 Size += EscapedNewLineSize;
1504 Ptr += EscapedNewLineSize;
1505
1506
1507 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1508 CharAndSize.Size += Size;
1509 return CharAndSize;
1510 }
1511
1512
1513 return {'\\', Size};
1514 }
1515
1516
1517 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1518
1519
1521 Ptr += 3;
1523 if (C == '\\') goto Slash;
1525 }
1526 }
1527
1528
1529 return {*Ptr, Size + 1u};
1530}
1531
1532
1533
1534
1535
1536
1537void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1538 BufferPtr = BufferStart + Offset;
1539 if (BufferPtr > BufferEnd)
1540 BufferPtr = BufferEnd;
1541
1542
1543
1544 IsAtStartOfLine = StartOfLine;
1545 IsAtPhysicalStartOfLine = StartOfLine;
1546}
1547
1549 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1551 return UnicodeWhitespaceChars.contains(Codepoint);
1552}
1553
1556 llvm::raw_svector_ostream CharOS(CharBuf);
1557 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1558 return CharBuf;
1559}
1560
1561
1562
1563
1564
1565
1566
1568 bool IsStart, bool &IsExtension) {
1569 static const llvm::sys::UnicodeCharSet MathStartChars(
1571 static const llvm::sys::UnicodeCharSet MathContinueChars(
1573 if (MathStartChars.contains(C) ||
1574 (!IsStart && MathContinueChars.contains(C))) {
1575 IsExtension = true;
1576 return true;
1577 }
1578 return false;
1579}
1580
1582 bool &IsExtension) {
1583 if (LangOpts.AsmPreprocessor) {
1584 return false;
1585 } else if (LangOpts.DollarIdents && '$' == C) {
1586 return true;
1587 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1588
1589
1590
1591
1592 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1593 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1594 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1595 return true;
1597 IsExtension);
1598 } else if (LangOpts.C11) {
1599 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1601 return C11AllowedIDChars.contains(C);
1602 } else {
1603 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1605 return C99AllowedIDChars.contains(C);
1606 }
1607}
1608
1610 bool &IsExtension) {
1611 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1612 IsExtension = false;
1613 if (LangOpts.AsmPreprocessor) {
1614 return false;
1615 }
1616 if (LangOpts.CPlusPlus || LangOpts.C23) {
1617 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1618 if (XIDStartChars.contains(C))
1619 return true;
1621 IsExtension);
1622 }
1624 return false;
1625 if (LangOpts.C11) {
1626 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1628 return !C11DisallowedInitialIDChars.contains(C);
1629 }
1630 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1632 return !C99DisallowedInitialIDChars.contains(C);
1633}
1634
1637
1638 static const llvm::sys::UnicodeCharSet MathStartChars(
1640 static const llvm::sys::UnicodeCharSet MathContinueChars(
1642
1643 (void)MathStartChars;
1644 (void)MathContinueChars;
1645 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1646 "Unexpected mathematical notation codepoint");
1649}
1650
1652 const char *End) {
1655}
1656
1659
1661 enum {
1662 CannotAppearInIdentifier = 0,
1663 CannotStartIdentifier
1664 };
1665
1666 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1668 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1670 if (!C99AllowedIDChars.contains(C)) {
1673 << CannotAppearInIdentifier;
1674 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1677 << CannotStartIdentifier;
1678 }
1679 }
1680}
1681
1682
1683
1684
1685
1688
1689 struct HomoglyphPair {
1690 uint32_t Character;
1691 char LooksLike;
1692 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1693 };
1694 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1695 {U'\u00ad', 0},
1696 {U'\u01c3', '!'},
1697 {U'\u037e', ';'},
1698 {U'\u200b', 0},
1699 {U'\u200c', 0},
1700 {U'\u200d', 0},
1701 {U'\u2060', 0},
1702 {U'\u2061', 0},
1703 {U'\u2062', 0},
1704 {U'\u2063', 0},
1705 {U'\u2064', 0},
1706 {U'\u2212', '-'},
1707 {U'\u2215', '/'},
1708 {U'\u2216', '\\'},
1709 {U'\u2217', '*'},
1710 {U'\u2223', '|'},
1711 {U'\u2227', '^'},
1712 {U'\u2236', ':'},
1713 {U'\u223c', '~'},
1714 {U'\ua789', ':'},
1715 {U'\ufeff', 0},
1716 {U'\uff01', '!'},
1717 {U'\uff03', '#'},
1718 {U'\uff04', '$'},
1719 {U'\uff05', '%'},
1720 {U'\uff06', '&'},
1721 {U'\uff08', '('},
1722 {U'\uff09', ')'},
1723 {U'\uff0a', '*'},
1724 {U'\uff0b', '+'},
1725 {U'\uff0c', ','},
1726 {U'\uff0d', '-'},
1727 {U'\uff0e', '.'},
1728 {U'\uff0f', '/'},
1729 {U'\uff1a', ':'},
1730 {U'\uff1b', ';'},
1731 {U'\uff1c', '<'},
1732 {U'\uff1d', '='},
1733 {U'\uff1e', '>'},
1734 {U'\uff1f', '?'},
1735 {U'\uff20', '@'},
1736 {U'\uff3b', '['},
1737 {U'\uff3c', '\\'},
1738 {U'\uff3d', ']'},
1739 {U'\uff3e', '^'},
1740 {U'\uff5b', '{'},
1741 {U'\uff5c', '|'},
1742 {U'\uff5d', '}'},
1743 {U'\uff5e', '~'},
1744 {0, 0}
1745 };
1746 auto Homoglyph =
1747 std::lower_bound(std::begin(SortedHomoglyphs),
1748 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1749 if (Homoglyph->Character == C) {
1750 if (Homoglyph->LooksLike) {
1751 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1754 } else {
1757 }
1758 }
1759}
1760
1765 return;
1766
1767 bool IsExtension;
1769 bool IsIDContinue =
1770 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1771
1772 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1773 return;
1774
1775 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1776
1777 if (!IsFirst || InvalidOnlyAtStart) {
1778 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1781 } else {
1785 }
1786}
1787
1788bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1790 const char *UCNPtr = CurPtr + Size;
1791 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, nullptr);
1792 if (CodePoint == 0) {
1793 return false;
1794 }
1795 bool IsExtension = false;
1796 if ((CodePoint, LangOpts, IsExtension)) {
1798 return false;
1804 false);
1805
1806
1807
1808
1810 if (IsExtension)
1813
1816 false);
1817 }
1818
1820 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1821 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1822 CurPtr = UCNPtr;
1823 else
1824 while (CurPtr != UCNPtr)
1825 (void)getAndAdvanceChar(CurPtr, Result);
1826 return true;
1827}
1828
1829bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1830 llvm::UTF32 CodePoint;
1831
1832
1833
1834
1835 unsigned FirstCodeUnitSize;
1836 getCharAndSize(CurPtr, FirstCodeUnitSize);
1837 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1838 const char *UnicodePtr = CharStart;
1839
1840 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1841 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1842 &CodePoint, llvm::strictConversion);
1843 if (ConvResult != llvm::conversionOK)
1844 return false;
1845
1846 bool IsExtension = false;
1848 IsExtension)) {
1850 return false;
1851
1856 makeCharRange(*this, CharStart, UnicodePtr), false);
1857
1858
1859
1861 if (IsExtension)
1867 false);
1870 }
1871
1872
1873
1874
1875 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1876 CurPtr = UnicodePtr;
1877 return true;
1878}
1879
1880bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1881 const char *CurPtr) {
1882 bool IsExtension = false;
1886 if (IsExtension)
1891 true);
1894 }
1895
1897 return LexIdentifierContinue(Result, CurPtr);
1898 }
1899
1903
1904
1905
1906
1907
1908
1909
1910
1911
1914 makeCharRange(*this, BufferPtr, CurPtr), true);
1915 BufferPtr = CurPtr;
1916 return false;
1917 }
1918
1919
1920
1922 FormTokenWithChars(Result, CurPtr, tok::unknown);
1923 return true;
1924}
1925
1926static const char *
1928 [[maybe_unused]] const char *BufferEnd) {
1929#ifdef __SSE4_2__
1930 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1931 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1932 };
1933 constexpr ssize_t BytesPerRegister = 16;
1934
1935 __m128i AsciiIdentifierRangeV =
1936 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1937
1938 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1939 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1940
1941 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1944 CurPtr += Consumed;
1945 if (Consumed == BytesPerRegister)
1946 continue;
1947 return CurPtr;
1948 }
1949#endif
1950
1951 unsigned char C = *CurPtr;
1953 C = *++CurPtr;
1954 return CurPtr;
1955}
1956
1957bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1958
1959
1960 while (true) {
1961
1963
1964 unsigned Size;
1965
1966 unsigned char C = getCharAndSize(CurPtr, Size);
1968 CurPtr = ConsumeChar(CurPtr, Size, Result);
1969 continue;
1970 }
1971 if (C == '$') {
1972
1973 if (!LangOpts.DollarIdents)
1974 break;
1975
1977 Diag(CurPtr, diag::ext_dollar_in_identifier);
1978 CurPtr = ConsumeChar(CurPtr, Size, Result);
1979 continue;
1980 }
1981 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1982 continue;
1983 if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1984 continue;
1985
1986 break;
1987 }
1988
1989 const char *IdStart = BufferPtr;
1990 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1991 Result.setRawIdentifierData(IdStart);
1992
1993
1994
1996 return true;
1997
1998
1999
2001
2002
2003
2004
2005
2006
2007 if (isCodeCompletionPoint(CurPtr)) {
2008
2009 Result.setKind(tok::code_completion);
2010
2011
2012
2013
2014
2015 assert(*CurPtr == 0 && "Completion character must be 0");
2016 ++CurPtr;
2017
2018
2019
2020 if (CurPtr < BufferEnd) {
2022 ++CurPtr;
2023 }
2024 BufferPtr = CurPtr;
2025 return true;
2026 }
2027
2028
2029
2032
2033 return true;
2034}
2035
2036
2037
2038bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2040 char C1 = CharAndSize1.Char;
2041 if (C1 != '0')
2042 return false;
2043
2044 auto CharAndSize2 =
2046 char C2 = CharAndSize2.Char;
2047 return (C2 == 'x' || C2 == 'X');
2048}
2049
2050
2051
2052
2053bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2054 unsigned Size;
2055 char C = getCharAndSize(CurPtr, Size);
2056 char PrevCh = 0;
2058 CurPtr = ConsumeChar(CurPtr, Size, Result);
2059 PrevCh = C;
2060 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2061 CurPtr -= Size;
2062 break;
2063 }
2064 C = getCharAndSize(CurPtr, Size);
2065 }
2066
2067
2068 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2069
2070
2071 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2072 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2073 }
2074
2075
2076 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2077
2078
2079
2080 bool IsHexFloat = true;
2081 if (!LangOpts.C99) {
2082 if (!isHexaLiteral(BufferPtr, LangOpts))
2083 IsHexFloat = false;
2084 else if (!LangOpts.CPlusPlus17 &&
2085 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2086 IsHexFloat = false;
2087 }
2088 if (IsHexFloat)
2089 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2090 }
2091
2092
2093 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2097 Diag(CurPtr, LangOpts.CPlusPlus
2098 ? diag::warn_cxx11_compat_digit_separator
2099 : diag::warn_c23_compat_digit_separator);
2100 CurPtr = ConsumeChar(CurPtr, Size, Result);
2101 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2102 return LexNumericConstant(Result, CurPtr);
2103 }
2104 }
2105
2106
2107 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2108 return LexNumericConstant(Result, CurPtr);
2109 if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2110 return LexNumericConstant(Result, CurPtr);
2111
2112
2113 const char *TokStart = BufferPtr;
2114 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2115 Result.setLiteralData(TokStart);
2116 return true;
2117}
2118
2119
2120
2121const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2122 bool IsStringLiteral) {
2123 assert(LangOpts.CPlusPlus);
2124
2125
2126 unsigned Size;
2127 char C = getCharAndSize(CurPtr, Size);
2128 bool Consumed = false;
2129
2131 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2132 Consumed = true;
2133 else if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2134 Consumed = true;
2135 else
2136 return CurPtr;
2137 }
2138
2139 if (!LangOpts.CPlusPlus11) {
2141 Diag(CurPtr,
2142 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2143 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2145 return CurPtr;
2146 }
2147
2148
2149
2150
2151
2152
2153 if (!Consumed) {
2154 bool IsUDSuffix = false;
2155 if (C == '_')
2156 IsUDSuffix = true;
2157 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2158
2159
2160
2161 const unsigned MaxStandardSuffixLength = 3;
2162 char Buffer[MaxStandardSuffixLength] = { C };
2163 unsigned Consumed = Size;
2164 unsigned Chars = 1;
2165 while (true) {
2166 auto [Next, NextSize] =
2169
2170 const StringRef CompleteSuffix(Buffer, Chars);
2171 IsUDSuffix =
2173 break;
2174 }
2175
2176 if (Chars == MaxStandardSuffixLength)
2177
2178 break;
2179
2180 Buffer[Chars++] = Next;
2181 Consumed += NextSize;
2182 }
2183 }
2184
2185 if (!IsUDSuffix) {
2187 Diag(CurPtr, LangOpts.MSVCCompat
2188 ? diag::ext_ms_reserved_user_defined_literal
2189 : diag::ext_reserved_user_defined_literal)
2191 return CurPtr;
2192 }
2193
2194 CurPtr = ConsumeChar(CurPtr, Size, Result);
2195 }
2196
2198 while (true) {
2199 C = getCharAndSize(CurPtr, Size);
2201 CurPtr = ConsumeChar(CurPtr, Size, Result);
2202 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2203 } else if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2204 } else
2205 break;
2206 }
2207
2208 return CurPtr;
2209}
2210
2211
2212
2213bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2215 const char *AfterQuote = CurPtr;
2216
2217 const char *NulCharacter = nullptr;
2218
2220 (Kind == tok::utf8_string_literal ||
2221 Kind == tok::utf16_string_literal ||
2222 Kind == tok::utf32_string_literal))
2223 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2224 : diag::warn_c99_compat_unicode_literal);
2225
2226 char C = getAndAdvanceChar(CurPtr, Result);
2227 while (C != '"') {
2228
2229
2230 if (C == '\\')
2231 C = getAndAdvanceChar(CurPtr, Result);
2232
2233 if (C == '\n' || C == '\r' ||
2234 (C == 0 && CurPtr-1 == BufferEnd)) {
2236 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2237 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2238 return true;
2239 }
2240
2241 if (C == 0) {
2242 if (isCodeCompletionPoint(CurPtr-1)) {
2244 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, false);
2245 else
2247 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2248 cutOffLexing();
2249 return true;
2250 }
2251
2252 NulCharacter = CurPtr-1;
2253 }
2254 C = getAndAdvanceChar(CurPtr, Result);
2255 }
2256
2257
2258 if (LangOpts.CPlusPlus)
2259 CurPtr = LexUDSuffix(Result, CurPtr, true);
2260
2261
2263 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2264
2265
2266 const char *TokStart = BufferPtr;
2267 FormTokenWithChars(Result, CurPtr, Kind);
2268 Result.setLiteralData(TokStart);
2269 return true;
2270}
2271
2272
2273
2274bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2276
2277
2278
2279
2280
2282 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2283
2284 unsigned PrefixLen = 0;
2285
2288 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2289 const char *Pos = &CurPtr[PrefixLen];
2290 Diag(Pos, LangOpts.CPlusPlus26
2291 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2292 : diag::ext_cxx26_raw_string_literal_character_set)
2293 << StringRef(Pos, 1);
2294 }
2295 ++PrefixLen;
2296 }
2297
2298
2299 if (CurPtr[PrefixLen] != '(') {
2301 const char *PrefixEnd = &CurPtr[PrefixLen];
2302 if (PrefixLen == 16) {
2303 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2304 } else if (*PrefixEnd == '\n') {
2305 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2306 } else {
2307 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2308 << StringRef(PrefixEnd, 1);
2309 }
2310 }
2311
2312
2313
2314
2315 while (true) {
2316 char C = *CurPtr++;
2317
2318 if (C == '"')
2319 break;
2320 if (C == 0 && CurPtr-1 == BufferEnd) {
2321 --CurPtr;
2322 break;
2323 }
2324 }
2325
2326 FormTokenWithChars(Result, CurPtr, tok::unknown);
2327 return true;
2328 }
2329
2330
2331 const char *Prefix = CurPtr;
2332 CurPtr += PrefixLen + 1;
2333
2334 while (true) {
2335 char C = *CurPtr++;
2336
2337 if (C == ')') {
2338
2339 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2340 CurPtr += PrefixLen + 1;
2341 break;
2342 }
2343 } else if (C == 0 && CurPtr-1 == BufferEnd) {
2345 Diag(BufferPtr, diag::err_unterminated_raw_string)
2346 << StringRef(Prefix, PrefixLen);
2347 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2348 return true;
2349 }
2350 }
2351
2352
2353 if (LangOpts.CPlusPlus)
2354 CurPtr = LexUDSuffix(Result, CurPtr, true);
2355
2356
2357 const char *TokStart = BufferPtr;
2358 FormTokenWithChars(Result, CurPtr, Kind);
2359 Result.setLiteralData(TokStart);
2360 return true;
2361}
2362
2363
2364
2365bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2366
2367 const char *NulCharacter = nullptr;
2368 const char *AfterLessPos = CurPtr;
2369 char C = getAndAdvanceChar(CurPtr, Result);
2370 while (C != '>') {
2371
2372
2373 if (C == '\\')
2374 C = getAndAdvanceChar(CurPtr, Result);
2375
2377 (C == 0 && (CurPtr - 1 == BufferEnd))) {
2378
2379
2380 FormTokenWithChars(Result, AfterLessPos, tok::less);
2381 return true;
2382 }
2383
2384 if (C == 0) {
2385 if (isCodeCompletionPoint(CurPtr - 1)) {
2386 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, true);
2387 cutOffLexing();
2388 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2389 return true;
2390 }
2391 NulCharacter = CurPtr-1;
2392 }
2393 C = getAndAdvanceChar(CurPtr, Result);
2394 }
2395
2396
2398 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2399
2400
2401 const char *TokStart = BufferPtr;
2402 FormTokenWithChars(Result, CurPtr, tok::header_name);
2403 Result.setLiteralData(TokStart);
2404 return true;
2405}
2406
2407void Lexer::codeCompleteIncludedFile(const char *PathStart,
2408 const char *CompletionPoint,
2409 bool IsAngled) {
2410
2411 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2412 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2413 auto Slash = PartialPath.find_last_of(SlashChars);
2414 StringRef Dir =
2415 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2416 const char *StartOfFilename =
2417 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2418
2420 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2421
2422
2423 while (CompletionPoint < BufferEnd) {
2424 char Next = *(CompletionPoint + 1);
2425 if (Next == 0 || Next == '\r' || Next == '\n')
2426 break;
2427 ++CompletionPoint;
2428 if (Next == (IsAngled ? '>' : '"'))
2429 break;
2430 if (SlashChars.contains(Next))
2431 break;
2432 }
2433
2438}
2439
2440
2441
2442bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2444
2445 const char *NulCharacter = nullptr;
2446
2448 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2449 Diag(BufferPtr, LangOpts.CPlusPlus
2450 ? diag::warn_cxx98_compat_unicode_literal
2451 : diag::warn_c99_compat_unicode_literal);
2452 else if (Kind == tok::utf8_char_constant)
2453 Diag(BufferPtr, LangOpts.CPlusPlus
2454 ? diag::warn_cxx14_compat_u8_character_literal
2455 : diag::warn_c17_compat_u8_character_literal);
2456 }
2457
2458 char C = getAndAdvanceChar(CurPtr, Result);
2459 if (C == '\'') {
2461 Diag(BufferPtr, diag::ext_empty_character);
2462 FormTokenWithChars(Result, CurPtr, tok::unknown);
2463 return true;
2464 }
2465
2466 while (C != '\'') {
2467
2468 if (C == '\\')
2469 C = getAndAdvanceChar(CurPtr, Result);
2470
2471 if (C == '\n' || C == '\r' ||
2472 (C == 0 && CurPtr-1 == BufferEnd)) {
2474 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2475 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2476 return true;
2477 }
2478
2479 if (C == 0) {
2480 if (isCodeCompletionPoint(CurPtr-1)) {
2482 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2483 cutOffLexing();
2484 return true;
2485 }
2486
2487 NulCharacter = CurPtr-1;
2488 }
2489 C = getAndAdvanceChar(CurPtr, Result);
2490 }
2491
2492
2493 if (LangOpts.CPlusPlus)
2494 CurPtr = LexUDSuffix(Result, CurPtr, false);
2495
2496
2498 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2499
2500
2501 const char *TokStart = BufferPtr;
2502 FormTokenWithChars(Result, CurPtr, Kind);
2503 Result.setLiteralData(TokStart);
2504 return true;
2505}
2506
2507
2508
2509
2510
2511bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2512 bool &TokAtPhysicalStartOfLine) {
2513
2515
2516 unsigned char Char = *CurPtr;
2517
2518 const char *lastNewLine = nullptr;
2519 auto setLastNewLine = [&](const char *Ptr) {
2520 lastNewLine = Ptr;
2521 if (!NewLinePtr)
2522 NewLinePtr = Ptr;
2523 };
2524 if (SawNewline)
2525 setLastNewLine(CurPtr - 1);
2526
2527
2528 while (true) {
2529
2531 Char = *++CurPtr;
2532
2533
2535 break;
2536
2538
2539 BufferPtr = CurPtr;
2540 return false;
2541 }
2542
2543
2544 if (*CurPtr == '\n')
2545 setLastNewLine(CurPtr);
2546 SawNewline = true;
2547 Char = *++CurPtr;
2548 }
2549
2550
2552 FormTokenWithChars(Result, CurPtr, tok::unknown);
2553 if (SawNewline) {
2554 IsAtStartOfLine = true;
2555 IsAtPhysicalStartOfLine = true;
2556 }
2557
2558 return true;
2559 }
2560
2561
2562 char PrevChar = CurPtr[-1];
2564
2566 if (SawNewline) {
2568 TokAtPhysicalStartOfLine = true;
2569
2570 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2574 }
2575 }
2576
2577 BufferPtr = CurPtr;
2578 return false;
2579}
2580
2581
2582
2583
2584
2585
2586
2587bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2588 bool &TokAtPhysicalStartOfLine) {
2589
2590
2591 if (!LineComment) {
2592 if (())
2593 Diag(BufferPtr, diag::ext_line_comment);
2594
2595
2596
2597 LineComment = true;
2598 }
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611 bool UnicodeDecodingAlreadyDiagnosed = false;
2612
2613 char C;
2614 while (true) {
2615 C = *CurPtr;
2616
2617 while (isASCII(C) && C != 0 &&
2618 C != '\n' && C != '\r') {
2619 C = *++CurPtr;
2620 UnicodeDecodingAlreadyDiagnosed = false;
2621 }
2622
2624 unsigned Length = llvm::getUTF8SequenceSize(
2625 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2626 if (Length == 0) {
2627 if (!UnicodeDecodingAlreadyDiagnosed && ())
2628 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2629 UnicodeDecodingAlreadyDiagnosed = true;
2630 ++CurPtr;
2631 } else {
2632 UnicodeDecodingAlreadyDiagnosed = false;
2633 CurPtr += Length;
2634 }
2635 continue;
2636 }
2637
2638 const char *NextLine = CurPtr;
2639 if (C != 0) {
2640
2641 const char *EscapePtr = CurPtr-1;
2642 bool HasSpace = false;
2644 --EscapePtr;
2645 HasSpace = true;
2646 }
2647
2648 if (*EscapePtr == '\\')
2649
2650 CurPtr = EscapePtr;
2651 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2652 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2653
2654 CurPtr = EscapePtr-2;
2655 else
2656 break;
2657
2658
2660 Diag(EscapePtr, diag::backslash_newline_space);
2661 }
2662
2663
2664
2665
2666
2667 const char *OldPtr = CurPtr;
2670 C = getAndAdvanceChar(CurPtr, Result);
2672
2673
2674
2675 if (C != 0 && CurPtr == OldPtr+1) {
2676 CurPtr = NextLine;
2677 break;
2678 }
2679
2680
2681
2682
2683 if (CurPtr != OldPtr + 1 && C != '/' &&
2684 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2685 for (; OldPtr != CurPtr; ++OldPtr)
2686 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2687
2688
2690 const char *ForwardPtr = CurPtr;
2691 while (isWhitespace(*ForwardPtr))
2692 ++ForwardPtr;
2693 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2694 break;
2695 }
2696
2698 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2699 break;
2700 }
2701 }
2702
2703 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2704 --CurPtr;
2705 break;
2706 }
2707
2708 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2710 cutOffLexing();
2711 return false;
2712 }
2713 }
2714
2715
2716
2720 BufferPtr = CurPtr;
2721 return true;
2722 }
2723
2724
2726 return SaveLineComment(Result, CurPtr);
2727
2728
2729
2731 BufferPtr = CurPtr;
2732 return false;
2733 }
2734
2735
2736
2737
2738
2739
2740 NewLinePtr = CurPtr++;
2741
2742
2744 TokAtPhysicalStartOfLine = true;
2745
2747 BufferPtr = CurPtr;
2748 return false;
2749}
2750
2751
2752
2753bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2754
2755
2756 FormTokenWithChars(Result, CurPtr, tok::comment);
2757
2759 return true;
2760
2761
2762
2766 return true;
2767
2768 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2769 Spelling[1] = '*';
2770 Spelling += "*/";
2771
2772 Result.setKind(tok::comment);
2774 Result.getLocation(), Result.getLocation());
2775 return true;
2776}
2777
2778
2779
2780
2782 bool Trigraphs) {
2783 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2784
2785
2786 const char *TrigraphPos = nullptr;
2787
2788 const char *SpacePos = nullptr;
2789
2790 while (true) {
2791
2792 --CurPtr;
2793
2794
2795 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2796
2797 if (CurPtr[0] == CurPtr[1])
2798 return false;
2799
2800 --CurPtr;
2801 }
2802
2803
2804
2806 SpacePos = CurPtr;
2807 --CurPtr;
2808 }
2809
2810
2811 if (*CurPtr == '\\') {
2812 --CurPtr;
2813 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2814
2815 TrigraphPos = CurPtr - 2;
2816 CurPtr -= 3;
2817 } else {
2818 return false;
2819 }
2820
2821
2822
2823 if (*CurPtr == '*')
2824 break;
2825
2826 if (*CurPtr != '\n' && *CurPtr != '\r')
2827 return false;
2828 }
2829
2830 if (TrigraphPos) {
2831
2832
2833 if (!Trigraphs) {
2835 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2836 return false;
2837 }
2839 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2840 }
2841
2842
2844 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2845
2846
2848 L->Diag(SpacePos, diag::backslash_newline_space);
2849
2850 return true;
2851}
2852
2853#ifdef __SSE2__
2854#include <emmintrin.h>
2855#elif __ALTIVEC__
2857#undef bool
2858#endif
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2870 bool &TokAtPhysicalStartOfLine) {
2871
2872
2873
2874
2875
2876
2877
2878
2879 unsigned CharSize;
2880 unsigned char C = getCharAndSize(CurPtr, CharSize);
2881 CurPtr += CharSize;
2882 if (C == 0 && CurPtr == BufferEnd+1) {
2884 Diag(BufferPtr, diag::err_unterminated_block_comment);
2885 --CurPtr;
2886
2887
2888
2890 FormTokenWithChars(Result, CurPtr, tok::unknown);
2891 return true;
2892 }
2893
2894 BufferPtr = CurPtr;
2895 return false;
2896 }
2897
2898
2899
2900 if (C == '/')
2901 C = *CurPtr++;
2902
2903
2904
2905
2906
2907 bool UnicodeDecodingAlreadyDiagnosed = false;
2908
2909 while (true) {
2910
2911
2912 if (CurPtr + 24 < BufferEnd &&
2913
2914
2916
2917 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2919 goto MultiByteUTF8;
2920 C = *CurPtr++;
2921 }
2922 if (C == '/') goto FoundSlash;
2923
2924#ifdef __SSE2__
2926 while (CurPtr + 16 < BufferEnd) {
2928 if (LLVM_UNLIKELY(Mask != 0)) {
2929 goto MultiByteUTF8;
2930 }
2931
2933 Slashes));
2934 if (cmp != 0) {
2935
2936
2937
2938 CurPtr += llvm::countr_zero(cmp) + 1;
2939 goto FoundSlash;
2940 }
2941 CurPtr += 16;
2942 }
2943#elif __ALTIVEC__
2944 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2945 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2946 0x80, 0x80, 0x80, 0x80};
2947 __vector unsigned char Slashes = {
2948 '/', '/', '/', '/', '/', '/', '/', '/',
2949 '/', '/', '/', '/', '/', '/', '/', '/'
2950 };
2951 while (CurPtr + 16 < BufferEnd) {
2952 if (LLVM_UNLIKELY(
2953 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2954 goto MultiByteUTF8;
2955 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2956 break;
2957 }
2958 CurPtr += 16;
2959 }
2960
2961#else
2962 while (CurPtr + 16 < BufferEnd) {
2963 bool HasNonASCII = false;
2964 for (unsigned I = 0; I < 16; ++I)
2965 HasNonASCII |= (CurPtr[I]);
2966
2967 if (LLVM_UNLIKELY(HasNonASCII))
2968 goto MultiByteUTF8;
2969
2970 bool HasSlash = false;
2971 for (unsigned I = 0; I < 16; ++I)
2972 HasSlash |= CurPtr[I] == '/';
2973 if (HasSlash)
2974 break;
2975 CurPtr += 16;
2976 }
2977#endif
2978
2979
2980 C = *CurPtr++;
2981 }
2982
2983
2984
2985
2986 while (C != '/' && C != '\0') {
2988 UnicodeDecodingAlreadyDiagnosed = false;
2989 C = *CurPtr++;
2990 continue;
2991 }
2992 MultiByteUTF8:
2993
2994
2995 unsigned Length = llvm::getUTF8SequenceSize(
2996 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2997 if (Length == 0) {
2998 if (!UnicodeDecodingAlreadyDiagnosed && ())
2999 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
3000 UnicodeDecodingAlreadyDiagnosed = true;
3001 } else {
3002 UnicodeDecodingAlreadyDiagnosed = false;
3003 CurPtr += Length - 1;
3004 }
3005 C = *CurPtr++;
3006 }
3007
3008 if (C == '/') {
3009 FoundSlash:
3010 if (CurPtr[-2] == '*')
3011 break;
3012
3013 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
3015 LangOpts.Trigraphs)) {
3016
3017
3018 break;
3019 }
3020 }
3021 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
3022
3023
3024
3026 Diag(CurPtr-1, diag::warn_nested_block_comment);
3027 }
3028 } else if (C == 0 && CurPtr == BufferEnd+1) {
3030 Diag(BufferPtr, diag::err_unterminated_block_comment);
3031
3032
3033
3034 --CurPtr;
3035
3036
3037
3039 FormTokenWithChars(Result, CurPtr, tok::unknown);
3040 return true;
3041 }
3042
3043 BufferPtr = CurPtr;
3044 return false;
3045 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3047 cutOffLexing();
3048 return false;
3049 }
3050
3051 C = *CurPtr++;
3052 }
3053
3054
3058 BufferPtr = CurPtr;
3059 return true;
3060 }
3061
3062
3064 FormTokenWithChars(Result, CurPtr, tok::comment);
3065 return true;
3066 }
3067
3068
3069
3070
3071
3073 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3074 return false;
3075 }
3076
3077
3078 BufferPtr = CurPtr;
3080 return false;
3081}
3082
3083
3084
3085
3086
3087
3088
3091 "Must be in a preprocessing directive!");
3094
3095
3096 const char *CurPtr = BufferPtr;
3097 while (true) {
3098 char Char = getAndAdvanceChar(CurPtr, Tmp);
3099 switch (Char) {
3100 default:
3102 Result->push_back(Char);
3103 break;
3104 case 0:
3105
3106 if (CurPtr-1 != BufferEnd) {
3107 if (isCodeCompletionPoint(CurPtr-1)) {
3109 cutOffLexing();
3110 return;
3111 }
3112
3113
3115 Result->push_back(Char);
3116 break;
3117 }
3118
3119 [[fallthrough]];
3120 case '\r':
3121 case '\n':
3122
3123 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3124 BufferPtr = CurPtr-1;
3125
3126
3127 Lex(Tmp);
3128 if (Tmp.is(tok::code_completion)) {
3129 if (PP)
3131 Lex(Tmp);
3132 }
3133 assert(Tmp.is(tok::eod) && "Unexpected token!");
3134
3135
3136 return;
3137 }
3138 }
3139}
3140
3141
3142
3143
3144
3145bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3146
3147
3148
3150
3152
3153 FormTokenWithChars(Result, CurPtr, tok::eod);
3154
3155
3156 if (PP)
3158 return true;
3159 }
3160
3161
3162
3164 Result.startToken();
3165 BufferPtr = BufferEnd;
3166 FormTokenWithChars(Result, BufferEnd, tok::eof);
3167 return true;
3168 }
3169
3172
3173
3174
3178 }
3179
3180
3181
3182
3186 diag::err_pp_unterminated_conditional);
3188 }
3189
3190
3191
3192 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3195 unsigned DiagID;
3196
3197 if (LangOpts.CPlusPlus11) {
3198
3199
3200
3201 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3202 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3203 } else {
3204 DiagID = diag::warn_no_newline_eof;
3205 }
3206 } else {
3207 DiagID = diag::ext_no_newline_eof;
3208 }
3209
3210 Diag(BufferEnd, DiagID)
3212 }
3213
3214 BufferPtr = CurPtr;
3215
3216
3218}
3219
3220
3221
3222
3223
3224unsigned Lexer::isNextPPTokenLParen() {
3225 assert( && "How can we expand a macro from a skipping buffer?");
3226
3227 if (isDependencyDirectivesLexer()) {
3228 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3229 return 2;
3230 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3231 tok::l_paren);
3232 }
3233
3234
3235
3236
3238
3239
3240 const char *TmpBufferPtr = BufferPtr;
3242 bool atStartOfLine = IsAtStartOfLine;
3243 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3244 bool leadingSpace = HasLeadingSpace;
3245
3247 Lex(Tok);
3248
3249
3250 BufferPtr = TmpBufferPtr;
3252 HasLeadingSpace = leadingSpace;
3253 IsAtStartOfLine = atStartOfLine;
3254 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3255
3256
3258
3259 if (Tok.is(tok::eof))
3260 return 2;
3261 return Tok.is(tok::l_paren);
3262}
3263
3264
3265static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3267 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3268 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3269 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3270 size_t Pos = RestOfBuffer.find(Terminator);
3271 while (Pos != StringRef::npos) {
3272
3273 if (Pos == 0 ||
3274 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3275 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3276 Pos = RestOfBuffer.find(Terminator);
3277 continue;
3278 }
3279 return RestOfBuffer.data()+Pos;
3280 }
3281 return nullptr;
3282}
3283
3284
3285
3286
3287
3288bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3289
3290 if (CurPtr != BufferStart &&
3291 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3292 return false;
3293
3294
3295 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3296 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3297 return false;
3298
3299
3300
3302 return false;
3303
3305
3306
3307
3309
3310
3311 Diag(CurPtr, diag::err_conflict_marker);
3312 CurrentConflictMarkerState = Kind;
3313
3314
3315
3316 while (*CurPtr != '\r' && *CurPtr != '\n') {
3317 assert(CurPtr != BufferEnd && "Didn't find end of line");
3318 ++CurPtr;
3319 }
3320 BufferPtr = CurPtr;
3321 return true;
3322 }
3323
3324
3325 return false;
3326}
3327
3328
3329
3330
3331
3332bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3333
3334 if (CurPtr != BufferStart &&
3335 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3336 return false;
3337
3338
3339
3341 return false;
3342
3343
3344 for (unsigned i = 1; i != 4; ++i)
3345 if (CurPtr[i] != CurPtr[0])
3346 return false;
3347
3348
3349
3350
3351 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3352 CurrentConflictMarkerState)) {
3353 CurPtr = End;
3354
3355
3356 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3357 ++CurPtr;
3358
3359 BufferPtr = CurPtr;
3360
3361
3362 CurrentConflictMarkerState = CMK_None;
3363 return true;
3364 }
3365
3366 return false;
3367}
3368
3370 const char *BufferEnd) {
3371 if (CurPtr == BufferEnd)
3372 return nullptr;
3373 BufferEnd -= 1;
3374 for (; CurPtr != BufferEnd; ++CurPtr) {
3375 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3376 return CurPtr + 2;
3377 }
3378 return nullptr;
3379}
3380
3381bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3382 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3384 return false;
3386 if (!End)
3387 return false;
3388 const char *Start = CurPtr - 1;
3389 if (!LangOpts.AllowEditorPlaceholders)
3390 Diag(Start, diag::err_placeholder_in_source);
3391 Result.startToken();
3392 FormTokenWithChars(Result, End, tok::raw_identifier);
3393 Result.setRawIdentifierData(Start);
3396 BufferPtr = End;
3397 return true;
3398}
3399
3400bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3404 }
3405
3406 return false;
3407}
3408
3409std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3410 const char *SlashLoc,
3412 unsigned CharSize;
3413 char Kind = getCharAndSize(StartPtr, CharSize);
3414 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3415
3416 unsigned NumHexDigits;
3417 if (Kind == 'u')
3418 NumHexDigits = 4;
3419 else if (Kind == 'U')
3420 NumHexDigits = 8;
3421
3422 bool Delimited = false;
3423 bool FoundEndDelimiter = false;
3424 unsigned Count = 0;
3426
3427 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3428 if (Diagnose)
3429 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3430 return std::nullopt;
3431 }
3432
3433 const char *CurPtr = StartPtr + CharSize;
3434 const char *KindLoc = &CurPtr[-1];
3435
3437 while (Count != NumHexDigits || Delimited) {
3438 char C = getCharAndSize(CurPtr, CharSize);
3439 if (!Delimited && Count == 0 && C == '{') {
3440 Delimited = true;
3441 CurPtr += CharSize;
3442 continue;
3443 }
3444
3445 if (Delimited && C == '}') {
3446 CurPtr += CharSize;
3447 FoundEndDelimiter = true;
3448 break;
3449 }
3450
3451 unsigned Value = llvm::hexDigitValue(C);
3452 if (Value == -1U) {
3453 if (!Delimited)
3454 break;
3455 if (Diagnose)
3456 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3457 << StringRef(KindLoc, 1);
3458 return std::nullopt;
3459 }
3460
3461 if (CodePoint & 0xF000'0000) {
3462 if (Diagnose)
3463 Diag(KindLoc, diag::err_escape_too_large) << 0;
3464 return std::nullopt;
3465 }
3466
3467 CodePoint <<= 4;
3468 CodePoint |= Value;
3469 CurPtr += CharSize;
3470 Count++;
3471 }
3472
3473 if (Count == 0) {
3474 if (Diagnose)
3475 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3476 : diag::warn_ucn_escape_no_digits)
3477 << StringRef(KindLoc, 1);
3478 return std::nullopt;
3479 }
3480
3481 if (Delimited && Kind == 'U') {
3482 if (Diagnose)
3483 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3484 return std::nullopt;
3485 }
3486
3487 if (!Delimited && Count != NumHexDigits) {
3488 if (Diagnose) {
3489 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3490
3491 if (Count == 4 && NumHexDigits == 8) {
3493 Diag(KindLoc, diag::note_ucn_four_not_eight)
3495 }
3496 }
3497 return std::nullopt;
3498 }
3499
3500 if (Delimited && PP) {
3502 ? diag::warn_cxx23_delimited_escape_sequence
3503 : diag::ext_delimited_escape_sequence)
3504 << 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3505 }
3506
3509
3510
3511
3512 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3513 StartPtr = CurPtr;
3514 else
3515 while (StartPtr != CurPtr)
3516 (void)getAndAdvanceChar(StartPtr, *Result);
3517 } else {
3518 StartPtr = CurPtr;
3519 }
3520 return CodePoint;
3521}
3522
3523std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3524 const char *SlashLoc,
3526 unsigned CharSize;
3528
3529 char C = getCharAndSize(StartPtr, CharSize);
3530 assert(C == 'N' && "expected \\N{...}");
3531
3532 const char *CurPtr = StartPtr + CharSize;
3533 const char *KindLoc = &CurPtr[-1];
3534
3535 C = getCharAndSize(CurPtr, CharSize);
3536 if (C != '{') {
3537 if (Diagnose)
3538 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3539 return std::nullopt;
3540 }
3541 CurPtr += CharSize;
3542 const char *StartName = CurPtr;
3543 bool FoundEndDelimiter = false;
3545 while (C) {
3546 C = getCharAndSize(CurPtr, CharSize);
3547 CurPtr += CharSize;
3548 if (C == '}') {
3549 FoundEndDelimiter = true;
3550 break;
3551 }
3552
3554 break;
3555 Buffer.push_back(C);
3556 }
3557
3558 if (!FoundEndDelimiter || Buffer.empty()) {
3559 if (Diagnose)
3560 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3561 : diag::warn_delimited_ucn_incomplete)
3562 << StringRef(KindLoc, 1);
3563 return std::nullopt;
3564 }
3565
3566 StringRef Name(Buffer.data(), Buffer.size());
3567 std::optional<char32_t> Match =
3568 llvm::sys::unicode::nameToCodepointStrict(Name);
3569 std::optionalllvm::sys::unicode::LooseMatchingResult LooseMatch;
3570 if (!Match) {
3571 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3572 if (Diagnose) {
3573 Diag(StartName, diag::err_invalid_ucn_name)
3574 << StringRef(Buffer.data(), Buffer.size())
3575 << makeCharRange(*this, StartName, CurPtr - CharSize);
3576 if (LooseMatch) {
3577 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3579 makeCharRange(*this, StartName, CurPtr - CharSize),
3580 LooseMatch->Name);
3581 }
3582 }
3583
3584
3585
3586 }
3587
3588 if (Diagnose && Match)
3590 ? diag::warn_cxx23_delimited_escape_sequence
3591 : diag::ext_delimited_escape_sequence)
3592 << 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3593
3594
3595
3596
3597
3598 if (LooseMatch && Diagnose)
3599 Match = LooseMatch->CodePoint;
3600
3603
3604
3605
3606 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3607 StartPtr = CurPtr;
3608 else
3609 while (StartPtr != CurPtr)
3610 (void)getAndAdvanceChar(StartPtr, *Result);
3611 } else {
3612 StartPtr = CurPtr;
3613 }
3614 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3615}
3616
3617uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3619
3620 unsigned CharSize;
3621 std::optional<uint32_t> CodePointOpt;
3622 char Kind = getCharAndSize(StartPtr, CharSize);
3623 if (Kind == 'u' || Kind == 'U')
3624 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3625 else if (Kind == 'N')
3626 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3627
3628 if (!CodePointOpt)
3629 return 0;
3630
3631 uint32_t CodePoint = *CodePointOpt;
3632
3633
3634 if (LangOpts.AsmPreprocessor)
3635 return CodePoint;
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653 if (CodePoint < 0xA0) {
3654
3655
3657 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3658 Diag(BufferPtr, diag::err_ucn_control_character);
3659 else {
3660 char C = static_cast<char>(CodePoint);
3661 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3662 }
3663 }
3664
3665 return 0;
3666 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3667
3668
3669
3671 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3672 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3673 else
3674 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3675 }
3676 return 0;
3677 }
3678
3679 return CodePoint;
3680}
3681
3682bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3683 const char *CurPtr) {
3686 Diag(BufferPtr, diag::ext_unicode_whitespace)
3688
3690 return true;
3691 }
3692 return false;
3693}
3694
3695void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3696 IsAtStartOfLine = Result.isAtStartOfLine();
3697 HasLeadingSpace = Result.hasLeadingSpace();
3698 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3699
3700}
3701
3703 assert(!isDependencyDirectivesLexer());
3704
3705
3706 Result.startToken();
3707
3708
3709 if (IsAtStartOfLine) {
3711 IsAtStartOfLine = false;
3712 }
3713
3714 if (HasLeadingSpace) {
3716 HasLeadingSpace = false;
3717 }
3718
3719 if (HasLeadingEmptyMacro) {
3721 HasLeadingEmptyMacro = false;
3722 }
3723
3724 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3725 IsAtPhysicalStartOfLine = false;
3727 (void) isRawLex;
3728 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3729
3730 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3731 return returnedToken;
3732}
3733
3734
3735
3736
3737
3738
3739bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3740LexStart:
3741 assert(.needsCleaning() && "Result needs cleaning");
3742 assert(.hasPtrData() && "Result has not been reset");
3743
3744
3745 const char *CurPtr = BufferPtr;
3746
3747
3749 do {
3750 ++CurPtr;
3752
3753
3754
3755
3757 FormTokenWithChars(Result, CurPtr, tok::unknown);
3758
3759 return true;
3760 }
3761
3762 BufferPtr = CurPtr;
3764 }
3765
3766 unsigned SizeTmp, SizeTmp2;
3767
3768
3769 char Char = getAndAdvanceChar(CurPtr, Result);
3771
3773 NewLinePtr = nullptr;
3774
3775 switch (Char) {
3776 case 0:
3777
3778 if (CurPtr-1 == BufferEnd)
3779 return LexEndOfFile(Result, CurPtr-1);
3780
3781
3782 if (isCodeCompletionPoint(CurPtr-1)) {
3783
3784 Result.startToken();
3785 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3786 return true;
3787 }
3788
3790 Diag(CurPtr-1, diag::null_in_file);
3792 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3793 return true;
3794
3795
3796
3797 goto LexNextToken;
3798
3799 case 26:
3800
3801 if (LangOpts.MicrosoftExt) {
3803 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3804 return LexEndOfFile(Result, CurPtr-1);
3805 }
3806
3807
3808 Kind = tok::unknown;
3809 break;
3810
3811 case '\r':
3812 if (CurPtr[0] == '\n')
3813 (void)getAndAdvanceChar(CurPtr, Result);
3814 [[fallthrough]];
3815 case '\n':
3816
3817
3819
3821
3822
3823 if (PP)
3825
3826
3827 IsAtStartOfLine = true;
3828 IsAtPhysicalStartOfLine = true;
3829 NewLinePtr = CurPtr - 1;
3830
3831 Kind = tok::eod;
3832 break;
3833 }
3834
3835
3837
3838 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3839 return true;
3840
3841
3842
3843 goto LexNextToken;
3844 case ' ':
3845 case '\t':
3846 case '\f':
3847 case '\v':
3848 SkipHorizontalWhitespace:
3850 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3851 return true;
3852
3853 SkipIgnoredUnits:
3854 CurPtr = BufferPtr;
3855
3856
3857
3858 if (CurPtr[0] == '/' && CurPtr[1] == '/' && () &&
3859 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3860 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3861 return true;
3862 goto SkipIgnoredUnits;
3863 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && ()) {
3864 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3865 return true;
3866 goto SkipIgnoredUnits;
3868 goto SkipHorizontalWhitespace;
3869 }
3870
3871
3872 goto LexNextToken;
3873
3874
3875
3876 case '0': case '1': case '2': case '3': case '4':
3877 case '5': case '6': case '7': case '8': case '9':
3878
3880 return LexNumericConstant(Result, CurPtr);
3881
3882
3883
3884
3885 case 'u':
3886
3888
3889 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3890 Char = getCharAndSize(CurPtr, SizeTmp);
3891
3892
3893 if (Char == '"')
3894 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3895 tok::utf16_string_literal);
3896
3897
3898 if (Char == '\'')
3899 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3900 tok::utf16_char_constant);
3901
3902
3903 if (Char == 'R' && LangOpts.RawStringLiterals &&
3904 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3905 return LexRawStringLiteral(Result,
3906 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3908 tok::utf16_string_literal);
3909
3910 if (Char == '8') {
3911 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3912
3913
3914 if (Char2 == '"')
3915 return LexStringLiteral(Result,
3916 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3918 tok::utf8_string_literal);
3919 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3920 return LexCharConstant(
3921 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3923 tok::utf8_char_constant);
3924
3925 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3926 unsigned SizeTmp3;
3927 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3928
3929 if (Char3 == '"') {
3930 return LexRawStringLiteral(Result,
3931 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3934 tok::utf8_string_literal);
3935 }
3936 }
3937 }
3938 }
3939
3940
3941 return LexIdentifierContinue(Result, CurPtr);
3942
3943 case 'U':
3944
3946
3947 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3948 Char = getCharAndSize(CurPtr, SizeTmp);
3949
3950
3951 if (Char == '"')
3952 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3953 tok::utf32_string_literal);
3954
3955
3956 if (Char == '\'')
3957 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3958 tok::utf32_char_constant);
3959
3960
3961 if (Char == 'R' && LangOpts.RawStringLiterals &&
3962 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3963 return LexRawStringLiteral(Result,
3964 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3966 tok::utf32_string_literal);
3967 }
3968
3969
3970 return LexIdentifierContinue(Result, CurPtr);
3971
3972 case 'R':
3973
3975
3976 if (LangOpts.RawStringLiterals) {
3977 Char = getCharAndSize(CurPtr, SizeTmp);
3978
3979 if (Char == '"')
3980 return LexRawStringLiteral(Result,
3981 ConsumeChar(CurPtr, SizeTmp, Result),
3982 tok::string_literal);
3983 }
3984
3985
3986 return LexIdentifierContinue(Result, CurPtr);
3987
3988 case 'L':
3989
3991 Char = getCharAndSize(CurPtr, SizeTmp);
3992
3993
3994 if (Char == '"')
3995 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3996 tok::wide_string_literal);
3997
3998
3999 if (LangOpts.RawStringLiterals && Char == 'R' &&
4000 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
4001 return LexRawStringLiteral(Result,
4002 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4004 tok::wide_string_literal);
4005
4006
4007 if (Char == '\'')
4008 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4009 tok::wide_char_constant);
4010
4011 [[fallthrough]];
4012
4013
4014 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
4015 case 'H': case 'I': case 'J': case 'K': case 'M': case 'N':
4016 case 'O': case 'P': case 'Q': case 'S': case 'T':
4017 case 'V': case 'W': case 'X': case 'Y': case 'Z':
4018 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
4019 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
4020 case 'o': case 'p': case 'q': case 'r': case 's': case 't':
4021 case 'v': case 'w': case 'x': case 'y': case 'z':
4022 case '_':
4023
4025 return LexIdentifierContinue(Result, CurPtr);
4026
4027 case '$':
4028 if (LangOpts.DollarIdents) {
4030 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4031
4033 return LexIdentifierContinue(Result, CurPtr);
4034 }
4035
4036 Kind = tok::unknown;
4037 break;
4038
4039
4040 case '\'':
4041
4043 return LexCharConstant(Result, CurPtr, tok::char_constant);
4044
4045
4046 case '"':
4047
4049 return LexStringLiteral(Result, CurPtr,
4051 : tok::string_literal);
4052
4053
4054 case '?':
4055 Kind = tok::question;
4056 break;
4057 case '[':
4058 Kind = tok::l_square;
4059 break;
4060 case ']':
4061 Kind = tok::r_square;
4062 break;
4063 case '(':
4064 Kind = tok::l_paren;
4065 break;
4066 case ')':
4067 Kind = tok::r_paren;
4068 break;
4069 case '{':
4070 Kind = tok::l_brace;
4071 break;
4072 case '}':
4073 Kind = tok::r_brace;
4074 break;
4075 case '.':
4076 Char = getCharAndSize(CurPtr, SizeTmp);
4077 if (Char >= '0' && Char <= '9') {
4078
4080
4081 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4082 } else if (LangOpts.CPlusPlus && Char == '*') {
4083 Kind = tok::periodstar;
4084 CurPtr += SizeTmp;
4085 } else if (Char == '.' &&
4086 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4087 Kind = tok::ellipsis;
4088 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4090 } else {
4091 Kind = tok::period;
4092 }
4093 break;
4094 case '&':
4095 Char = getCharAndSize(CurPtr, SizeTmp);
4096 if (Char == '&') {
4097 Kind = tok::ampamp;
4098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4099 } else if (Char == '=') {
4100 Kind = tok::ampequal;
4101 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4102 } else {
4103 Kind = tok::amp;
4104 }
4105 break;
4106 case '*':
4107 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4108 Kind = tok::starequal;
4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4110 } else {
4111 Kind = tok::star;
4112 }
4113 break;
4114 case '+':
4115 Char = getCharAndSize(CurPtr, SizeTmp);
4116 if (Char == '+') {
4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4118 Kind = tok::plusplus;
4119 } else if (Char == '=') {
4120 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4121 Kind = tok::plusequal;
4122 } else {
4123 Kind = tok::plus;
4124 }
4125 break;
4126 case '-':
4127 Char = getCharAndSize(CurPtr, SizeTmp);
4128 if (Char == '-') {
4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4130 Kind = tok::minusminus;
4131 } else if (Char == '>' && LangOpts.CPlusPlus &&
4132 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {
4133 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4135 Kind = tok::arrowstar;
4136 } else if (Char == '>') {
4137 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4138 Kind = tok::arrow;
4139 } else if (Char == '=') {
4140 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4141 Kind = tok::minusequal;
4142 } else {
4143 Kind = tok::minus;
4144 }
4145 break;
4146 case '~':
4147 Kind = tok::tilde;
4148 break;
4149 case '!':
4150 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4151 Kind = tok::exclaimequal;
4152 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4153 } else {
4154 Kind = tok::exclaim;
4155 }
4156 break;
4157 case '/':
4158
4159 Char = getCharAndSize(CurPtr, SizeTmp);
4160 if (Char == '/') {
4161
4162
4163
4164
4165
4166
4167
4168
4169 bool TreatAsComment =
4170 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4171 if (!TreatAsComment)
4173 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4174
4175 if (TreatAsComment) {
4176 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4177 TokAtPhysicalStartOfLine))
4178 return true;
4179
4180
4181
4182
4183 goto SkipIgnoredUnits;
4184 }
4185 }
4186
4187 if (Char == '*') {
4188 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4189 TokAtPhysicalStartOfLine))
4190 return true;
4191
4192
4193
4194 goto LexNextToken;
4195 }
4196
4197 if (Char == '=') {
4198 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4199 Kind = tok::slashequal;
4200 } else {
4201 Kind = tok::slash;
4202 }
4203 break;
4204 case '%':
4205 Char = getCharAndSize(CurPtr, SizeTmp);
4206 if (Char == '=') {
4207 Kind = tok::percentequal;
4208 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4209 } else if (LangOpts.Digraphs && Char == '>') {
4210 Kind = tok::r_brace;
4211 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4212 } else if (LangOpts.Digraphs && Char == ':') {
4213 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4214 Char = getCharAndSize(CurPtr, SizeTmp);
4215 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4216 Kind = tok::hashhash;
4217 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4219 } else if (Char == '@' && LangOpts.MicrosoftExt) {
4220 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4222 Diag(BufferPtr, diag::ext_charize_microsoft);
4223 Kind = tok::hashat;
4224 } else {
4225
4226
4227
4228
4229 if (TokAtPhysicalStartOfLine && && !Is_PragmaLexer)
4230 goto HandleDirective;
4231
4232 Kind = tok::hash;
4233 }
4234 } else {
4235 Kind = tok::percent;
4236 }
4237 break;
4238 case '<':
4239 Char = getCharAndSize(CurPtr, SizeTmp);
4241 return LexAngledStringLiteral(Result, CurPtr);
4242 } else if (Char == '<') {
4243 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4244 if (After == '=') {
4245 Kind = tok::lesslessequal;
4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4248 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4249
4250
4251 goto LexNextToken;
4252 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4253
4254
4255 goto LexNextToken;
4256 } else if (LangOpts.CUDA && After == '<') {
4257 Kind = tok::lesslessless;
4258 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4260 } else {
4261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4262 Kind = tok::lessless;
4263 }
4264 } else if (Char == '=') {
4265 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4266 if (After == '>') {
4267 if (LangOpts.CPlusPlus20) {
4269 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4270 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4272 Kind = tok::spaceship;
4273 break;
4274 }
4275
4276
4278 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4281 }
4282 }
4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4284 Kind = tok::lessequal;
4285 } else if (LangOpts.Digraphs && Char == ':') {
4286 if (LangOpts.CPlusPlus11 &&
4287 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4288
4289
4290
4291
4292
4293 unsigned SizeTmp3;
4294 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4295 if (After != ':' && After != '>') {
4296 Kind = tok::less;
4298 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4299 break;
4300 }
4301 }
4302
4303 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4304 Kind = tok::l_square;
4305 } else if (LangOpts.Digraphs && Char == '%') {
4306 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4307 Kind = tok::l_brace;
4308 } else if (Char == '#' && SizeTmp == 1 &&
4309 lexEditorPlaceholder(Result, CurPtr)) {
4310 return true;
4311 } else {
4312 Kind = tok::less;
4313 }
4314 break;
4315 case '>':
4316 Char = getCharAndSize(CurPtr, SizeTmp);
4317 if (Char == '=') {
4318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4319 Kind = tok::greaterequal;
4320 } else if (Char == '>') {
4321 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4322 if (After == '=') {
4323 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4325 Kind = tok::greatergreaterequal;
4326 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4327
4328
4329 goto LexNextToken;
4330 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4331
4332 goto LexNextToken;
4333 } else if (LangOpts.CUDA && After == '>') {
4334 Kind = tok::greatergreatergreater;
4335 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4337 } else {
4338 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4339 Kind = tok::greatergreater;
4340 }
4341 } else {
4342 Kind = tok::greater;
4343 }
4344 break;
4345 case '^':
4346 Char = getCharAndSize(CurPtr, SizeTmp);
4347 if (Char == '=') {
4348 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4349 Kind = tok::caretequal;
4350 } else {
4351 if (LangOpts.OpenCL && Char == '^')
4352 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4353 Kind = tok::caret;
4354 }
4355 break;
4356 case '|':
4357 Char = getCharAndSize(CurPtr, SizeTmp);
4358 if (Char == '=') {
4359 Kind = tok::pipeequal;
4360 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4361 } else if (Char == '|') {
4362
4363 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4364 goto LexNextToken;
4365 Kind = tok::pipepipe;
4366 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4367 } else {
4368 Kind = tok::pipe;
4369 }
4370 break;
4371 case ':':
4372 Char = getCharAndSize(CurPtr, SizeTmp);
4373 if (LangOpts.Digraphs && Char == '>') {
4374 Kind = tok::r_square;
4375 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4376 } else if (Char == ':') {
4377 Kind = tok::coloncolon;
4378 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4379 } else {
4380 Kind = tok::colon;
4381 }
4382 break;
4383 case ';':
4384 Kind = tok::semi;
4385 break;
4386 case '=':
4387 Char = getCharAndSize(CurPtr, SizeTmp);
4388 if (Char == '=') {
4389
4390 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4391 goto LexNextToken;
4392
4393 Kind = tok::equalequal;
4394 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4395 } else {
4396 Kind = tok::equal;
4397 }
4398 break;
4399 case ',':
4400 Kind = tok::comma;
4401 break;
4402 case '#':
4403 Char = getCharAndSize(CurPtr, SizeTmp);
4404 if (Char == '#') {
4405 Kind = tok::hashhash;
4406 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4407 } else if (Char == '@' && LangOpts.MicrosoftExt) {
4408 Kind = tok::hashat;
4410 Diag(BufferPtr, diag::ext_charize_microsoft);
4411 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4412 } else {
4413
4414
4415
4416
4417 if (TokAtPhysicalStartOfLine && && !Is_PragmaLexer)
4418 goto HandleDirective;
4419
4420 Kind = tok::hash;
4421 }
4422 break;
4423
4424 case '@':
4425
4426 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4427 Kind = tok::at;
4428 else
4429 Kind = tok::unknown;
4430 break;
4431
4432
4433 case '\\':
4434 if (!LangOpts.AsmPreprocessor) {
4435 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4436 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4437 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4438 return true;
4439
4440
4441
4442 goto LexNextToken;
4443 }
4444
4445 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4446 }
4447 }
4448
4449 Kind = tok::unknown;
4450 break;
4451
4452 default: {
4454 Kind = tok::unknown;
4455 break;
4456 }
4457
4458 llvm::UTF32 CodePoint;
4459
4460
4461
4462 --CurPtr;
4463 llvm::ConversionResult Status =
4464 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4465 (const llvm::UTF8 *)BufferEnd,
4466 &CodePoint,
4467 llvm::strictConversion);
4468 if (Status == llvm::conversionOK) {
4469 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4470 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4471 return true;
4472
4473
4474
4475 goto LexNextToken;
4476 }
4477 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4478 }
4479
4482 ++CurPtr;
4483 Kind = tok::unknown;
4484 break;
4485 }
4486
4487
4488
4489
4490 Diag(CurPtr, diag::err_invalid_utf8);
4491
4492 BufferPtr = CurPtr+1;
4493
4494
4495
4496 goto LexNextToken;
4497 }
4498 }
4499
4500
4502
4503
4504 FormTokenWithChars(Result, CurPtr, Kind);
4505 return true;
4506
4507HandleDirective:
4508
4509
4510 FormTokenWithChars(Result, CurPtr, tok::hash);
4512
4514
4515 return true;
4516
4517
4518 return false;
4519
4520LexNextToken:
4522 goto LexStart;
4523}
4524
4525const char *Lexer::convertDependencyDirectiveToken(
4527 const char *TokPtr = BufferStart + DDTok.Offset;
4528 Result.startToken();
4533 BufferPtr = TokPtr + DDTok.Length;
4534 return TokPtr;
4535}
4536
4537bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4538 assert(isDependencyDirectivesLexer());
4539
4540 using namespace dependency_directives_scan;
4541
4542 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4543 if (DepDirectives.front().Kind == pp_eof)
4544 return LexEndOfFile(Result, BufferEnd);
4545 if (DepDirectives.front().Kind == tokens_present_before_eof)
4547 NextDepDirectiveTokenIndex = 0;
4548 DepDirectives = DepDirectives.drop_front();
4549 }
4550
4552 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4553 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4554
4556 }
4557
4559 BufferPtr = BufferStart + DDTok.Offset;
4560 LexAngledStringLiteral(Result, BufferPtr + 1);
4561 if (Result.isNot(tok::header_name))
4562 return true;
4563
4564 while (true) {
4566 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4567 if (BufferStart + NextTok.Offset >= BufferPtr)
4568 break;
4569 ++NextDepDirectiveTokenIndex;
4570 }
4571 return true;
4572 }
4573
4574 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4575
4576 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4578 return false;
4579 }
4580 if (Result.is(tok::raw_identifier)) {
4581 Result.setRawIdentifierData(TokPtr);
4586 }
4587 return true;
4588 }
4589 if (Result.isLiteral()) {
4590 Result.setLiteralData(TokPtr);
4591 return true;
4592 }
4593 if (Result.is(tok::colon)) {
4594
4595 if (*BufferPtr == ':') {
4596 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4597 tok::colon));
4598 ++NextDepDirectiveTokenIndex;
4599 Result.setKind(tok::coloncolon);
4600 }
4601 return true;
4602 }
4603 if (Result.is(tok::eod))
4605
4606 return true;
4607}
4608
4609bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4610 assert(isDependencyDirectivesLexer());
4611
4612 using namespace dependency_directives_scan;
4613
4614 bool Stop = false;
4615 unsigned NestedIfs = 0;
4616 do {
4617 DepDirectives = DepDirectives.drop_front();
4618 switch (DepDirectives.front().Kind) {
4620 llvm_unreachable("unexpected 'pp_none'");
4639 break;
4643 ++NestedIfs;
4644 break;
4649 if (!NestedIfs) {
4650 Stop = true;
4651 }
4652 break;
4654 if (!NestedIfs) {
4655 Stop = true;
4656 } else {
4657 --NestedIfs;
4658 }
4659 break;
4661 NextDepDirectiveTokenIndex = 0;
4662 return LexEndOfFile(Result, BufferEnd);
4663 }
4664 } while (!Stop);
4665
4667 DepDirectives.front().Tokens.front();
4668 assert(DDTok.is(tok::hash));
4669 NextDepDirectiveTokenIndex = 1;
4670
4671 convertDependencyDirectiveToken(DDTok, Result);
4672 return false;
4673}
Defines the Diagnostic-related interfaces.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
static void StringifyImpl(T &Str, char Quote)
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
static bool isUnicodeWhitespace(uint32_t Codepoint)
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Concrete class used by the front-end to report problems and issues.
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)
Finds the token that comes before the given location.
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
const char * getBufferLocation() const
Return the current location in the buffer.
bool Lex(Token &Result)
Lex - Return the next token in the file.
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)
Finds the token that comes right after the given location.
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
bool isMacroArgExpansion() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
IdentifierInfo * getIdentifierInfo() const
bool hasUCN() const
Returns true if this token contains a universal character name.
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
unsigned getLength() const
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
tok::TokenKind getKind() const
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
void startToken()
Reset all flags to cleared.
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
void setFlag(TokenFlags Flag)
Set the specified flag.
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ pp_pragma_system_header
@ pp_pragma_include_alias
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
@ CMK_None
Not within a conflict marker.
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
#define _SIDD_LEAST_SIGNIFICANT
#define _SIDD_NEGATIVE_POLARITY
Represents a char and the number of bytes parsed to produce it.
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.
bool is(tok::TokenKind K) const