clang: lib/Lex/Lexer.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include
40#include
41#include
42#include
43#include
44#include
45#include
46#include
47#include
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55
56
57
58
59
62 return false;
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68
71 return tok::objc_not_keyword;
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
115
116 default:
117 return false;
118 }
119}
120
121
122
123
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137
138
139
140 if (BufferStart == BufferPtr) {
141
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3)
145 .Default(0);
146
147
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161
163
164
166
167
168
169
170
172
173
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179
180
181
182
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195
196
197
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205
207}
208
209
210
211
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
247
248
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253
254
255
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262
263
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268
269
271
272
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
314}
315
317
318
319
320
321
322
323
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
332
333 while (BufPtr < BufEnd) {
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342
343
344
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347
348
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358
359 }
360 }
361
362 while (BufPtr < BufEnd) {
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373
374
375
376
377
382 bool *invalid) {
383
385
386
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
401
403
404
406 return StringRef(tokenBegin, length);
407
408
409 buffer.resize(length);
411 return StringRef(buffer.data(), buffer.size());
412}
413
414
415
416
417
418
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
425 &CharDataInvalid);
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431
432 if (.needsCleaning())
433 return std::string(TokStart, TokStart + Tok.getLength());
434
439}
440
441
442
443
444
445
446
447
448
449
450
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457
458 if (Tok.is(tok::raw_identifier))
459 TokStart = Tok.getRawIdentifier().data();
460 else if (.hasUCN()) {
462
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468
469 if (Tok.isLiteral())
470 TokStart = Tok.getLiteralData();
471
472 if (!TokStart) {
473
474 bool CharDataInvalid = false;
475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484
485 if (.needsCleaning()) {
486 Buffer = TokStart;
487 return Tok.getLength();
488 }
489
490
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494
495
496
497
506
507
508
512 bool IgnoreWhiteSpace) {
513
514
515
516
517
518
519
520
521 Loc = SM.getExpansionLoc(Loc);
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
538 return false;
539}
540
541
542
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
552
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
571 return Loc;
572
573
574
575 const char *StrData = Buffer.data() + LocInfo.second;
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
585
586
588 do {
590
592
593
594
597
598
599
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604
605 return Loc;
606}
607
613
614 if (.isMacroArgExpansion(Loc))
615 return Loc;
616
620 FileIDAndOffset BeginFileLocInfo = SM.getDecomposedLoc(BeginFileLoc);
621 assert(FileLocInfo.first == BeginFileLocInfo.first &&
622 FileLocInfo.second >= BeginFileLocInfo.second);
623 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
624}
625
626namespace {
627
628enum PreambleDirectiveKind {
629 PDK_Skipped,
630 PDK_Unknown
631};
632
633}
634
637 unsigned MaxLines) {
638
639
640
643 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
644 Buffer.end());
646
647 bool InPreprocessorDirective = false;
650
651 unsigned MaxLineOffset = 0;
652 if (MaxLines) {
653 const char *CurPtr = Buffer.begin();
654 unsigned CurLine = 0;
655 while (CurPtr != Buffer.end()) {
656 char ch = *CurPtr++;
657 if (ch == '\n') {
658 ++CurLine;
659 if (CurLine == MaxLines)
660 break;
661 }
662 }
663 if (CurPtr != Buffer.end())
664 MaxLineOffset = CurPtr - Buffer.begin();
665 }
666
667 do {
669
670 if (InPreprocessorDirective) {
671
672 if (TheTok.getKind() == tok::eof) {
673 break;
674 }
675
676
677
679 continue;
680
681
682
683 InPreprocessorDirective = false;
684 }
685
686
689
690
691
692 if (MaxLineOffset && TokOffset >= MaxLineOffset)
693 break;
694 }
695
696
697 if (TheTok.getKind() == tok::comment) {
698 if (ActiveCommentLoc.isInvalid())
700 continue;
701 }
702
704
705 Token HashTok = TheTok;
706 InPreprocessorDirective = true;
708
709
710
711
715 PreambleDirectiveKind PDK
716 = llvm::StringSwitch(Keyword)
717 .Case("include", PDK_Skipped)
718 .Case("__include_macros", PDK_Skipped)
719 .Case("define", PDK_Skipped)
720 .Case("undef", PDK_Skipped)
721 .Case("line", PDK_Skipped)
722 .Case("error", PDK_Skipped)
723 .Case("pragma", PDK_Skipped)
724 .Case("import", PDK_Skipped)
725 .Case("include_next", PDK_Skipped)
726 .Case("warning", PDK_Skipped)
727 .Case("ident", PDK_Skipped)
728 .Case("sccs", PDK_Skipped)
729 .Case("assert", PDK_Skipped)
730 .Case("unassert", PDK_Skipped)
731 .Case("if", PDK_Skipped)
732 .Case("ifdef", PDK_Skipped)
733 .Case("ifndef", PDK_Skipped)
734 .Case("elif", PDK_Skipped)
735 .Case("elifdef", PDK_Skipped)
736 .Case("elifndef", PDK_Skipped)
737 .Case("else", PDK_Skipped)
738 .Case("endif", PDK_Skipped)
739 .Default(PDK_Unknown);
740
741 switch (PDK) {
742 case PDK_Skipped:
743 continue;
744
745 case PDK_Unknown:
746
747 break;
748 }
749 }
750
751
752
753
754 TheTok = HashTok;
756 TheTok.getKind() == tok::raw_identifier &&
758 LangOpts.CPlusPlusModules) {
759
760
761 Token ModuleTok = TheTok;
762 do {
764 } while (TheTok.getKind() == tok::comment);
765 if (TheTok.getKind() != tok::semi) {
766
767 TheTok = ModuleTok;
768 break;
769 }
770 continue;
771 }
772
773
774
775
776 break;
777 } while (true);
778
780 if (ActiveCommentLoc.isValid())
781 End = ActiveCommentLoc;
782 else
784
787}
788
792
793
794
796 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
797
798
799 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
800 return 0;
801
802 unsigned PhysOffset = 0;
803
804
805
806
807 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
808 if (CharNo == 0)
809 return PhysOffset;
810 ++TokPtr;
811 --CharNo;
812 ++PhysOffset;
813 }
814
815
816
817 for (; CharNo; --CharNo) {
819 TokPtr += CharAndSize.Size;
820 PhysOffset += CharAndSize.Size;
821 }
822
823
824
825
826
827 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
828 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
829
830 return PhysOffset;
831}
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
852 return {};
853
856 return {};
857 }
858
860 if (Len > Offset)
861 Len = Len - Offset;
862 else
863 return Loc;
864
866}
867
868
869
874 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
875
877 if (.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
878 return false;
879
880 if (expansionLoc.isFileID()) {
881
882 if (MacroBegin)
883 *MacroBegin = expansionLoc;
884 return true;
885 }
886
888}
889
890
891
896 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
897
900 if (tokLen == 0)
901 return false;
902
905 if (.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
906 return false;
907
908 if (expansionLoc.isFileID()) {
909
910 if (MacroEnd)
911 *MacroEnd = expansionLoc;
912 return true;
913 }
914
916}
917
924 if (Range.isTokenRange()) {
927 return {};
928 }
929
930
931 auto [FID, BeginOffs] = SM.getDecomposedLoc(Begin);
932 if (FID.isInvalid())
933 return {};
934
935 unsigned EndOffs;
936 if (.isInFileID(End, FID, &EndOffs) ||
937 BeginOffs > EndOffs)
938 return {};
939
941}
942
943
946 return SM.getSLocEntry(SM.getFileID(Loc))
947 .getExpansion()
948 .isExpansionTokenRange();
949}
950
957 return {};
958
961
964 return {};
965 Range.setBegin(Begin);
967 }
968
970 if (Range.isTokenRange()) {
972 return {};
973
976 return {};
977 Range.setEnd(End);
979 }
980
985 &MacroEnd)) ||
987 &MacroEnd)))) {
988 Range.setBegin(MacroBegin);
989 Range.setEnd(MacroEnd);
990
991 if (Range.isTokenRange())
994 }
995
1000 return {};
1001
1006 return {};
1007
1011 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1012 Range.setEnd(SM.getImmediateSpellingLoc(End));
1014 }
1015 }
1016
1017 return {};
1018}
1019
1025 if (Range.isInvalid()) {
1027 return {};
1028 }
1029
1030
1031 FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin());
1032 if (beginInfo.first.isInvalid()) {
1034 return {};
1035 }
1036
1037 unsigned EndOffs;
1038 if (.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1039 beginInfo.second > EndOffs) {
1041 return {};
1042 }
1043
1044
1045 bool invalidTemp = false;
1046 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1047 if (invalidTemp) {
1049 return {};
1050 }
1051
1053 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1054}
1055
1059 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1060
1061
1062 while (true) {
1068 break;
1069
1070
1071
1072
1073
1074
1075 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1078 break;
1079
1080
1081
1082 FileID MacroFID = SM.getFileID(Loc);
1083 if (SM.isInFileID(SpellLoc, MacroFID))
1084 break;
1085
1086
1087 Loc = SpellLoc;
1088 }
1089
1090
1091
1092
1093 Loc = SM.getSpellingLoc(Loc);
1094
1095
1096
1099 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1100 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1101}
1102
1105 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1106
1107 while (SM.isMacroArgExpansion(Loc))
1108 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1109
1110
1111
1112
1114 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1115 return {};
1116
1117
1118
1119
1120 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1121
1122
1123
1126 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1127 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1128}
1129
1133
1136 if (Str - 1 < BufferStart)
1137 return false;
1138
1139 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1140 (Str[0] == '\r' && Str[-1] == '\n')) {
1141 if (Str - 2 < BufferStart)
1142 return false;
1143 --Str;
1144 }
1145 --Str;
1146
1147
1149 --Str;
1150
1151 return *Str == '\\';
1152}
1153
1157 return {};
1159 if (LocInfo.first.isInvalid())
1160 return {};
1162 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1164 return {};
1167 return {};
1168 StringRef Rest = Buffer.substr(Line - Buffer.data());
1169 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1170 return NumWhitespaceChars == StringRef::npos
1171 ? ""
1172 : Rest.take_front(NumWhitespaceChars);
1173}
1174
1175
1176
1177
1178
1179
1180
1181
1182
1187 unsigned CharNo, unsigned TokLen) {
1188 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1189
1190
1191
1192
1194
1195
1196
1199
1200
1201
1203
1204 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1205}
1206
1207
1208
1210 unsigned TokLen) const {
1211 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1212 "Location out of range for this buffer!");
1213
1214
1215
1216 unsigned CharNo = Loc-BufferStart;
1217 if (FileLoc.isFileID())
1218 return FileLoc.getLocWithOffset(CharNo);
1219
1220
1221
1222 assert(PP && "This doesn't work on raw lexers");
1224}
1225
1226
1227
1231
1232
1233
1234
1235
1236
1237
1239 switch (Letter) {
1240 default: return 0;
1241 case '=': return '#';
1242 case ')': return ']';
1243 case '(': return '[';
1244 case '!': return '|';
1245 case '\'': return '^';
1246 case '>': return '}';
1247 case '/': return '\\';
1248 case '<': return '{';
1249 case '-': return '~';
1250 }
1251}
1252
1253
1254
1255
1256
1259 if (!Res)
1260 return Res;
1261
1262 if (!Trigraphs) {
1264 L->Diag(CP-2, diag::trigraph_ignored);
1265 return 0;
1266 }
1267
1269 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1270 return Res;
1271}
1272
1273
1274
1275
1277 unsigned Size = 0;
1279 ++Size;
1280
1281 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1282 continue;
1283
1284
1285 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1286 Ptr[Size-1] != Ptr[Size])
1287 ++Size;
1288
1289 return Size;
1290 }
1291
1292
1293 return 0;
1294}
1295
1296
1297
1298
1299const char *Lexer::SkipEscapedNewLines(const char *P) {
1300 while (true) {
1301 const char *AfterEscape;
1302 if (*P == '\\') {
1303 AfterEscape = P+1;
1304 } else if (*P == '?') {
1305
1306 if (P[1] != '?' || P[2] != '/')
1307 return P;
1308
1309
1310 AfterEscape = P+3;
1311 } else {
1312 return P;
1313 }
1314
1316 if (NewLineSize == 0) return P;
1317 P = AfterEscape+NewLineSize;
1318 }
1319}
1320
1324 bool IncludeComments) {
1327 return std::nullopt;
1328 }
1330
1331
1333
1334
1335 bool InvalidTemp = false;
1336 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1337 if (InvalidTemp)
1338 return std::nullopt;
1339
1340 const char *TokenBegin = File.data() + LocInfo.second;
1341
1342
1343 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1344 TokenBegin, File.end());
1346
1349 return Tok;
1350}
1351
1355 bool IncludeComments) {
1356 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));
1357 while (Loc != StartOfFile) {
1360 return std::nullopt;
1361
1365 continue;
1366 if (.is(tok::comment) || IncludeComments) {
1367 return Tok;
1368 }
1369 }
1370 return std::nullopt;
1371}
1372
1373
1374
1375
1376
1379 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1381 if ( || Tok->isNot(TKind))
1382 return {};
1384
1385
1386 unsigned NumWhitespaceChars = 0;
1387 if (SkipTrailingWhitespaceAndNewLine) {
1388 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1389 unsigned char C = *TokenEnd;
1391 C = *(++TokenEnd);
1392 NumWhitespaceChars++;
1393 }
1394
1395
1396 if (C == '\n' || C == '\r') {
1397 char PrevC = C;
1398 C = *(++TokenEnd);
1399 NumWhitespaceChars++;
1400 if ((C == '\n' || C == '\r') && C != PrevC)
1401 NumWhitespaceChars++;
1402 }
1403 }
1404
1405 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1406}
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1424 unsigned Size = 0;
1425
1426 if (Ptr[0] == '\\') {
1427 ++Size;
1428 ++Ptr;
1429Slash:
1430
1432 return {'\\', Size};
1433
1434
1435
1437
1439
1440
1442 Diag(Ptr, diag::backslash_newline_space);
1443
1444
1445 Size += EscapedNewLineSize;
1446 Ptr += EscapedNewLineSize;
1447
1448
1449 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1450 CharAndSize.Size += Size;
1451 return CharAndSize;
1452 }
1453
1454
1455 return {'\\', Size};
1456 }
1457
1458
1459 if (Ptr[0] == '?' && Ptr[1] == '?') {
1460
1461
1463 LangOpts.Trigraphs)) {
1464
1466
1467 Ptr += 3;
1469 if (C == '\\') goto Slash;
1471 }
1472 }
1473
1474
1475 return {*Ptr, Size + 1u};
1476}
1477
1478
1479
1480
1481
1482
1483
1484Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1486
1487 unsigned Size = 0;
1488
1489 if (Ptr[0] == '\\') {
1491 ++Ptr;
1492Slash:
1493
1495 return {'\\', Size};
1496
1497
1499
1500 Size += EscapedNewLineSize;
1501 Ptr += EscapedNewLineSize;
1502
1503
1504 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1505 CharAndSize.Size += Size;
1506 return CharAndSize;
1507 }
1508
1509
1510 return {'\\', Size};
1511 }
1512
1513
1514 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1515
1516
1518 Ptr += 3;
1520 if (C == '\\') goto Slash;
1522 }
1523 }
1524
1525
1526 return {*Ptr, Size + 1u};
1527}
1528
1529
1530
1531
1532
1533
1534void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1535 BufferPtr = BufferStart + Offset;
1536 if (BufferPtr > BufferEnd)
1537 BufferPtr = BufferEnd;
1538
1539
1540
1541 IsAtStartOfLine = StartOfLine;
1542 IsAtPhysicalStartOfLine = StartOfLine;
1543}
1544
1546 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1548 return UnicodeWhitespaceChars.contains(Codepoint);
1549}
1550
1553 llvm::raw_svector_ostream CharOS(CharBuf);
1554 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1555 return CharBuf;
1556}
1557
1558
1559
1560
1561
1562
1563
1565 bool IsStart, bool &IsExtension) {
1566 static const llvm::sys::UnicodeCharSet MathStartChars(
1568 static const llvm::sys::UnicodeCharSet MathContinueChars(
1570 if (MathStartChars.contains(C) ||
1571 (!IsStart && MathContinueChars.contains(C))) {
1572 IsExtension = true;
1573 return true;
1574 }
1575 return false;
1576}
1577
1579 bool &IsExtension) {
1580 if (LangOpts.AsmPreprocessor) {
1581 return false;
1582 } else if (LangOpts.DollarIdents && '$' == C) {
1583 return true;
1584 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1585
1586
1587
1588
1589 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1590 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1591 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1592 return true;
1594 IsExtension);
1595 } else if (LangOpts.C11) {
1596 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1598 return C11AllowedIDChars.contains(C);
1599 } else {
1600 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1602 return C99AllowedIDChars.contains(C);
1603 }
1604}
1605
1607 bool &IsExtension) {
1608 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1609 IsExtension = false;
1610 if (LangOpts.AsmPreprocessor) {
1611 return false;
1612 }
1613 if (LangOpts.CPlusPlus || LangOpts.C23) {
1614 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1615 if (XIDStartChars.contains(C))
1616 return true;
1618 IsExtension);
1619 }
1621 return false;
1622 if (LangOpts.C11) {
1623 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1625 return !C11DisallowedInitialIDChars.contains(C);
1626 }
1627 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1629 return !C99DisallowedInitialIDChars.contains(C);
1630}
1631
1634
1635 static const llvm::sys::UnicodeCharSet MathStartChars(
1637 static const llvm::sys::UnicodeCharSet MathContinueChars(
1639
1640 (void)MathStartChars;
1641 (void)MathContinueChars;
1642 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1643 "Unexpected mathematical notation codepoint");
1644 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1646}
1647
1649 const char *End) {
1652}
1653
1656
1657 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1658 enum {
1659 CannotAppearInIdentifier = 0,
1660 CannotStartIdentifier
1661 };
1662
1663 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1665 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1667 if (!C99AllowedIDChars.contains(C)) {
1668 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1669 << Range
1670 << CannotAppearInIdentifier;
1671 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1672 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1673 << Range
1674 << CannotStartIdentifier;
1675 }
1676 }
1677}
1678
1679
1680
1681
1682
1685
1686 struct HomoglyphPair {
1687 uint32_t Character;
1688 char LooksLike;
1689 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1690 };
1691 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1692 {U'\u00ad', 0},
1693 {U'\u01c3', '!'},
1694 {U'\u037e', ';'},
1695 {U'\u200b', 0},
1696 {U'\u200c', 0},
1697 {U'\u200d', 0},
1698 {U'\u2060', 0},
1699 {U'\u2061', 0},
1700 {U'\u2062', 0},
1701 {U'\u2063', 0},
1702 {U'\u2064', 0},
1703 {U'\u2212', '-'},
1704 {U'\u2215', '/'},
1705 {U'\u2216', '\\'},
1706 {U'\u2217', '*'},
1707 {U'\u2223', '|'},
1708 {U'\u2227', '^'},
1709 {U'\u2236', ':'},
1710 {U'\u223c', '~'},
1711 {U'\ua789', ':'},
1712 {U'\ufeff', 0},
1713 {U'\uff01', '!'},
1714 {U'\uff03', '#'},
1715 {U'\uff04', '$'},
1716 {U'\uff05', '%'},
1717 {U'\uff06', '&'},
1718 {U'\uff08', '('},
1719 {U'\uff09', ')'},
1720 {U'\uff0a', '*'},
1721 {U'\uff0b', '+'},
1722 {U'\uff0c', ','},
1723 {U'\uff0d', '-'},
1724 {U'\uff0e', '.'},
1725 {U'\uff0f', '/'},
1726 {U'\uff1a', ':'},
1727 {U'\uff1b', ';'},
1728 {U'\uff1c', '<'},
1729 {U'\uff1d', '='},
1730 {U'\uff1e', '>'},
1731 {U'\uff1f', '?'},
1732 {U'\uff20', '@'},
1733 {U'\uff3b', '['},
1734 {U'\uff3c', '\\'},
1735 {U'\uff3d', ']'},
1736 {U'\uff3e', '^'},
1737 {U'\uff5b', '{'},
1738 {U'\uff5c', '|'},
1739 {U'\uff5d', '}'},
1740 {U'\uff5e', '~'},
1741 {0, 0}
1742 };
1743 auto Homoglyph =
1744 std::lower_bound(std::begin(SortedHomoglyphs),
1745 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1746 if (Homoglyph->Character == C) {
1747 if (Homoglyph->LooksLike) {
1748 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1749 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1751 } else {
1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1754 }
1755 }
1756}
1757
1762 return;
1763
1764 bool IsExtension;
1766 bool IsIDContinue =
1767 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1768
1769 if ((IsFirst && IsIDStart) || ( && IsIDContinue))
1770 return;
1771
1772 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1773
1774 if ( || InvalidOnlyAtStart) {
1775 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1778 } else {
1779 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1782 }
1783}
1784
1785bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1787 const char *UCNPtr = CurPtr + Size;
1788 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, nullptr);
1789 if (CodePoint == 0) {
1790 return false;
1791 }
1792 bool IsExtension = false;
1793 if ((CodePoint, LangOpts, IsExtension)) {
1795 return false;
1797 ->isPreprocessedOutput())
1799 PP->getDiagnostics(), LangOpts, CodePoint,
1801 false);
1802
1803
1804
1805
1807 if (IsExtension)
1810
1813 false);
1814 }
1815
1817 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1818 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1819 CurPtr = UCNPtr;
1820 else
1821 while (CurPtr != UCNPtr)
1822 (void)getAndAdvanceChar(CurPtr, Result);
1823 return true;
1824}
1825
1826bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1827 llvm::UTF32 CodePoint;
1828
1829
1830
1831
1832 unsigned FirstCodeUnitSize;
1833 getCharAndSize(CurPtr, FirstCodeUnitSize);
1834 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1835 const char *UnicodePtr = CharStart;
1836
1837 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1838 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1839 &CodePoint, llvm::strictConversion);
1840 if (ConvResult != llvm::conversionOK)
1841 return false;
1842
1843 bool IsExtension = false;
1845 IsExtension)) {
1847 return false;
1848
1850 ->isPreprocessedOutput())
1852 PP->getDiagnostics(), LangOpts, CodePoint,
1853 makeCharRange(*this, CharStart, UnicodePtr), false);
1854
1855
1856
1858 if (IsExtension)
1860 PP->getDiagnostics(), CodePoint,
1864 false);
1867 }
1868
1869
1870
1871
1872 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1873 CurPtr = UnicodePtr;
1874 return true;
1875}
1876
1877bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1878 const char *CurPtr) {
1879 bool IsExtension = false;
1882 ->isPreprocessedOutput()) {
1883 if (IsExtension)
1888 true);
1891 }
1892
1893 MIOpt.ReadToken();
1894 return LexIdentifierContinue(Result, CurPtr);
1895 }
1896
1898 ->isPreprocessedOutput() &&
(*BufferPtr) &&
1900
1901
1902
1903
1904
1905
1906
1907
1908
1910 PP->getDiagnostics(), LangOpts, C,
1911 makeCharRange(*this, BufferPtr, CurPtr), true);
1912 BufferPtr = CurPtr;
1913 return false;
1914 }
1915
1916
1917
1918 MIOpt.ReadToken();
1919 FormTokenWithChars(Result, CurPtr, tok::unknown);
1920 return true;
1921}
1922
1923static const char *
1925 [[maybe_unused]] const char *BufferEnd) {
1926#ifdef __SSE4_2__
1927 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1928 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1929 };
1930 constexpr ssize_t BytesPerRegister = 16;
1931
1932 __m128i AsciiIdentifierRangeV =
1933 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1934
1935 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1936 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1937
1938 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1941 CurPtr += Consumed;
1942 if (Consumed == BytesPerRegister)
1943 continue;
1944 return CurPtr;
1945 }
1946#endif
1947
1948 unsigned char C = *CurPtr;
1950 C = *++CurPtr;
1951 return CurPtr;
1952}
1953
1954bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1955
1956
1957 while (true) {
1958
1960
1961 unsigned Size;
1962
1963 unsigned char C = getCharAndSize(CurPtr, Size);
1965 CurPtr = ConsumeChar(CurPtr, Size, Result);
1966 continue;
1967 }
1968 if (C == '$') {
1969
1970 if (!LangOpts.DollarIdents)
1971 break;
1972
1974 Diag(CurPtr, diag::ext_dollar_in_identifier);
1975 CurPtr = ConsumeChar(CurPtr, Size, Result);
1976 continue;
1977 }
1978 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1979 continue;
1980 if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1981 continue;
1982
1983 break;
1984 }
1985
1986 const char *IdStart = BufferPtr;
1987 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1988 Result.setRawIdentifierData(IdStart);
1989
1990
1991
1993 return true;
1994
1995
1996
1997 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1998
1999
2000
2001
2002
2003
2004 if (isCodeCompletionPoint(CurPtr)) {
2005
2006 Result.setKind(tok::code_completion);
2007
2008
2009
2010
2011
2012 assert(*CurPtr == 0 && "Completion character must be 0");
2013 ++CurPtr;
2014
2015
2016
2017 if (CurPtr < BufferEnd) {
2019 ++CurPtr;
2020 }
2021 BufferPtr = CurPtr;
2022 return true;
2023 }
2024
2025
2026
2028 return PP->HandleIdentifier(Result);
2029
2030 return true;
2031}
2032
2033
2034
2035bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2037 char C1 = CharAndSize1.Char;
2038 if (C1 != '0')
2039 return false;
2040
2041 auto CharAndSize2 =
2043 char C2 = CharAndSize2.Char;
2044 return (C2 == 'x' || C2 == 'X');
2045}
2046
2047
2048
2049
2050bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2051 unsigned Size;
2052 char C = getCharAndSize(CurPtr, Size);
2053 char PrevCh = 0;
2055 CurPtr = ConsumeChar(CurPtr, Size, Result);
2056 PrevCh = C;
2057 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2058 CurPtr -= Size;
2059 break;
2060 }
2061 C = getCharAndSize(CurPtr, Size);
2062 }
2063
2064
2065 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2066
2067
2068 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2069 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2070 }
2071
2072
2073 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2074
2075
2076
2077 bool IsHexFloat = true;
2078 if (!LangOpts.C99) {
2079 if (!isHexaLiteral(BufferPtr, LangOpts))
2080 IsHexFloat = false;
2081 else if (!LangOpts.CPlusPlus17 &&
2082 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2083 IsHexFloat = false;
2084 }
2085 if (IsHexFloat)
2086 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2087 }
2088
2089
2090 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2094 Diag(CurPtr, LangOpts.CPlusPlus
2095 ? diag::warn_cxx11_compat_digit_separator
2096 : diag::warn_c23_compat_digit_separator);
2097 CurPtr = ConsumeChar(CurPtr, Size, Result);
2098 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2099 return LexNumericConstant(Result, CurPtr);
2100 }
2101 }
2102
2103
2104 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2105 return LexNumericConstant(Result, CurPtr);
2106 if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2107 return LexNumericConstant(Result, CurPtr);
2108
2109
2110 const char *TokStart = BufferPtr;
2111 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2112 Result.setLiteralData(TokStart);
2113 return true;
2114}
2115
2116
2117
2118const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2119 bool IsStringLiteral) {
2120 assert(LangOpts.CPlusPlus);
2121
2122
2123 unsigned Size;
2124 char C = getCharAndSize(CurPtr, Size);
2125 bool Consumed = false;
2126
2128 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2129 Consumed = true;
2130 else if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2131 Consumed = true;
2132 else
2133 return CurPtr;
2134 }
2135
2136 if (!LangOpts.CPlusPlus11) {
2138 Diag(CurPtr,
2139 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2140 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2142 return CurPtr;
2143 }
2144
2145
2146
2147
2148
2149
2150 if (!Consumed) {
2151 bool IsUDSuffix = false;
2152 if (C == '_')
2153 IsUDSuffix = true;
2154 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2155
2156
2157
2158 const unsigned MaxStandardSuffixLength = 3;
2159 char Buffer[MaxStandardSuffixLength] = { C };
2160 unsigned Consumed = Size;
2161 unsigned Chars = 1;
2162 while (true) {
2163 auto [Next, NextSize] =
2166
2167 const StringRef CompleteSuffix(Buffer, Chars);
2168 IsUDSuffix =
2170 break;
2171 }
2172
2173 if (Chars == MaxStandardSuffixLength)
2174
2175 break;
2176
2177 Buffer[Chars++] = Next;
2178 Consumed += NextSize;
2179 }
2180 }
2181
2182 if (!IsUDSuffix) {
2184 Diag(CurPtr, LangOpts.MSVCCompat
2185 ? diag::ext_ms_reserved_user_defined_literal
2186 : diag::ext_reserved_user_defined_literal)
2188 return CurPtr;
2189 }
2190
2191 CurPtr = ConsumeChar(CurPtr, Size, Result);
2192 }
2193
2195 while (true) {
2196 C = getCharAndSize(CurPtr, Size);
2198 CurPtr = ConsumeChar(CurPtr, Size, Result);
2199 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2200 } else if ((C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2201 } else
2202 break;
2203 }
2204
2205 return CurPtr;
2206}
2207
2208
2209
2210bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2212 const char *AfterQuote = CurPtr;
2213
2214 const char *NulCharacter = nullptr;
2215
2217 (Kind == tok::utf8_string_literal ||
2218 Kind == tok::utf16_string_literal ||
2219 Kind == tok::utf32_string_literal))
2220 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2221 : diag::warn_c99_compat_unicode_literal);
2222
2223 char C = getAndAdvanceChar(CurPtr, Result);
2224 while (C != '"') {
2225
2226
2227 if (C == '\\')
2228 C = getAndAdvanceChar(CurPtr, Result);
2229
2230 if (C == '\n' || C == '\r' ||
2231 (C == 0 && CurPtr-1 == BufferEnd)) {
2233 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2234 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2235 return true;
2236 }
2237
2238 if (C == 0) {
2239 if (isCodeCompletionPoint(CurPtr-1)) {
2241 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, false);
2242 else
2243 PP->CodeCompleteNaturalLanguage();
2244 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2245 cutOffLexing();
2246 return true;
2247 }
2248
2249 NulCharacter = CurPtr-1;
2250 }
2251 C = getAndAdvanceChar(CurPtr, Result);
2252 }
2253
2254
2255 if (LangOpts.CPlusPlus)
2256 CurPtr = LexUDSuffix(Result, CurPtr, true);
2257
2258
2260 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2261
2262
2263 const char *TokStart = BufferPtr;
2264 FormTokenWithChars(Result, CurPtr, Kind);
2265 Result.setLiteralData(TokStart);
2266 return true;
2267}
2268
2269
2270
2271bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2273
2274
2275
2276
2277
2279 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2280
2281 unsigned PrefixLen = 0;
2282
2285 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2286 const char *Pos = &CurPtr[PrefixLen];
2287 Diag(Pos, LangOpts.CPlusPlus26
2288 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2289 : diag::ext_cxx26_raw_string_literal_character_set)
2290 << StringRef(Pos, 1);
2291 }
2292 ++PrefixLen;
2293 }
2294
2295
2296 if (CurPtr[PrefixLen] != '(') {
2298 const char *PrefixEnd = &CurPtr[PrefixLen];
2299 if (PrefixLen == 16) {
2300 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2301 } else if (*PrefixEnd == '\n') {
2302 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2303 } else {
2304 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2305 << StringRef(PrefixEnd, 1);
2306 }
2307 }
2308
2309
2310
2311
2312 while (true) {
2313 char C = *CurPtr++;
2314
2315 if (C == '"')
2316 break;
2317 if (C == 0 && CurPtr-1 == BufferEnd) {
2318 --CurPtr;
2319 break;
2320 }
2321 }
2322
2323 FormTokenWithChars(Result, CurPtr, tok::unknown);
2324 return true;
2325 }
2326
2327
2328 const char *Prefix = CurPtr;
2329 CurPtr += PrefixLen + 1;
2330
2331 while (true) {
2332 char C = *CurPtr++;
2333
2334 if (C == ')') {
2335
2336 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2337 CurPtr += PrefixLen + 1;
2338 break;
2339 }
2340 } else if (C == 0 && CurPtr-1 == BufferEnd) {
2342 Diag(BufferPtr, diag::err_unterminated_raw_string)
2343 << StringRef(Prefix, PrefixLen);
2344 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2345 return true;
2346 }
2347 }
2348
2349
2350 if (LangOpts.CPlusPlus)
2351 CurPtr = LexUDSuffix(Result, CurPtr, true);
2352
2353
2354 const char *TokStart = BufferPtr;
2355 FormTokenWithChars(Result, CurPtr, Kind);
2356 Result.setLiteralData(TokStart);
2357 return true;
2358}
2359
2360
2361
2362bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2363
2364 const char *NulCharacter = nullptr;
2365 const char *AfterLessPos = CurPtr;
2366 char C = getAndAdvanceChar(CurPtr, Result);
2367 while (C != '>') {
2368
2369
2370 if (C == '\\')
2371 C = getAndAdvanceChar(CurPtr, Result);
2372
2374 (C == 0 && (CurPtr - 1 == BufferEnd))) {
2375
2376
2377 FormTokenWithChars(Result, AfterLessPos, tok::less);
2378 return true;
2379 }
2380
2381 if (C == 0) {
2382 if (isCodeCompletionPoint(CurPtr - 1)) {
2383 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, true);
2384 cutOffLexing();
2385 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2386 return true;
2387 }
2388 NulCharacter = CurPtr-1;
2389 }
2390 C = getAndAdvanceChar(CurPtr, Result);
2391 }
2392
2393
2395 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2396
2397
2398 const char *TokStart = BufferPtr;
2399 FormTokenWithChars(Result, CurPtr, tok::header_name);
2400 Result.setLiteralData(TokStart);
2401 return true;
2402}
2403
2404void Lexer::codeCompleteIncludedFile(const char *PathStart,
2405 const char *CompletionPoint,
2406 bool IsAngled) {
2407
2408 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2409 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2410 auto Slash = PartialPath.find_last_of(SlashChars);
2411 StringRef Dir =
2412 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2413 const char *StartOfFilename =
2414 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2415
2416 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2417 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2418
2419
2420 while (CompletionPoint < BufferEnd) {
2421 char Next = *(CompletionPoint + 1);
2422 if (Next == 0 || Next == '\r' || Next == '\n')
2423 break;
2424 ++CompletionPoint;
2425 if (Next == (IsAngled ? '>' : '"'))
2426 break;
2427 if (SlashChars.contains(Next))
2428 break;
2429 }
2430
2431 PP->setCodeCompletionTokenRange(
2432 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2433 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2434 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2435}
2436
2437
2438
2439bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2441
2442 const char *NulCharacter = nullptr;
2443
2445 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2446 Diag(BufferPtr, LangOpts.CPlusPlus
2447 ? diag::warn_cxx98_compat_unicode_literal
2448 : diag::warn_c99_compat_unicode_literal);
2449 else if (Kind == tok::utf8_char_constant)
2450 Diag(BufferPtr, LangOpts.CPlusPlus
2451 ? diag::warn_cxx14_compat_u8_character_literal
2452 : diag::warn_c17_compat_u8_character_literal);
2453 }
2454
2455 char C = getAndAdvanceChar(CurPtr, Result);
2456 if (C == '\'') {
2458 Diag(BufferPtr, diag::ext_empty_character);
2459 FormTokenWithChars(Result, CurPtr, tok::unknown);
2460 return true;
2461 }
2462
2463 while (C != '\'') {
2464
2465 if (C == '\\')
2466 C = getAndAdvanceChar(CurPtr, Result);
2467
2468 if (C == '\n' || C == '\r' ||
2469 (C == 0 && CurPtr-1 == BufferEnd)) {
2471 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2472 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2473 return true;
2474 }
2475
2476 if (C == 0) {
2477 if (isCodeCompletionPoint(CurPtr-1)) {
2478 PP->CodeCompleteNaturalLanguage();
2479 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2480 cutOffLexing();
2481 return true;
2482 }
2483
2484 NulCharacter = CurPtr-1;
2485 }
2486 C = getAndAdvanceChar(CurPtr, Result);
2487 }
2488
2489
2490 if (LangOpts.CPlusPlus)
2491 CurPtr = LexUDSuffix(Result, CurPtr, false);
2492
2493
2495 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2496
2497
2498 const char *TokStart = BufferPtr;
2499 FormTokenWithChars(Result, CurPtr, Kind);
2500 Result.setLiteralData(TokStart);
2501 return true;
2502}
2503
2504
2505
2506
2507
2508bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2509 bool &TokAtPhysicalStartOfLine) {
2510
2512
2513 unsigned char Char = *CurPtr;
2514
2515 const char *lastNewLine = nullptr;
2516 auto setLastNewLine = [&](const char *Ptr) {
2517 lastNewLine = Ptr;
2518 if (!NewLinePtr)
2519 NewLinePtr = Ptr;
2520 };
2521 if (SawNewline)
2522 setLastNewLine(CurPtr - 1);
2523
2524
2525 while (true) {
2526
2528 Char = *++CurPtr;
2529
2530
2532 break;
2533
2535
2536 BufferPtr = CurPtr;
2537 return false;
2538 }
2539
2540
2541 if (*CurPtr == '\n')
2542 setLastNewLine(CurPtr);
2543 SawNewline = true;
2544 Char = *++CurPtr;
2545 }
2546
2547
2549 FormTokenWithChars(Result, CurPtr, tok::unknown);
2550 if (SawNewline) {
2551 IsAtStartOfLine = true;
2552 IsAtPhysicalStartOfLine = true;
2553 }
2554
2555 return true;
2556 }
2557
2558
2559 char PrevChar = CurPtr[-1];
2561
2563 if (SawNewline) {
2565 TokAtPhysicalStartOfLine = true;
2566
2567 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2568 if (auto *Handler = PP->getEmptylineHandler())
2569 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2571 }
2572 }
2573
2574 BufferPtr = CurPtr;
2575 return false;
2576}
2577
2578
2579
2580
2581
2582
2583
2584bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2585 bool &TokAtPhysicalStartOfLine) {
2586
2587
2588 if (!LineComment) {
2589 if (())
2590 Diag(BufferPtr, diag::ext_line_comment);
2591
2592
2593
2594 LineComment = true;
2595 }
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608 bool UnicodeDecodingAlreadyDiagnosed = false;
2609
2610 char C;
2611 while (true) {
2612 C = *CurPtr;
2613
2614 while (isASCII(C) && C != 0 &&
2615 C != '\n' && C != '\r') {
2616 C = *++CurPtr;
2617 UnicodeDecodingAlreadyDiagnosed = false;
2618 }
2619
2621 unsigned Length = llvm::getUTF8SequenceSize(
2622 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2623 if (Length == 0) {
2624 if (!UnicodeDecodingAlreadyDiagnosed && ())
2625 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2626 UnicodeDecodingAlreadyDiagnosed = true;
2627 ++CurPtr;
2628 } else {
2629 UnicodeDecodingAlreadyDiagnosed = false;
2630 CurPtr += Length;
2631 }
2632 continue;
2633 }
2634
2635 const char *NextLine = CurPtr;
2636 if (C != 0) {
2637
2638 const char *EscapePtr = CurPtr-1;
2639 bool HasSpace = false;
2641 --EscapePtr;
2642 HasSpace = true;
2643 }
2644
2645 if (*EscapePtr == '\\')
2646
2647 CurPtr = EscapePtr;
2648 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2649 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2650
2651 CurPtr = EscapePtr-2;
2652 else
2653 break;
2654
2655
2657 Diag(EscapePtr, diag::backslash_newline_space);
2658 }
2659
2660
2661
2662
2663
2664 const char *OldPtr = CurPtr;
2667 C = getAndAdvanceChar(CurPtr, Result);
2669
2670
2671
2672 if (C != 0 && CurPtr == OldPtr+1) {
2673 CurPtr = NextLine;
2674 break;
2675 }
2676
2677
2678
2679
2680 if (CurPtr != OldPtr + 1 && C != '/' &&
2681 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2682 for (; OldPtr != CurPtr; ++OldPtr)
2683 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2684
2685
2687 const char *ForwardPtr = CurPtr;
2688 while (isWhitespace(*ForwardPtr))
2689 ++ForwardPtr;
2690 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2691 break;
2692 }
2693
2695 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2696 break;
2697 }
2698 }
2699
2700 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2701 --CurPtr;
2702 break;
2703 }
2704
2705 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2706 PP->CodeCompleteNaturalLanguage();
2707 cutOffLexing();
2708 return false;
2709 }
2710 }
2711
2712
2713
2717 BufferPtr = CurPtr;
2718 return true;
2719 }
2720
2721
2723 return SaveLineComment(Result, CurPtr);
2724
2725
2726
2728 BufferPtr = CurPtr;
2729 return false;
2730 }
2731
2732
2733
2734
2735
2736
2737 NewLinePtr = CurPtr++;
2738
2739
2741 TokAtPhysicalStartOfLine = true;
2742
2744 BufferPtr = CurPtr;
2745 return false;
2746}
2747
2748
2749
2750bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2751
2752
2753 FormTokenWithChars(Result, CurPtr, tok::comment);
2754
2756 return true;
2757
2758
2759
2761 std::string Spelling = PP->getSpelling(Result, &Invalid);
2763 return true;
2764
2765 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2766 Spelling[1] = '*';
2767 Spelling += "*/";
2768
2769 Result.setKind(tok::comment);
2770 PP->CreateString(Spelling, Result,
2771 Result.getLocation(), Result.getLocation());
2772 return true;
2773}
2774
2775
2776
2777
2779 bool Trigraphs) {
2780 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2781
2782
2783 const char *TrigraphPos = nullptr;
2784
2785 const char *SpacePos = nullptr;
2786
2787 while (true) {
2788
2789 --CurPtr;
2790
2791
2792 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2793
2794 if (CurPtr[0] == CurPtr[1])
2795 return false;
2796
2797 --CurPtr;
2798 }
2799
2800
2801
2803 SpacePos = CurPtr;
2804 --CurPtr;
2805 }
2806
2807
2808 if (*CurPtr == '\\') {
2809 --CurPtr;
2810 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2811
2812 TrigraphPos = CurPtr - 2;
2813 CurPtr -= 3;
2814 } else {
2815 return false;
2816 }
2817
2818
2819
2820 if (*CurPtr == '*')
2821 break;
2822
2823 if (*CurPtr != '\n' && *CurPtr != '\r')
2824 return false;
2825 }
2826
2827 if (TrigraphPos) {
2828
2829
2830 if (!Trigraphs) {
2832 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2833 return false;
2834 }
2836 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2837 }
2838
2839
2841 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2842
2843
2845 L->Diag(SpacePos, diag::backslash_newline_space);
2846
2847 return true;
2848}
2849
2850#ifdef __SSE2__
2851#include <emmintrin.h>
2852#elif __ALTIVEC__
2854#undef bool
2855#endif
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2867 bool &TokAtPhysicalStartOfLine) {
2868
2869
2870
2871
2872
2873
2874
2875
2876 unsigned CharSize;
2877 unsigned char C = getCharAndSize(CurPtr, CharSize);
2878 CurPtr += CharSize;
2879 if (C == 0 && CurPtr == BufferEnd+1) {
2881 Diag(BufferPtr, diag::err_unterminated_block_comment);
2882 --CurPtr;
2883
2884
2885
2887 FormTokenWithChars(Result, CurPtr, tok::unknown);
2888 return true;
2889 }
2890
2891 BufferPtr = CurPtr;
2892 return false;
2893 }
2894
2895
2896
2897 if (C == '/')
2898 C = *CurPtr++;
2899
2900
2901
2902
2903
2904 bool UnicodeDecodingAlreadyDiagnosed = false;
2905
2906 while (true) {
2907
2908
2909 if (CurPtr + 24 < BufferEnd &&
2910
2911
2912 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2913
2914 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2916 goto MultiByteUTF8;
2917 C = *CurPtr++;
2918 }
2919 if (C == '/') goto FoundSlash;
2920
2921#ifdef __SSE2__
2923 while (CurPtr + 16 < BufferEnd) {
2925 if (LLVM_UNLIKELY(Mask != 0)) {
2926 goto MultiByteUTF8;
2927 }
2928
2930 Slashes));
2931 if (cmp != 0) {
2932
2933
2934
2935 CurPtr += llvm::countr_zero(cmp) + 1;
2936 goto FoundSlash;
2937 }
2938 CurPtr += 16;
2939 }
2940#elif __ALTIVEC__
2941 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2942 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2943 0x80, 0x80, 0x80, 0x80};
2944 __vector unsigned char Slashes = {
2945 '/', '/', '/', '/', '/', '/', '/', '/',
2946 '/', '/', '/', '/', '/', '/', '/', '/'
2947 };
2948 while (CurPtr + 16 < BufferEnd) {
2949 if (LLVM_UNLIKELY(
2950 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2951 goto MultiByteUTF8;
2952 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2953 break;
2954 }
2955 CurPtr += 16;
2956 }
2957
2958#else
2959 while (CurPtr + 16 < BufferEnd) {
2960 bool HasNonASCII = false;
2961 for (unsigned I = 0; I < 16; ++I)
2962 HasNonASCII |= (CurPtr[I]);
2963
2964 if (LLVM_UNLIKELY(HasNonASCII))
2965 goto MultiByteUTF8;
2966
2967 bool HasSlash = false;
2968 for (unsigned I = 0; I < 16; ++I)
2969 HasSlash |= CurPtr[I] == '/';
2970 if (HasSlash)
2971 break;
2972 CurPtr += 16;
2973 }
2974#endif
2975
2976
2977 C = *CurPtr++;
2978 }
2979
2980
2981
2982
2983 while (C != '/' && C != '\0') {
2985 UnicodeDecodingAlreadyDiagnosed = false;
2986 C = *CurPtr++;
2987 continue;
2988 }
2989 MultiByteUTF8:
2990
2991
2992 unsigned Length = llvm::getUTF8SequenceSize(
2993 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2994 if (Length == 0) {
2995 if (!UnicodeDecodingAlreadyDiagnosed && ())
2996 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2997 UnicodeDecodingAlreadyDiagnosed = true;
2998 } else {
2999 UnicodeDecodingAlreadyDiagnosed = false;
3000 CurPtr += Length - 1;
3001 }
3002 C = *CurPtr++;
3003 }
3004
3005 if (C == '/') {
3006 FoundSlash:
3007 if (CurPtr[-2] == '*')
3008 break;
3009
3010 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
3012 LangOpts.Trigraphs)) {
3013
3014
3015 break;
3016 }
3017 }
3018 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
3019
3020
3021
3023 Diag(CurPtr-1, diag::warn_nested_block_comment);
3024 }
3025 } else if (C == 0 && CurPtr == BufferEnd+1) {
3027 Diag(BufferPtr, diag::err_unterminated_block_comment);
3028
3029
3030
3031 --CurPtr;
3032
3033
3034
3036 FormTokenWithChars(Result, CurPtr, tok::unknown);
3037 return true;
3038 }
3039
3040 BufferPtr = CurPtr;
3041 return false;
3042 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3043 PP->CodeCompleteNaturalLanguage();
3044 cutOffLexing();
3045 return false;
3046 }
3047
3048 C = *CurPtr++;
3049 }
3050
3051
3055 BufferPtr = CurPtr;
3056 return true;
3057 }
3058
3059
3061 FormTokenWithChars(Result, CurPtr, tok::comment);
3062 return true;
3063 }
3064
3065
3066
3067
3068
3070 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3071 return false;
3072 }
3073
3074
3075 BufferPtr = CurPtr;
3077 return false;
3078}
3079
3080
3081
3082
3083
3084
3085
3088 "Must be in a preprocessing directive!");
3091
3092
3093 const char *CurPtr = BufferPtr;
3094 while (true) {
3095 char Char = getAndAdvanceChar(CurPtr, Tmp);
3096 switch (Char) {
3097 default:
3099 Result->push_back(Char);
3100 break;
3101 case 0:
3102
3103 if (CurPtr-1 != BufferEnd) {
3104 if (isCodeCompletionPoint(CurPtr-1)) {
3105 PP->CodeCompleteNaturalLanguage();
3106 cutOffLexing();
3107 return;
3108 }
3109
3110
3112 Result->push_back(Char);
3113 break;
3114 }
3115
3116 [[fallthrough]];
3117 case '\r':
3118 case '\n':
3119
3120 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3121 BufferPtr = CurPtr-1;
3122
3123
3124 Lex(Tmp);
3125 if (Tmp.is(tok::code_completion)) {
3126 if (PP)
3127 PP->CodeCompleteNaturalLanguage();
3128 Lex(Tmp);
3129 }
3130 assert(Tmp.is(tok::eod) && "Unexpected token!");
3131
3132
3133 return;
3134 }
3135 }
3136}
3137
3138
3139
3140
3141
3142bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3143
3144
3145
3147
3149
3150 FormTokenWithChars(Result, CurPtr, tok::eod);
3151
3152
3153 if (PP)
3155 return true;
3156 }
3157
3158
3159
3161 Result.startToken();
3162 BufferPtr = BufferEnd;
3163 FormTokenWithChars(Result, BufferEnd, tok::eof);
3164 return true;
3165 }
3166
3167 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3169
3170
3171
3173 MIOpt.ExitTopLevelConditional();
3175 }
3176
3177
3178
3179
3181 if (PP->getCodeCompletionFileLoc() != FileLoc)
3183 diag::err_pp_unterminated_conditional);
3185 }
3186
3187
3188
3189
3190 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
3191 Diag(BufferEnd, diag::warn_no_newline_eof)
3193
3194 BufferPtr = CurPtr;
3195
3196
3198}
3199
3200
3201
3202
3203std::optional Lexer::peekNextPPToken() {
3204 assert( && "How can we expand a macro from a skipping buffer?");
3205
3206 if (isDependencyDirectivesLexer()) {
3207 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3208 return std::nullopt;
3210 (void)convertDependencyDirectiveToken(
3211 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex], Result);
3213 }
3214
3215
3216
3217
3219
3220
3221 const char *TmpBufferPtr = BufferPtr;
3223 bool atStartOfLine = IsAtStartOfLine;
3224 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3225 bool leadingSpace = HasLeadingSpace;
3226
3227 Token Tok;
3229
3230
3231 BufferPtr = TmpBufferPtr;
3233 HasLeadingSpace = leadingSpace;
3234 IsAtStartOfLine = atStartOfLine;
3235 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3236
3238
3240 return std::nullopt;
3241 return Tok;
3242}
3243
3244
3245static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3247 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3248 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3249 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3250 size_t Pos = RestOfBuffer.find(Terminator);
3251 while (Pos != StringRef::npos) {
3252
3253 if (Pos == 0 ||
3254 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3255 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3256 Pos = RestOfBuffer.find(Terminator);
3257 continue;
3258 }
3259 return RestOfBuffer.data()+Pos;
3260 }
3261 return nullptr;
3262}
3263
3264
3265
3266
3267
3268bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3269
3270 if (CurPtr != BufferStart &&
3271 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3272 return false;
3273
3274
3275 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3276 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3277 return false;
3278
3279
3280
3282 return false;
3283
3285
3286
3287
3289
3290
3291 Diag(CurPtr, diag::err_conflict_marker);
3292 CurrentConflictMarkerState = Kind;
3293
3294
3295
3296 while (*CurPtr != '\r' && *CurPtr != '\n') {
3297 assert(CurPtr != BufferEnd && "Didn't find end of line");
3298 ++CurPtr;
3299 }
3300 BufferPtr = CurPtr;
3301 return true;
3302 }
3303
3304
3305 return false;
3306}
3307
3308
3309
3310
3311
3312bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3313
3314 if (CurPtr != BufferStart &&
3315 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3316 return false;
3317
3318
3319
3321 return false;
3322
3323
3324 for (unsigned i = 1; i != 4; ++i)
3325 if (CurPtr[i] != CurPtr[0])
3326 return false;
3327
3328
3329
3330
3331 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3332 CurrentConflictMarkerState)) {
3333 CurPtr = End;
3334
3335
3336 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3337 ++CurPtr;
3338
3339 BufferPtr = CurPtr;
3340
3341
3342 CurrentConflictMarkerState = CMK_None;
3343 return true;
3344 }
3345
3346 return false;
3347}
3348
3350 const char *BufferEnd) {
3351 if (CurPtr == BufferEnd)
3352 return nullptr;
3353 BufferEnd -= 1;
3354 for (; CurPtr != BufferEnd; ++CurPtr) {
3355 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3356 return CurPtr + 2;
3357 }
3358 return nullptr;
3359}
3360
3361bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3362 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3363 if ( ||
->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
3364 return false;
3366 if (!End)
3367 return false;
3368 const char *Start = CurPtr - 1;
3369 if (!LangOpts.AllowEditorPlaceholders)
3370 Diag(Start, diag::err_placeholder_in_source);
3371 Result.startToken();
3372 FormTokenWithChars(Result, End, tok::raw_identifier);
3373 Result.setRawIdentifierData(Start);
3374 PP->LookUpIdentifierInfo(Result);
3376 BufferPtr = End;
3377 return true;
3378}
3379
3380bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3381 if (PP && PP->isCodeCompletionEnabled()) {
3382 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3383 return Loc == PP->getCodeCompletionLoc();
3384 }
3385
3386 return false;
3387}
3388
3390 bool Named,
3393 unsigned DiagId;
3394 if (Opts.CPlusPlus23)
3395 DiagId = diag::warn_cxx23_delimited_escape_sequence;
3396 else if (Opts.C2y && !Named)
3397 DiagId = diag::warn_c2y_delimited_escape_sequence;
3398 else
3399 DiagId = diag::ext_delimited_escape_sequence;
3400
3401
3402
3403
3404 unsigned Ext;
3405 if (!Opts.CPlusPlus)
3406 Ext = Named ? 2 : 1 ;
3407 else
3408 Ext = 0;
3409
3410 Diags.Report(Loc, DiagId) << Named << Ext;
3411}
3412
3413std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3414 const char *SlashLoc,
3416 unsigned CharSize;
3417 char Kind = getCharAndSize(StartPtr, CharSize);
3418 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3419
3420 unsigned NumHexDigits;
3421 if (Kind == 'u')
3422 NumHexDigits = 4;
3423 else if (Kind == 'U')
3424 NumHexDigits = 8;
3425
3426 bool Delimited = false;
3427 bool FoundEndDelimiter = false;
3428 unsigned Count = 0;
3430
3431 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3432 if (Diagnose)
3433 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3434 return std::nullopt;
3435 }
3436
3437 const char *CurPtr = StartPtr + CharSize;
3438 const char *KindLoc = &CurPtr[-1];
3439
3440 uint32_t CodePoint = 0;
3441 while (Count != NumHexDigits || Delimited) {
3442 char C = getCharAndSize(CurPtr, CharSize);
3443 if (!Delimited && Count == 0 && C == '{') {
3444 Delimited = true;
3445 CurPtr += CharSize;
3446 continue;
3447 }
3448
3449 if (Delimited && C == '}') {
3450 CurPtr += CharSize;
3451 FoundEndDelimiter = true;
3452 break;
3453 }
3454
3455 unsigned Value = llvm::hexDigitValue(C);
3456 if (Value == std::numeric_limits::max()) {
3457 if (!Delimited)
3458 break;
3459 if (Diagnose)
3460 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3461 << StringRef(KindLoc, 1);
3462 return std::nullopt;
3463 }
3464
3465 if (CodePoint & 0xF000'0000) {
3466 if (Diagnose)
3467 Diag(KindLoc, diag::err_escape_too_large) << 0;
3468 return std::nullopt;
3469 }
3470
3471 CodePoint <<= 4;
3472 CodePoint |= Value;
3473 CurPtr += CharSize;
3474 Count++;
3475 }
3476
3477 if (Count == 0) {
3478 if (Diagnose)
3479 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3480 : diag::warn_ucn_escape_no_digits)
3481 << StringRef(KindLoc, 1);
3482 return std::nullopt;
3483 }
3484
3485 if (Delimited && Kind == 'U') {
3486 if (Diagnose)
3487 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3488 return std::nullopt;
3489 }
3490
3491 if (!Delimited && Count != NumHexDigits) {
3492 if (Diagnose) {
3493 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3494
3495 if (Count == 4 && NumHexDigits == 8) {
3496 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3497 Diag(KindLoc, diag::note_ucn_four_not_eight)
3499 }
3500 }
3501 return std::nullopt;
3502 }
3503
3504 if (Delimited && PP)
3506 PP->getLangOpts(),
3507 PP->getDiagnostics());
3508
3511
3512
3513
3514 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3515 StartPtr = CurPtr;
3516 else
3517 while (StartPtr != CurPtr)
3518 (void)getAndAdvanceChar(StartPtr, *Result);
3519 } else {
3520 StartPtr = CurPtr;
3521 }
3522 return CodePoint;
3523}
3524
3525std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3526 const char *SlashLoc,
3528 unsigned CharSize;
3530
3531 char C = getCharAndSize(StartPtr, CharSize);
3532 assert(C == 'N' && "expected \\N{...}");
3533
3534 const char *CurPtr = StartPtr + CharSize;
3535 const char *KindLoc = &CurPtr[-1];
3536
3537 C = getCharAndSize(CurPtr, CharSize);
3538 if (C != '{') {
3539 if (Diagnose)
3540 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3541 return std::nullopt;
3542 }
3543 CurPtr += CharSize;
3544 const char *StartName = CurPtr;
3545 bool FoundEndDelimiter = false;
3546 llvm::SmallVector<char, 30> Buffer;
3547 while (C) {
3548 C = getCharAndSize(CurPtr, CharSize);
3549 CurPtr += CharSize;
3550 if (C == '}') {
3551 FoundEndDelimiter = true;
3552 break;
3553 }
3554
3556 break;
3557 Buffer.push_back(C);
3558 }
3559
3560 if (!FoundEndDelimiter || Buffer.empty()) {
3561 if (Diagnose)
3562 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3563 : diag::warn_delimited_ucn_incomplete)
3564 << StringRef(KindLoc, 1);
3565 return std::nullopt;
3566 }
3567
3568 StringRef Name(Buffer.data(), Buffer.size());
3569 std::optional<char32_t> Match =
3570 llvm::sys::unicode::nameToCodepointStrict(Name);
3571 std::optionalllvm::sys::unicode::LooseMatchingResult LooseMatch;
3573 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3574 if (Diagnose) {
3575 Diag(StartName, diag::err_invalid_ucn_name)
3576 << StringRef(Buffer.data(), Buffer.size())
3577 << makeCharRange(*this, StartName, CurPtr - CharSize);
3578 if (LooseMatch) {
3579 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3581 makeCharRange(*this, StartName, CurPtr - CharSize),
3582 LooseMatch->Name);
3583 }
3584 }
3585
3586
3587
3588 }
3589
3590 if (Diagnose && Match)
3592 PP->getLangOpts(),
3593 PP->getDiagnostics());
3594
3595
3596
3597
3598
3599 if (LooseMatch && Diagnose)
3600 Match = LooseMatch->CodePoint;
3601
3604
3605
3606
3607 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3608 StartPtr = CurPtr;
3609 else
3610 while (StartPtr != CurPtr)
3611 (void)getAndAdvanceChar(StartPtr, *Result);
3612 } else {
3613 StartPtr = CurPtr;
3614 }
3615 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3616}
3617
3618uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3620
3621 unsigned CharSize;
3622 std::optional<uint32_t> CodePointOpt;
3623 char Kind = getCharAndSize(StartPtr, CharSize);
3624 if (Kind == 'u' || Kind == 'U')
3625 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3626 else if (Kind == 'N')
3627 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3628
3629 if (!CodePointOpt)
3630 return 0;
3631
3632 uint32_t CodePoint = *CodePointOpt;
3633
3634
3635 if (LangOpts.AsmPreprocessor)
3636 return CodePoint;
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654 if (CodePoint < 0xA0) {
3655
3656
3658 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3659 Diag(BufferPtr, diag::err_ucn_control_character);
3660 else {
3661 char C = static_cast<char>(CodePoint);
3662 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3663 }
3664 }
3665
3666 return 0;
3667 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3668
3669
3670
3672 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3673 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3674 else
3675 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3676 }
3677 return 0;
3678 }
3679
3680 return CodePoint;
3681}
3682
3683bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3684 const char *CurPtr) {
3687 Diag(BufferPtr, diag::ext_unicode_whitespace)
3689
3691 return true;
3692 }
3693 return false;
3694}
3695
3696void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3697 IsAtStartOfLine = Result.isAtStartOfLine();
3698 HasLeadingSpace = Result.hasLeadingSpace();
3699 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3700
3701}
3702
3704 assert(!isDependencyDirectivesLexer());
3705
3706
3707 Result.startToken();
3708
3709
3710 if (IsAtStartOfLine) {
3712 IsAtStartOfLine = false;
3713 }
3714
3715 if (HasLeadingSpace) {
3717 HasLeadingSpace = false;
3718 }
3719
3720 if (HasLeadingEmptyMacro) {
3722 HasLeadingEmptyMacro = false;
3723 }
3724
3725 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3726 IsAtPhysicalStartOfLine = false;
3728 (void) isRawLex;
3729 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3730
3731 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3732 return returnedToken;
3733}
3734
3735
3736
3737
3738
3739
3740bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3741LexStart:
3742 assert(.needsCleaning() && "Result needs cleaning");
3743 assert(.hasPtrData() && "Result has not been reset");
3744
3745
3746 const char *CurPtr = BufferPtr;
3747
3748
3750 do {
3751 ++CurPtr;
3753
3754
3755
3756
3758 FormTokenWithChars(Result, CurPtr, tok::unknown);
3759
3760 return true;
3761 }
3762
3763 BufferPtr = CurPtr;
3765 }
3766
3767 unsigned SizeTmp, SizeTmp2;
3768
3769
3770 char Char = getAndAdvanceChar(CurPtr, Result);
3772
3774 NewLinePtr = nullptr;
3775
3776 switch (Char) {
3777 case 0:
3778
3779 if (CurPtr-1 == BufferEnd)
3780 return LexEndOfFile(Result, CurPtr-1);
3781
3782
3783 if (isCodeCompletionPoint(CurPtr-1)) {
3784
3785 Result.startToken();
3786 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3787 return true;
3788 }
3789
3791 Diag(CurPtr-1, diag::null_in_file);
3793 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3794 return true;
3795
3796
3797
3798 goto LexNextToken;
3799
3800 case 26:
3801
3802 if (LangOpts.MicrosoftExt) {
3804 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3805 return LexEndOfFile(Result, CurPtr-1);
3806 }
3807
3808
3809 Kind = tok::unknown;
3810 break;
3811
3812 case '\r':
3813 if (CurPtr[0] == '\n')
3814 (void)getAndAdvanceChar(CurPtr, Result);
3815 [[fallthrough]];
3816 case '\n':
3817
3818
3820
3822
3823
3824 if (PP)
3826
3827
3828 IsAtStartOfLine = true;
3829 IsAtPhysicalStartOfLine = true;
3830 NewLinePtr = CurPtr - 1;
3831
3832 Kind = tok::eod;
3833 break;
3834 }
3835
3836
3838
3839 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3840 return true;
3841
3842
3843
3844 goto LexNextToken;
3845 case ' ':
3846 case '\t':
3847 case '\f':
3848 case '\v':
3849 SkipHorizontalWhitespace:
3851 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3852 return true;
3853
3854 SkipIgnoredUnits:
3855 CurPtr = BufferPtr;
3856
3857
3858
3859 if (CurPtr[0] == '/' && CurPtr[1] == '/' && () &&
3860 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3861 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3862 return true;
3863 goto SkipIgnoredUnits;
3864 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && ()) {
3865 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3866 return true;
3867 goto SkipIgnoredUnits;
3869 goto SkipHorizontalWhitespace;
3870 }
3871
3872
3873 goto LexNextToken;
3874
3875
3876
3877 case '0': case '1': case '2': case '3': case '4':
3878 case '5': case '6': case '7': case '8': case '9':
3879
3880 MIOpt.ReadToken();
3881 return LexNumericConstant(Result, CurPtr);
3882
3883
3884
3885
3886 case 'u':
3887
3888 MIOpt.ReadToken();
3889
3890 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3891 Char = getCharAndSize(CurPtr, SizeTmp);
3892
3893
3894 if (Char == '"')
3895 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3896 tok::utf16_string_literal);
3897
3898
3899 if (Char == '\'')
3900 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3901 tok::utf16_char_constant);
3902
3903
3904 if (Char == 'R' && LangOpts.RawStringLiterals &&
3905 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3906 return LexRawStringLiteral(Result,
3907 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3909 tok::utf16_string_literal);
3910
3911 if (Char == '8') {
3912 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3913
3914
3915 if (Char2 == '"')
3916 return LexStringLiteral(Result,
3917 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3919 tok::utf8_string_literal);
3920 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3921 return LexCharConstant(
3922 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3924 tok::utf8_char_constant);
3925
3926 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3927 unsigned SizeTmp3;
3928 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3929
3930 if (Char3 == '"') {
3931 return LexRawStringLiteral(Result,
3932 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3935 tok::utf8_string_literal);
3936 }
3937 }
3938 }
3939 }
3940
3941
3942 return LexIdentifierContinue(Result, CurPtr);
3943
3944 case 'U':
3945
3946 MIOpt.ReadToken();
3947
3948 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3949 Char = getCharAndSize(CurPtr, SizeTmp);
3950
3951
3952 if (Char == '"')
3953 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3954 tok::utf32_string_literal);
3955
3956
3957 if (Char == '\'')
3958 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3959 tok::utf32_char_constant);
3960
3961
3962 if (Char == 'R' && LangOpts.RawStringLiterals &&
3963 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3964 return LexRawStringLiteral(Result,
3965 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3967 tok::utf32_string_literal);
3968 }
3969
3970
3971 return LexIdentifierContinue(Result, CurPtr);
3972
3973 case 'R':
3974
3975 MIOpt.ReadToken();
3976
3977 if (LangOpts.RawStringLiterals) {
3978 Char = getCharAndSize(CurPtr, SizeTmp);
3979
3980 if (Char == '"')
3981 return LexRawStringLiteral(Result,
3982 ConsumeChar(CurPtr, SizeTmp, Result),
3983 tok::string_literal);
3984 }
3985
3986
3987 return LexIdentifierContinue(Result, CurPtr);
3988
3989 case 'L':
3990
3991 MIOpt.ReadToken();
3992 Char = getCharAndSize(CurPtr, SizeTmp);
3993
3994
3995 if (Char == '"')
3996 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3997 tok::wide_string_literal);
3998
3999
4000 if (LangOpts.RawStringLiterals && Char == 'R' &&
4001 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
4002 return LexRawStringLiteral(Result,
4003 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4005 tok::wide_string_literal);
4006
4007
4008 if (Char == '\'')
4009 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4010 tok::wide_char_constant);
4011
4012 [[fallthrough]];
4013
4014
4015 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
4016 case 'H': case 'I': case 'J': case 'K': case 'M': case 'N':
4017 case 'O': case 'P': case 'Q': case 'S': case 'T':
4018 case 'V': case 'W': case 'X': case 'Y': case 'Z':
4019 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
4020 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
4021 case 'o': case 'p': case 'q': case 'r': case 's': case 't':
4022 case 'v': case 'w': case 'x': case 'y': case 'z':
4023 case '_':
4024
4025 MIOpt.ReadToken();
4026 return LexIdentifierContinue(Result, CurPtr);
4027
4028 case '$':
4029 if (LangOpts.DollarIdents) {
4031 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4032
4033 MIOpt.ReadToken();
4034 return LexIdentifierContinue(Result, CurPtr);
4035 }
4036
4037 Kind = tok::unknown;
4038 break;
4039
4040
4041 case '\'':
4042
4043 MIOpt.ReadToken();
4044 return LexCharConstant(Result, CurPtr, tok::char_constant);
4045
4046
4047 case '"':
4048
4049 MIOpt.ReadToken();
4050 return LexStringLiteral(Result, CurPtr,
4052 : tok::string_literal);
4053
4054
4055 case '?':
4056 Kind = tok::question;
4057 break;
4058 case '[':
4059 Kind = tok::l_square;
4060 break;
4061 case ']':
4062 Kind = tok::r_square;
4063 break;
4064 case '(':
4065 Kind = tok::l_paren;
4066 break;
4067 case ')':
4068 Kind = tok::r_paren;
4069 break;
4070 case '{':
4071 Kind = tok::l_brace;
4072 break;
4073 case '}':
4074 Kind = tok::r_brace;
4075 break;
4076 case '.':
4077 Char = getCharAndSize(CurPtr, SizeTmp);
4078 if (Char >= '0' && Char <= '9') {
4079
4080 MIOpt.ReadToken();
4081
4082 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4083 } else if (LangOpts.CPlusPlus && Char == '*') {
4084 Kind = tok::periodstar;
4085 CurPtr += SizeTmp;
4086 } else if (Char == '.' &&
4087 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4088 Kind = tok::ellipsis;
4089 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4091 } else {
4092 Kind = tok::period;
4093 }
4094 break;
4095 case '&':
4096 Char = getCharAndSize(CurPtr, SizeTmp);
4097 if (Char == '&') {
4098 Kind = tok::ampamp;
4099 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4100 } else if (Char == '=') {
4101 Kind = tok::ampequal;
4102 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4103 } else {
4104 Kind = tok::amp;
4105 }
4106 break;
4107 case '*':
4108 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4109 Kind = tok::starequal;
4110 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4111 } else {
4112 Kind = tok::star;
4113 }
4114 break;
4115 case '+':
4116 Char = getCharAndSize(CurPtr, SizeTmp);
4117 if (Char == '+') {
4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4119 Kind = tok::plusplus;
4120 } else if (Char == '=') {
4121 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4122 Kind = tok::plusequal;
4123 } else {
4124 Kind = tok::plus;
4125 }
4126 break;
4127 case '-':
4128 Char = getCharAndSize(CurPtr, SizeTmp);
4129 if (Char == '-') {
4130 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4131 Kind = tok::minusminus;
4132 } else if (Char == '>' && LangOpts.CPlusPlus &&
4133 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {
4134 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4136 Kind = tok::arrowstar;
4137 } else if (Char == '>') {
4138 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4139 Kind = tok::arrow;
4140 } else if (Char == '=') {
4141 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4142 Kind = tok::minusequal;
4143 } else {
4144 Kind = tok::minus;
4145 }
4146 break;
4147 case '~':
4148 Kind = tok::tilde;
4149 break;
4150 case '!':
4151 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4152 Kind = tok::exclaimequal;
4153 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4154 } else {
4155 Kind = tok::exclaim;
4156 }
4157 break;
4158 case '/':
4159
4160 Char = getCharAndSize(CurPtr, SizeTmp);
4161 if (Char == '/') {
4162
4163
4164
4165
4166
4167
4168
4169
4170 bool TreatAsComment =
4171 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4172 if (!TreatAsComment)
4173 if (!(PP && PP->isPreprocessedOutput()))
4174 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4175
4176 if (TreatAsComment) {
4177 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4178 TokAtPhysicalStartOfLine))
4179 return true;
4180
4181
4182
4183
4184 goto SkipIgnoredUnits;
4185 }
4186 }
4187
4188 if (Char == '*') {
4189 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4190 TokAtPhysicalStartOfLine))
4191 return true;
4192
4193
4194
4195 goto LexNextToken;
4196 }
4197
4198 if (Char == '=') {
4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200 Kind = tok::slashequal;
4201 } else {
4202 Kind = tok::slash;
4203 }
4204 break;
4205 case '%':
4206 Char = getCharAndSize(CurPtr, SizeTmp);
4207 if (Char == '=') {
4208 Kind = tok::percentequal;
4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4210 } else if (LangOpts.Digraphs && Char == '>') {
4211 Kind = tok::r_brace;
4212 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4213 } else if (LangOpts.Digraphs && Char == ':') {
4214 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4215 Char = getCharAndSize(CurPtr, SizeTmp);
4216 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4217 Kind = tok::hashhash;
4218 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4220 } else if (Char == '@' && LangOpts.MicrosoftExt) {
4221 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4223 Diag(BufferPtr, diag::ext_charize_microsoft);
4224 Kind = tok::hashat;
4225 } else {
4226
4227
4228
4229
4230 if (TokAtPhysicalStartOfLine && && !Is_PragmaLexer)
4231 goto HandleDirective;
4232
4233 Kind = tok::hash;
4234 }
4235 } else {
4236 Kind = tok::percent;
4237 }
4238 break;
4239 case '<':
4240 Char = getCharAndSize(CurPtr, SizeTmp);
4242 return LexAngledStringLiteral(Result, CurPtr);
4243 } else if (Char == '<') {
4244 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4245 if (After == '=') {
4246 Kind = tok::lesslessequal;
4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4249 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4250
4251
4252 goto LexNextToken;
4253 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4254
4255
4256 goto LexNextToken;
4257 } else if (LangOpts.CUDA && After == '<') {
4258 Kind = tok::lesslessless;
4259 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4261 } else {
4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4263 Kind = tok::lessless;
4264 }
4265 } else if (Char == '=') {
4266 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4267 if (After == '>') {
4268 if (LangOpts.CPlusPlus20) {
4270 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4271 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4273 Kind = tok::spaceship;
4274 break;
4275 }
4276
4277
4279 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4282 }
4283 }
4284 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4285 Kind = tok::lessequal;
4286 } else if (LangOpts.Digraphs && Char == ':') {
4287 if (LangOpts.CPlusPlus11 &&
4288 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4289
4290
4291
4292
4293
4294 unsigned SizeTmp3;
4295 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4296 if (After != ':' && After != '>') {
4297 Kind = tok::less;
4299 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4300 break;
4301 }
4302 }
4303
4304 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4305 Kind = tok::l_square;
4306 } else if (LangOpts.Digraphs && Char == '%') {
4307 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4308 Kind = tok::l_brace;
4309 } else if (Char == '#' && SizeTmp == 1 &&
4310 lexEditorPlaceholder(Result, CurPtr)) {
4311 return true;
4312 } else {
4313 Kind = tok::less;
4314 }
4315 break;
4316 case '>':
4317 Char = getCharAndSize(CurPtr, SizeTmp);
4318 if (Char == '=') {
4319 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4320 Kind = tok::greaterequal;
4321 } else if (Char == '>') {
4322 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4323 if (After == '=') {
4324 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4326 Kind = tok::greatergreaterequal;
4327 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4328
4329
4330 goto LexNextToken;
4331 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4332
4333 goto LexNextToken;
4334 } else if (LangOpts.CUDA && After == '>') {
4335 Kind = tok::greatergreatergreater;
4336 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4338 } else {
4339 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4340 Kind = tok::greatergreater;
4341 }
4342 } else {
4343 Kind = tok::greater;
4344 }
4345 break;
4346 case '^':
4347 Char = getCharAndSize(CurPtr, SizeTmp);
4348 if (Char == '=') {
4349 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4350 Kind = tok::caretequal;
4351 } else {
4352 if (LangOpts.OpenCL && Char == '^')
4353 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4354 Kind = tok::caret;
4355 }
4356 break;
4357 case '|':
4358 Char = getCharAndSize(CurPtr, SizeTmp);
4359 if (Char == '=') {
4360 Kind = tok::pipeequal;
4361 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4362 } else if (Char == '|') {
4363
4364 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4365 goto LexNextToken;
4366 Kind = tok::pipepipe;
4367 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4368 } else {
4369 Kind = tok::pipe;
4370 }
4371 break;
4372 case ':':
4373 Char = getCharAndSize(CurPtr, SizeTmp);
4374 if (LangOpts.Digraphs && Char == '>') {
4375 Kind = tok::r_square;
4376 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4377 } else if (Char == ':') {
4378 Kind = tok::coloncolon;
4379 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4380 } else {
4381 Kind = tok::colon;
4382 }
4383 break;
4384 case ';':
4385 Kind = tok::semi;
4386 break;
4387 case '=':
4388 Char = getCharAndSize(CurPtr, SizeTmp);
4389 if (Char == '=') {
4390
4391 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4392 goto LexNextToken;
4393
4394 Kind = tok::equalequal;
4395 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4396 } else {
4397 Kind = tok::equal;
4398 }
4399 break;
4400 case ',':
4401 Kind = tok::comma;
4402 break;
4403 case '#':
4404 Char = getCharAndSize(CurPtr, SizeTmp);
4405 if (Char == '#') {
4406 Kind = tok::hashhash;
4407 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4408 } else if (Char == '@' && LangOpts.MicrosoftExt) {
4409 Kind = tok::hashat;
4411 Diag(BufferPtr, diag::ext_charize_microsoft);
4412 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4413 } else {
4414
4415
4416
4417
4418 if (TokAtPhysicalStartOfLine && && !Is_PragmaLexer)
4419 goto HandleDirective;
4420
4421 Kind = tok::hash;
4422 }
4423 break;
4424
4425 case '@':
4426
4427 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4428 Kind = tok::at;
4429 else
4430 Kind = tok::unknown;
4431 break;
4432
4433
4434 case '\\':
4435 if (!LangOpts.AsmPreprocessor) {
4436 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4437 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4438 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4439 return true;
4440
4441
4442
4443 goto LexNextToken;
4444 }
4445
4446 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4447 }
4448 }
4449
4450 Kind = tok::unknown;
4451 break;
4452
4453 default: {
4455 Kind = tok::unknown;
4456 break;
4457 }
4458
4459 llvm::UTF32 CodePoint;
4460
4461
4462
4463 --CurPtr;
4464 llvm::ConversionResult Status =
4465 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4466 (const llvm::UTF8 *)BufferEnd,
4467 &CodePoint,
4468 llvm::strictConversion);
4469 if (Status == llvm::conversionOK) {
4470 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4471 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4472 return true;
4473
4474
4475
4476 goto LexNextToken;
4477 }
4478 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4479 }
4480
4482 PP->isPreprocessedOutput()) {
4483 ++CurPtr;
4484 Kind = tok::unknown;
4485 break;
4486 }
4487
4488
4489
4490
4491 Diag(CurPtr, diag::err_invalid_utf8);
4492
4493 BufferPtr = CurPtr+1;
4494
4495
4496
4497 goto LexNextToken;
4498 }
4499 }
4500
4501
4502 MIOpt.ReadToken();
4503
4504
4505 FormTokenWithChars(Result, CurPtr, Kind);
4506 return true;
4507
4508HandleDirective:
4509
4510
4511 FormTokenWithChars(Result, CurPtr, tok::hash);
4512 PP->HandleDirective(Result);
4513
4514 if (PP->hadModuleLoaderFatalFailure())
4515
4516 return true;
4517
4518
4519 return false;
4520
4521LexNextToken:
4523 goto LexStart;
4524}
4525
4526const char *Lexer::convertDependencyDirectiveToken(
4528 const char *TokPtr = BufferStart + DDTok.Offset;
4529 Result.startToken();
4534 BufferPtr = TokPtr + DDTok.Length;
4535 return TokPtr;
4536}
4537
4538bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4539 assert(isDependencyDirectivesLexer());
4540
4541 using namespace dependency_directives_scan;
4542
4543 if (BufferPtr == BufferEnd)
4544 return LexEndOfFile(Result, BufferPtr);
4545
4546 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4547 if (DepDirectives.front().Kind == pp_eof)
4548 return LexEndOfFile(Result, BufferEnd);
4549 if (DepDirectives.front().Kind == tokens_present_before_eof)
4550 MIOpt.ReadToken();
4551 NextDepDirectiveTokenIndex = 0;
4552 DepDirectives = DepDirectives.drop_front();
4553 }
4554
4555 const dependency_directives_scan::Token &DDTok =
4556 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4557 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4558
4559 MIOpt.ReadToken();
4560 }
4561
4563 BufferPtr = BufferStart + DDTok.Offset;
4564 LexAngledStringLiteral(Result, BufferPtr + 1);
4565 if (Result.isNot(tok::header_name))
4566 return true;
4567
4568 while (true) {
4569 const dependency_directives_scan::Token &NextTok =
4570 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4571 if (BufferStart + NextTok.Offset >= BufferPtr)
4572 break;
4573 ++NextDepDirectiveTokenIndex;
4574 }
4575 return true;
4576 }
4577
4578 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4579
4580 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4581 PP->HandleDirective(Result);
4582 if (PP->hadModuleLoaderFatalFailure())
4583
4584 return true;
4585 return false;
4586 }
4587 if (Result.is(tok::raw_identifier)) {
4588 Result.setRawIdentifierData(TokPtr);
4590 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4592 return PP->HandleIdentifier(Result);
4593 }
4594 return true;
4595 }
4596 if (Result.isLiteral()) {
4597 Result.setLiteralData(TokPtr);
4598 return true;
4599 }
4600 if (Result.is(tok::colon)) {
4601
4602 if (*BufferPtr == ':') {
4603 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4604 tok::colon));
4605 ++NextDepDirectiveTokenIndex;
4606 Result.setKind(tok::coloncolon);
4607 }
4608 return true;
4609 }
4610 if (Result.is(tok::eod))
4612
4613 return true;
4614}
4615
4616bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4617 assert(isDependencyDirectivesLexer());
4618
4619 using namespace dependency_directives_scan;
4620
4621 bool Stop = false;
4622 unsigned NestedIfs = 0;
4623 do {
4624 DepDirectives = DepDirectives.drop_front();
4625 switch (DepDirectives.front().Kind) {
4627 llvm_unreachable("unexpected 'pp_none'");
4646 break;
4650 ++NestedIfs;
4651 break;
4656 if (!NestedIfs) {
4657 Stop = true;
4658 }
4659 break;
4661 if (!NestedIfs) {
4662 Stop = true;
4663 } else {
4664 --NestedIfs;
4665 }
4666 break;
4668 NextDepDirectiveTokenIndex = 0;
4669 return LexEndOfFile(Result, BufferEnd);
4670 }
4671 } while (!Stop);
4672
4673 const dependency_directives_scan::Token &DDTok =
4674 DepDirectives.front().Tokens.front();
4675 assert(DDTok.is(tok::hash));
4676 NextDepDirectiveTokenIndex = 1;
4677
4678 convertDependencyDirectiveToken(DDTok, Result);
4679 return false;
4680}
Defines the Diagnostic-related interfaces.
unsigned IsFirst
Indicates that this is the first token of the file.
FormatToken * Next
The next token in the unwrapped line.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition Lexer.cpp:944
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition Lexer.cpp:1564
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition Lexer.cpp:1758
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?
Definition Lexer.cpp:1257
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition Lexer.cpp:324
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition Lexer.cpp:3245
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition Lexer.cpp:1683
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition Lexer.cpp:560
static void StringifyImpl(T &Str, char Quote)
Definition Lexer.cpp:284
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition Lexer.cpp:1185
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition Lexer.cpp:1578
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition Lexer.cpp:1648
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition Lexer.cpp:1545
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition Lexer.cpp:1632
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition Lexer.cpp:3349
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition Lexer.cpp:1551
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition Lexer.cpp:918
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition Lexer.cpp:2778
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition Lexer.cpp:1924
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?
Definition Lexer.cpp:1238
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition Lexer.cpp:1606
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition Lexer.cpp:1654
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition Lexer.cpp:543
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
A signed integer type that is the result of subtracting two pointers.
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Concrete class used by the front-end to report problems and issues.
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition Lexer.cpp:1020
friend class Preprocessor
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition Lexer.cpp:1377
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
static unsigned getEscapedNewLineSize(const char *P)
getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...
Definition Lexer.cpp:1276
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)
Finds the token that comes before the given location.
Definition Lexer.cpp:1352
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition Lexer.cpp:277
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition Lexer.cpp:1056
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition Lexer.cpp:3086
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition Lexer.cpp:870
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition Lexer.cpp:1228
const char * getBufferLocation() const
Return the current location in the buffer.
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition Lexer.cpp:3703
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc, bool Named, const LangOptions &Opts, DiagnosticsEngine &Diags)
Diagnose use of a delimited or named escape sequence.
Definition Lexer.cpp:3389
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition Lexer.cpp:789
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition Lexer.cpp:183
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition Lexer.cpp:892
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition Lexer.cpp:951
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition Lexer.cpp:1134
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition Lexer.cpp:1209
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition Lexer.cpp:1154
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition Lexer.cpp:451
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition Lexer.cpp:1130
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)
Finds the token that comes right after the given location.
Definition Lexer.cpp:1321
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition Lexer.cpp:498
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition Lexer.cpp:608
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition Lexer.cpp:219
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition Lexer.cpp:1103
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition Lexer.cpp:242
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition Lexer.cpp:635
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition Lexer.cpp:509
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition Lexer.cpp:848
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition Lexer.cpp:309
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
SourceManager & getSourceManager() const
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
bool isMacroArgExpansion() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
IdentifierInfo * getIdentifierInfo() const
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
unsigned getLength() const
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition Lexer.cpp:69
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
tok::TokenKind getKind() const
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition Lexer.cpp:60
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition Lexer.cpp:77
void startToken()
Reset all flags to cleared.
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
void setFlag(TokenFlags Flag)
Set the specified flag.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ pp_pragma_system_header
@ pp_pragma_include_alias
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
@ Match
This is not an overload because the signature exactly matches an existing declaration.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
@ CMK_None
Not within a conflict marker.
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
std::pair< FileID, unsigned > FileIDAndOffset
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
@ Result
The result type of a method or function.
const FunctionProtoType * T
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
@ Keyword
The name has been typo-corrected to a keyword.
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
#define _SIDD_LEAST_SIGNIFICANT
#define _SIDD_NEGATIVE_POLARITY
Represents a char and the number of bytes parsed to produce it.
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.
bool is(tok::TokenKind K) const