LLVM: lib/Support/UnicodeNameToCodepoint.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
19
20namespace llvm {
21namespace sys {
23
28
30
33 char32_t Value = 0xFFFFFFFF;
39
41 return .empty() || Value == 0xFFFFFFFF;
42 }
44
46 std::string S;
47
48
49
50 S.reserve(46);
52 while (N) {
53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
55 }
56 std::reverse(S.begin(), S.end());
57 return S;
58 }
59};
60
63 N.IsRoot = true;
64 N.ChildrenOffset = 1;
65 N.Size = 1;
66 return N;
67}
68
72
75 N.Parent = Parent;
78 return N;
79
80 bool LongName = NameInfo & 0x40;
81 bool HasValue = NameInfo & 0x80;
82 std::size_t Size = NameInfo & ~0xC0;
83 if (LongName) {
87 } else {
89 }
94 N.Value = ((H << 16) | (M << 8) | L) >> 3;
95
96 bool HasChildren = L & 0x02;
97 N.HasSibling = L & 0x01;
98
99 if (HasChildren) {
103 }
104 } else {
107 bool HasChildren = H & 0x40;
109 if (HasChildren) {
110 N.ChildrenOffset = (H << 16);
111 N.ChildrenOffset |=
114 }
115 }
117 return N;
118}
119
121 std::size_t &Consummed, char &PreviousCharInName,
122 bool IsPrefix = false) {
123
124 Consummed = 0;
125 if (Strict) {
126 if (!Name.starts_with(Needle))
127 return false;
128 Consummed = Needle.size();
129 return true;
130 }
131 if (Needle.empty())
132 return true;
133
134 auto NamePos = Name.begin();
135 auto NeedlePos = Needle.begin();
136
137 char PreviousCharInNameOrigin = PreviousCharInName;
138 char PreviousCharInNeedle = *Needle.begin();
139 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
140 bool IsPrefix = false) {
141 while (It != End) {
142 const auto Next = std::next(It);
143
144
145
147 *It == ' ' || *It == '_' ||
148 (*It == '-' && isAlnum(PreviousChar) &&
150 PreviousChar = *It;
152 break;
153 ++It;
154 }
155 return It;
156 };
157
158 while (true) {
159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160 NeedlePos =
161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162 if (NeedlePos == Needle.end())
163 break;
164 if (NamePos == Name.end())
165 break;
167 break;
168 NeedlePos++;
169 NamePos++;
170 }
171 Consummed = std::distance(Name.begin(), NamePos);
172 if (NeedlePos != Needle.end()) {
173 PreviousCharInName = PreviousCharInNameOrigin;
174 }
175 return NeedlePos == Needle.end();
176}
177
178static std::tuple<Node, bool, uint32_t>
180 char PreviousCharInName, BufferType &Buffer,
181 const Node *Parent = nullptr) {
183 std::size_t Consummed = 0;
184 bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
185 PreviousCharInName);
186 if (!DoesStartWith)
187 return std::make_tuple(N, false, 0);
188
189 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
190 return std::make_tuple(N, true, N.Value);
191
192 if (N.hasChildren()) {
193 uint32_t ChildOffset = N.ChildrenOffset;
194 for (;;) {
196 bool Matches;
198 std::tie(C, Matches, Value) =
199 compareNode(ChildOffset, Name.substr(Consummed), Strict,
200 PreviousCharInName, Buffer, &N);
201 if (Matches) {
202 std::reverse_copy(C.Name.begin(), C.Name.end(),
203 std::back_inserter(Buffer));
204 return std::make_tuple(N, true, Value);
205 }
206 ChildOffset += C.Size;
207 if (.HasSibling)
208 break;
209 }
210 }
211 return std::make_tuple(N, false, 0);
212}
213
214static std::tuple<Node, bool, uint32_t>
218
219
221 { "G", "A", "" },
222 { "GG", "AE", "G" },
223 { "N", "YA", "GG" },
224 { "D", "YAE", "GS" },
225 { "DD", "EO", "N", },
226 { "R", "E", "NJ" },
227 { "M", "YEO", "NH" },
228 { "B", "YE", "D" },
229 { "BB", "O", "L" },
230 { "S", "WA", "LG" },
231 { "SS", "WAE", "LM" },
232 { "", "OE", "LB" },
233 { "J", "YO", "LS" },
234 { "JJ", "U", "LT" },
235 { "C", "WEO", "LP" },
236 { "K", "WE", "LH" },
237 { "T", "WI", "M" },
238 { "P", "YU", "B" },
239 { "H", "EU", "BS" },
240 { 0, "YI", "S" },
241 { 0, "I", "SS" },
242 { 0, 0, "NG" },
243 { 0, 0, "J" },
244 { 0, 0, "C" },
245 { 0, 0, "K" },
246 { 0, 0, "T" },
247 { 0, 0, "P" },
248 { 0, 0, "H" }
249 };
250
251
252
253
254constexpr char32_t SBase = 0xAC00;
258
260 char &PreviousInName, int &Pos, int Column) {
261 assert(Column == 0 || Column == 1 || Column == 2);
263 int Len = -1;
264 int Prev = PreviousInName;
265 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
267 if (int(Syllable.size()) <= Len)
268 continue;
269 std::size_t Consummed = 0;
270 char PreviousInNameCopy = PreviousInName;
271 bool DoesStartWith =
272 startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
273 if (!DoesStartWith)
274 continue;
275 Len = Consummed;
276 Pos = I;
277 Prev = PreviousInNameCopy;
278 }
279 if (Len == -1)
280 return 0;
281 PreviousInName = Prev;
282 return size_t(Len);
283}
284
285static std::optional<char32_t>
288
289 std::size_t Consummed = 0;
290 char NameStart = 0;
291 bool DoesStartWith =
292 startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
293 if (!DoesStartWith)
294 return std::nullopt;
295 Name = Name.substr(Consummed);
296 int L = -1, V = -1, T = -1;
297 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
298 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
299 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
300 if (L != -1 && V != -1 && T != -1 && Name.empty()) {
301 if (!Strict) {
302 Buffer.append("HANGUL SYLLABLE ");
303 if (L != -1)
305 if (V != -1)
307 if (T != -1)
309 }
310 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
311 std::uint32_t(T);
312 }
313
314 return std::nullopt;
315}
316
322
323
325 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
329 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
330 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
331 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
332 {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
333 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
334 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
335 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
336 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
337 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
338 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
339 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
340 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
341 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
342};
343
344static std::optional<char32_t>
348 std::size_t Consummed = 0;
349 char NameStart = 0;
350 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
351 NameStart, true);
352 if (!DoesStartWith)
353 continue;
354 auto Number = Name.substr(Consummed);
355 unsigned long long V = 0;
356
357 if (Strict &&
359 return {};
361 continue;
362 if (!Strict) {
363 Buffer.append(Item.Prefix);
365 }
366 return V;
367 }
368 return std::nullopt;
369}
370
373 if (Name.empty())
374 return std::nullopt;
375
377 if (!Res)
379 if (Res)
380 return *Res;
381
384 bool Matches;
387 if (Matches) {
388 std::reverse(Buffer.begin(), Buffer.end());
389
390
391 if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {
392 Buffer = "HANGUL JUNGSEONG O-E";
394 }
396 }
397 return std::nullopt;
398}
399
406
407std::optional
411 if (!Opt)
412 return std::nullopt;
414}
415
416
417
420
421
422
423
424 std::size_t LargestEditDistance = 0;
426 Matches.reserve(MaxMatchesCount + 1);
427
429 char32_t Value) -> bool {
430 if (Distance > LargestEditDistance) {
431 if (Matches.size() == MaxMatchesCount)
432 return false;
433 LargestEditDistance = Distance;
434 }
435
436
437 std::string Name;
438 auto GetName = [&] {
439 if (Name.empty())
441 return Name;
442 };
443
445 Matches, Distance,
448 return a.Name < GetName();
449 return a.Distance < Distance;
450 });
451 if (It == Matches.end() && Matches.size() == MaxMatchesCount)
452 return false;
453
455 Matches.insert(It, std::move(M));
456 if (Matches.size() > MaxMatchesCount)
458 return true;
459 };
460
461
462
464 std::string Out;
465 Out.reserve(Name.size());
466 for (char C : Name) {
469 }
470 return Out;
471 };
473
474
475 const std::size_t Columns =
477 1;
478
479 [[maybe_unused]] static std::size_t Rows =
481
482 std::vector Distances(
484
485 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
486 assert(Column < Columns);
488 return Distances[Row * Columns + Column];
489 };
490
491 for (std::size_t I = 0; I < Columns; I++)
493
494
495
496
497
498 auto VisitNode = [&](const Node &N, std::size_t Row,
499 auto &VisitNode) -> void {
500 std::size_t J = 0;
501 for (; J < N.Name.size(); J++) {
503 continue;
504
505 Get(0, Row) = Row;
506
507 for (std::size_t I = 1; I < Columns; I++) {
508 const int Delete = Get(I - 1, Row) + 1;
509 const int Insert = Get(I, Row - 1) + 1;
510
511 const int Replace =
512 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
513
514 Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
515 }
516
517 Row++;
518 }
519
520 unsigned Cost = Get(Columns - 1, Row - 1);
521 if (N.Value != 0xFFFFFFFF) {
523 }
524
525 if (N.hasChildren()) {
526 auto ChildOffset = N.ChildrenOffset;
527 for (;;) {
529 ChildOffset += C.Size;
530 if (.isValid())
531 break;
532 VisitNode(C, Row, VisitNode);
533 if (.HasSibling)
534 break;
535 }
536 }
537 };
538
540 VisitNode(Root, 1, VisitNode);
541 return Matches;
542}
543
544}
545
546}
547}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
ReachingDefInfo InstSet InstSet & Ignore
@ Normalize
Normalize - Normalize according to the given loops.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
void append(StringRef RHS)
Append from a StringRef.
void reserve(size_type N)
iterator insert(iterator I, T &&Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
LLVM Value Representation.
@ C
The default llvm calling convention, compatible with C.
static Node readNode(uint32_t Offset, const Node *Parent=nullptr)
Definition UnicodeNameToCodepoint.cpp:69
const std::size_t UnicodeNameToCodepointLargestNameSize
static bool startsWith(StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, bool IsPrefix=false)
Definition UnicodeNameToCodepoint.cpp:120
constexpr char32_t SBase
Definition UnicodeNameToCodepoint.cpp:254
LLVM_ABI std::optional< char32_t > nameToCodepointStrict(StringRef Name)
Maps the name or the alias of a Unicode character to its associated codepoints.
Definition UnicodeNameToCodepoint.cpp:400
LLVM_ABI SmallVector< MatchForCodepointName > nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount)
Definition UnicodeNameToCodepoint.cpp:419
constexpr uint32_t LCount
Definition UnicodeNameToCodepoint.cpp:255
const std::size_t UnicodeNameToCodepointIndexSize
static Node createRoot()
Definition UnicodeNameToCodepoint.cpp:61
SmallString< 64 > BufferType
Definition UnicodeNameToCodepoint.cpp:29
constexpr const char *const HangulSyllables[][3]
Definition UnicodeNameToCodepoint.cpp:220
LLVM_ABI std::optional< LooseMatchingResult > nameToCodepointLooseMatching(StringRef Name)
Definition UnicodeNameToCodepoint.cpp:408
static std::optional< char32_t > nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
Definition UnicodeNameToCodepoint.cpp:286
const uint8_t * UnicodeNameToCodepointIndex
static const GeneratedNamesData GeneratedNamesDataTable[]
Definition UnicodeNameToCodepoint.cpp:324
static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column)
Definition UnicodeNameToCodepoint.cpp:259
static std::tuple< Node, bool, uint32_t > compareNode(uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, BufferType &Buffer, const Node *Parent=nullptr)
Definition UnicodeNameToCodepoint.cpp:179
static std::optional< char32_t > nameToCodepoint(StringRef Name, bool Strict, BufferType &Buffer)
Definition UnicodeNameToCodepoint.cpp:371
constexpr uint32_t VCount
Definition UnicodeNameToCodepoint.cpp:256
static std::optional< char32_t > nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
Definition UnicodeNameToCodepoint.cpp:345
const char * UnicodeNameToCodepointDict
constexpr uint32_t TCount
Definition UnicodeNameToCodepoint.cpp:257
This is an optimization pass for GlobalISel generic memory operations.
std::string utohexstr(uint64_t X, bool LowerCase=false, unsigned Width=0)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
detail::ValueMatchesPoly< M > HasValue(M Matcher)
bool isAlnum(char C)
Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
char toUpper(char x)
Returns the corresponding uppercase character if x is lowercase.
FunctionAddr VTableAddr Next
LLVM_ABI bool getAsUnsignedInteger(StringRef Str, unsigned Radix, unsigned long long &Result)
Helper functions for StringRef::getAsInteger.
StringRef Prefix
Definition UnicodeNameToCodepoint.cpp:318
uint32_t Start
Definition UnicodeNameToCodepoint.cpp:319
uint32_t End
Definition UnicodeNameToCodepoint.cpp:320
constexpr bool isValid() const
Definition UnicodeNameToCodepoint.cpp:40
std::string fullName() const
Definition UnicodeNameToCodepoint.cpp:45
StringRef Name
Definition UnicodeNameToCodepoint.cpp:37
char32_t Value
Definition UnicodeNameToCodepoint.cpp:33
uint32_t ChildrenOffset
Definition UnicodeNameToCodepoint.cpp:34
bool IsRoot
Definition UnicodeNameToCodepoint.cpp:32
constexpr bool hasChildren() const
Definition UnicodeNameToCodepoint.cpp:43
uint32_t Size
Definition UnicodeNameToCodepoint.cpp:36
bool HasSibling
Definition UnicodeNameToCodepoint.cpp:35
const Node * Parent
Definition UnicodeNameToCodepoint.cpp:38