LLVM: lib/Support/UnicodeNameToCodepoint.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

19

20namespace llvm {

21namespace sys {

23

28

30

33 char32_t Value = 0xFFFFFFFF;

39

41 return Name.empty() || Value == 0xFFFFFFFF;

42 }

44

46 std::string S;

47

48

49

50 S.reserve(46);

51 const Node *N = this;

52 while (N) {

53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));

54 N = N->Parent;

55 }

56 std::reverse(S.begin(), S.end());

57 return S;

58 }

59};

60

63 N.IsRoot = true;

64 N.ChildrenOffset = 1;

65 N.Size = 1;

66 return N;

67}

68

72

75 N.Parent = Parent;

78 return N;

79

80 bool LongName = NameInfo & 0x40;

81 bool HasValue = NameInfo & 0x80;

82 std::size_t Size = NameInfo & ~0xC0;

83 if (LongName) {

87 } else {

89 }

94 N.Value = ((H << 16) | (M << 8) | L) >> 3;

95

96 bool HasChildren = L & 0x02;

97 N.HasSibling = L & 0x01;

98

99 if (HasChildren) {

103 }

104 } else {

106 N.HasSibling = H & 0x80;

107 bool HasChildren = H & 0x40;

109 if (HasChildren) {

110 N.ChildrenOffset = (H << 16);

111 N.ChildrenOffset |=

114 }

115 }

116 N.Size = Offset - Origin;

117 return N;

118}

119

121 std::size_t &Consummed, char &PreviousCharInName,

122 bool IsPrefix = false) {

123

124 Consummed = 0;

125 if (Strict) {

126 if (!Name.starts_with(Needle))

127 return false;

128 Consummed = Needle.size();

129 return true;

130 }

131 if (Needle.empty())

132 return true;

133

134 auto NamePos = Name.begin();

135 auto NeedlePos = Needle.begin();

136

137 char PreviousCharInNameOrigin = PreviousCharInName;

138 char PreviousCharInNeedle = *Needle.begin();

139 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,

140 bool IsPrefix = false) {

141 while (It != End) {

142 const auto Next = std::next(It);

143

144

145

147 *It == ' ' || *It == '_' ||

148 (*It == '-' && isAlnum(PreviousChar) &&

150 PreviousChar = *It;

152 break;

153 ++It;

154 }

155 return It;

156 };

157

158 while (true) {

159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);

160 NeedlePos =

161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);

162 if (NeedlePos == Needle.end())

163 break;

164 if (NamePos == Name.end())

165 break;

167 break;

168 NeedlePos++;

169 NamePos++;

170 }

171 Consummed = std::distance(Name.begin(), NamePos);

172 if (NeedlePos != Needle.end()) {

173 PreviousCharInName = PreviousCharInNameOrigin;

174 }

175 return NeedlePos == Needle.end();

176}

177

178static std::tuple<Node, bool, uint32_t>

180 char PreviousCharInName, BufferType &Buffer,

181 const Node *Parent = nullptr) {

183 std::size_t Consummed = 0;

184 bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,

185 PreviousCharInName);

186 if (!DoesStartWith)

187 return std::make_tuple(N, false, 0);

188

189 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)

190 return std::make_tuple(N, true, N.Value);

191

192 if (N.hasChildren()) {

193 uint32_t ChildOffset = N.ChildrenOffset;

194 for (;;) {

196 bool Matches;

198 std::tie(C, Matches, Value) =

199 compareNode(ChildOffset, Name.substr(Consummed), Strict,

200 PreviousCharInName, Buffer, &N);

201 if (Matches) {

202 std::reverse_copy(C.Name.begin(), C.Name.end(),

203 std::back_inserter(Buffer));

204 return std::make_tuple(N, true, Value);

205 }

206 ChildOffset += C.Size;

207 if (C.HasSibling)

208 break;

209 }

210 }

211 return std::make_tuple(N, false, 0);

212}

213

214static std::tuple<Node, bool, uint32_t>

218

219

221 { "G", "A", "" },

222 { "GG", "AE", "G" },

223 { "N", "YA", "GG" },

224 { "D", "YAE", "GS" },

225 { "DD", "EO", "N", },

226 { "R", "E", "NJ" },

227 { "M", "YEO", "NH" },

228 { "B", "YE", "D" },

229 { "BB", "O", "L" },

230 { "S", "WA", "LG" },

231 { "SS", "WAE", "LM" },

232 { "", "OE", "LB" },

233 { "J", "YO", "LS" },

234 { "JJ", "U", "LT" },

235 { "C", "WEO", "LP" },

236 { "K", "WE", "LH" },

237 { "T", "WI", "M" },

238 { "P", "YU", "B" },

239 { "H", "EU", "BS" },

240 { 0, "YI", "S" },

241 { 0, "I", "SS" },

242 { 0, 0, "NG" },

243 { 0, 0, "J" },

244 { 0, 0, "C" },

245 { 0, 0, "K" },

246 { 0, 0, "T" },

247 { 0, 0, "P" },

248 { 0, 0, "H" }

249 };

250

251

252

253

254constexpr char32_t SBase = 0xAC00;

258

260 char &PreviousInName, int &Pos, int Column) {

261 assert(Column == 0 || Column == 1 || Column == 2);

263 int Len = -1;

264 int Prev = PreviousInName;

265 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {

267 if (int(Syllable.size()) <= Len)

268 continue;

269 std::size_t Consummed = 0;

270 char PreviousInNameCopy = PreviousInName;

271 bool DoesStartWith =

272 startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);

273 if (!DoesStartWith)

274 continue;

275 Len = Consummed;

276 Pos = I;

277 Prev = PreviousInNameCopy;

278 }

279 if (Len == -1)

280 return 0;

281 PreviousInName = Prev;

282 return size_t(Len);

283}

284

285static std::optional<char32_t>

288

289 std::size_t Consummed = 0;

290 char NameStart = 0;

291 bool DoesStartWith =

292 startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);

293 if (!DoesStartWith)

294 return std::nullopt;

295 Name = Name.substr(Consummed);

296 int L = -1, V = -1, T = -1;

297 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));

298 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));

299 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));

300 if (L != -1 && V != -1 && T != -1 && Name.empty()) {

301 if (!Strict) {

302 Buffer.append("HANGUL SYLLABLE ");

303 if (L != -1)

305 if (V != -1)

307 if (T != -1)

309 }

310 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +

311 std::uint32_t(T);

312 }

313

314 return std::nullopt;

315}

316

322

323

325 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},

326 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},

327 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},

328 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},

329 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},

330 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},

331 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},

332 {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},

333 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},

334 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},

335 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},

336 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},

337 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},

338 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},

339 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},

340 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},

341 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},

342};

343

344static std::optional<char32_t>

348 std::size_t Consummed = 0;

349 char NameStart = 0;

350 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,

351 NameStart, true);

352 if (!DoesStartWith)

353 continue;

354 auto Number = Name.substr(Consummed);

355 unsigned long long V = 0;

356

357 if (Strict &&

359 return {};

361 continue;

362 if (!Strict) {

363 Buffer.append(Item.Prefix);

365 }

366 return V;

367 }

368 return std::nullopt;

369}

370

373 if (Name.empty())

374 return std::nullopt;

375

377 if (!Res)

379 if (Res)

380 return *Res;

381

384 bool Matches;

387 if (Matches) {

388 std::reverse(Buffer.begin(), Buffer.end());

389

390

391 if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {

392 Buffer = "HANGUL JUNGSEONG O-E";

394 }

396 }

397 return std::nullopt;

398}

399

406

407std::optional

411 if (!Opt)

412 return std::nullopt;

414}

415

416

417

420

421

422

423

424 std::size_t LargestEditDistance = 0;

426 Matches.reserve(MaxMatchesCount + 1);

427

429 char32_t Value) -> bool {

430 if (Distance > LargestEditDistance) {

431 if (Matches.size() == MaxMatchesCount)

432 return false;

433 LargestEditDistance = Distance;

434 }

435

436

437 std::string Name;

438 auto GetName = [&] {

439 if (Name.empty())

441 return Name;

442 };

443

445 Matches, Distance,

448 return a.Name < GetName();

449 return a.Distance < Distance;

450 });

451 if (It == Matches.end() && Matches.size() == MaxMatchesCount)

452 return false;

453

455 Matches.insert(It, std::move(M));

456 if (Matches.size() > MaxMatchesCount)

458 return true;

459 };

460

461

462

464 std::string Out;

465 Out.reserve(Name.size());

466 for (char C : Name) {

469 }

470 return Out;

471 };

473

474

475 const std::size_t Columns =

477 1;

478

479 [[maybe_unused]] static std::size_t Rows =

481

482 std::vector Distances(

484

485 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {

486 assert(Column < Columns);

488 return Distances[Row * Columns + Column];

489 };

490

491 for (std::size_t I = 0; I < Columns; I++)

492 Get(I, 0) = I;

493

494

495

496

497

498 auto VisitNode = [&](const Node &N, std::size_t Row,

499 auto &VisitNode) -> void {

500 std::size_t J = 0;

501 for (; J < N.Name.size(); J++) {

503 continue;

504

505 Get(0, Row) = Row;

506

507 for (std::size_t I = 1; I < Columns; I++) {

508 const int Delete = Get(I - 1, Row) + 1;

509 const int Insert = Get(I, Row - 1) + 1;

510

511 const int Replace =

512 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);

513

514 Get(I, Row) = std::min(Insert, std::min(Delete, Replace));

515 }

516

517 Row++;

518 }

519

520 unsigned Cost = Get(Columns - 1, Row - 1);

521 if (N.Value != 0xFFFFFFFF) {

522 Insert(N, Cost, N.Value);

523 }

524

525 if (N.hasChildren()) {

526 auto ChildOffset = N.ChildrenOffset;

527 for (;;) {

529 ChildOffset += C.Size;

530 if (C.isValid())

531 break;

532 VisitNode(C, Row, VisitNode);

533 if (C.HasSibling)

534 break;

535 }

536 }

537 };

538

540 VisitNode(Root, 1, VisitNode);

541 return Matches;

542}

543

544}

545

546}

547}

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

ReachingDefInfo InstSet InstSet & Ignore

@ Normalize

Normalize - Normalize according to the given loops.

SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...

void append(StringRef RHS)

Append from a StringRef.

void reserve(size_type N)

iterator insert(iterator I, T &&Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

StringRef - Represent a constant reference to a string, i.e.

constexpr bool empty() const

empty - Check if the string is empty.

constexpr size_t size() const

size - Get the string size.

LLVM Value Representation.

@ C

The default llvm calling convention, compatible with C.

static Node readNode(uint32_t Offset, const Node *Parent=nullptr)

Definition UnicodeNameToCodepoint.cpp:69

const std::size_t UnicodeNameToCodepointLargestNameSize

static bool startsWith(StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, bool IsPrefix=false)

Definition UnicodeNameToCodepoint.cpp:120

constexpr char32_t SBase

Definition UnicodeNameToCodepoint.cpp:254

LLVM_ABI std::optional< char32_t > nameToCodepointStrict(StringRef Name)

Maps the name or the alias of a Unicode character to its associated codepoints.

Definition UnicodeNameToCodepoint.cpp:400

LLVM_ABI SmallVector< MatchForCodepointName > nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount)

Definition UnicodeNameToCodepoint.cpp:419

constexpr uint32_t LCount

Definition UnicodeNameToCodepoint.cpp:255

const std::size_t UnicodeNameToCodepointIndexSize

static Node createRoot()

Definition UnicodeNameToCodepoint.cpp:61

SmallString< 64 > BufferType

Definition UnicodeNameToCodepoint.cpp:29

constexpr const char *const HangulSyllables[][3]

Definition UnicodeNameToCodepoint.cpp:220

LLVM_ABI std::optional< LooseMatchingResult > nameToCodepointLooseMatching(StringRef Name)

Definition UnicodeNameToCodepoint.cpp:408

static std::optional< char32_t > nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer)

Definition UnicodeNameToCodepoint.cpp:286

const uint8_t * UnicodeNameToCodepointIndex

static const GeneratedNamesData GeneratedNamesDataTable[]

Definition UnicodeNameToCodepoint.cpp:324

static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column)

Definition UnicodeNameToCodepoint.cpp:259

static std::tuple< Node, bool, uint32_t > compareNode(uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, BufferType &Buffer, const Node *Parent=nullptr)

Definition UnicodeNameToCodepoint.cpp:179

static std::optional< char32_t > nameToCodepoint(StringRef Name, bool Strict, BufferType &Buffer)

Definition UnicodeNameToCodepoint.cpp:371

constexpr uint32_t VCount

Definition UnicodeNameToCodepoint.cpp:256

static std::optional< char32_t > nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer)

Definition UnicodeNameToCodepoint.cpp:345

const char * UnicodeNameToCodepointDict

constexpr uint32_t TCount

Definition UnicodeNameToCodepoint.cpp:257

This is an optimization pass for GlobalISel generic memory operations.

std::string utohexstr(uint64_t X, bool LowerCase=false, unsigned Width=0)

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

detail::ValueMatchesPoly< M > HasValue(M Matcher)

bool isAlnum(char C)

Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...

auto lower_bound(R &&Range, T &&Value)

Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...

char toUpper(char x)

Returns the corresponding uppercase character if x is lowercase.

FunctionAddr VTableAddr Next

LLVM_ABI bool getAsUnsignedInteger(StringRef Str, unsigned Radix, unsigned long long &Result)

Helper functions for StringRef::getAsInteger.

StringRef Prefix

Definition UnicodeNameToCodepoint.cpp:318

uint32_t Start

Definition UnicodeNameToCodepoint.cpp:319

uint32_t End

Definition UnicodeNameToCodepoint.cpp:320

constexpr bool isValid() const

Definition UnicodeNameToCodepoint.cpp:40

std::string fullName() const

Definition UnicodeNameToCodepoint.cpp:45

StringRef Name

Definition UnicodeNameToCodepoint.cpp:37

char32_t Value

Definition UnicodeNameToCodepoint.cpp:33

uint32_t ChildrenOffset

Definition UnicodeNameToCodepoint.cpp:34

bool IsRoot

Definition UnicodeNameToCodepoint.cpp:32

constexpr bool hasChildren() const

Definition UnicodeNameToCodepoint.cpp:43

uint32_t Size

Definition UnicodeNameToCodepoint.cpp:36

bool HasSibling

Definition UnicodeNameToCodepoint.cpp:35

const Node * Parent

Definition UnicodeNameToCodepoint.cpp:38