LLVM: lib/Support/ConvertUTF.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
65#ifdef CVTUTF_DEBUG
66#include <stdio.h>
67#endif
68#include <assert.h>
69
70
71
72
73
74#if defined(__clang__) && defined(__has_warning)
75# if __has_warning("-Wimplicit-fallthrough")
76# define ConvertUTF_DISABLE_WARNINGS \
77 _Pragma("clang diagnostic push") \
78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79# define ConvertUTF_RESTORE_WARNINGS \
80 _Pragma("clang diagnostic pop")
81# endif
82#elif defined(__GNUC__)
83# define ConvertUTF_DISABLE_WARNINGS \
84 _Pragma("GCC diagnostic push") \
85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86# define ConvertUTF_RESTORE_WARNINGS \
87 _Pragma("GCC diagnostic pop")
88#endif
89#ifndef ConvertUTF_DISABLE_WARNINGS
90# define ConvertUTF_DISABLE_WARNINGS
91#endif
92#ifndef ConvertUTF_RESTORE_WARNINGS
93# define ConvertUTF_RESTORE_WARNINGS
94#endif
95
97
98namespace llvm {
99
100static const int halfShift = 10;
101
104
105#define UNI_SUR_HIGH_START (UTF32)0xD800
106#define UNI_SUR_HIGH_END (UTF32)0xDBFF
107#define UNI_SUR_LOW_START (UTF32)0xDC00
108#define UNI_SUR_LOW_END (UTF32)0xDFFF
109
110
111
112
113
114
115
116
117
118
120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
128};
129
130
131
132
133
134
136 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
137
138
139
140
141
142
143
144
145static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
146
147
148
149
150
151
152
153
154
155
156
157
158
159
161 const UTF32** sourceStart, const UTF32* sourceEnd,
164 const UTF32* source = *sourceStart;
165 UTF16* target = *targetStart;
166 while (source < sourceEnd) {
168 if (target >= targetEnd) {
170 }
171 ch = *source++;
172 if (ch <= UNI_MAX_BMP) {
173
176 --source;
178 break;
179 } else {
181 }
182 } else {
183 *target++ = (UTF16)ch;
184 }
188 } else {
190 }
191 } else {
192
193 if (target + 1 >= targetEnd) {
194 --source;
196 }
200 }
201 }
202 *sourceStart = source;
203 *targetStart = target;
204 return result;
205}
206
207
208
210 const UTF16** sourceStart, const UTF16* sourceEnd,
213 const UTF16* source = *sourceStart;
214 UTF32* target = *targetStart;
216 while (source < sourceEnd) {
217 const UTF16* oldSource = source;
218 ch = *source++;
219
221
222 if (source < sourceEnd) {
223 ch2 = *source;
224
228 ++source;
229 } else if (flags == strictConversion) {
230 --source;
232 break;
233 }
234 } else {
235 --source;
237 break;
238 }
240
242 --source;
244 break;
245 }
246 }
247 if (target >= targetEnd) {
248 source = oldSource;
250 }
251 *target++ = ch;
252 }
253 *sourceStart = source;
254 *targetStart = target;
255#ifdef CVTUTF_DEBUG
257 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258 fflush(stderr);
259}
260#endif
261 return result;
262}
264 const UTF16** sourceStart, const UTF16* sourceEnd,
267 const UTF16* source = *sourceStart;
268 UTF8* target = *targetStart;
269 while (source < sourceEnd) {
271 unsigned short bytesToWrite = 0;
272 const UTF32 byteMask = 0xBF;
273 const UTF32 byteMark = 0x80;
274 const UTF16* oldSource = source;
275 ch = *source++;
276
278
279 if (source < sourceEnd) {
280 UTF32 ch2 = *source;
281
285 ++source;
286 } else if (flags == strictConversion) {
287 --source;
289 break;
290 }
291 } else {
292 --source;
294 break;
295 }
297
299 --source;
301 break;
302 }
303 }
304
305 if (ch < (UTF32)0x80) { bytesToWrite = 1;
306 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
307 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
308 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
309 } else { bytesToWrite = 3;
311 }
312
313 target += bytesToWrite;
314 if (target > targetEnd) {
315 source = oldSource;
317 }
318 switch (bytesToWrite) {
319 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
323 }
324 target += bytesToWrite;
325 }
326 *sourceStart = source;
327 *targetStart = target;
328 return result;
329}
330
331
332
334 const UTF32** sourceStart, const UTF32* sourceEnd,
337 const UTF32* source = *sourceStart;
338 UTF8* target = *targetStart;
339 while (source < sourceEnd) {
341 unsigned short bytesToWrite = 0;
342 const UTF32 byteMask = 0xBF;
343 const UTF32 byteMark = 0x80;
344 ch = *source++;
346
348 --source;
350 break;
351 }
352 }
353
354
355
356
357 if (ch < (UTF32)0x80) { bytesToWrite = 1;
358 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
359 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
361 } else { bytesToWrite = 3;
364 }
365
366 target += bytesToWrite;
367 if (target > targetEnd) {
368 --source;
370 }
371 switch (bytesToWrite) {
372 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
376 }
377 target += bytesToWrite;
378 }
379 *sourceStart = source;
380 *targetStart = target;
381 return result;
382}
383
384
385
386
387
388
389
390
391
392
393
394
395
396
399 const UTF8 *srcptr = source+length;
400 switch (length) {
401 default: return false;
402
403 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
404 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
405 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
406
407 switch (*source) {
408
409 case 0xE0: if (a < 0xA0) return false; break;
410 case 0xED: if (a > 0x9F) return false; break;
411 case 0xF0: if (a < 0x90) return false; break;
412 case 0xF4: if (a > 0x8F) return false; break;
413 default: if (a < 0x80) return false;
414 }
415
416 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
417 }
418 if (*source > 0xF4) return false;
419 return true;
420}
421
422
423
424
425
426
427
430 if (length > sourceEnd - source) {
431 return false;
432 }
434}
435
436
437
438
439
442 return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443 : 0;
444}
445
446
447
448static unsigned
450 const UTF8 *sourceEnd) {
451 UTF8 b1, b2, b3;
452
454
455
456
457
458
459
460
461
462
463
464 if (source == sourceEnd)
465 return 0;
466
467
468
469
470
471
472 b1 = *source;
473 ++source;
474 if (b1 >= 0xC2 && b1 <= 0xDF) {
475
476
477
478
479 return 1;
480 }
481
482 if (source == sourceEnd)
483 return 1;
484
485 b2 = *source;
486 ++source;
487
488 if (b1 == 0xE0) {
489 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
490 }
491 if (b1 >= 0xE1 && b1 <= 0xEC) {
492 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
493 }
494 if (b1 == 0xED) {
495 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
496 }
497 if (b1 >= 0xEE && b1 <= 0xEF) {
498 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
499 }
500 if (b1 == 0xF0) {
501 if (b2 >= 0x90 && b2 <= 0xBF) {
502 if (source == sourceEnd)
503 return 2;
504
505 b3 = *source;
506 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
507 }
508 return 1;
509 }
510 if (b1 >= 0xF1 && b1 <= 0xF3) {
511 if (b2 >= 0x80 && b2 <= 0xBF) {
512 if (source == sourceEnd)
513 return 2;
514
515 b3 = *source;
516 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
517 }
518 return 1;
519 }
520 if (b1 == 0xF4) {
521 if (b2 >= 0x80 && b2 <= 0x8F) {
522 if (source == sourceEnd)
523 return 2;
524
525 b3 = *source;
526 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
527 }
528 return 1;
529 }
530
531 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
532
533
534
535
536 return 1;
537}
538
539
540
541
542
543
544
548
549
550
551
552
553
554
556 while (*source != sourceEnd) {
558 if (length > sourceEnd - *source || (*source, length))
559 return false;
560 *source += length;
561 }
562 return true;
563}
564
565
566
568 const UTF8** sourceStart, const UTF8* sourceEnd,
571 const UTF8* source = *sourceStart;
572 UTF16* target = *targetStart;
573 while (source < sourceEnd) {
576 if (extraBytesToRead >= sourceEnd - source) {
578 }
579
580 if ((source, extraBytesToRead+1)) {
582 break;
583 }
584
585
586
587 switch (extraBytesToRead) {
588 case 5: ch += *source++; ch <<= 6;
589 case 4: ch += *source++; ch <<= 6;
590 case 3: ch += *source++; ch <<= 6;
591 case 2: ch += *source++; ch <<= 6;
592 case 1: ch += *source++; ch <<= 6;
593 case 0: ch += *source++;
594 }
596
597 if (target >= targetEnd) {
598 source -= (extraBytesToRead+1);
600 }
601 if (ch <= UNI_MAX_BMP) {
602
605 source -= (extraBytesToRead+1);
607 break;
608 } else {
610 }
611 } else {
612 *target++ = (UTF16)ch;
613 }
617 source -= (extraBytesToRead+1);
618 break;
619 } else {
621 }
622 } else {
623
624 if (target + 1 >= targetEnd) {
625 source -= (extraBytesToRead+1);
627 }
631 }
632 }
633 *sourceStart = source;
634 *targetStart = target;
635 return result;
636}
637
638
639
641 const UTF8** sourceStart, const UTF8* sourceEnd,
643 Boolean InputIsPartial) {
645 const UTF8* source = *sourceStart;
646 UTF32* target = *targetStart;
647 while (source < sourceEnd) {
650 if (extraBytesToRead >= sourceEnd - source) {
653 break;
654 } else {
656
657
658
659
660
662 sourceEnd);
664 continue;
665 }
666 }
667 if (target >= targetEnd) {
669 }
670
671
672 if ((source, extraBytesToRead+1)) {
675
676 break;
677 } else {
678
679
680
681
683 sourceEnd);
685 continue;
686 }
687 }
688
689
690
691 switch (extraBytesToRead) {
692 case 5: ch += *source++; ch <<= 6;
693 case 4: ch += *source++; ch <<= 6;
694 case 3: ch += *source++; ch <<= 6;
695 case 2: ch += *source++; ch <<= 6;
696 case 1: ch += *source++; ch <<= 6;
697 case 0: ch += *source++;
698 }
700
702
703
704
705
708 source -= (extraBytesToRead+1);
710 break;
711 } else {
713 }
714 } else {
715 *target++ = ch;
716 }
717 } else {
720 }
721 }
722 *sourceStart = source;
723 *targetStart = target;
724 return result;
725}
726
728 const UTF8 *sourceEnd,
729 UTF32 **targetStart,
730 UTF32 *targetEnd,
733 flags, true);
734}
735
737 const UTF8 *sourceEnd, UTF32 **targetStart,
740 flags, false);
741}
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762}
763
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define UNI_SUR_LOW_START
Definition ConvertUTF.cpp:107
#define UNI_SUR_HIGH_START
Definition ConvertUTF.cpp:105
#define ConvertUTF_DISABLE_WARNINGS
Definition ConvertUTF.cpp:90
#define UNI_SUR_LOW_END
Definition ConvertUTF.cpp:108
#define UNI_SUR_HIGH_END
Definition ConvertUTF.cpp:106
#define ConvertUTF_RESTORE_WARNINGS
Definition ConvertUTF.cpp:93
#define UNI_REPLACEMENT_CHAR
#define UNI_MAX_LEGAL_UTF32
This is an optimization pass for GlobalISel generic memory operations.
static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)
Definition ConvertUTF.cpp:640
static const UTF32 offsetsFromUTF8[6]
Definition ConvertUTF.cpp:135
LLVM_ABI ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition ConvertUTF.cpp:736
static const int halfShift
Definition ConvertUTF.cpp:100
LLVM_ABI unsigned getNumBytesForUTF8(UTF8 firstByte)
Definition ConvertUTF.cpp:545
static const UTF32 halfBase
Definition ConvertUTF.cpp:102
static Boolean isLegalUTF8(const UTF8 *source, int length)
Definition ConvertUTF.cpp:397
static const char trailingBytesForUTF8[256]
Definition ConvertUTF.cpp:119
LLVM_ABI ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition ConvertUTF.cpp:727
LLVM_ABI ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition ConvertUTF.cpp:160
LLVM_ABI Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition ConvertUTF.cpp:428
LLVM_ABI ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition ConvertUTF.cpp:263
LLVM_ABI ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition ConvertUTF.cpp:333
static const UTF32 halfMask
Definition ConvertUTF.cpp:103
LLVM_ABI Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
Definition ConvertUTF.cpp:555
static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition ConvertUTF.cpp:449
LLVM_ABI unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd)
Definition ConvertUTF.cpp:440
LLVM_ABI ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Definition ConvertUTF.cpp:209
static const UTF8 firstByteMark[7]
Definition ConvertUTF.cpp:145
LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition ConvertUTF.cpp:567