ConvertUTF.cpp Source File (original) (raw)

65#ifdef CVTUTF_DEBUG

66#include <stdio.h>

67#endif

68#include <assert.h>

74#if defined(__clang__) && defined(__has_warning)

75# if __has_warning("-Wimplicit-fallthrough")

76# define ConvertUTF_DISABLE_WARNINGS \

77 _Pragma("clang diagnostic push") \

78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")

79# define ConvertUTF_RESTORE_WARNINGS \

80 _Pragma("clang diagnostic pop")

81# endif

82#elif defined(__GNUC__)

83# define ConvertUTF_DISABLE_WARNINGS \

84 _Pragma("GCC diagnostic push") \

85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")

86# define ConvertUTF_RESTORE_WARNINGS \

87 _Pragma("GCC diagnostic pop")

88#endif

89#ifndef ConvertUTF_DISABLE_WARNINGS

90# define ConvertUTF_DISABLE_WARNINGS

91#endif

92#ifndef ConvertUTF_RESTORE_WARNINGS

93# define ConvertUTF_RESTORE_WARNINGS

94#endif

98namespace llvm {

100static const int halfShift = 10;

101

104

105#define UNI_SUR_HIGH_START (UTF32)0xD800

106#define UNI_SUR_HIGH_END (UTF32)0xDBFF

107#define UNI_SUR_LOW_START (UTF32)0xDC00

108#define UNI_SUR_LOW_END (UTF32)0xDFFF

109

110

111

112

113

114

115

116

117

118

120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5

128};

129

130

131

132

133

134

136 0x03C82080UL, 0xFA082080UL, 0x82082080UL };

137

138

139

140

141

142

143

144

145static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

146

147

148

149

150

151

152

153

154

155

156

157

158

159

161 const UTF32** sourceStart, const UTF32* sourceEnd,

164 const UTF32* source = *sourceStart;

165 UTF16* target = *targetStart;

166 while (source < sourceEnd) {

168 if (target >= targetEnd) {

170 }

171 ch = *source++;

172 if (ch <= UNI_MAX_BMP) {

173

176 --source;

178 break;

179 } else {

181 }

182 } else {

183 *target++ = (UTF16)ch;

184 }

188 } else {

190 }

191 } else {

192

193 if (target + 1 >= targetEnd) {

194 --source;

196 }

200 }

201 }

202 *sourceStart = source;

203 *targetStart = target;

204 return result;

205}

206

207

208

210 const UTF16** sourceStart, const UTF16* sourceEnd,

213 const UTF16* source = *sourceStart;

214 UTF32* target = *targetStart;

216 while (source < sourceEnd) {

217 const UTF16* oldSource = source;

218 ch = *source++;

219

221

222 if (source < sourceEnd) {

223 ch2 = *source;

224

228 ++source;

229 } else if (flags == strictConversion) {

230 --source;

232 break;

233 }

234 } else {

235 --source;

237 break;

238 }

240

242 --source;

244 break;

245 }

246 }

247 if (target >= targetEnd) {

248 source = oldSource;

250 }

251 *target++ = ch;

252 }

253 *sourceStart = source;

254 *targetStart = target;

255#ifdef CVTUTF_DEBUG

257 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);

258 fflush(stderr);

259}

260#endif

261 return result;

262}

264 const UTF16** sourceStart, const UTF16* sourceEnd,

267 const UTF16* source = *sourceStart;

268 UTF8* target = *targetStart;

269 while (source < sourceEnd) {

271 unsigned short bytesToWrite = 0;

272 const UTF32 byteMask = 0xBF;

273 const UTF32 byteMark = 0x80;

274 const UTF16* oldSource = source;

275 ch = *source++;

276

278

279 if (source < sourceEnd) {

280 UTF32 ch2 = *source;

281

285 ++source;

286 } else if (flags == strictConversion) {

287 --source;

289 break;

290 }

291 } else {

292 --source;

294 break;

295 }

297

299 --source;

301 break;

302 }

303 }

304

305 if (ch < (UTF32)0x80) { bytesToWrite = 1;

306 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;

307 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;

308 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;

309 } else { bytesToWrite = 3;

311 }

312

313 target += bytesToWrite;

314 if (target > targetEnd) {

315 source = oldSource;

317 }

318 switch (bytesToWrite) {

319 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

320 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

321 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

323 }

324 target += bytesToWrite;

325 }

326 *sourceStart = source;

327 *targetStart = target;

328 return result;

329}

330

331

332

334 const UTF32** sourceStart, const UTF32* sourceEnd,

337 const UTF32* source = *sourceStart;

338 UTF8* target = *targetStart;

339 while (source < sourceEnd) {

341 unsigned short bytesToWrite = 0;

342 const UTF32 byteMask = 0xBF;

343 const UTF32 byteMark = 0x80;

344 ch = *source++;

346

348 --source;

350 break;

351 }

352 }

353

354

355

356

357 if (ch < (UTF32)0x80) { bytesToWrite = 1;

358 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;

359 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;

361 } else { bytesToWrite = 3;

364 }

365

366 target += bytesToWrite;

367 if (target > targetEnd) {

368 --source;

370 }

371 switch (bytesToWrite) {

372 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

373 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

374 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

376 }

377 target += bytesToWrite;

378 }

379 *sourceStart = source;

380 *targetStart = target;

381 return result;

382}

383

384

385

386

387

388

389

390

391

392

393

394

395

396

399 const UTF8 *srcptr = source+length;

400 switch (length) {

401 default: return false;

402

403 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;

404 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;

405 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;

406

407 switch (*source) {

408

409 case 0xE0: if (a < 0xA0) return false; break;

410 case 0xED: if (a > 0x9F) return false; break;

411 case 0xF0: if (a < 0x90) return false; break;

412 case 0xF4: if (a > 0x8F) return false; break;

413 default: if (a < 0x80) return false;

414 }

415

416 case 1: if (*source >= 0x80 && *source < 0xC2) return false;

417 }

418 if (*source > 0xF4) return false;

419 return true;

420}

421

422

423

424

425

426

427

430 if (length > sourceEnd - source) {

431 return false;

432 }

434}

435

436

437

438

439

442 return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length

443 : 0;

444}

445

446

447

448static unsigned

450 const UTF8 *sourceEnd) {

451 UTF8 b1, b2, b3;

452

454

455

456

457

458

459

460

461

462

463

464 if (source == sourceEnd)

465 return 0;

466

467

468

469

470

471

472 b1 = *source;

473 ++source;

474 if (b1 >= 0xC2 && b1 <= 0xDF) {

475

476

477

478

479 return 1;

480 }

481

482 if (source == sourceEnd)

483 return 1;

484

485 b2 = *source;

486 ++source;

487

488 if (b1 == 0xE0) {

489 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;

490 }

491 if (b1 >= 0xE1 && b1 <= 0xEC) {

492 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;

493 }

494 if (b1 == 0xED) {

495 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;

496 }

497 if (b1 >= 0xEE && b1 <= 0xEF) {

498 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;

499 }

500 if (b1 == 0xF0) {

501 if (b2 >= 0x90 && b2 <= 0xBF) {

502 if (source == sourceEnd)

503 return 2;

504

505 b3 = *source;

506 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

507 }

508 return 1;

509 }

510 if (b1 >= 0xF1 && b1 <= 0xF3) {

511 if (b2 >= 0x80 && b2 <= 0xBF) {

512 if (source == sourceEnd)

513 return 2;

514

515 b3 = *source;

516 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

517 }

518 return 1;

519 }

520 if (b1 == 0xF4) {

521 if (b2 >= 0x80 && b2 <= 0x8F) {

522 if (source == sourceEnd)

523 return 2;

524

525 b3 = *source;

526 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

527 }

528 return 1;

529 }

530

531 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);

532

533

534

535

536 return 1;

537}

538

539

540

541

542

543

544

548

549

550

551

552

553

554

556 while (*source != sourceEnd) {

558 if (length > sourceEnd - *source || isLegalUTF8 (*source, length))

559 return false;

560 *source += length;

561 }

562 return true;

563}

564

565

566

568 const UTF8** sourceStart, const UTF8* sourceEnd,

571 const UTF8* source = *sourceStart;

572 UTF16* target = *targetStart;

573 while (source < sourceEnd) {

576 if (extraBytesToRead >= sourceEnd - source) {

578 }

579

580 if ( isLegalUTF8 (source, extraBytesToRead+1)) {

582 break;

583 }

584

585

586

587 switch (extraBytesToRead) {

588 case 5: ch += *source++; ch <<= 6;

589 case 4: ch += *source++; ch <<= 6;

590 case 3: ch += *source++; ch <<= 6;

591 case 2: ch += *source++; ch <<= 6;

592 case 1: ch += *source++; ch <<= 6;

593 case 0: ch += *source++;

594 }

596

597 if (target >= targetEnd) {

598 source -= (extraBytesToRead+1);

600 }

601 if (ch <= UNI_MAX_BMP) {

602

605 source -= (extraBytesToRead+1);

607 break;

608 } else {

610 }

611 } else {

612 *target++ = (UTF16)ch;

613 }

617 source -= (extraBytesToRead+1);

618 break;

619 } else {

621 }

622 } else {

623

624 if (target + 1 >= targetEnd) {

625 source -= (extraBytesToRead+1);

627 }

631 }

632 }

633 *sourceStart = source;

634 *targetStart = target;

635 return result;

636}

637

638

639

641 const UTF8** sourceStart, const UTF8* sourceEnd,

643 Boolean InputIsPartial) {

645 const UTF8* source = *sourceStart;

646 UTF32* target = *targetStart;

647 while (source < sourceEnd) {

650 if (extraBytesToRead >= sourceEnd - source) {

653 break;

654 } else {

656

657

658

659

660

662 sourceEnd);

664 continue;

665 }

666 }

667 if (target >= targetEnd) {

669 }

670

671

672 if ( isLegalUTF8 (source, extraBytesToRead+1)) {

675

676 break;

677 } else {

678

679

680

681

683 sourceEnd);

685 continue;

686 }

687 }

688

689

690

691 switch (extraBytesToRead) {

692 case 5: ch += *source++; ch <<= 6;

693 case 4: ch += *source++; ch <<= 6;

694 case 3: ch += *source++; ch <<= 6;

695 case 2: ch += *source++; ch <<= 6;

696 case 1: ch += *source++; ch <<= 6;

697 case 0: ch += *source++;

698 }

700

702

703

704

705

708 source -= (extraBytesToRead+1);

710 break;

711 } else {

713 }

714 } else {

715 *target++ = ch;

716 }

717 } else {

720 }

721 }

722 *sourceStart = source;

723 *targetStart = target;

724 return result;

725}

726

728 const UTF8 *sourceEnd,

729 UTF32 **targetStart,

730 UTF32 *targetEnd,

733 flags, true);

734}

735

737 const UTF8 *sourceEnd, UTF32 **targetStart,

740 flags, false);

741}

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762}

763

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

#define UNI_SUR_LOW_START

Definition ConvertUTF.cpp:107

#define UNI_SUR_HIGH_START

Definition ConvertUTF.cpp:105

#define ConvertUTF_DISABLE_WARNINGS

Definition ConvertUTF.cpp:90

#define UNI_SUR_LOW_END

Definition ConvertUTF.cpp:108

#define UNI_SUR_HIGH_END

Definition ConvertUTF.cpp:106

#define ConvertUTF_RESTORE_WARNINGS

Definition ConvertUTF.cpp:93

#define UNI_REPLACEMENT_CHAR

#define UNI_MAX_LEGAL_UTF32

This is an optimization pass for GlobalISel generic memory operations.

static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)

Definition ConvertUTF.cpp:640

static const UTF32 offsetsFromUTF8[6]

Definition ConvertUTF.cpp:135

LLVM_ABI ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)

Convert a partial UTF8 sequence to UTF32.

Definition ConvertUTF.cpp:736

static const int halfShift

Definition ConvertUTF.cpp:100

LLVM_ABI unsigned getNumBytesForUTF8(UTF8 firstByte)

Definition ConvertUTF.cpp:545

static const UTF32 halfBase

Definition ConvertUTF.cpp:102

static Boolean isLegalUTF8(const UTF8 *source, int length)

Definition ConvertUTF.cpp:397

static const char trailingBytesForUTF8[256]

Definition ConvertUTF.cpp:119

LLVM_ABI ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)

Convert a partial UTF8 sequence to UTF32.

Definition ConvertUTF.cpp:727

LLVM_ABI ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)

Definition ConvertUTF.cpp:160

LLVM_ABI Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)

Definition ConvertUTF.cpp:428

LLVM_ABI ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)

Definition ConvertUTF.cpp:263

LLVM_ABI ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)

Definition ConvertUTF.cpp:333

static const UTF32 halfMask

Definition ConvertUTF.cpp:103

LLVM_ABI Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)

Definition ConvertUTF.cpp:555

static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)

Definition ConvertUTF.cpp:449

LLVM_ABI unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd)

Definition ConvertUTF.cpp:440

LLVM_ABI ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)

Definition ConvertUTF.cpp:209

static const UTF8 firstByteMark[7]

Definition ConvertUTF.cpp:145

LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)

Definition ConvertUTF.cpp:567