PostgreSQL Source Code: contrib/fuzzystrmatch/dmetaphone.c Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97#ifndef DMETAPHONE_MAIN

98

100

102

103

104#define NDEBUG

105

106#else

107

108

109#include <stdio.h>

110#include <stdlib.h>

112#include <stdarg.h>

113

114#endif

115

116#include <assert.h>

117#include <ctype.h>

118

119

121

122#ifndef DMETAPHONE_MAIN

123

124

125

126

127

129

132{

134 char *aptr,

135 *codes[2],

136 *code;

137

138#ifdef DMETAPHONE_NOSTRICT

141#endif

144

146 code = codes[0];

147 if (!code)

148 code = "";

149

151}

152

153

154

155

156

158

161{

163 char *aptr,

164 *codes[2],

165 *code;

166

167#ifdef DMETAPHONE_NOSTRICT

170#endif

173

175 code = codes[1];

176 if (!code)

177 code = "";

178

180}

181

182

183

184

185

186

187#define META_MALLOC(v,n,t) \

188 (v = (t*)palloc(((n)*sizeof(t))))

189

190#define META_REALLOC(v,n,t) \

191 (v = (t*)repalloc((v),((n)*sizeof(t))))

192

193

194

195

196

197

198

199

200#define META_FREE(x) ((void)true)

201#else

202

203

204

205#define META_MALLOC(v,n,t) \

206 (v = (t*)malloc(((n)*sizeof(t))))

207

208#define META_REALLOC(v,n,t) \

209 (v = (t*)realloc((v),((n)*sizeof(t))))

210

211#define META_FREE(x) free((x))

212#endif

213

214

215

216

217

218typedef struct

219{

224}

225

227

228

229

230

231

232

233

236{

238 char empty_string[] = "";

239

242

243 if (init_str == NULL)

244 init_str = empty_string;

245 s->length = strlen(init_str);

246

248

251

252 memcpy(s->str, init_str, s->length + 1);

254

255 return s;

256}

257

258

259static void

261{

262 if (s == NULL)

263 return;

264

267

269}

270

271

272static void

274{

278}

279

280

281static void

283{

284 char *i;

285

286 for (i = s->str; *i; i++)

287 *i = toupper((unsigned char) *i);

288}

289

290

291static int

293{

294 char c;

295

296 if ((pos < 0) || (pos >= s->length))

297 return 0;

298

299 c = *(s->str + pos);

300 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||

301 (c == 'U') || (c == 'Y'))

302 return 1;

303

304 return 0;

305}

306

307

308static int

310{

311 if (strstr(s->str, "W"))

312 return 1;

313 else if (strstr(s->str, "K"))

314 return 1;

315 else if (strstr(s->str, "CZ"))

316 return 1;

317 else if (strstr(s->str, "WITZ"))

318 return 1;

319 else

320 return 0;

321}

322

323

324static char

326{

327 if ((pos < 0) || (pos >= s->length))

328 return '\0';

329

330 return ((char) *(s->str + pos));

331}

332

333

334static void

336{

337 if ((pos < 0) || (pos >= s->length))

338 return;

339

340 *(s->str + pos) = c;

341}

342

343

344

345

346

347static int

349{

351 char *pos;

352 va_list ap;

353

355 return 0;

356

358 va_start(ap, length);

359

360 do

361 {

362 test = va_arg(ap, char *);

363 if (*test && (strncmp(pos, test, length) == 0))

364 {

365 va_end(ap);

366 return 1;

367 }

368 }

369 while (strcmp(test, "") != 0);

370

371 va_end(ap);

372

373 return 0;

374}

375

376

377static void

379{

380 int add_length;

381

382 if (new_str == NULL)

383 return;

384

385 add_length = strlen(new_str);

388

389 strcat(s->str, new_str);

390 s->length += add_length;

391}

392

393

394static void

396{

397 int length;

401 int current;

402 int last;

403

404 current = 0;

405

406 length = strlen(str);

407 last = length - 1;

409

411

416

418

419

420 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))

421 current += 1;

422

423

424 if (GetAt(original, 0) == 'X')

425 {

426 MetaphAdd(primary, "S");

428 current += 1;

429 }

430

431

432 while ((primary->length < 4) || (secondary->length < 4))

433 {

434 if (current >= length)

435 break;

436

437 switch (GetAt(original, current))

438 {

439 case 'A':

440 case 'E':

441 case 'I':

442 case 'O':

443 case 'U':

444 case 'Y':

445 if (current == 0)

446 {

447

450 }

451 current += 1;

452 break;

453

454 case 'B':

455

456

459

460 if (GetAt(original, current + 1) == 'B')

461 current += 2;

462 else

463 current += 1;

464 break;

465

466 case '\xc7':

469 current += 1;

470 break;

471

472 case 'C':

473

474 if ((current > 1)

475 && IsVowel(original, current - 2)

476 && StringAt(original, (current - 1), 3, "ACH", "")

477 && ((GetAt(original, current + 2) != 'I')

478 && ((GetAt(original, current + 2) != 'E')

479 || StringAt(original, (current - 2), 6, "BACHER",

480 "MACHER", ""))))

481 {

484 current += 2;

485 break;

486 }

487

488

489 if ((current == 0)

490 && StringAt(original, current, 6, "CAESAR", ""))

491 {

494 current += 2;

495 break;

496 }

497

498

499 if (StringAt(original, current, 4, "CHIA", ""))

500 {

503 current += 2;

504 break;

505 }

506

507 if (StringAt(original, current, 2, "CH", ""))

508 {

509

510 if ((current > 0)

511 && StringAt(original, current, 4, "CHAE", ""))

512 {

515 current += 2;

516 break;

517 }

518

519

520 if ((current == 0)

521 && (StringAt(original, (current + 1), 5,

522 "HARAC", "HARIS", "")

523 || StringAt(original, (current + 1), 3, "HOR",

524 "HYM", "HIA", "HEM", ""))

525 && StringAt(original, 0, 5, "CHORE", ""))

526 {

529 current += 2;

530 break;

531 }

532

533

534 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")

535 || StringAt(original, 0, 3, "SCH", ""))

536

537 || StringAt(original, (current - 2), 6, "ORCHES",

538 "ARCHIT", "ORCHID", "")

539 || StringAt(original, (current + 2), 1, "T", "S",

540 "")

541 || ((StringAt(original, (current - 1), 1,

542 "A", "O", "U", "E", "")

543 || (current == 0))

544

545

546

547

548 && StringAt(original, (current + 2), 1, "L", "R",

549 "N", "M", "B", "H", "F", "V", "W",

550 " ", "")))

551 {

554 }

555 else

556 {

557 if (current > 0)

558 {

559 if (StringAt(original, 0, 2, "MC", ""))

560 {

561

564 }

565 else

566 {

569 }

570 }

571 else

572 {

575 }

576 }

577 current += 2;

578 break;

579 }

580

581 if (StringAt(original, current, 2, "CZ", "")

582 && StringAt(original, (current - 2), 4, "WICZ", ""))

583 {

586 current += 2;

587 break;

588 }

589

590

591 if (StringAt(original, (current + 1), 3, "CIA", ""))

592 {

595 current += 3;

596 break;

597 }

598

599

600 if (StringAt(original, current, 2, "CC", "")

601 && !((current == 1) && (GetAt(original, 0) == 'M')))

602 {

603

604 if (StringAt(original, (current + 2), 1, "I", "E", "H", "")

605 && StringAt(original, (current + 2), 2, "HU", ""))

606 {

607

608 if (((current == 1)

609 && (GetAt(original, current - 1) == 'A'))

610 || StringAt(original, (current - 1), 5, "UCCEE",

611 "UCCES", ""))

612 {

615

616 }

617 else

618 {

621 }

622 current += 3;

623 break;

624 }

625 else

626 {

629 current += 2;

630 break;

631 }

632 }

633

634 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))

635 {

638 current += 2;

639 break;

640 }

641

642 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))

643 {

644

646 (original, current, 3, "CIO", "CIE", "CIA", ""))

647 {

650 }

651 else

652 {

655 }

656 current += 2;

657 break;

658 }

659

660

663

664

665 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))

666 current += 3;

667 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")

668 && StringAt(original, (current + 1), 2,

669 "CE", "CI", ""))

670 current += 2;

671 else

672 current += 1;

673 break;

674

675 case 'D':

676 if (StringAt(original, current, 2, "DG", ""))

677 {

678 if (StringAt(original, (current + 2), 1,

679 "I", "E", "Y", ""))

680 {

681

684 current += 3;

685 break;

686 }

687 else

688 {

689

692 current += 2;

693 break;

694 }

695 }

696

697 if (StringAt(original, current, 2, "DT", "DD", ""))

698 {

701 current += 2;

702 break;

703 }

704

705

708 current += 1;

709 break;

710

711 case 'F':

712 if (GetAt(original, current + 1) == 'F')

713 current += 2;

714 else

715 current += 1;

718 break;

719

720 case 'G':

721 if (GetAt(original, current + 1) == 'H')

722 {

723 if ((current > 0) && IsVowel(original, current - 1))

724 {

727 current += 2;

728 break;

729 }

730

731 if (current < 3)

732 {

733

734 if (current == 0)

735 {

736 if (GetAt(original, current + 2) == 'I')

737 {

740 }

741 else

742 {

745 }

746 current += 2;

747 break;

748 }

749 }

750

751

752

753

754

755 if (((current > 1)

756 && StringAt(original, (current - 2), 1,

757 "B", "H", "D", ""))

758

759 || ((current > 2)

760 && StringAt(original, (current - 3), 1,

761 "B", "H", "D", ""))

762

763 || ((current > 3)

764 && StringAt(original, (current - 4), 1,

765 "B", "H", "")))

766 {

767 current += 2;

768 break;

769 }

770 else

771 {

772

773

774

775

776 if ((current > 2)

777 && (GetAt(original, current - 1) == 'U')

778 && StringAt(original, (current - 3), 1, "C",

779 "G", "L", "R", "T", ""))

780 {

783 }

784 else if ((current > 0)

785 && GetAt(original, current - 1) != 'I')

786 {

787

788

791 }

792

793 current += 2;

794 break;

795 }

796 }

797

798 if (GetAt(original, current + 1) == 'N')

799 {

800 if ((current == 1) && IsVowel(original, 0)

802 {

805 }

806 else

807

808 if (StringAt(original, (current + 2), 2, "EY", "")

809 && (GetAt(original, current + 1) != 'Y')

811 {

814 }

815 else

816 {

819 }

820 current += 2;

821 break;

822 }

823

824

825 if (StringAt(original, (current + 1), 2, "LI", "")

827 {

830 current += 2;

831 break;

832 }

833

834

835 if ((current == 0)

836 && ((GetAt(original, current + 1) == 'Y')

837 || StringAt(original, (current + 1), 2, "ES", "EP",

838 "EB", "EL", "EY", "IB", "IL", "IN", "IE",

839 "EI", "ER", "")))

840 {

843 current += 2;

844 break;

845 }

846

847

848 if ((StringAt(original, (current + 1), 2, "ER", "")

849 || (GetAt(original, current + 1) == 'Y'))

851 "DANGER", "RANGER", "MANGER", "")

852 && StringAt(original, (current - 1), 1, "E", "I", "")

853 && StringAt(original, (current - 1), 3, "RGY", "OGY", ""))

854 {

857 current += 2;

858 break;

859 }

860

861

862 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")

863 || StringAt(original, (current - 1), 4,

864 "AGGI", "OGGI", ""))

865 {

866

867 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")

868 || StringAt(original, 0, 3, "SCH", ""))

869 || StringAt(original, (current + 1), 2, "ET", ""))

870 {

873 }

874 else

875 {

876

878 (original, (current + 1), 4, "IER ", ""))

879 {

882 }

883 else

884 {

887 }

888 }

889 current += 2;

890 break;

891 }

892

893 if (GetAt(original, current + 1) == 'G')

894 current += 2;

895 else

896 current += 1;

899 break;

900

901 case 'H':

902

903 if (((current == 0) || IsVowel(original, current - 1))

904 && IsVowel(original, current + 1))

905 {

908 current += 2;

909 }

910 else

911

912 current += 1;

913 break;

914

915 case 'J':

916

917 if (StringAt(original, current, 4, "JOSE", "")

918 || StringAt(original, 0, 4, "SAN ", ""))

919 {

920 if (((current == 0)

921 && (GetAt(original, current + 4) == ' '))

922 || StringAt(original, 0, 4, "SAN ", ""))

923 {

926 }

927 else

928 {

931 }

932 current += 1;

933 break;

934 }

935

936 if ((current == 0)

937 && StringAt(original, current, 4, "JOSE", ""))

938 {

939 MetaphAdd(primary, "J");

941 }

942 else

943 {

944

945 if (IsVowel(original, current - 1)

947 && ((GetAt(original, current + 1) == 'A')

948 || (GetAt(original, current + 1) == 'O')))

949 {

952 }

953 else

954 {

955 if (current == last)

956 {

959 }

960 else

961 {

962 if (StringAt(original, (current + 1), 1, "L", "T",

963 "K", "S", "N", "M", "B", "Z", "")

964 && StringAt(original, (current - 1), 1,

965 "S", "K", "L", ""))

966 {

969 }

970 }

971 }

972 }

973

974 if (GetAt(original, current + 1) == 'J')

975 current += 2;

976 else

977 current += 1;

978 break;

979

980 case 'K':

981 if (GetAt(original, current + 1) == 'K')

982 current += 2;

983 else

984 current += 1;

987 break;

988

989 case 'L':

990 if (GetAt(original, current + 1) == 'L')

991 {

992

993 if (((current == (length - 3))

994 && StringAt(original, (current - 1), 4, "ILLO",

995 "ILLA", "ALLE", ""))

996 || ((StringAt(original, (last - 1), 2, "AS", "OS", "")

997 || StringAt(original, last, 1, "A", "O", ""))

998 && StringAt(original, (current - 1), 4,

999 "ALLE", "")))

1000 {

1003 current += 2;

1004 break;

1005 }

1006 current += 2;

1007 }

1008 else

1009 current += 1;

1012 break;

1013

1014 case 'M':

1015 if ((StringAt(original, (current - 1), 3, "UMB", "")

1016 && (((current + 1) == last)

1017 || StringAt(original, (current + 2), 2, "ER", "")))

1018

1019 || (GetAt(original, current + 1) == 'M'))

1020 current += 2;

1021 else

1022 current += 1;

1025 break;

1026

1027 case 'N':

1028 if (GetAt(original, current + 1) == 'N')

1029 current += 2;

1030 else

1031 current += 1;

1034 break;

1035

1036 case '\xd1':

1037 current += 1;

1040 break;

1041

1042 case 'P':

1043 if (GetAt(original, current + 1) == 'H')

1044 {

1047 current += 2;

1048 break;

1049 }

1050

1051

1052 if (StringAt(original, (current + 1), 1, "P", "B", ""))

1053 current += 2;

1054 else

1055 current += 1;

1058 break;

1059

1060 case 'Q':

1061 if (GetAt(original, current + 1) == 'Q')

1062 current += 2;

1063 else

1064 current += 1;

1067 break;

1068

1069 case 'R':

1070

1071 if ((current == last)

1073 && StringAt(original, (current - 2), 2, "IE", "")

1074 && StringAt(original, (current - 4), 2, "ME", "MA", ""))

1075 {

1078 }

1079 else

1080 {

1083 }

1084

1085 if (GetAt(original, current + 1) == 'R')

1086 current += 2;

1087 else

1088 current += 1;

1089 break;

1090

1091 case 'S':

1092

1093 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))

1094 {

1095 current += 1;

1096 break;

1097 }

1098

1099

1100 if ((current == 0)

1101 && StringAt(original, current, 5, "SUGAR", ""))

1102 {

1105 current += 1;

1106 break;

1107 }

1108

1109 if (StringAt(original, current, 2, "SH", ""))

1110 {

1111

1113 (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",

1114 "HOLZ", ""))

1115 {

1118 }

1119 else

1120 {

1123 }

1124 current += 2;

1125 break;

1126 }

1127

1128

1129 if (StringAt(original, current, 3, "SIO", "SIA", "")

1130 || StringAt(original, current, 4, "SIAN", ""))

1131 {

1133 {

1136 }

1137 else

1138 {

1141 }

1142 current += 3;

1143 break;

1144 }

1145

1146

1147

1148

1149

1150

1151 if (((current == 0)

1152 && StringAt(original, (current + 1), 1,

1153 "M", "N", "L", "W", ""))

1154 || StringAt(original, (current + 1), 1, "Z", ""))

1155 {

1158 if (StringAt(original, (current + 1), 1, "Z", ""))

1159 current += 2;

1160 else

1161 current += 1;

1162 break;

1163 }

1164

1165 if (StringAt(original, current, 2, "SC", ""))

1166 {

1167

1168 if (GetAt(original, current + 2) == 'H')

1169 {

1170

1171 if (StringAt(original, (current + 3), 2,

1172 "OO", "ER", "EN",

1173 "UY", "ED", "EM", ""))

1174 {

1175

1176 if (StringAt(original, (current + 3), 2,

1177 "ER", "EN", ""))

1178 {

1181 }

1182 else

1183 {

1186 }

1187 current += 3;

1188 break;

1189 }

1190 else

1191 {

1192 if ((current == 0) && IsVowel(original, 3)

1193 && (GetAt(original, 3) != 'W'))

1194 {

1197 }

1198 else

1199 {

1202 }

1203 current += 3;

1204 break;

1205 }

1206 }

1207

1208 if (StringAt(original, (current + 2), 1,

1209 "I", "E", "Y", ""))

1210 {

1213 current += 3;

1214 break;

1215 }

1216

1219 current += 3;

1220 break;

1221 }

1222

1223

1224 if ((current == last)

1225 && StringAt(original, (current - 2), 2, "AI", "OI", ""))

1226 {

1229 }

1230 else

1231 {

1234 }

1235

1236 if (StringAt(original, (current + 1), 1, "S", "Z", ""))

1237 current += 2;

1238 else

1239 current += 1;

1240 break;

1241

1242 case 'T':

1243 if (StringAt(original, current, 4, "TION", ""))

1244 {

1247 current += 3;

1248 break;

1249 }

1250

1251 if (StringAt(original, current, 3, "TIA", "TCH", ""))

1252 {

1255 current += 3;

1256 break;

1257 }

1258

1259 if (StringAt(original, current, 2, "TH", "")

1260 || StringAt(original, current, 3, "TTH", ""))

1261 {

1262

1263 if (StringAt(original, (current + 2), 2, "OM", "AM", "")

1264 || StringAt(original, 0, 4, "VAN ", "VON ", "")

1265 || StringAt(original, 0, 3, "SCH", ""))

1266 {

1269 }

1270 else

1271 {

1274 }

1275 current += 2;

1276 break;

1277 }

1278

1279 if (StringAt(original, (current + 1), 1, "T", "D", ""))

1280 current += 2;

1281 else

1282 current += 1;

1285 break;

1286

1287 case 'V':

1288 if (GetAt(original, current + 1) == 'V')

1289 current += 2;

1290 else

1291 current += 1;

1294 break;

1295

1296 case 'W':

1297

1298 if (StringAt(original, current, 2, "WR", ""))

1299 {

1302 current += 2;

1303 break;

1304 }

1305

1306 if ((current == 0)

1307 && (IsVowel(original, current + 1)

1308 || StringAt(original, current, 2, "WH", "")))

1309 {

1310

1311 if (IsVowel(original, current + 1))

1312 {

1315 }

1316 else

1317 {

1318

1321 }

1322 }

1323

1324

1325 if (((current == last) && IsVowel(original, current - 1))

1326 || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",

1327 "OWSKI", "OWSKY", "")

1328 || StringAt(original, 0, 3, "SCH", ""))

1329 {

1332 current += 1;

1333 break;

1334 }

1335

1336

1337 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))

1338 {

1341 current += 4;

1342 break;

1343 }

1344

1345

1346 current += 1;

1347 break;

1348

1349 case 'X':

1350

1351 if (!((current == last)

1352 && (StringAt(original, (current - 3), 3,

1353 "IAU", "EAU", "")

1354 || StringAt(original, (current - 2), 2,

1355 "AU", "OU", ""))))

1356 {

1359 }

1360

1361

1362 if (StringAt(original, (current + 1), 1, "C", "X", ""))

1363 current += 2;

1364 else

1365 current += 1;

1366 break;

1367

1368 case 'Z':

1369

1370 if (GetAt(original, current + 1) == 'H')

1371 {

1374 current += 2;

1375 break;

1376 }

1377 else if (StringAt(original, (current + 1), 2,

1378 "ZO", "ZI", "ZA", "")

1380 && ((current > 0)

1381 && GetAt(original, current - 1) != 'T')))

1382 {

1385 }

1386 else

1387 {

1390 }

1391

1392 if (GetAt(original, current + 1) == 'Z')

1393 current += 2;

1394 else

1395 current += 1;

1396 break;

1397

1398 default:

1399 current += 1;

1400 }

1401

1402

1403

1404

1405

1406 }

1407

1408

1409 if (primary->length > 4)

1410 SetAt(primary, 4, '\0');

1411

1412 if (secondary->length > 4)

1413 SetAt(secondary, 4, '\0');

1414

1415 *codes = primary->str;

1416 *++codes = secondary->str;

1417

1421}

1422

1423#ifdef DMETAPHONE_MAIN

1424

1425

1426

1427main(int argc, char **argv)

1428{

1429 char *codes[2];

1430

1431 if (argc > 1)

1432 {

1434 printf("%s|%s\n", codes[0], codes[1]);

1435 }

1436}

1437

1438#endif

#define PG_GETARG_TEXT_PP(n)

#define PG_RETURN_TEXT_P(x)

int main(int argc, char **argv)

text * cstring_to_text(const char *s)

char * text_to_cstring(const text *t)