PostgreSQL Source Code: src/backend/utils/adt/levenshtein.c Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26#define MAX_LEVENSHTEIN_STRLEN 255

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66int

67#ifdef LEVENSHTEIN_LESS_EQUAL

69 const char *target, int tlen,

70 int ins_c, int del_c, int sub_c,

71 int max_d, bool trusted)

72#else

74 const char *target, int tlen,

75 int ins_c, int del_c, int sub_c,

76 bool trusted)

77#endif

78{

79 int m,

80 n;

81 int *prev;

82 int *curr;

83 int *s_char_len = NULL;

84 int j;

85 const char *y;

86

87

88

89

90

91

92#ifdef LEVENSHTEIN_LESS_EQUAL

93 int start_column,

94 stop_column;

95

96#undef START_COLUMN

97#undef STOP_COLUMN

98#define START_COLUMN start_column

99#define STOP_COLUMN stop_column

100#else

101#undef START_COLUMN

102#undef STOP_COLUMN

103#define START_COLUMN 0

104#define STOP_COLUMN m

105#endif

106

107

110

111

112

113

114

115 if (!m)

116 return n * ins_c;

117 if (!n)

118 return m * del_c;

119

120

121

122

123

124

125

126

127 if (!trusted &&

131 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

132 errmsg("levenshtein argument exceeds maximum length of %d characters",

134

135#ifdef LEVENSHTEIN_LESS_EQUAL

136

137 start_column = 0;

138 stop_column = m + 1;

139

140

141

142

143

144

145

146 if (max_d >= 0)

147 {

148 int min_theo_d;

149 int max_theo_d;

150 int net_inserts = n - m;

151

152 min_theo_d = net_inserts < 0 ?

153 -net_inserts * del_c : net_inserts * ins_c;

154 if (min_theo_d > max_d)

155 return max_d + 1;

156 if (ins_c + del_c < sub_c)

157 sub_c = ins_c + del_c;

158 max_theo_d = min_theo_d + sub_c * Min(m, n);

159 if (max_d >= max_theo_d)

160 max_d = -1;

161 else if (ins_c + del_c > 0)

162 {

163

164

165

166

167

168

169

170

171

172

173

174

175 int slack_d = max_d - min_theo_d;

176 int best_column = net_inserts < 0 ? -net_inserts : 0;

177

178 stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;

179 if (stop_column > m)

180 stop_column = m + 1;

181 }

182 }

183#endif

184

185

186

187

188

189

190

191

192

193 if (m != slen || n != tlen)

194 {

195 int i;

196 const char *cp = source;

197

198 s_char_len = (int *) palloc((m + 1) * sizeof(int));

199 for (i = 0; i < m; ++i)

200 {

202 cp += s_char_len[i];

203 }

204 s_char_len[i] = 0;

205 }

206

207

208 ++m;

209 ++n;

210

211

212 prev = (int *) palloc(2 * m * sizeof(int));

213 curr = prev + m;

214

215

216

217

218

220 prev[i] = i * del_c;

221

222

223 for (y = target, j = 1; j < n; j++)

224 {

225 int *temp;

227 int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;

228 int i;

229

230#ifdef LEVENSHTEIN_LESS_EQUAL

231

232

233

234

235

236

237

238 if (stop_column < m)

239 {

240 prev[stop_column] = max_d + 1;

241 ++stop_column;

242 }

243

244

245

246

247

248

249

250 if (start_column == 0)

251 {

252 curr[0] = j * ins_c;

253 i = 1;

254 }

255 else

256 i = start_column;

257#else

258 curr[0] = j * ins_c;

259 i = 1;

260#endif

261

262

263

264

265

266

267

268

269 if (s_char_len != NULL)

270 {

272 {

273 int ins;

274 int del;

275 int sub;

276 int x_char_len = s_char_len[i - 1];

277

278

279

280

281

282

283

284

285

286

287 ins = prev[i] + ins_c;

288 del = curr[i - 1] + del_c;

289 if (x[x_char_len - 1] == y[y_char_len - 1]

290 && x_char_len == y_char_len &&

292 sub = prev[i - 1];

293 else

294 sub = prev[i - 1] + sub_c;

295

296

297 curr[i] = Min(ins, del);

298 curr[i] = Min(curr[i], sub);

299

300

301 x += x_char_len;

302 }

303 }

304 else

305 {

307 {

308 int ins;

309 int del;

310 int sub;

311

312

313 ins = prev[i] + ins_c;

314 del = curr[i - 1] + del_c;

315 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);

316

317

318 curr[i] = Min(ins, del);

319 curr[i] = Min(curr[i], sub);

320

321

322 x++;

323 }

324 }

325

326

327 temp = curr;

328 curr = prev;

329 prev = temp;

330

331

332 y += y_char_len;

333

334#ifdef LEVENSHTEIN_LESS_EQUAL

335

336

337

338

339

340

341

342

343 if (max_d >= 0)

344 {

345

346

347

348

349

350

351

352

353 int zp = j - (n - m);

354

355

356 while (stop_column > 0)

357 {

358 int ii = stop_column - 1;

359 int net_inserts = ii - zp;

360

361 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :

362 -net_inserts * del_c) <= max_d)

363 break;

364 stop_column--;

365 }

366

367

368 while (start_column < stop_column)

369 {

370 int net_inserts = start_column - zp;

371

372 if (prev[start_column] +

373 (net_inserts > 0 ? net_inserts * ins_c :

374 -net_inserts * del_c) <= max_d)

375 break;

376

377

378

379

380

381

382 prev[start_column] = max_d + 1;

383 curr[start_column] = max_d + 1;

384 if (start_column != 0)

385 source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;

386 start_column++;

387 }

388

389

390 if (start_column >= stop_column)

391 return max_d + 1;

392 }

393#endif

394 }

395

396

397

398

399

400 return prev[m - 1];

401}

int errcode(int sqlerrcode)

int errmsg(const char *fmt,...)

#define ereport(elevel,...)

#define MAX_LEVENSHTEIN_STRLEN

int varstr_levenshtein(const char *source, int slen, const char *target, int tlen, int ins_c, int del_c, int sub_c, bool trusted)

int pg_mbstrlen_with_len(const char *mbstr, int limit)

int pg_mblen(const char *mbstr)

static rewind_source * source

static bool rest_of_char_same(const char *s1, const char *s2, int len)

int varstr_levenshtein_less_equal(const char *source, int slen, const char *target, int tlen, int ins_c, int del_c, int sub_c, int max_d, bool trusted)