PostgreSQL Source Code: src/backend/utils/adt/levenshtein.c Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#define MAX_LEVENSHTEIN_STRLEN 255
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66int
67#ifdef LEVENSHTEIN_LESS_EQUAL
69 const char *target, int tlen,
70 int ins_c, int del_c, int sub_c,
71 int max_d, bool trusted)
72#else
74 const char *target, int tlen,
75 int ins_c, int del_c, int sub_c,
76 bool trusted)
77#endif
78{
79 int m,
80 n;
81 int *prev;
82 int *curr;
83 int *s_char_len = NULL;
84 int j;
85 const char *y;
86
87
88
89
90
91
92#ifdef LEVENSHTEIN_LESS_EQUAL
93 int start_column,
94 stop_column;
95
96#undef START_COLUMN
97#undef STOP_COLUMN
98#define START_COLUMN start_column
99#define STOP_COLUMN stop_column
100#else
101#undef START_COLUMN
102#undef STOP_COLUMN
103#define START_COLUMN 0
104#define STOP_COLUMN m
105#endif
106
107
110
111
112
113
114
115 if (!m)
116 return n * ins_c;
117 if (!n)
118 return m * del_c;
119
120
121
122
123
124
125
126
127 if (!trusted &&
131 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
132 errmsg("levenshtein argument exceeds maximum length of %d characters",
134
135#ifdef LEVENSHTEIN_LESS_EQUAL
136
137 start_column = 0;
138 stop_column = m + 1;
139
140
141
142
143
144
145
146 if (max_d >= 0)
147 {
148 int min_theo_d;
149 int max_theo_d;
150 int net_inserts = n - m;
151
152 min_theo_d = net_inserts < 0 ?
153 -net_inserts * del_c : net_inserts * ins_c;
154 if (min_theo_d > max_d)
155 return max_d + 1;
156 if (ins_c + del_c < sub_c)
157 sub_c = ins_c + del_c;
158 max_theo_d = min_theo_d + sub_c * Min(m, n);
159 if (max_d >= max_theo_d)
160 max_d = -1;
161 else if (ins_c + del_c > 0)
162 {
163
164
165
166
167
168
169
170
171
172
173
174
175 int slack_d = max_d - min_theo_d;
176 int best_column = net_inserts < 0 ? -net_inserts : 0;
177
178 stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
179 if (stop_column > m)
180 stop_column = m + 1;
181 }
182 }
183#endif
184
185
186
187
188
189
190
191
192
193 if (m != slen || n != tlen)
194 {
195 int i;
196 const char *cp = source;
197
198 s_char_len = (int *) palloc((m + 1) * sizeof(int));
200 {
202 cp += s_char_len[i];
203 }
204 s_char_len[i] = 0;
205 }
206
207
208 ++m;
209 ++n;
210
211
212 prev = (int *) palloc(2 * m * sizeof(int));
213 curr = prev + m;
214
215
216
217
218
221
222
223 for (y = target, j = 1; j < n; j++)
224 {
225 int *temp;
227 int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
228 int i;
229
230#ifdef LEVENSHTEIN_LESS_EQUAL
231
232
233
234
235
236
237
238 if (stop_column < m)
239 {
240 prev[stop_column] = max_d + 1;
241 ++stop_column;
242 }
243
244
245
246
247
248
249
250 if (start_column == 0)
251 {
252 curr[0] = j * ins_c;
253 i = 1;
254 }
255 else
256 i = start_column;
257#else
258 curr[0] = j * ins_c;
259 i = 1;
260#endif
261
262
263
264
265
266
267
268
269 if (s_char_len != NULL)
270 {
272 {
273 int ins;
274 int del;
275 int sub;
276 int x_char_len = s_char_len[i - 1];
277
278
279
280
281
282
283
284
285
286
287 ins = prev[i] + ins_c;
288 del = curr[i - 1] + del_c;
289 if (x[x_char_len - 1] == y[y_char_len - 1]
290 && x_char_len == y_char_len &&
292 sub = prev[i - 1];
293 else
294 sub = prev[i - 1] + sub_c;
295
296
298 curr[i] = Min(curr[i], sub);
299
300
301 x += x_char_len;
302 }
303 }
304 else
305 {
307 {
308 int ins;
309 int del;
310 int sub;
311
312
313 ins = prev[i] + ins_c;
314 del = curr[i - 1] + del_c;
315 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
316
317
319 curr[i] = Min(curr[i], sub);
320
321
322 x++;
323 }
324 }
325
326
327 temp = curr;
328 curr = prev;
329 prev = temp;
330
331
332 y += y_char_len;
333
334#ifdef LEVENSHTEIN_LESS_EQUAL
335
336
337
338
339
340
341
342
343 if (max_d >= 0)
344 {
345
346
347
348
349
350
351
352
353 int zp = j - (n - m);
354
355
356 while (stop_column > 0)
357 {
358 int ii = stop_column - 1;
359 int net_inserts = ii - zp;
360
361 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
362 -net_inserts * del_c) <= max_d)
363 break;
364 stop_column--;
365 }
366
367
368 while (start_column < stop_column)
369 {
370 int net_inserts = start_column - zp;
371
372 if (prev[start_column] +
373 (net_inserts > 0 ? net_inserts * ins_c :
374 -net_inserts * del_c) <= max_d)
375 break;
376
377
378
379
380
381
382 prev[start_column] = max_d + 1;
383 curr[start_column] = max_d + 1;
384 if (start_column != 0)
385 source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
386 start_column++;
387 }
388
389
390 if (start_column >= stop_column)
391 return max_d + 1;
392 }
393#endif
394 }
395
396
397
398
399
400 return prev[m - 1];
401}
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define MAX_LEVENSHTEIN_STRLEN
int varstr_levenshtein(const char *source, int slen, const char *target, int tlen, int ins_c, int del_c, int sub_c, bool trusted)
int pg_mbstrlen_with_len(const char *mbstr, int limit)
int pg_mblen(const char *mbstr)
static rewind_source * source
static bool rest_of_char_same(const char *s1, const char *s2, int len)
int varstr_levenshtein_less_equal(const char *source, int slen, const char *target, int tlen, int ins_c, int del_c, int sub_c, int max_d, bool trusted)