Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * unaccent.c
4 : * Text search unaccent dictionary
5 : *
6 : * Copyright (c) 2009-2024, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * contrib/unaccent/unaccent.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_ts_dict.h"
17 : #include "commands/defrem.h"
18 : #include "lib/stringinfo.h"
19 : #include "tsearch/ts_cache.h"
20 : #include "tsearch/ts_locale.h"
21 : #include "tsearch/ts_public.h"
22 : #include "utils/builtins.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/syscache.h"
25 :
26 2 : PG_MODULE_MAGIC;
27 :
28 : /*
29 : * An unaccent dictionary uses a trie to find a string to replace. Each node
30 : * of the trie is an array of 256 TrieChar structs; the N-th element of the
31 : * array corresponds to next byte value N. That element can contain both a
32 : * replacement string (to be used if the source string ends with this byte)
33 : * and a link to another trie node (to be followed if there are more bytes).
34 : *
35 : * Note that the trie search logic pays no attention to multibyte character
36 : * boundaries. This is OK as long as both the data entered into the trie and
37 : * the data we're trying to look up are validly encoded; no partial-character
38 : * matches will occur.
39 : */
40 : typedef struct TrieChar
41 : {
42 : struct TrieChar *nextChar;
43 : char *replaceTo;
44 : int replacelen;
45 : } TrieChar;
46 :
47 : /*
48 : * placeChar - put str into trie's structure, byte by byte.
49 : *
50 : * If node is NULL, we need to make a new node, which will be returned;
51 : * otherwise the return value is the same as node.
52 : */
53 : static TrieChar *
54 33424 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
55 : const char *replaceTo, int replacelen)
56 : {
57 : TrieChar *curnode;
58 :
59 33424 : if (!node)
60 332 : node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
61 :
62 : Assert(lenstr > 0); /* else str[0] doesn't exist */
63 :
64 33424 : curnode = node + *str;
65 :
66 33424 : if (lenstr <= 1)
67 : {
68 10644 : if (curnode->replaceTo)
69 0 : ereport(WARNING,
70 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
71 : errmsg("duplicate source strings, first one will be used")));
72 : else
73 : {
74 10644 : curnode->replacelen = replacelen;
75 10644 : curnode->replaceTo = (char *) palloc(replacelen);
76 10644 : memcpy(curnode->replaceTo, replaceTo, replacelen);
77 : }
78 : }
79 : else
80 : {
81 22780 : curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
82 : replaceTo, replacelen);
83 : }
84 :
85 33424 : return node;
86 : }
87 :
88 : /*
89 : * initTrie - create trie from file.
90 : *
91 : * Function converts UTF8-encoded file into current encoding.
92 : */
93 : static TrieChar *
94 4 : initTrie(const char *filename)
95 : {
96 4 : TrieChar *volatile rootTrie = NULL;
97 4 : MemoryContext ccxt = CurrentMemoryContext;
98 : tsearch_readline_state trst;
99 : volatile bool skip;
100 :
101 4 : filename = get_tsearch_config_filename(filename, "rules");
102 4 : if (!tsearch_readline_begin(&trst, filename))
103 0 : ereport(ERROR,
104 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
105 : errmsg("could not open unaccent file \"%s\": %m",
106 : filename)));
107 :
108 : do
109 : {
110 : /*
111 : * pg_do_encoding_conversion() (called by tsearch_readline()) will
112 : * emit exception if it finds untranslatable characters in current
113 : * locale. We just skip such lines, continuing with the next.
114 : */
115 4 : skip = true;
116 :
117 4 : PG_TRY();
118 : {
119 : char *line;
120 :
121 10648 : while ((line = tsearch_readline(&trst)) != NULL)
122 : {
123 : /*----------
124 : * The format of each line must be "src" or "src trg", where
125 : * src and trg are sequences of one or more non-whitespace
126 : * characters, separated by whitespace. Whitespace at start
127 : * or end of line is ignored. If trg is omitted, an empty
128 : * string is used as the replacement. trg can be optionally
129 : * quoted, in which case whitespaces are included in it.
130 : *
131 : * We use a simple state machine, with states
132 : * 0 initial (before src)
133 : * 1 in src
134 : * 2 in whitespace after src
135 : * 3 in trg (non-quoted)
136 : * 4 in trg (quoted)
137 : * 5 in whitespace after trg
138 : * -1 syntax error detected (two strings)
139 : * -2 syntax error detected (unfinished quoted string)
140 : *----------
141 : */
142 : int state;
143 : char *ptr;
144 10644 : char *src = NULL;
145 10644 : char *trg = NULL;
146 10644 : char *trgstore = NULL;
147 : int ptrlen;
148 10644 : int srclen = 0;
149 10644 : int trglen = 0;
150 10644 : int trgstorelen = 0;
151 10644 : bool trgquoted = false;
152 :
153 10644 : state = 0;
154 54712 : for (ptr = line; *ptr; ptr += ptrlen)
155 : {
156 44068 : ptrlen = pg_mblen(ptr);
157 : /* ignore whitespace, but end src or trg */
158 44068 : if (t_isspace(ptr))
159 : {
160 20944 : if (state == 1)
161 10644 : state = 2;
162 10300 : else if (state == 3)
163 10108 : state = 5;
164 : /* whitespaces are OK in quoted area */
165 20944 : if (state != 4)
166 20864 : continue;
167 : }
168 23204 : switch (state)
169 : {
170 10644 : case 0:
171 : /* start of src */
172 10644 : src = ptr;
173 10644 : srclen = ptrlen;
174 10644 : state = 1;
175 10644 : break;
176 0 : case 1:
177 : /* continue src */
178 0 : srclen += ptrlen;
179 0 : break;
180 10220 : case 2:
181 : /* start of trg */
182 10220 : if (*ptr == '"')
183 : {
184 112 : trgquoted = true;
185 112 : state = 4;
186 : }
187 : else
188 10108 : state = 3;
189 :
190 10220 : trg = ptr;
191 10220 : trglen = ptrlen;
192 10220 : break;
193 1876 : case 3:
194 : /* continue non-quoted trg */
195 1876 : trglen += ptrlen;
196 1876 : break;
197 464 : case 4:
198 : /* continue quoted trg */
199 464 : trglen += ptrlen;
200 :
201 : /*
202 : * If this is a quote, consider it as the end of
203 : * trg except if the follow-up character is itself
204 : * a quote.
205 : */
206 464 : if (*ptr == '"')
207 : {
208 144 : if (*(ptr + 1) == '"')
209 : {
210 32 : ptr++;
211 32 : trglen += 1;
212 : }
213 : else
214 112 : state = 5;
215 : }
216 464 : break;
217 0 : default:
218 : /* bogus line format */
219 0 : state = -1;
220 0 : break;
221 : }
222 : }
223 :
224 10644 : if (state == 1 || state == 2)
225 : {
226 : /* trg was omitted, so use "" */
227 424 : trg = "";
228 424 : trglen = 0;
229 : }
230 :
231 : /* If still in a quoted area, fallback to an error */
232 10644 : if (state == 4)
233 0 : state = -2;
234 :
235 : /* If trg was quoted, remove its quotes and unescape it */
236 10644 : if (trgquoted && state > 0)
237 : {
238 : /* Ignore first and end quotes */
239 112 : trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
240 112 : trgstorelen = 0;
241 464 : for (int i = 1; i < trglen - 1; i++)
242 : {
243 352 : trgstore[trgstorelen] = trg[i];
244 352 : trgstorelen++;
245 : /* skip second double quotes */
246 352 : if (trg[i] == '"' && trg[i + 1] == '"')
247 32 : i++;
248 : }
249 : }
250 : else
251 : {
252 10532 : trgstore = (char *) palloc(sizeof(char) * trglen);
253 10532 : trgstorelen = trglen;
254 10532 : memcpy(trgstore, trg, trgstorelen);
255 : }
256 :
257 10644 : if (state > 0)
258 10644 : rootTrie = placeChar(rootTrie,
259 : (unsigned char *) src, srclen,
260 : trgstore, trgstorelen);
261 0 : else if (state == -1)
262 0 : ereport(WARNING,
263 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
264 : errmsg("invalid syntax: more than two strings in unaccent rule")));
265 0 : else if (state == -2)
266 0 : ereport(WARNING,
267 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
268 : errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
269 :
270 10644 : pfree(trgstore);
271 10644 : pfree(line);
272 : }
273 4 : skip = false;
274 : }
275 0 : PG_CATCH();
276 : {
277 : ErrorData *errdata;
278 : MemoryContext ecxt;
279 :
280 0 : ecxt = MemoryContextSwitchTo(ccxt);
281 0 : errdata = CopyErrorData();
282 0 : if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
283 : {
284 0 : FlushErrorState();
285 : }
286 : else
287 : {
288 0 : MemoryContextSwitchTo(ecxt);
289 0 : PG_RE_THROW();
290 : }
291 : }
292 4 : PG_END_TRY();
293 : }
294 4 : while (skip);
295 :
296 4 : tsearch_readline_end(&trst);
297 :
298 4 : return rootTrie;
299 : }
300 :
301 : /*
302 : * findReplaceTo - find longest possible match in trie
303 : *
304 : * On success, returns pointer to ending subnode, plus length of matched
305 : * source string in *p_matchlen. On failure, returns NULL.
306 : */
307 : static TrieChar *
308 158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
309 : int *p_matchlen)
310 : {
311 158 : TrieChar *result = NULL;
312 158 : int matchlen = 0;
313 :
314 158 : *p_matchlen = 0; /* prevent uninitialized-variable warnings */
315 :
316 452 : while (node && matchlen < srclen)
317 : {
318 294 : node = node + src[matchlen];
319 294 : matchlen++;
320 :
321 294 : if (node->replaceTo)
322 : {
323 74 : result = node;
324 74 : *p_matchlen = matchlen;
325 : }
326 :
327 294 : node = node->nextChar;
328 : }
329 :
330 158 : return result;
331 : }
332 :
333 4 : PG_FUNCTION_INFO_V1(unaccent_init);
334 : Datum
335 4 : unaccent_init(PG_FUNCTION_ARGS)
336 : {
337 4 : List *dictoptions = (List *) PG_GETARG_POINTER(0);
338 4 : TrieChar *rootTrie = NULL;
339 4 : bool fileloaded = false;
340 : ListCell *l;
341 :
342 8 : foreach(l, dictoptions)
343 : {
344 4 : DefElem *defel = (DefElem *) lfirst(l);
345 :
346 4 : if (strcmp(defel->defname, "rules") == 0)
347 : {
348 4 : if (fileloaded)
349 0 : ereport(ERROR,
350 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
351 : errmsg("multiple Rules parameters")));
352 4 : rootTrie = initTrie(defGetString(defel));
353 4 : fileloaded = true;
354 : }
355 : else
356 : {
357 0 : ereport(ERROR,
358 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
359 : errmsg("unrecognized Unaccent parameter: \"%s\"",
360 : defel->defname)));
361 : }
362 : }
363 :
364 4 : if (!fileloaded)
365 : {
366 0 : ereport(ERROR,
367 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
368 : errmsg("missing Rules parameter")));
369 : }
370 :
371 4 : PG_RETURN_POINTER(rootTrie);
372 : }
373 :
374 4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
375 : Datum
376 56 : unaccent_lexize(PG_FUNCTION_ARGS)
377 : {
378 56 : TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
379 56 : char *srcchar = (char *) PG_GETARG_POINTER(1);
380 56 : int32 len = PG_GETARG_INT32(2);
381 56 : char *srcstart = srcchar;
382 : TSLexeme *res;
383 : StringInfoData buf;
384 :
385 : /* we allocate storage for the buffer only if needed */
386 56 : buf.data = NULL;
387 :
388 214 : while (len > 0)
389 : {
390 : TrieChar *node;
391 : int matchlen;
392 :
393 158 : node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
394 : &matchlen);
395 158 : if (node && node->replaceTo)
396 : {
397 74 : if (buf.data == NULL)
398 : {
399 : /* initialize buffer */
400 50 : initStringInfo(&buf);
401 : /* insert any data we already skipped over */
402 50 : if (srcchar != srcstart)
403 12 : appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
404 : }
405 74 : appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
406 : }
407 : else
408 : {
409 84 : matchlen = pg_mblen(srcchar);
410 84 : if (buf.data != NULL)
411 36 : appendBinaryStringInfo(&buf, srcchar, matchlen);
412 : }
413 :
414 158 : srcchar += matchlen;
415 158 : len -= matchlen;
416 : }
417 :
418 : /* return a result only if we made at least one substitution */
419 56 : if (buf.data != NULL)
420 : {
421 50 : res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
422 50 : res->lexeme = buf.data;
423 50 : res->flags = TSL_FILTER;
424 : }
425 : else
426 6 : res = NULL;
427 :
428 56 : PG_RETURN_POINTER(res);
429 : }
430 :
431 : /*
432 : * Function-like wrapper for dictionary
433 : */
434 8 : PG_FUNCTION_INFO_V1(unaccent_dict);
435 : Datum
436 38 : unaccent_dict(PG_FUNCTION_ARGS)
437 : {
438 : text *str;
439 : int strArg;
440 : Oid dictOid;
441 : TSDictionaryCacheEntry *dict;
442 : TSLexeme *res;
443 :
444 38 : if (PG_NARGS() == 1)
445 : {
446 : /*
447 : * Use the "unaccent" dictionary that is in the same schema that this
448 : * function is in.
449 : */
450 20 : Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
451 20 : const char *dictname = "unaccent";
452 :
453 20 : dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
454 : PointerGetDatum(dictname),
455 : ObjectIdGetDatum(procnspid));
456 20 : if (!OidIsValid(dictOid))
457 0 : ereport(ERROR,
458 : (errcode(ERRCODE_UNDEFINED_OBJECT),
459 : errmsg("text search dictionary \"%s.%s\" does not exist",
460 : get_namespace_name(procnspid), dictname)));
461 20 : strArg = 0;
462 : }
463 : else
464 : {
465 18 : dictOid = PG_GETARG_OID(0);
466 18 : strArg = 1;
467 : }
468 38 : str = PG_GETARG_TEXT_PP(strArg);
469 :
470 38 : dict = lookup_ts_dictionary_cache(dictOid);
471 :
472 38 : res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
473 : PointerGetDatum(dict->dictData),
474 : PointerGetDatum(VARDATA_ANY(str)),
475 : Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
476 : PointerGetDatum(NULL)));
477 :
478 38 : PG_FREE_IF_COPY(str, strArg);
479 :
480 38 : if (res == NULL)
481 : {
482 4 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
483 : }
484 34 : else if (res->lexeme == NULL)
485 : {
486 0 : pfree(res);
487 0 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
488 : }
489 : else
490 : {
491 34 : text *txt = cstring_to_text(res->lexeme);
492 :
493 34 : pfree(res->lexeme);
494 34 : pfree(res);
495 :
496 34 : PG_RETURN_TEXT_P(txt);
497 : }
498 : }
|