Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * unaccent.c
4 : * Text search unaccent dictionary
5 : *
6 : * Copyright (c) 2009-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * contrib/unaccent/unaccent.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_ts_dict.h"
17 : #include "commands/defrem.h"
18 : #include "lib/stringinfo.h"
19 : #include "tsearch/ts_cache.h"
20 : #include "tsearch/ts_locale.h"
21 : #include "tsearch/ts_public.h"
22 : #include "utils/builtins.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/syscache.h"
25 :
26 2 : PG_MODULE_MAGIC_EXT(
27 : .name = "unaccent",
28 : .version = PG_VERSION
29 : );
30 :
31 : /*
32 : * An unaccent dictionary uses a trie to find a string to replace. Each node
33 : * of the trie is an array of 256 TrieChar structs; the N-th element of the
34 : * array corresponds to next byte value N. That element can contain both a
35 : * replacement string (to be used if the source string ends with this byte)
36 : * and a link to another trie node (to be followed if there are more bytes).
37 : *
38 : * Note that the trie search logic pays no attention to multibyte character
39 : * boundaries. This is OK as long as both the data entered into the trie and
40 : * the data we're trying to look up are validly encoded; no partial-character
41 : * matches will occur.
42 : */
43 : typedef struct TrieChar
44 : {
45 : struct TrieChar *nextChar;
46 : char *replaceTo;
47 : int replacelen;
48 : } TrieChar;
49 :
50 : /*
51 : * placeChar - put str into trie's structure, byte by byte.
52 : *
53 : * If node is NULL, we need to make a new node, which will be returned;
54 : * otherwise the return value is the same as node.
55 : */
56 : static TrieChar *
57 33424 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
58 : const char *replaceTo, int replacelen)
59 : {
60 : TrieChar *curnode;
61 :
62 33424 : if (!node)
63 332 : node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
64 :
65 : Assert(lenstr > 0); /* else str[0] doesn't exist */
66 :
67 33424 : curnode = node + *str;
68 :
69 33424 : if (lenstr <= 1)
70 : {
71 10644 : if (curnode->replaceTo)
72 0 : ereport(WARNING,
73 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
74 : errmsg("duplicate source strings, first one will be used")));
75 : else
76 : {
77 10644 : curnode->replacelen = replacelen;
78 10644 : curnode->replaceTo = (char *) palloc(replacelen);
79 10644 : memcpy(curnode->replaceTo, replaceTo, replacelen);
80 : }
81 : }
82 : else
83 : {
84 22780 : curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
85 : replaceTo, replacelen);
86 : }
87 :
88 33424 : return node;
89 : }
90 :
91 : /*
92 : * initTrie - create trie from file.
93 : *
94 : * Function converts UTF8-encoded file into current encoding.
95 : */
96 : static TrieChar *
97 4 : initTrie(const char *filename)
98 : {
99 4 : TrieChar *volatile rootTrie = NULL;
100 4 : MemoryContext ccxt = CurrentMemoryContext;
101 : tsearch_readline_state trst;
102 : volatile bool skip;
103 :
104 4 : filename = get_tsearch_config_filename(filename, "rules");
105 4 : if (!tsearch_readline_begin(&trst, filename))
106 0 : ereport(ERROR,
107 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
108 : errmsg("could not open unaccent file \"%s\": %m",
109 : filename)));
110 :
111 : do
112 : {
113 : /*
114 : * pg_do_encoding_conversion() (called by tsearch_readline()) will
115 : * emit exception if it finds untranslatable characters in current
116 : * locale. We just skip such lines, continuing with the next.
117 : */
118 4 : skip = true;
119 :
120 4 : PG_TRY();
121 : {
122 : char *line;
123 :
124 10648 : while ((line = tsearch_readline(&trst)) != NULL)
125 : {
126 : /*----------
127 : * The format of each line must be "src" or "src trg", where
128 : * src and trg are sequences of one or more non-whitespace
129 : * characters, separated by whitespace. Whitespace at start
130 : * or end of line is ignored. If trg is omitted, an empty
131 : * string is used as the replacement. trg can be optionally
132 : * quoted, in which case whitespaces are included in it.
133 : *
134 : * We use a simple state machine, with states
135 : * 0 initial (before src)
136 : * 1 in src
137 : * 2 in whitespace after src
138 : * 3 in trg (non-quoted)
139 : * 4 in trg (quoted)
140 : * 5 in whitespace after trg
141 : * -1 syntax error detected (two strings)
142 : * -2 syntax error detected (unfinished quoted string)
143 : *----------
144 : */
145 : int state;
146 : char *ptr;
147 10644 : char *src = NULL;
148 10644 : char *trg = NULL;
149 10644 : char *trgstore = NULL;
150 : int ptrlen;
151 10644 : int srclen = 0;
152 10644 : int trglen = 0;
153 10644 : int trgstorelen = 0;
154 10644 : bool trgquoted = false;
155 :
156 10644 : state = 0;
157 54712 : for (ptr = line; *ptr; ptr += ptrlen)
158 : {
159 44068 : ptrlen = pg_mblen(ptr);
160 : /* ignore whitespace, but end src or trg */
161 44068 : if (isspace((unsigned char) *ptr))
162 : {
163 20944 : if (state == 1)
164 10644 : state = 2;
165 10300 : else if (state == 3)
166 10108 : state = 5;
167 : /* whitespaces are OK in quoted area */
168 20944 : if (state != 4)
169 20864 : continue;
170 : }
171 23204 : switch (state)
172 : {
173 10644 : case 0:
174 : /* start of src */
175 10644 : src = ptr;
176 10644 : srclen = ptrlen;
177 10644 : state = 1;
178 10644 : break;
179 0 : case 1:
180 : /* continue src */
181 0 : srclen += ptrlen;
182 0 : break;
183 10220 : case 2:
184 : /* start of trg */
185 10220 : if (*ptr == '"')
186 : {
187 112 : trgquoted = true;
188 112 : state = 4;
189 : }
190 : else
191 10108 : state = 3;
192 :
193 10220 : trg = ptr;
194 10220 : trglen = ptrlen;
195 10220 : break;
196 1876 : case 3:
197 : /* continue non-quoted trg */
198 1876 : trglen += ptrlen;
199 1876 : break;
200 464 : case 4:
201 : /* continue quoted trg */
202 464 : trglen += ptrlen;
203 :
204 : /*
205 : * If this is a quote, consider it as the end of
206 : * trg except if the follow-up character is itself
207 : * a quote.
208 : */
209 464 : if (*ptr == '"')
210 : {
211 144 : if (*(ptr + 1) == '"')
212 : {
213 32 : ptr++;
214 32 : trglen += 1;
215 : }
216 : else
217 112 : state = 5;
218 : }
219 464 : break;
220 0 : default:
221 : /* bogus line format */
222 0 : state = -1;
223 0 : break;
224 : }
225 : }
226 :
227 10644 : if (state == 1 || state == 2)
228 : {
229 : /* trg was omitted, so use "" */
230 424 : trg = "";
231 424 : trglen = 0;
232 : }
233 :
234 : /* If still in a quoted area, fallback to an error */
235 10644 : if (state == 4)
236 0 : state = -2;
237 :
238 : /* If trg was quoted, remove its quotes and unescape it */
239 10644 : if (trgquoted && state > 0)
240 : {
241 : /* Ignore first and end quotes */
242 112 : trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
243 112 : trgstorelen = 0;
244 464 : for (int i = 1; i < trglen - 1; i++)
245 : {
246 352 : trgstore[trgstorelen] = trg[i];
247 352 : trgstorelen++;
248 : /* skip second double quotes */
249 352 : if (trg[i] == '"' && trg[i + 1] == '"')
250 32 : i++;
251 : }
252 : }
253 : else
254 : {
255 10532 : trgstore = (char *) palloc(sizeof(char) * trglen);
256 10532 : trgstorelen = trglen;
257 10532 : memcpy(trgstore, trg, trgstorelen);
258 : }
259 :
260 10644 : if (state > 0)
261 10644 : rootTrie = placeChar(rootTrie,
262 : (unsigned char *) src, srclen,
263 : trgstore, trgstorelen);
264 0 : else if (state == -1)
265 0 : ereport(WARNING,
266 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
267 : errmsg("invalid syntax: more than two strings in unaccent rule")));
268 0 : else if (state == -2)
269 0 : ereport(WARNING,
270 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
271 : errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
272 :
273 10644 : pfree(trgstore);
274 10644 : pfree(line);
275 : }
276 4 : skip = false;
277 : }
278 0 : PG_CATCH();
279 : {
280 : ErrorData *errdata;
281 : MemoryContext ecxt;
282 :
283 0 : ecxt = MemoryContextSwitchTo(ccxt);
284 0 : errdata = CopyErrorData();
285 0 : if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
286 : {
287 0 : FlushErrorState();
288 : }
289 : else
290 : {
291 0 : MemoryContextSwitchTo(ecxt);
292 0 : PG_RE_THROW();
293 : }
294 : }
295 4 : PG_END_TRY();
296 : }
297 4 : while (skip);
298 :
299 4 : tsearch_readline_end(&trst);
300 :
301 4 : return rootTrie;
302 : }
303 :
304 : /*
305 : * findReplaceTo - find longest possible match in trie
306 : *
307 : * On success, returns pointer to ending subnode, plus length of matched
308 : * source string in *p_matchlen. On failure, returns NULL.
309 : */
310 : static TrieChar *
311 158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
312 : int *p_matchlen)
313 : {
314 158 : TrieChar *result = NULL;
315 158 : int matchlen = 0;
316 :
317 158 : *p_matchlen = 0; /* prevent uninitialized-variable warnings */
318 :
319 452 : while (node && matchlen < srclen)
320 : {
321 294 : node = node + src[matchlen];
322 294 : matchlen++;
323 :
324 294 : if (node->replaceTo)
325 : {
326 74 : result = node;
327 74 : *p_matchlen = matchlen;
328 : }
329 :
330 294 : node = node->nextChar;
331 : }
332 :
333 158 : return result;
334 : }
335 :
336 4 : PG_FUNCTION_INFO_V1(unaccent_init);
337 : Datum
338 4 : unaccent_init(PG_FUNCTION_ARGS)
339 : {
340 4 : List *dictoptions = (List *) PG_GETARG_POINTER(0);
341 4 : TrieChar *rootTrie = NULL;
342 4 : bool fileloaded = false;
343 : ListCell *l;
344 :
345 8 : foreach(l, dictoptions)
346 : {
347 4 : DefElem *defel = (DefElem *) lfirst(l);
348 :
349 4 : if (strcmp(defel->defname, "rules") == 0)
350 : {
351 4 : if (fileloaded)
352 0 : ereport(ERROR,
353 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
354 : errmsg("multiple Rules parameters")));
355 4 : rootTrie = initTrie(defGetString(defel));
356 4 : fileloaded = true;
357 : }
358 : else
359 : {
360 0 : ereport(ERROR,
361 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
362 : errmsg("unrecognized Unaccent parameter: \"%s\"",
363 : defel->defname)));
364 : }
365 : }
366 :
367 4 : if (!fileloaded)
368 : {
369 0 : ereport(ERROR,
370 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
371 : errmsg("missing Rules parameter")));
372 : }
373 :
374 4 : PG_RETURN_POINTER(rootTrie);
375 : }
376 :
377 4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
378 : Datum
379 56 : unaccent_lexize(PG_FUNCTION_ARGS)
380 : {
381 56 : TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
382 56 : char *srcchar = (char *) PG_GETARG_POINTER(1);
383 56 : int32 len = PG_GETARG_INT32(2);
384 56 : char *srcstart = srcchar;
385 : TSLexeme *res;
386 : StringInfoData buf;
387 :
388 : /* we allocate storage for the buffer only if needed */
389 56 : buf.data = NULL;
390 :
391 214 : while (len > 0)
392 : {
393 : TrieChar *node;
394 : int matchlen;
395 :
396 158 : node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
397 : &matchlen);
398 158 : if (node && node->replaceTo)
399 : {
400 74 : if (buf.data == NULL)
401 : {
402 : /* initialize buffer */
403 50 : initStringInfo(&buf);
404 : /* insert any data we already skipped over */
405 50 : if (srcchar != srcstart)
406 12 : appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
407 : }
408 74 : appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
409 : }
410 : else
411 : {
412 84 : matchlen = pg_mblen(srcchar);
413 84 : if (buf.data != NULL)
414 36 : appendBinaryStringInfo(&buf, srcchar, matchlen);
415 : }
416 :
417 158 : srcchar += matchlen;
418 158 : len -= matchlen;
419 : }
420 :
421 : /* return a result only if we made at least one substitution */
422 56 : if (buf.data != NULL)
423 : {
424 50 : res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
425 50 : res->lexeme = buf.data;
426 50 : res->flags = TSL_FILTER;
427 : }
428 : else
429 6 : res = NULL;
430 :
431 56 : PG_RETURN_POINTER(res);
432 : }
433 :
434 : /*
435 : * Function-like wrapper for dictionary
436 : */
437 8 : PG_FUNCTION_INFO_V1(unaccent_dict);
438 : Datum
439 38 : unaccent_dict(PG_FUNCTION_ARGS)
440 : {
441 : text *str;
442 : int strArg;
443 : Oid dictOid;
444 : TSDictionaryCacheEntry *dict;
445 : TSLexeme *res;
446 :
447 38 : if (PG_NARGS() == 1)
448 : {
449 : /*
450 : * Use the "unaccent" dictionary that is in the same schema that this
451 : * function is in.
452 : */
453 20 : Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
454 20 : const char *dictname = "unaccent";
455 :
456 20 : dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
457 : PointerGetDatum(dictname),
458 : ObjectIdGetDatum(procnspid));
459 20 : if (!OidIsValid(dictOid))
460 0 : ereport(ERROR,
461 : (errcode(ERRCODE_UNDEFINED_OBJECT),
462 : errmsg("text search dictionary \"%s.%s\" does not exist",
463 : get_namespace_name(procnspid), dictname)));
464 20 : strArg = 0;
465 : }
466 : else
467 : {
468 18 : dictOid = PG_GETARG_OID(0);
469 18 : strArg = 1;
470 : }
471 38 : str = PG_GETARG_TEXT_PP(strArg);
472 :
473 38 : dict = lookup_ts_dictionary_cache(dictOid);
474 :
475 38 : res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
476 : PointerGetDatum(dict->dictData),
477 : PointerGetDatum(VARDATA_ANY(str)),
478 : Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
479 : PointerGetDatum(NULL)));
480 :
481 38 : PG_FREE_IF_COPY(str, strArg);
482 :
483 38 : if (res == NULL)
484 : {
485 4 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
486 : }
487 34 : else if (res->lexeme == NULL)
488 : {
489 0 : pfree(res);
490 0 : PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
491 : }
492 : else
493 : {
494 34 : text *txt = cstring_to_text(res->lexeme);
495 :
496 34 : pfree(res->lexeme);
497 34 : pfree(res);
498 :
499 34 : PG_RETURN_TEXT_P(txt);
500 : }
501 : }
|