LCOV - code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 159 185 85.9 %
Date: 2026-02-09 18:18:03 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * unaccent.c
       4             :  *    Text search unaccent dictionary
       5             :  *
       6             :  * Copyright (c) 2009-2026, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    contrib/unaccent/unaccent.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : 
      14             : #include "postgres.h"
      15             : 
      16             : #include "catalog/pg_ts_dict.h"
      17             : #include "commands/defrem.h"
      18             : #include "lib/stringinfo.h"
      19             : #include "tsearch/ts_cache.h"
      20             : #include "tsearch/ts_locale.h"
      21             : #include "tsearch/ts_public.h"
      22             : #include "utils/builtins.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/syscache.h"
      25             : 
      26           2 : PG_MODULE_MAGIC_EXT(
      27             :                     .name = "unaccent",
      28             :                     .version = PG_VERSION
      29             : );
      30             : 
      31             : /*
      32             :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      33             :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      34             :  * array corresponds to next byte value N.  That element can contain both a
      35             :  * replacement string (to be used if the source string ends with this byte)
      36             :  * and a link to another trie node (to be followed if there are more bytes).
      37             :  *
      38             :  * Note that the trie search logic pays no attention to multibyte character
      39             :  * boundaries.  This is OK as long as both the data entered into the trie and
      40             :  * the data we're trying to look up are validly encoded; no partial-character
      41             :  * matches will occur.
      42             :  */
      43             : typedef struct TrieChar
      44             : {
      45             :     struct TrieChar *nextChar;
      46             :     char       *replaceTo;
      47             :     int         replacelen;
      48             : } TrieChar;
      49             : 
      50             : /*
      51             :  * placeChar - put str into trie's structure, byte by byte.
      52             :  *
      53             :  * If node is NULL, we need to make a new node, which will be returned;
      54             :  * otherwise the return value is the same as node.
      55             :  */
      56             : static TrieChar *
      57       33424 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      58             :           const char *replaceTo, int replacelen)
      59             : {
      60             :     TrieChar   *curnode;
      61             : 
      62       33424 :     if (!node)
      63         332 :         node = palloc0_array(TrieChar, 256);
      64             : 
      65             :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
      66             : 
      67       33424 :     curnode = node + *str;
      68             : 
      69       33424 :     if (lenstr <= 1)
      70             :     {
      71       10644 :         if (curnode->replaceTo)
      72           0 :             ereport(WARNING,
      73             :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
      74             :                      errmsg("duplicate source strings, first one will be used")));
      75             :         else
      76             :         {
      77       10644 :             curnode->replacelen = replacelen;
      78       10644 :             curnode->replaceTo = (char *) palloc(replacelen);
      79       10644 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
      80             :         }
      81             :     }
      82             :     else
      83             :     {
      84       22780 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      85             :                                       replaceTo, replacelen);
      86             :     }
      87             : 
      88       33424 :     return node;
      89             : }
      90             : 
      91             : /*
      92             :  * initTrie  - create trie from file.
      93             :  *
      94             :  * Function converts UTF8-encoded file into current encoding.
      95             :  */
      96             : static TrieChar *
      97           4 : initTrie(const char *filename)
      98             : {
      99           4 :     TrieChar   *volatile rootTrie = NULL;
     100           4 :     MemoryContext ccxt = CurrentMemoryContext;
     101             :     tsearch_readline_state trst;
     102             :     volatile bool skip;
     103             : 
     104           4 :     filename = get_tsearch_config_filename(filename, "rules");
     105           4 :     if (!tsearch_readline_begin(&trst, filename))
     106           0 :         ereport(ERROR,
     107             :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     108             :                  errmsg("could not open unaccent file \"%s\": %m",
     109             :                         filename)));
     110             : 
     111             :     do
     112             :     {
     113             :         /*
     114             :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
     115             :          * emit exception if it finds untranslatable characters in current
     116             :          * locale. We just skip such lines, continuing with the next.
     117             :          */
     118           4 :         skip = true;
     119             : 
     120           4 :         PG_TRY();
     121             :         {
     122             :             char       *line;
     123             : 
     124       10648 :             while ((line = tsearch_readline(&trst)) != NULL)
     125             :             {
     126             :                 /*----------
     127             :                  * The format of each line must be "src" or "src trg", where
     128             :                  * src and trg are sequences of one or more non-whitespace
     129             :                  * characters, separated by whitespace.  Whitespace at start
     130             :                  * or end of line is ignored.  If trg is omitted, an empty
     131             :                  * string is used as the replacement.  trg can be optionally
     132             :                  * quoted, in which case whitespaces are included in it.
     133             :                  *
     134             :                  * We use a simple state machine, with states
     135             :                  *  0   initial (before src)
     136             :                  *  1   in src
     137             :                  *  2   in whitespace after src
     138             :                  *  3   in trg (non-quoted)
     139             :                  *  4   in trg (quoted)
     140             :                  *  5   in whitespace after trg
     141             :                  *  -1  syntax error detected (two strings)
     142             :                  *  -2  syntax error detected (unfinished quoted string)
     143             :                  *----------
     144             :                  */
     145             :                 int         state;
     146             :                 char       *ptr;
     147       10644 :                 char       *src = NULL;
     148       10644 :                 char       *trg = NULL;
     149       10644 :                 char       *trgstore = NULL;
     150             :                 int         ptrlen;
     151       10644 :                 int         srclen = 0;
     152       10644 :                 int         trglen = 0;
     153       10644 :                 int         trgstorelen = 0;
     154       10644 :                 bool        trgquoted = false;
     155             : 
     156       10644 :                 state = 0;
     157       54712 :                 for (ptr = line; *ptr; ptr += ptrlen)
     158             :                 {
     159       44068 :                     ptrlen = pg_mblen_cstr(ptr);
     160             :                     /* ignore whitespace, but end src or trg */
     161       44068 :                     if (isspace((unsigned char) *ptr))
     162             :                     {
     163       20944 :                         if (state == 1)
     164       10644 :                             state = 2;
     165       10300 :                         else if (state == 3)
     166       10108 :                             state = 5;
     167             :                         /* whitespaces are OK in quoted area */
     168       20944 :                         if (state != 4)
     169       20864 :                             continue;
     170             :                     }
     171       23204 :                     switch (state)
     172             :                     {
     173       10644 :                         case 0:
     174             :                             /* start of src */
     175       10644 :                             src = ptr;
     176       10644 :                             srclen = ptrlen;
     177       10644 :                             state = 1;
     178       10644 :                             break;
     179           0 :                         case 1:
     180             :                             /* continue src */
     181           0 :                             srclen += ptrlen;
     182           0 :                             break;
     183       10220 :                         case 2:
     184             :                             /* start of trg */
     185       10220 :                             if (*ptr == '"')
     186             :                             {
     187         112 :                                 trgquoted = true;
     188         112 :                                 state = 4;
     189             :                             }
     190             :                             else
     191       10108 :                                 state = 3;
     192             : 
     193       10220 :                             trg = ptr;
     194       10220 :                             trglen = ptrlen;
     195       10220 :                             break;
     196        1876 :                         case 3:
     197             :                             /* continue non-quoted trg */
     198        1876 :                             trglen += ptrlen;
     199        1876 :                             break;
     200         464 :                         case 4:
     201             :                             /* continue quoted trg */
     202         464 :                             trglen += ptrlen;
     203             : 
     204             :                             /*
     205             :                              * If this is a quote, consider it as the end of
     206             :                              * trg except if the follow-up character is itself
     207             :                              * a quote.
     208             :                              */
     209         464 :                             if (*ptr == '"')
     210             :                             {
     211         144 :                                 if (*(ptr + 1) == '"')
     212             :                                 {
     213          32 :                                     ptr++;
     214          32 :                                     trglen += 1;
     215             :                                 }
     216             :                                 else
     217         112 :                                     state = 5;
     218             :                             }
     219         464 :                             break;
     220           0 :                         default:
     221             :                             /* bogus line format */
     222           0 :                             state = -1;
     223           0 :                             break;
     224             :                     }
     225             :                 }
     226             : 
     227       10644 :                 if (state == 1 || state == 2)
     228             :                 {
     229             :                     /* trg was omitted, so use "" */
     230         424 :                     trg = "";
     231         424 :                     trglen = 0;
     232             :                 }
     233             : 
     234             :                 /* If still in a quoted area, fallback to an error */
     235       10644 :                 if (state == 4)
     236           0 :                     state = -2;
     237             : 
     238             :                 /* If trg was quoted, remove its quotes and unescape it */
     239       10644 :                 if (trgquoted && state > 0)
     240             :                 {
     241             :                     /* Ignore first and end quotes */
     242         112 :                     trgstore = palloc_array(char, trglen - 2);
     243         112 :                     trgstorelen = 0;
     244         464 :                     for (int i = 1; i < trglen - 1; i++)
     245             :                     {
     246         352 :                         trgstore[trgstorelen] = trg[i];
     247         352 :                         trgstorelen++;
     248             :                         /* skip second double quotes */
     249         352 :                         if (trg[i] == '"' && trg[i + 1] == '"')
     250          32 :                             i++;
     251             :                     }
     252             :                 }
     253             :                 else
     254             :                 {
     255       10532 :                     trgstore = palloc_array(char, trglen);
     256       10532 :                     trgstorelen = trglen;
     257       10532 :                     memcpy(trgstore, trg, trgstorelen);
     258             :                 }
     259             : 
     260       10644 :                 if (state > 0)
     261       10644 :                     rootTrie = placeChar(rootTrie,
     262             :                                          (unsigned char *) src, srclen,
     263             :                                          trgstore, trgstorelen);
     264           0 :                 else if (state == -1)
     265           0 :                     ereport(WARNING,
     266             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     267             :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
     268           0 :                 else if (state == -2)
     269           0 :                     ereport(WARNING,
     270             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     271             :                              errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     272             : 
     273       10644 :                 pfree(trgstore);
     274       10644 :                 pfree(line);
     275             :             }
     276           4 :             skip = false;
     277             :         }
     278           0 :         PG_CATCH();
     279             :         {
     280             :             ErrorData  *errdata;
     281             :             MemoryContext ecxt;
     282             : 
     283           0 :             ecxt = MemoryContextSwitchTo(ccxt);
     284           0 :             errdata = CopyErrorData();
     285           0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     286             :             {
     287           0 :                 FlushErrorState();
     288             :             }
     289             :             else
     290             :             {
     291           0 :                 MemoryContextSwitchTo(ecxt);
     292           0 :                 PG_RE_THROW();
     293             :             }
     294             :         }
     295           4 :         PG_END_TRY();
     296             :     }
     297           4 :     while (skip);
     298             : 
     299           4 :     tsearch_readline_end(&trst);
     300             : 
     301           4 :     return rootTrie;
     302             : }
     303             : 
     304             : /*
     305             :  * findReplaceTo - find longest possible match in trie
     306             :  *
     307             :  * On success, returns pointer to ending subnode, plus length of matched
     308             :  * source string in *p_matchlen.  On failure, returns NULL.
     309             :  */
     310             : static TrieChar *
     311         158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     312             :               int *p_matchlen)
     313             : {
     314         158 :     TrieChar   *result = NULL;
     315         158 :     int         matchlen = 0;
     316             : 
     317         158 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
     318             : 
     319         452 :     while (node && matchlen < srclen)
     320             :     {
     321         294 :         node = node + src[matchlen];
     322         294 :         matchlen++;
     323             : 
     324         294 :         if (node->replaceTo)
     325             :         {
     326          74 :             result = node;
     327          74 :             *p_matchlen = matchlen;
     328             :         }
     329             : 
     330         294 :         node = node->nextChar;
     331             :     }
     332             : 
     333         158 :     return result;
     334             : }
     335             : 
     336           4 : PG_FUNCTION_INFO_V1(unaccent_init);
     337             : Datum
     338           4 : unaccent_init(PG_FUNCTION_ARGS)
     339             : {
     340           4 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     341           4 :     TrieChar   *rootTrie = NULL;
     342           4 :     bool        fileloaded = false;
     343             :     ListCell   *l;
     344             : 
     345           8 :     foreach(l, dictoptions)
     346             :     {
     347           4 :         DefElem    *defel = (DefElem *) lfirst(l);
     348             : 
     349           4 :         if (strcmp(defel->defname, "rules") == 0)
     350             :         {
     351           4 :             if (fileloaded)
     352           0 :                 ereport(ERROR,
     353             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     354             :                          errmsg("multiple Rules parameters")));
     355           4 :             rootTrie = initTrie(defGetString(defel));
     356           4 :             fileloaded = true;
     357             :         }
     358             :         else
     359             :         {
     360           0 :             ereport(ERROR,
     361             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     362             :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
     363             :                             defel->defname)));
     364             :         }
     365             :     }
     366             : 
     367           4 :     if (!fileloaded)
     368             :     {
     369           0 :         ereport(ERROR,
     370             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     371             :                  errmsg("missing Rules parameter")));
     372             :     }
     373             : 
     374           4 :     PG_RETURN_POINTER(rootTrie);
     375             : }
     376             : 
     377           4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     378             : Datum
     379          56 : unaccent_lexize(PG_FUNCTION_ARGS)
     380             : {
     381          56 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     382          56 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
     383          56 :     int32       len = PG_GETARG_INT32(2);
     384          56 :     char       *srcstart = srcchar;
     385          56 :     const char *srcend = srcstart + len;
     386             :     TSLexeme   *res;
     387             :     StringInfoData buf;
     388             : 
     389             :     /* we allocate storage for the buffer only if needed */
     390          56 :     buf.data = NULL;
     391             : 
     392         214 :     while (len > 0)
     393             :     {
     394             :         TrieChar   *node;
     395             :         int         matchlen;
     396             : 
     397         158 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     398             :                              &matchlen);
     399         158 :         if (node && node->replaceTo)
     400             :         {
     401          74 :             if (buf.data == NULL)
     402             :             {
     403             :                 /* initialize buffer */
     404          50 :                 initStringInfo(&buf);
     405             :                 /* insert any data we already skipped over */
     406          50 :                 if (srcchar != srcstart)
     407          12 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     408             :             }
     409          74 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     410             :         }
     411             :         else
     412             :         {
     413          84 :             matchlen = pg_mblen_range(srcchar, srcend);
     414          84 :             if (buf.data != NULL)
     415          36 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     416             :         }
     417             : 
     418         158 :         srcchar += matchlen;
     419         158 :         len -= matchlen;
     420             :     }
     421             : 
     422             :     /* return a result only if we made at least one substitution */
     423          56 :     if (buf.data != NULL)
     424             :     {
     425          50 :         res = palloc0_array(TSLexeme, 2);
     426          50 :         res->lexeme = buf.data;
     427          50 :         res->flags = TSL_FILTER;
     428             :     }
     429             :     else
     430           6 :         res = NULL;
     431             : 
     432          56 :     PG_RETURN_POINTER(res);
     433             : }
     434             : 
     435             : /*
     436             :  * Function-like wrapper for dictionary
     437             :  */
     438           8 : PG_FUNCTION_INFO_V1(unaccent_dict);
     439             : Datum
     440          38 : unaccent_dict(PG_FUNCTION_ARGS)
     441             : {
     442             :     text       *str;
     443             :     int         strArg;
     444             :     Oid         dictOid;
     445             :     TSDictionaryCacheEntry *dict;
     446             :     TSLexeme   *res;
     447             : 
     448          38 :     if (PG_NARGS() == 1)
     449             :     {
     450             :         /*
     451             :          * Use the "unaccent" dictionary that is in the same schema that this
     452             :          * function is in.
     453             :          */
     454          20 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     455          20 :         const char *dictname = "unaccent";
     456             : 
     457          20 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     458             :                                   PointerGetDatum(dictname),
     459             :                                   ObjectIdGetDatum(procnspid));
     460          20 :         if (!OidIsValid(dictOid))
     461           0 :             ereport(ERROR,
     462             :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
     463             :                      errmsg("text search dictionary \"%s.%s\" does not exist",
     464             :                             get_namespace_name(procnspid), dictname)));
     465          20 :         strArg = 0;
     466             :     }
     467             :     else
     468             :     {
     469          18 :         dictOid = PG_GETARG_OID(0);
     470          18 :         strArg = 1;
     471             :     }
     472          38 :     str = PG_GETARG_TEXT_PP(strArg);
     473             : 
     474          38 :     dict = lookup_ts_dictionary_cache(dictOid);
     475             : 
     476          38 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     477             :                                                      PointerGetDatum(dict->dictData),
     478             :                                                      PointerGetDatum(VARDATA_ANY(str)),
     479             :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     480             :                                                      PointerGetDatum(NULL)));
     481             : 
     482          38 :     PG_FREE_IF_COPY(str, strArg);
     483             : 
     484          38 :     if (res == NULL)
     485             :     {
     486           4 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     487             :     }
     488          34 :     else if (res->lexeme == NULL)
     489             :     {
     490           0 :         pfree(res);
     491           0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     492             :     }
     493             :     else
     494             :     {
     495          34 :         text       *txt = cstring_to_text(res->lexeme);
     496             : 
     497          34 :         pfree(res->lexeme);
     498          34 :         pfree(res);
     499             : 
     500          34 :         PG_RETURN_TEXT_P(txt);
     501             :     }
     502             : }

Generated by: LCOV version 1.16