LCOV - code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 85.9 % 185 159
Test Date: 2026-03-02 02:14:46 Functions: 100.0 % 10 10
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * unaccent.c
       4              :  *    Text search unaccent dictionary
       5              :  *
       6              :  * Copyright (c) 2009-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    contrib/unaccent/unaccent.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : 
      14              : #include "postgres.h"
      15              : 
      16              : #include "catalog/pg_ts_dict.h"
      17              : #include "commands/defrem.h"
      18              : #include "lib/stringinfo.h"
      19              : #include "tsearch/ts_cache.h"
      20              : #include "tsearch/ts_locale.h"
      21              : #include "tsearch/ts_public.h"
      22              : #include "utils/builtins.h"
      23              : #include "utils/lsyscache.h"
      24              : #include "utils/syscache.h"
      25              : 
      26            1 : PG_MODULE_MAGIC_EXT(
      27              :                     .name = "unaccent",
      28              :                     .version = PG_VERSION
      29              : );
      30              : 
      31              : /*
      32              :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      33              :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      34              :  * array corresponds to next byte value N.  That element can contain both a
      35              :  * replacement string (to be used if the source string ends with this byte)
      36              :  * and a link to another trie node (to be followed if there are more bytes).
      37              :  *
      38              :  * Note that the trie search logic pays no attention to multibyte character
      39              :  * boundaries.  This is OK as long as both the data entered into the trie and
      40              :  * the data we're trying to look up are validly encoded; no partial-character
      41              :  * matches will occur.
      42              :  */
      43              : typedef struct TrieChar
      44              : {
      45              :     struct TrieChar *nextChar;
      46              :     char       *replaceTo;
      47              :     int         replacelen;
      48              : } TrieChar;
      49              : 
      50              : /*
      51              :  * placeChar - put str into trie's structure, byte by byte.
      52              :  *
      53              :  * If node is NULL, we need to make a new node, which will be returned;
      54              :  * otherwise the return value is the same as node.
      55              :  */
      56              : static TrieChar *
      57        16712 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      58              :           const char *replaceTo, int replacelen)
      59              : {
      60              :     TrieChar   *curnode;
      61              : 
      62        16712 :     if (!node)
      63          166 :         node = palloc0_array(TrieChar, 256);
      64              : 
      65              :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
      66              : 
      67        16712 :     curnode = node + *str;
      68              : 
      69        16712 :     if (lenstr <= 1)
      70              :     {
      71         5322 :         if (curnode->replaceTo)
      72            0 :             ereport(WARNING,
      73              :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
      74              :                      errmsg("duplicate source strings, first one will be used")));
      75              :         else
      76              :         {
      77         5322 :             curnode->replacelen = replacelen;
      78         5322 :             curnode->replaceTo = (char *) palloc(replacelen);
      79         5322 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
      80              :         }
      81              :     }
      82              :     else
      83              :     {
      84        11390 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      85              :                                       replaceTo, replacelen);
      86              :     }
      87              : 
      88        16712 :     return node;
      89              : }
      90              : 
      91              : /*
      92              :  * initTrie  - create trie from file.
      93              :  *
      94              :  * Function converts UTF8-encoded file into current encoding.
      95              :  */
      96              : static TrieChar *
      97            2 : initTrie(const char *filename)
      98              : {
      99            2 :     TrieChar   *volatile rootTrie = NULL;
     100            2 :     MemoryContext ccxt = CurrentMemoryContext;
     101              :     tsearch_readline_state trst;
     102              :     volatile bool skip;
     103              : 
     104            2 :     filename = get_tsearch_config_filename(filename, "rules");
     105            2 :     if (!tsearch_readline_begin(&trst, filename))
     106            0 :         ereport(ERROR,
     107              :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     108              :                  errmsg("could not open unaccent file \"%s\": %m",
     109              :                         filename)));
     110              : 
     111              :     do
     112              :     {
     113              :         /*
     114              :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
     115              :          * emit exception if it finds untranslatable characters in current
     116              :          * locale. We just skip such lines, continuing with the next.
     117              :          */
     118            2 :         skip = true;
     119              : 
     120            2 :         PG_TRY();
     121              :         {
     122              :             char       *line;
     123              : 
     124         5324 :             while ((line = tsearch_readline(&trst)) != NULL)
     125              :             {
     126              :                 /*----------
     127              :                  * The format of each line must be "src" or "src trg", where
     128              :                  * src and trg are sequences of one or more non-whitespace
     129              :                  * characters, separated by whitespace.  Whitespace at start
     130              :                  * or end of line is ignored.  If trg is omitted, an empty
     131              :                  * string is used as the replacement.  trg can be optionally
     132              :                  * quoted, in which case whitespaces are included in it.
     133              :                  *
     134              :                  * We use a simple state machine, with states
     135              :                  *  0   initial (before src)
     136              :                  *  1   in src
     137              :                  *  2   in whitespace after src
     138              :                  *  3   in trg (non-quoted)
     139              :                  *  4   in trg (quoted)
     140              :                  *  5   in whitespace after trg
     141              :                  *  -1  syntax error detected (two strings)
     142              :                  *  -2  syntax error detected (unfinished quoted string)
     143              :                  *----------
     144              :                  */
     145              :                 int         state;
     146              :                 char       *ptr;
     147         5322 :                 char       *src = NULL;
     148         5322 :                 char       *trg = NULL;
     149         5322 :                 char       *trgstore = NULL;
     150              :                 int         ptrlen;
     151         5322 :                 int         srclen = 0;
     152         5322 :                 int         trglen = 0;
     153         5322 :                 int         trgstorelen = 0;
     154         5322 :                 bool        trgquoted = false;
     155              : 
     156         5322 :                 state = 0;
     157        27356 :                 for (ptr = line; *ptr; ptr += ptrlen)
     158              :                 {
     159        22034 :                     ptrlen = pg_mblen_cstr(ptr);
     160              :                     /* ignore whitespace, but end src or trg */
     161        22034 :                     if (isspace((unsigned char) *ptr))
     162              :                     {
     163        10472 :                         if (state == 1)
     164         5322 :                             state = 2;
     165         5150 :                         else if (state == 3)
     166         5054 :                             state = 5;
     167              :                         /* whitespaces are OK in quoted area */
     168        10472 :                         if (state != 4)
     169        10432 :                             continue;
     170              :                     }
     171        11602 :                     switch (state)
     172              :                     {
     173         5322 :                         case 0:
     174              :                             /* start of src */
     175         5322 :                             src = ptr;
     176         5322 :                             srclen = ptrlen;
     177         5322 :                             state = 1;
     178         5322 :                             break;
     179            0 :                         case 1:
     180              :                             /* continue src */
     181            0 :                             srclen += ptrlen;
     182            0 :                             break;
     183         5110 :                         case 2:
     184              :                             /* start of trg */
     185         5110 :                             if (*ptr == '"')
     186              :                             {
     187           56 :                                 trgquoted = true;
     188           56 :                                 state = 4;
     189              :                             }
     190              :                             else
     191         5054 :                                 state = 3;
     192              : 
     193         5110 :                             trg = ptr;
     194         5110 :                             trglen = ptrlen;
     195         5110 :                             break;
     196          938 :                         case 3:
     197              :                             /* continue non-quoted trg */
     198          938 :                             trglen += ptrlen;
     199          938 :                             break;
     200          232 :                         case 4:
     201              :                             /* continue quoted trg */
     202          232 :                             trglen += ptrlen;
     203              : 
     204              :                             /*
     205              :                              * If this is a quote, consider it as the end of
     206              :                              * trg except if the follow-up character is itself
     207              :                              * a quote.
     208              :                              */
     209          232 :                             if (*ptr == '"')
     210              :                             {
     211           72 :                                 if (*(ptr + 1) == '"')
     212              :                                 {
     213           16 :                                     ptr++;
     214           16 :                                     trglen += 1;
     215              :                                 }
     216              :                                 else
     217           56 :                                     state = 5;
     218              :                             }
     219          232 :                             break;
     220            0 :                         default:
     221              :                             /* bogus line format */
     222            0 :                             state = -1;
     223            0 :                             break;
     224              :                     }
     225              :                 }
     226              : 
     227         5322 :                 if (state == 1 || state == 2)
     228              :                 {
     229              :                     /* trg was omitted, so use "" */
     230          212 :                     trg = "";
     231          212 :                     trglen = 0;
     232              :                 }
     233              : 
     234              :                 /* If still in a quoted area, fallback to an error */
     235         5322 :                 if (state == 4)
     236            0 :                     state = -2;
     237              : 
     238              :                 /* If trg was quoted, remove its quotes and unescape it */
     239         5322 :                 if (trgquoted && state > 0)
     240              :                 {
     241              :                     /* Ignore first and end quotes */
     242           56 :                     trgstore = palloc_array(char, trglen - 2);
     243           56 :                     trgstorelen = 0;
     244          232 :                     for (int i = 1; i < trglen - 1; i++)
     245              :                     {
     246          176 :                         trgstore[trgstorelen] = trg[i];
     247          176 :                         trgstorelen++;
     248              :                         /* skip second double quotes */
     249          176 :                         if (trg[i] == '"' && trg[i + 1] == '"')
     250           16 :                             i++;
     251              :                     }
     252              :                 }
     253              :                 else
     254              :                 {
     255         5266 :                     trgstore = palloc_array(char, trglen);
     256         5266 :                     trgstorelen = trglen;
     257         5266 :                     memcpy(trgstore, trg, trgstorelen);
     258              :                 }
     259              : 
     260         5322 :                 if (state > 0)
     261         5322 :                     rootTrie = placeChar(rootTrie,
     262              :                                          (unsigned char *) src, srclen,
     263              :                                          trgstore, trgstorelen);
     264            0 :                 else if (state == -1)
     265            0 :                     ereport(WARNING,
     266              :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     267              :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
     268            0 :                 else if (state == -2)
     269            0 :                     ereport(WARNING,
     270              :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     271              :                              errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     272              : 
     273         5322 :                 pfree(trgstore);
     274         5322 :                 pfree(line);
     275              :             }
     276            2 :             skip = false;
     277              :         }
     278            0 :         PG_CATCH();
     279              :         {
     280              :             ErrorData  *errdata;
     281              :             MemoryContext ecxt;
     282              : 
     283            0 :             ecxt = MemoryContextSwitchTo(ccxt);
     284            0 :             errdata = CopyErrorData();
     285            0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     286              :             {
     287            0 :                 FlushErrorState();
     288              :             }
     289              :             else
     290              :             {
     291            0 :                 MemoryContextSwitchTo(ecxt);
     292            0 :                 PG_RE_THROW();
     293              :             }
     294              :         }
     295            2 :         PG_END_TRY();
     296              :     }
     297            2 :     while (skip);
     298              : 
     299            2 :     tsearch_readline_end(&trst);
     300              : 
     301            2 :     return rootTrie;
     302              : }
     303              : 
     304              : /*
     305              :  * findReplaceTo - find longest possible match in trie
     306              :  *
     307              :  * On success, returns pointer to ending subnode, plus length of matched
     308              :  * source string in *p_matchlen.  On failure, returns NULL.
     309              :  */
     310              : static TrieChar *
     311           79 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     312              :               int *p_matchlen)
     313              : {
     314           79 :     TrieChar   *result = NULL;
     315           79 :     int         matchlen = 0;
     316              : 
     317           79 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
     318              : 
     319          226 :     while (node && matchlen < srclen)
     320              :     {
     321          147 :         node = node + src[matchlen];
     322          147 :         matchlen++;
     323              : 
     324          147 :         if (node->replaceTo)
     325              :         {
     326           37 :             result = node;
     327           37 :             *p_matchlen = matchlen;
     328              :         }
     329              : 
     330          147 :         node = node->nextChar;
     331              :     }
     332              : 
     333           79 :     return result;
     334              : }
     335              : 
     336            2 : PG_FUNCTION_INFO_V1(unaccent_init);
     337              : Datum
     338            2 : unaccent_init(PG_FUNCTION_ARGS)
     339              : {
     340            2 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     341            2 :     TrieChar   *rootTrie = NULL;
     342            2 :     bool        fileloaded = false;
     343              :     ListCell   *l;
     344              : 
     345            4 :     foreach(l, dictoptions)
     346              :     {
     347            2 :         DefElem    *defel = (DefElem *) lfirst(l);
     348              : 
     349            2 :         if (strcmp(defel->defname, "rules") == 0)
     350              :         {
     351            2 :             if (fileloaded)
     352            0 :                 ereport(ERROR,
     353              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     354              :                          errmsg("multiple Rules parameters")));
     355            2 :             rootTrie = initTrie(defGetString(defel));
     356            2 :             fileloaded = true;
     357              :         }
     358              :         else
     359              :         {
     360            0 :             ereport(ERROR,
     361              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     362              :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
     363              :                             defel->defname)));
     364              :         }
     365              :     }
     366              : 
     367            2 :     if (!fileloaded)
     368              :     {
     369            0 :         ereport(ERROR,
     370              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     371              :                  errmsg("missing Rules parameter")));
     372              :     }
     373              : 
     374            2 :     PG_RETURN_POINTER(rootTrie);
     375              : }
     376              : 
     377            2 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     378              : Datum
     379           28 : unaccent_lexize(PG_FUNCTION_ARGS)
     380              : {
     381           28 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     382           28 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
     383           28 :     int32       len = PG_GETARG_INT32(2);
     384           28 :     char       *srcstart = srcchar;
     385           28 :     const char *srcend = srcstart + len;
     386              :     TSLexeme   *res;
     387              :     StringInfoData buf;
     388              : 
     389              :     /* we allocate storage for the buffer only if needed */
     390           28 :     buf.data = NULL;
     391              : 
     392          107 :     while (len > 0)
     393              :     {
     394              :         TrieChar   *node;
     395              :         int         matchlen;
     396              : 
     397           79 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     398              :                              &matchlen);
     399           79 :         if (node && node->replaceTo)
     400              :         {
     401           37 :             if (buf.data == NULL)
     402              :             {
     403              :                 /* initialize buffer */
     404           25 :                 initStringInfo(&buf);
     405              :                 /* insert any data we already skipped over */
     406           25 :                 if (srcchar != srcstart)
     407            6 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     408              :             }
     409           37 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     410              :         }
     411              :         else
     412              :         {
     413           42 :             matchlen = pg_mblen_range(srcchar, srcend);
     414           42 :             if (buf.data != NULL)
     415           18 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     416              :         }
     417              : 
     418           79 :         srcchar += matchlen;
     419           79 :         len -= matchlen;
     420              :     }
     421              : 
     422              :     /* return a result only if we made at least one substitution */
     423           28 :     if (buf.data != NULL)
     424              :     {
     425           25 :         res = palloc0_array(TSLexeme, 2);
     426           25 :         res->lexeme = buf.data;
     427           25 :         res->flags = TSL_FILTER;
     428              :     }
     429              :     else
     430            3 :         res = NULL;
     431              : 
     432           28 :     PG_RETURN_POINTER(res);
     433              : }
     434              : 
     435              : /*
     436              :  * Function-like wrapper for dictionary
     437              :  */
     438            4 : PG_FUNCTION_INFO_V1(unaccent_dict);
     439              : Datum
     440           19 : unaccent_dict(PG_FUNCTION_ARGS)
     441              : {
     442              :     text       *str;
     443              :     int         strArg;
     444              :     Oid         dictOid;
     445              :     TSDictionaryCacheEntry *dict;
     446              :     TSLexeme   *res;
     447              : 
     448           19 :     if (PG_NARGS() == 1)
     449              :     {
     450              :         /*
     451              :          * Use the "unaccent" dictionary that is in the same schema that this
     452              :          * function is in.
     453              :          */
     454           10 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     455           10 :         const char *dictname = "unaccent";
     456              : 
     457           10 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     458              :                                   PointerGetDatum(dictname),
     459              :                                   ObjectIdGetDatum(procnspid));
     460           10 :         if (!OidIsValid(dictOid))
     461            0 :             ereport(ERROR,
     462              :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
     463              :                      errmsg("text search dictionary \"%s.%s\" does not exist",
     464              :                             get_namespace_name(procnspid), dictname)));
     465           10 :         strArg = 0;
     466              :     }
     467              :     else
     468              :     {
     469            9 :         dictOid = PG_GETARG_OID(0);
     470            9 :         strArg = 1;
     471              :     }
     472           19 :     str = PG_GETARG_TEXT_PP(strArg);
     473              : 
     474           19 :     dict = lookup_ts_dictionary_cache(dictOid);
     475              : 
     476           19 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     477              :                                                      PointerGetDatum(dict->dictData),
     478              :                                                      PointerGetDatum(VARDATA_ANY(str)),
     479              :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     480              :                                                      PointerGetDatum(NULL)));
     481              : 
     482           19 :     PG_FREE_IF_COPY(str, strArg);
     483              : 
     484           19 :     if (res == NULL)
     485              :     {
     486            2 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     487              :     }
     488           17 :     else if (res->lexeme == NULL)
     489              :     {
     490            0 :         pfree(res);
     491            0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     492              :     }
     493              :     else
     494              :     {
     495           17 :         text       *txt = cstring_to_text(res->lexeme);
     496              : 
     497           17 :         pfree(res->lexeme);
     498           17 :         pfree(res);
     499              : 
     500           17 :         PG_RETURN_TEXT_P(txt);
     501              :     }
     502              : }
        

Generated by: LCOV version 2.0-1