LCOV - code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 158 184 85.9 %
Date: 2025-04-01 16:15:31 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * unaccent.c
       4             :  *    Text search unaccent dictionary
       5             :  *
       6             :  * Copyright (c) 2009-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    contrib/unaccent/unaccent.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : 
      14             : #include "postgres.h"
      15             : 
      16             : #include "catalog/pg_ts_dict.h"
      17             : #include "commands/defrem.h"
      18             : #include "lib/stringinfo.h"
      19             : #include "tsearch/ts_cache.h"
      20             : #include "tsearch/ts_locale.h"
      21             : #include "tsearch/ts_public.h"
      22             : #include "utils/builtins.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/syscache.h"
      25             : 
      26           2 : PG_MODULE_MAGIC_EXT(
      27             :                     .name = "unaccent",
      28             :                     .version = PG_VERSION
      29             : );
      30             : 
      31             : /*
      32             :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      33             :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      34             :  * array corresponds to next byte value N.  That element can contain both a
      35             :  * replacement string (to be used if the source string ends with this byte)
      36             :  * and a link to another trie node (to be followed if there are more bytes).
      37             :  *
      38             :  * Note that the trie search logic pays no attention to multibyte character
      39             :  * boundaries.  This is OK as long as both the data entered into the trie and
      40             :  * the data we're trying to look up are validly encoded; no partial-character
      41             :  * matches will occur.
      42             :  */
      43             : typedef struct TrieChar
      44             : {
      45             :     struct TrieChar *nextChar;
      46             :     char       *replaceTo;
      47             :     int         replacelen;
      48             : } TrieChar;
      49             : 
      50             : /*
      51             :  * placeChar - put str into trie's structure, byte by byte.
      52             :  *
      53             :  * If node is NULL, we need to make a new node, which will be returned;
      54             :  * otherwise the return value is the same as node.
      55             :  */
      56             : static TrieChar *
      57       33424 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      58             :           const char *replaceTo, int replacelen)
      59             : {
      60             :     TrieChar   *curnode;
      61             : 
      62       33424 :     if (!node)
      63         332 :         node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
      64             : 
      65             :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
      66             : 
      67       33424 :     curnode = node + *str;
      68             : 
      69       33424 :     if (lenstr <= 1)
      70             :     {
      71       10644 :         if (curnode->replaceTo)
      72           0 :             ereport(WARNING,
      73             :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
      74             :                      errmsg("duplicate source strings, first one will be used")));
      75             :         else
      76             :         {
      77       10644 :             curnode->replacelen = replacelen;
      78       10644 :             curnode->replaceTo = (char *) palloc(replacelen);
      79       10644 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
      80             :         }
      81             :     }
      82             :     else
      83             :     {
      84       22780 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      85             :                                       replaceTo, replacelen);
      86             :     }
      87             : 
      88       33424 :     return node;
      89             : }
      90             : 
      91             : /*
      92             :  * initTrie  - create trie from file.
      93             :  *
      94             :  * Function converts UTF8-encoded file into current encoding.
      95             :  */
      96             : static TrieChar *
      97           4 : initTrie(const char *filename)
      98             : {
      99           4 :     TrieChar   *volatile rootTrie = NULL;
     100           4 :     MemoryContext ccxt = CurrentMemoryContext;
     101             :     tsearch_readline_state trst;
     102             :     volatile bool skip;
     103             : 
     104           4 :     filename = get_tsearch_config_filename(filename, "rules");
     105           4 :     if (!tsearch_readline_begin(&trst, filename))
     106           0 :         ereport(ERROR,
     107             :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     108             :                  errmsg("could not open unaccent file \"%s\": %m",
     109             :                         filename)));
     110             : 
     111             :     do
     112             :     {
     113             :         /*
     114             :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
     115             :          * emit exception if it finds untranslatable characters in current
     116             :          * locale. We just skip such lines, continuing with the next.
     117             :          */
     118           4 :         skip = true;
     119             : 
     120           4 :         PG_TRY();
     121             :         {
     122             :             char       *line;
     123             : 
     124       10648 :             while ((line = tsearch_readline(&trst)) != NULL)
     125             :             {
     126             :                 /*----------
     127             :                  * The format of each line must be "src" or "src trg", where
     128             :                  * src and trg are sequences of one or more non-whitespace
     129             :                  * characters, separated by whitespace.  Whitespace at start
     130             :                  * or end of line is ignored.  If trg is omitted, an empty
     131             :                  * string is used as the replacement.  trg can be optionally
     132             :                  * quoted, in which case whitespaces are included in it.
     133             :                  *
     134             :                  * We use a simple state machine, with states
     135             :                  *  0   initial (before src)
     136             :                  *  1   in src
     137             :                  *  2   in whitespace after src
     138             :                  *  3   in trg (non-quoted)
     139             :                  *  4   in trg (quoted)
     140             :                  *  5   in whitespace after trg
     141             :                  *  -1  syntax error detected (two strings)
     142             :                  *  -2  syntax error detected (unfinished quoted string)
     143             :                  *----------
     144             :                  */
     145             :                 int         state;
     146             :                 char       *ptr;
     147       10644 :                 char       *src = NULL;
     148       10644 :                 char       *trg = NULL;
     149       10644 :                 char       *trgstore = NULL;
     150             :                 int         ptrlen;
     151       10644 :                 int         srclen = 0;
     152       10644 :                 int         trglen = 0;
     153       10644 :                 int         trgstorelen = 0;
     154       10644 :                 bool        trgquoted = false;
     155             : 
     156       10644 :                 state = 0;
     157       54712 :                 for (ptr = line; *ptr; ptr += ptrlen)
     158             :                 {
     159       44068 :                     ptrlen = pg_mblen(ptr);
     160             :                     /* ignore whitespace, but end src or trg */
     161       44068 :                     if (isspace((unsigned char) *ptr))
     162             :                     {
     163       20944 :                         if (state == 1)
     164       10644 :                             state = 2;
     165       10300 :                         else if (state == 3)
     166       10108 :                             state = 5;
     167             :                         /* whitespaces are OK in quoted area */
     168       20944 :                         if (state != 4)
     169       20864 :                             continue;
     170             :                     }
     171       23204 :                     switch (state)
     172             :                     {
     173       10644 :                         case 0:
     174             :                             /* start of src */
     175       10644 :                             src = ptr;
     176       10644 :                             srclen = ptrlen;
     177       10644 :                             state = 1;
     178       10644 :                             break;
     179           0 :                         case 1:
     180             :                             /* continue src */
     181           0 :                             srclen += ptrlen;
     182           0 :                             break;
     183       10220 :                         case 2:
     184             :                             /* start of trg */
     185       10220 :                             if (*ptr == '"')
     186             :                             {
     187         112 :                                 trgquoted = true;
     188         112 :                                 state = 4;
     189             :                             }
     190             :                             else
     191       10108 :                                 state = 3;
     192             : 
     193       10220 :                             trg = ptr;
     194       10220 :                             trglen = ptrlen;
     195       10220 :                             break;
     196        1876 :                         case 3:
     197             :                             /* continue non-quoted trg */
     198        1876 :                             trglen += ptrlen;
     199        1876 :                             break;
     200         464 :                         case 4:
     201             :                             /* continue quoted trg */
     202         464 :                             trglen += ptrlen;
     203             : 
     204             :                             /*
     205             :                              * If this is a quote, consider it as the end of
     206             :                              * trg except if the follow-up character is itself
     207             :                              * a quote.
     208             :                              */
     209         464 :                             if (*ptr == '"')
     210             :                             {
     211         144 :                                 if (*(ptr + 1) == '"')
     212             :                                 {
     213          32 :                                     ptr++;
     214          32 :                                     trglen += 1;
     215             :                                 }
     216             :                                 else
     217         112 :                                     state = 5;
     218             :                             }
     219         464 :                             break;
     220           0 :                         default:
     221             :                             /* bogus line format */
     222           0 :                             state = -1;
     223           0 :                             break;
     224             :                     }
     225             :                 }
     226             : 
     227       10644 :                 if (state == 1 || state == 2)
     228             :                 {
     229             :                     /* trg was omitted, so use "" */
     230         424 :                     trg = "";
     231         424 :                     trglen = 0;
     232             :                 }
     233             : 
     234             :                 /* If still in a quoted area, fallback to an error */
     235       10644 :                 if (state == 4)
     236           0 :                     state = -2;
     237             : 
     238             :                 /* If trg was quoted, remove its quotes and unescape it */
     239       10644 :                 if (trgquoted && state > 0)
     240             :                 {
     241             :                     /* Ignore first and end quotes */
     242         112 :                     trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
     243         112 :                     trgstorelen = 0;
     244         464 :                     for (int i = 1; i < trglen - 1; i++)
     245             :                     {
     246         352 :                         trgstore[trgstorelen] = trg[i];
     247         352 :                         trgstorelen++;
     248             :                         /* skip second double quotes */
     249         352 :                         if (trg[i] == '"' && trg[i + 1] == '"')
     250          32 :                             i++;
     251             :                     }
     252             :                 }
     253             :                 else
     254             :                 {
     255       10532 :                     trgstore = (char *) palloc(sizeof(char) * trglen);
     256       10532 :                     trgstorelen = trglen;
     257       10532 :                     memcpy(trgstore, trg, trgstorelen);
     258             :                 }
     259             : 
     260       10644 :                 if (state > 0)
     261       10644 :                     rootTrie = placeChar(rootTrie,
     262             :                                          (unsigned char *) src, srclen,
     263             :                                          trgstore, trgstorelen);
     264           0 :                 else if (state == -1)
     265           0 :                     ereport(WARNING,
     266             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     267             :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
     268           0 :                 else if (state == -2)
     269           0 :                     ereport(WARNING,
     270             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     271             :                              errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     272             : 
     273       10644 :                 pfree(trgstore);
     274       10644 :                 pfree(line);
     275             :             }
     276           4 :             skip = false;
     277             :         }
     278           0 :         PG_CATCH();
     279             :         {
     280             :             ErrorData  *errdata;
     281             :             MemoryContext ecxt;
     282             : 
     283           0 :             ecxt = MemoryContextSwitchTo(ccxt);
     284           0 :             errdata = CopyErrorData();
     285           0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     286             :             {
     287           0 :                 FlushErrorState();
     288             :             }
     289             :             else
     290             :             {
     291           0 :                 MemoryContextSwitchTo(ecxt);
     292           0 :                 PG_RE_THROW();
     293             :             }
     294             :         }
     295           4 :         PG_END_TRY();
     296             :     }
     297           4 :     while (skip);
     298             : 
     299           4 :     tsearch_readline_end(&trst);
     300             : 
     301           4 :     return rootTrie;
     302             : }
     303             : 
     304             : /*
     305             :  * findReplaceTo - find longest possible match in trie
     306             :  *
     307             :  * On success, returns pointer to ending subnode, plus length of matched
     308             :  * source string in *p_matchlen.  On failure, returns NULL.
     309             :  */
     310             : static TrieChar *
     311         158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     312             :               int *p_matchlen)
     313             : {
     314         158 :     TrieChar   *result = NULL;
     315         158 :     int         matchlen = 0;
     316             : 
     317         158 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
     318             : 
     319         452 :     while (node && matchlen < srclen)
     320             :     {
     321         294 :         node = node + src[matchlen];
     322         294 :         matchlen++;
     323             : 
     324         294 :         if (node->replaceTo)
     325             :         {
     326          74 :             result = node;
     327          74 :             *p_matchlen = matchlen;
     328             :         }
     329             : 
     330         294 :         node = node->nextChar;
     331             :     }
     332             : 
     333         158 :     return result;
     334             : }
     335             : 
     336           4 : PG_FUNCTION_INFO_V1(unaccent_init);
     337             : Datum
     338           4 : unaccent_init(PG_FUNCTION_ARGS)
     339             : {
     340           4 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     341           4 :     TrieChar   *rootTrie = NULL;
     342           4 :     bool        fileloaded = false;
     343             :     ListCell   *l;
     344             : 
     345           8 :     foreach(l, dictoptions)
     346             :     {
     347           4 :         DefElem    *defel = (DefElem *) lfirst(l);
     348             : 
     349           4 :         if (strcmp(defel->defname, "rules") == 0)
     350             :         {
     351           4 :             if (fileloaded)
     352           0 :                 ereport(ERROR,
     353             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     354             :                          errmsg("multiple Rules parameters")));
     355           4 :             rootTrie = initTrie(defGetString(defel));
     356           4 :             fileloaded = true;
     357             :         }
     358             :         else
     359             :         {
     360           0 :             ereport(ERROR,
     361             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     362             :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
     363             :                             defel->defname)));
     364             :         }
     365             :     }
     366             : 
     367           4 :     if (!fileloaded)
     368             :     {
     369           0 :         ereport(ERROR,
     370             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     371             :                  errmsg("missing Rules parameter")));
     372             :     }
     373             : 
     374           4 :     PG_RETURN_POINTER(rootTrie);
     375             : }
     376             : 
     377           4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     378             : Datum
     379          56 : unaccent_lexize(PG_FUNCTION_ARGS)
     380             : {
     381          56 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     382          56 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
     383          56 :     int32       len = PG_GETARG_INT32(2);
     384          56 :     char       *srcstart = srcchar;
     385             :     TSLexeme   *res;
     386             :     StringInfoData buf;
     387             : 
     388             :     /* we allocate storage for the buffer only if needed */
     389          56 :     buf.data = NULL;
     390             : 
     391         214 :     while (len > 0)
     392             :     {
     393             :         TrieChar   *node;
     394             :         int         matchlen;
     395             : 
     396         158 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     397             :                              &matchlen);
     398         158 :         if (node && node->replaceTo)
     399             :         {
     400          74 :             if (buf.data == NULL)
     401             :             {
     402             :                 /* initialize buffer */
     403          50 :                 initStringInfo(&buf);
     404             :                 /* insert any data we already skipped over */
     405          50 :                 if (srcchar != srcstart)
     406          12 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     407             :             }
     408          74 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     409             :         }
     410             :         else
     411             :         {
     412          84 :             matchlen = pg_mblen(srcchar);
     413          84 :             if (buf.data != NULL)
     414          36 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     415             :         }
     416             : 
     417         158 :         srcchar += matchlen;
     418         158 :         len -= matchlen;
     419             :     }
     420             : 
     421             :     /* return a result only if we made at least one substitution */
     422          56 :     if (buf.data != NULL)
     423             :     {
     424          50 :         res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
     425          50 :         res->lexeme = buf.data;
     426          50 :         res->flags = TSL_FILTER;
     427             :     }
     428             :     else
     429           6 :         res = NULL;
     430             : 
     431          56 :     PG_RETURN_POINTER(res);
     432             : }
     433             : 
     434             : /*
     435             :  * Function-like wrapper for dictionary
     436             :  */
     437           8 : PG_FUNCTION_INFO_V1(unaccent_dict);
     438             : Datum
     439          38 : unaccent_dict(PG_FUNCTION_ARGS)
     440             : {
     441             :     text       *str;
     442             :     int         strArg;
     443             :     Oid         dictOid;
     444             :     TSDictionaryCacheEntry *dict;
     445             :     TSLexeme   *res;
     446             : 
     447          38 :     if (PG_NARGS() == 1)
     448             :     {
     449             :         /*
     450             :          * Use the "unaccent" dictionary that is in the same schema that this
     451             :          * function is in.
     452             :          */
     453          20 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     454          20 :         const char *dictname = "unaccent";
     455             : 
     456          20 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     457             :                                   PointerGetDatum(dictname),
     458             :                                   ObjectIdGetDatum(procnspid));
     459          20 :         if (!OidIsValid(dictOid))
     460           0 :             ereport(ERROR,
     461             :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
     462             :                      errmsg("text search dictionary \"%s.%s\" does not exist",
     463             :                             get_namespace_name(procnspid), dictname)));
     464          20 :         strArg = 0;
     465             :     }
     466             :     else
     467             :     {
     468          18 :         dictOid = PG_GETARG_OID(0);
     469          18 :         strArg = 1;
     470             :     }
     471          38 :     str = PG_GETARG_TEXT_PP(strArg);
     472             : 
     473          38 :     dict = lookup_ts_dictionary_cache(dictOid);
     474             : 
     475          38 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     476             :                                                      PointerGetDatum(dict->dictData),
     477             :                                                      PointerGetDatum(VARDATA_ANY(str)),
     478             :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     479             :                                                      PointerGetDatum(NULL)));
     480             : 
     481          38 :     PG_FREE_IF_COPY(str, strArg);
     482             : 
     483          38 :     if (res == NULL)
     484             :     {
     485           4 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     486             :     }
     487          34 :     else if (res->lexeme == NULL)
     488             :     {
     489           0 :         pfree(res);
     490           0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     491             :     }
     492             :     else
     493             :     {
     494          34 :         text       *txt = cstring_to_text(res->lexeme);
     495             : 
     496          34 :         pfree(res->lexeme);
     497          34 :         pfree(res);
     498             : 
     499          34 :         PG_RETURN_TEXT_P(txt);
     500             :     }
     501             : }

Generated by: LCOV version 1.14