LCOV - code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 158 184 85.9 %
Date: 2024-12-02 20:15:07 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * unaccent.c
       4             :  *    Text search unaccent dictionary
       5             :  *
       6             :  * Copyright (c) 2009-2024, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    contrib/unaccent/unaccent.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : 
      14             : #include "postgres.h"
      15             : 
      16             : #include "catalog/pg_ts_dict.h"
      17             : #include "commands/defrem.h"
      18             : #include "lib/stringinfo.h"
      19             : #include "tsearch/ts_cache.h"
      20             : #include "tsearch/ts_locale.h"
      21             : #include "tsearch/ts_public.h"
      22             : #include "utils/builtins.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/syscache.h"
      25             : 
      26           2 : PG_MODULE_MAGIC;
      27             : 
      28             : /*
      29             :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      30             :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      31             :  * array corresponds to next byte value N.  That element can contain both a
      32             :  * replacement string (to be used if the source string ends with this byte)
      33             :  * and a link to another trie node (to be followed if there are more bytes).
      34             :  *
      35             :  * Note that the trie search logic pays no attention to multibyte character
      36             :  * boundaries.  This is OK as long as both the data entered into the trie and
      37             :  * the data we're trying to look up are validly encoded; no partial-character
      38             :  * matches will occur.
      39             :  */
      40             : typedef struct TrieChar
      41             : {
      42             :     struct TrieChar *nextChar;
      43             :     char       *replaceTo;
      44             :     int         replacelen;
      45             : } TrieChar;
      46             : 
      47             : /*
      48             :  * placeChar - put str into trie's structure, byte by byte.
      49             :  *
      50             :  * If node is NULL, we need to make a new node, which will be returned;
      51             :  * otherwise the return value is the same as node.
      52             :  */
      53             : static TrieChar *
      54       33424 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      55             :           const char *replaceTo, int replacelen)
      56             : {
      57             :     TrieChar   *curnode;
      58             : 
      59       33424 :     if (!node)
      60         332 :         node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
      61             : 
      62             :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
      63             : 
      64       33424 :     curnode = node + *str;
      65             : 
      66       33424 :     if (lenstr <= 1)
      67             :     {
      68       10644 :         if (curnode->replaceTo)
      69           0 :             ereport(WARNING,
      70             :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
      71             :                      errmsg("duplicate source strings, first one will be used")));
      72             :         else
      73             :         {
      74       10644 :             curnode->replacelen = replacelen;
      75       10644 :             curnode->replaceTo = (char *) palloc(replacelen);
      76       10644 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
      77             :         }
      78             :     }
      79             :     else
      80             :     {
      81       22780 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      82             :                                       replaceTo, replacelen);
      83             :     }
      84             : 
      85       33424 :     return node;
      86             : }
      87             : 
      88             : /*
      89             :  * initTrie  - create trie from file.
      90             :  *
      91             :  * Function converts UTF8-encoded file into current encoding.
      92             :  */
      93             : static TrieChar *
      94           4 : initTrie(const char *filename)
      95             : {
      96           4 :     TrieChar   *volatile rootTrie = NULL;
      97           4 :     MemoryContext ccxt = CurrentMemoryContext;
      98             :     tsearch_readline_state trst;
      99             :     volatile bool skip;
     100             : 
     101           4 :     filename = get_tsearch_config_filename(filename, "rules");
     102           4 :     if (!tsearch_readline_begin(&trst, filename))
     103           0 :         ereport(ERROR,
     104             :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     105             :                  errmsg("could not open unaccent file \"%s\": %m",
     106             :                         filename)));
     107             : 
     108             :     do
     109             :     {
     110             :         /*
     111             :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
     112             :          * emit exception if it finds untranslatable characters in current
     113             :          * locale. We just skip such lines, continuing with the next.
     114             :          */
     115           4 :         skip = true;
     116             : 
     117           4 :         PG_TRY();
     118             :         {
     119             :             char       *line;
     120             : 
     121       10648 :             while ((line = tsearch_readline(&trst)) != NULL)
     122             :             {
     123             :                 /*----------
     124             :                  * The format of each line must be "src" or "src trg", where
     125             :                  * src and trg are sequences of one or more non-whitespace
     126             :                  * characters, separated by whitespace.  Whitespace at start
     127             :                  * or end of line is ignored.  If trg is omitted, an empty
     128             :                  * string is used as the replacement.  trg can be optionally
     129             :                  * quoted, in which case whitespaces are included in it.
     130             :                  *
     131             :                  * We use a simple state machine, with states
     132             :                  *  0   initial (before src)
     133             :                  *  1   in src
     134             :                  *  2   in whitespace after src
     135             :                  *  3   in trg (non-quoted)
     136             :                  *  4   in trg (quoted)
     137             :                  *  5   in whitespace after trg
     138             :                  *  -1  syntax error detected (two strings)
     139             :                  *  -2  syntax error detected (unfinished quoted string)
     140             :                  *----------
     141             :                  */
     142             :                 int         state;
     143             :                 char       *ptr;
     144       10644 :                 char       *src = NULL;
     145       10644 :                 char       *trg = NULL;
     146       10644 :                 char       *trgstore = NULL;
     147             :                 int         ptrlen;
     148       10644 :                 int         srclen = 0;
     149       10644 :                 int         trglen = 0;
     150       10644 :                 int         trgstorelen = 0;
     151       10644 :                 bool        trgquoted = false;
     152             : 
     153       10644 :                 state = 0;
     154       54712 :                 for (ptr = line; *ptr; ptr += ptrlen)
     155             :                 {
     156       44068 :                     ptrlen = pg_mblen(ptr);
     157             :                     /* ignore whitespace, but end src or trg */
     158       44068 :                     if (t_isspace(ptr))
     159             :                     {
     160       20944 :                         if (state == 1)
     161       10644 :                             state = 2;
     162       10300 :                         else if (state == 3)
     163       10108 :                             state = 5;
     164             :                         /* whitespaces are OK in quoted area */
     165       20944 :                         if (state != 4)
     166       20864 :                             continue;
     167             :                     }
     168       23204 :                     switch (state)
     169             :                     {
     170       10644 :                         case 0:
     171             :                             /* start of src */
     172       10644 :                             src = ptr;
     173       10644 :                             srclen = ptrlen;
     174       10644 :                             state = 1;
     175       10644 :                             break;
     176           0 :                         case 1:
     177             :                             /* continue src */
     178           0 :                             srclen += ptrlen;
     179           0 :                             break;
     180       10220 :                         case 2:
     181             :                             /* start of trg */
     182       10220 :                             if (*ptr == '"')
     183             :                             {
     184         112 :                                 trgquoted = true;
     185         112 :                                 state = 4;
     186             :                             }
     187             :                             else
     188       10108 :                                 state = 3;
     189             : 
     190       10220 :                             trg = ptr;
     191       10220 :                             trglen = ptrlen;
     192       10220 :                             break;
     193        1876 :                         case 3:
     194             :                             /* continue non-quoted trg */
     195        1876 :                             trglen += ptrlen;
     196        1876 :                             break;
     197         464 :                         case 4:
     198             :                             /* continue quoted trg */
     199         464 :                             trglen += ptrlen;
     200             : 
     201             :                             /*
     202             :                              * If this is a quote, consider it as the end of
     203             :                              * trg except if the follow-up character is itself
     204             :                              * a quote.
     205             :                              */
     206         464 :                             if (*ptr == '"')
     207             :                             {
     208         144 :                                 if (*(ptr + 1) == '"')
     209             :                                 {
     210          32 :                                     ptr++;
     211          32 :                                     trglen += 1;
     212             :                                 }
     213             :                                 else
     214         112 :                                     state = 5;
     215             :                             }
     216         464 :                             break;
     217           0 :                         default:
     218             :                             /* bogus line format */
     219           0 :                             state = -1;
     220           0 :                             break;
     221             :                     }
     222             :                 }
     223             : 
     224       10644 :                 if (state == 1 || state == 2)
     225             :                 {
     226             :                     /* trg was omitted, so use "" */
     227         424 :                     trg = "";
     228         424 :                     trglen = 0;
     229             :                 }
     230             : 
     231             :                 /* If still in a quoted area, fallback to an error */
     232       10644 :                 if (state == 4)
     233           0 :                     state = -2;
     234             : 
     235             :                 /* If trg was quoted, remove its quotes and unescape it */
     236       10644 :                 if (trgquoted && state > 0)
     237             :                 {
     238             :                     /* Ignore first and end quotes */
     239         112 :                     trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
     240         112 :                     trgstorelen = 0;
     241         464 :                     for (int i = 1; i < trglen - 1; i++)
     242             :                     {
     243         352 :                         trgstore[trgstorelen] = trg[i];
     244         352 :                         trgstorelen++;
     245             :                         /* skip second double quotes */
     246         352 :                         if (trg[i] == '"' && trg[i + 1] == '"')
     247          32 :                             i++;
     248             :                     }
     249             :                 }
     250             :                 else
     251             :                 {
     252       10532 :                     trgstore = (char *) palloc(sizeof(char) * trglen);
     253       10532 :                     trgstorelen = trglen;
     254       10532 :                     memcpy(trgstore, trg, trgstorelen);
     255             :                 }
     256             : 
     257       10644 :                 if (state > 0)
     258       10644 :                     rootTrie = placeChar(rootTrie,
     259             :                                          (unsigned char *) src, srclen,
     260             :                                          trgstore, trgstorelen);
     261           0 :                 else if (state == -1)
     262           0 :                     ereport(WARNING,
     263             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     264             :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
     265           0 :                 else if (state == -2)
     266           0 :                     ereport(WARNING,
     267             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     268             :                              errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     269             : 
     270       10644 :                 pfree(trgstore);
     271       10644 :                 pfree(line);
     272             :             }
     273           4 :             skip = false;
     274             :         }
     275           0 :         PG_CATCH();
     276             :         {
     277             :             ErrorData  *errdata;
     278             :             MemoryContext ecxt;
     279             : 
     280           0 :             ecxt = MemoryContextSwitchTo(ccxt);
     281           0 :             errdata = CopyErrorData();
     282           0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     283             :             {
     284           0 :                 FlushErrorState();
     285             :             }
     286             :             else
     287             :             {
     288           0 :                 MemoryContextSwitchTo(ecxt);
     289           0 :                 PG_RE_THROW();
     290             :             }
     291             :         }
     292           4 :         PG_END_TRY();
     293             :     }
     294           4 :     while (skip);
     295             : 
     296           4 :     tsearch_readline_end(&trst);
     297             : 
     298           4 :     return rootTrie;
     299             : }
     300             : 
     301             : /*
     302             :  * findReplaceTo - find longest possible match in trie
     303             :  *
     304             :  * On success, returns pointer to ending subnode, plus length of matched
     305             :  * source string in *p_matchlen.  On failure, returns NULL.
     306             :  */
     307             : static TrieChar *
     308         158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     309             :               int *p_matchlen)
     310             : {
     311         158 :     TrieChar   *result = NULL;
     312         158 :     int         matchlen = 0;
     313             : 
     314         158 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
     315             : 
     316         452 :     while (node && matchlen < srclen)
     317             :     {
     318         294 :         node = node + src[matchlen];
     319         294 :         matchlen++;
     320             : 
     321         294 :         if (node->replaceTo)
     322             :         {
     323          74 :             result = node;
     324          74 :             *p_matchlen = matchlen;
     325             :         }
     326             : 
     327         294 :         node = node->nextChar;
     328             :     }
     329             : 
     330         158 :     return result;
     331             : }
     332             : 
     333           4 : PG_FUNCTION_INFO_V1(unaccent_init);
     334             : Datum
     335           4 : unaccent_init(PG_FUNCTION_ARGS)
     336             : {
     337           4 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     338           4 :     TrieChar   *rootTrie = NULL;
     339           4 :     bool        fileloaded = false;
     340             :     ListCell   *l;
     341             : 
     342           8 :     foreach(l, dictoptions)
     343             :     {
     344           4 :         DefElem    *defel = (DefElem *) lfirst(l);
     345             : 
     346           4 :         if (strcmp(defel->defname, "rules") == 0)
     347             :         {
     348           4 :             if (fileloaded)
     349           0 :                 ereport(ERROR,
     350             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     351             :                          errmsg("multiple Rules parameters")));
     352           4 :             rootTrie = initTrie(defGetString(defel));
     353           4 :             fileloaded = true;
     354             :         }
     355             :         else
     356             :         {
     357           0 :             ereport(ERROR,
     358             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     359             :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
     360             :                             defel->defname)));
     361             :         }
     362             :     }
     363             : 
     364           4 :     if (!fileloaded)
     365             :     {
     366           0 :         ereport(ERROR,
     367             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     368             :                  errmsg("missing Rules parameter")));
     369             :     }
     370             : 
     371           4 :     PG_RETURN_POINTER(rootTrie);
     372             : }
     373             : 
     374           4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     375             : Datum
     376          56 : unaccent_lexize(PG_FUNCTION_ARGS)
     377             : {
     378          56 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     379          56 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
     380          56 :     int32       len = PG_GETARG_INT32(2);
     381          56 :     char       *srcstart = srcchar;
     382             :     TSLexeme   *res;
     383             :     StringInfoData buf;
     384             : 
     385             :     /* we allocate storage for the buffer only if needed */
     386          56 :     buf.data = NULL;
     387             : 
     388         214 :     while (len > 0)
     389             :     {
     390             :         TrieChar   *node;
     391             :         int         matchlen;
     392             : 
     393         158 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     394             :                              &matchlen);
     395         158 :         if (node && node->replaceTo)
     396             :         {
     397          74 :             if (buf.data == NULL)
     398             :             {
     399             :                 /* initialize buffer */
     400          50 :                 initStringInfo(&buf);
     401             :                 /* insert any data we already skipped over */
     402          50 :                 if (srcchar != srcstart)
     403          12 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     404             :             }
     405          74 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     406             :         }
     407             :         else
     408             :         {
     409          84 :             matchlen = pg_mblen(srcchar);
     410          84 :             if (buf.data != NULL)
     411          36 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     412             :         }
     413             : 
     414         158 :         srcchar += matchlen;
     415         158 :         len -= matchlen;
     416             :     }
     417             : 
     418             :     /* return a result only if we made at least one substitution */
     419          56 :     if (buf.data != NULL)
     420             :     {
     421          50 :         res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
     422          50 :         res->lexeme = buf.data;
     423          50 :         res->flags = TSL_FILTER;
     424             :     }
     425             :     else
     426           6 :         res = NULL;
     427             : 
     428          56 :     PG_RETURN_POINTER(res);
     429             : }
     430             : 
     431             : /*
     432             :  * Function-like wrapper for dictionary
     433             :  */
     434           8 : PG_FUNCTION_INFO_V1(unaccent_dict);
     435             : Datum
     436          38 : unaccent_dict(PG_FUNCTION_ARGS)
     437             : {
     438             :     text       *str;
     439             :     int         strArg;
     440             :     Oid         dictOid;
     441             :     TSDictionaryCacheEntry *dict;
     442             :     TSLexeme   *res;
     443             : 
     444          38 :     if (PG_NARGS() == 1)
     445             :     {
     446             :         /*
     447             :          * Use the "unaccent" dictionary that is in the same schema that this
     448             :          * function is in.
     449             :          */
     450          20 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     451          20 :         const char *dictname = "unaccent";
     452             : 
     453          20 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     454             :                                   PointerGetDatum(dictname),
     455             :                                   ObjectIdGetDatum(procnspid));
     456          20 :         if (!OidIsValid(dictOid))
     457           0 :             ereport(ERROR,
     458             :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
     459             :                      errmsg("text search dictionary \"%s.%s\" does not exist",
     460             :                             get_namespace_name(procnspid), dictname)));
     461          20 :         strArg = 0;
     462             :     }
     463             :     else
     464             :     {
     465          18 :         dictOid = PG_GETARG_OID(0);
     466          18 :         strArg = 1;
     467             :     }
     468          38 :     str = PG_GETARG_TEXT_PP(strArg);
     469             : 
     470          38 :     dict = lookup_ts_dictionary_cache(dictOid);
     471             : 
     472          38 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     473             :                                                      PointerGetDatum(dict->dictData),
     474             :                                                      PointerGetDatum(VARDATA_ANY(str)),
     475             :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     476             :                                                      PointerGetDatum(NULL)));
     477             : 
     478          38 :     PG_FREE_IF_COPY(str, strArg);
     479             : 
     480          38 :     if (res == NULL)
     481             :     {
     482           4 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     483             :     }
     484          34 :     else if (res->lexeme == NULL)
     485             :     {
     486           0 :         pfree(res);
     487           0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     488             :     }
     489             :     else
     490             :     {
     491          34 :         text       *txt = cstring_to_text(res->lexeme);
     492             : 
     493          34 :         pfree(res->lexeme);
     494          34 :         pfree(res);
     495             : 
     496          34 :         PG_RETURN_TEXT_P(txt);
     497             :     }
     498             : }

Generated by: LCOV version 1.14