LCOV - code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 158 184 85.9 %
Date: 2024-04-23 09:11:01 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * unaccent.c
       4             :  *    Text search unaccent dictionary
       5             :  *
       6             :  * Copyright (c) 2009-2024, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    contrib/unaccent/unaccent.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : 
      14             : #include "postgres.h"
      15             : 
      16             : #include "catalog/namespace.h"
      17             : #include "catalog/pg_ts_dict.h"
      18             : #include "commands/defrem.h"
      19             : #include "lib/stringinfo.h"
      20             : #include "tsearch/ts_cache.h"
      21             : #include "tsearch/ts_locale.h"
      22             : #include "tsearch/ts_public.h"
      23             : #include "utils/builtins.h"
      24             : #include "utils/lsyscache.h"
      25             : #include "utils/regproc.h"
      26             : #include "utils/syscache.h"
      27             : 
      28           2 : PG_MODULE_MAGIC;
      29             : 
      30             : /*
      31             :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
      32             :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
      33             :  * array corresponds to next byte value N.  That element can contain both a
      34             :  * replacement string (to be used if the source string ends with this byte)
      35             :  * and a link to another trie node (to be followed if there are more bytes).
      36             :  *
      37             :  * Note that the trie search logic pays no attention to multibyte character
      38             :  * boundaries.  This is OK as long as both the data entered into the trie and
      39             :  * the data we're trying to look up are validly encoded; no partial-character
      40             :  * matches will occur.
      41             :  */
      42             : typedef struct TrieChar
      43             : {
      44             :     struct TrieChar *nextChar;
      45             :     char       *replaceTo;
      46             :     int         replacelen;
      47             : } TrieChar;
      48             : 
      49             : /*
      50             :  * placeChar - put str into trie's structure, byte by byte.
      51             :  *
      52             :  * If node is NULL, we need to make a new node, which will be returned;
      53             :  * otherwise the return value is the same as node.
      54             :  */
      55             : static TrieChar *
      56       17788 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
      57             :           const char *replaceTo, int replacelen)
      58             : {
      59             :     TrieChar   *curnode;
      60             : 
      61       17788 :     if (!node)
      62         252 :         node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
      63             : 
      64             :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
      65             : 
      66       17788 :     curnode = node + *str;
      67             : 
      68       17788 :     if (lenstr <= 1)
      69             :     {
      70        6600 :         if (curnode->replaceTo)
      71           0 :             ereport(WARNING,
      72             :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
      73             :                      errmsg("duplicate source strings, first one will be used")));
      74             :         else
      75             :         {
      76        6600 :             curnode->replacelen = replacelen;
      77        6600 :             curnode->replaceTo = (char *) palloc(replacelen);
      78        6600 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
      79             :         }
      80             :     }
      81             :     else
      82             :     {
      83       11188 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
      84             :                                       replaceTo, replacelen);
      85             :     }
      86             : 
      87       17788 :     return node;
      88             : }
      89             : 
      90             : /*
      91             :  * initTrie  - create trie from file.
      92             :  *
      93             :  * Function converts UTF8-encoded file into current encoding.
      94             :  */
      95             : static TrieChar *
      96           4 : initTrie(const char *filename)
      97             : {
      98           4 :     TrieChar   *volatile rootTrie = NULL;
      99           4 :     MemoryContext ccxt = CurrentMemoryContext;
     100             :     tsearch_readline_state trst;
     101             :     volatile bool skip;
     102             : 
     103           4 :     filename = get_tsearch_config_filename(filename, "rules");
     104           4 :     if (!tsearch_readline_begin(&trst, filename))
     105           0 :         ereport(ERROR,
     106             :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
     107             :                  errmsg("could not open unaccent file \"%s\": %m",
     108             :                         filename)));
     109             : 
     110             :     do
     111             :     {
     112             :         /*
     113             :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
     114             :          * emit exception if it finds untranslatable characters in current
     115             :          * locale. We just skip such lines, continuing with the next.
     116             :          */
     117           4 :         skip = true;
     118             : 
     119           4 :         PG_TRY();
     120             :         {
     121             :             char       *line;
     122             : 
     123        6604 :             while ((line = tsearch_readline(&trst)) != NULL)
     124             :             {
     125             :                 /*----------
     126             :                  * The format of each line must be "src" or "src trg", where
     127             :                  * src and trg are sequences of one or more non-whitespace
     128             :                  * characters, separated by whitespace.  Whitespace at start
     129             :                  * or end of line is ignored.  If trg is omitted, an empty
     130             :                  * string is used as the replacement.  trg can be optionally
     131             :                  * quoted, in which case whitespaces are included in it.
     132             :                  *
     133             :                  * We use a simple state machine, with states
     134             :                  *  0   initial (before src)
     135             :                  *  1   in src
     136             :                  *  2   in whitespace after src
     137             :                  *  3   in trg (non-quoted)
     138             :                  *  4   in trg (quoted)
     139             :                  *  5   in whitespace after trg
     140             :                  *  -1  syntax error detected (two strings)
     141             :                  *  -2  syntax error detected (unfinished quoted string)
     142             :                  *----------
     143             :                  */
     144             :                 int         state;
     145             :                 char       *ptr;
     146        6600 :                 char       *src = NULL;
     147        6600 :                 char       *trg = NULL;
     148        6600 :                 char       *trgstore = NULL;
     149             :                 int         ptrlen;
     150        6600 :                 int         srclen = 0;
     151        6600 :                 int         trglen = 0;
     152        6600 :                 int         trgstorelen = 0;
     153        6600 :                 bool        trgquoted = false;
     154             : 
     155        6600 :                 state = 0;
     156       34492 :                 for (ptr = line; *ptr; ptr += ptrlen)
     157             :                 {
     158       27892 :                     ptrlen = pg_mblen(ptr);
     159             :                     /* ignore whitespace, but end src or trg */
     160       27892 :                     if (t_isspace(ptr))
     161             :                     {
     162       12856 :                         if (state == 1)
     163        6600 :                             state = 2;
     164        6256 :                         else if (state == 3)
     165        6064 :                             state = 5;
     166             :                         /* whitespaces are OK in quoted area */
     167       12856 :                         if (state != 4)
     168       12776 :                             continue;
     169             :                     }
     170       15116 :                     switch (state)
     171             :                     {
     172        6600 :                         case 0:
     173             :                             /* start of src */
     174        6600 :                             src = ptr;
     175        6600 :                             srclen = ptrlen;
     176        6600 :                             state = 1;
     177        6600 :                             break;
     178           0 :                         case 1:
     179             :                             /* continue src */
     180           0 :                             srclen += ptrlen;
     181           0 :                             break;
     182        6176 :                         case 2:
     183             :                             /* start of trg */
     184        6176 :                             if (*ptr == '"')
     185             :                             {
     186         112 :                                 trgquoted = true;
     187         112 :                                 state = 4;
     188             :                             }
     189             :                             else
     190        6064 :                                 state = 3;
     191             : 
     192        6176 :                             trg = ptr;
     193        6176 :                             trglen = ptrlen;
     194        6176 :                             break;
     195        1876 :                         case 3:
     196             :                             /* continue non-quoted trg */
     197        1876 :                             trglen += ptrlen;
     198        1876 :                             break;
     199         464 :                         case 4:
     200             :                             /* continue quoted trg */
     201         464 :                             trglen += ptrlen;
     202             : 
     203             :                             /*
     204             :                              * If this is a quote, consider it as the end of
     205             :                              * trg except if the follow-up character is itself
     206             :                              * a quote.
     207             :                              */
     208         464 :                             if (*ptr == '"')
     209             :                             {
     210         144 :                                 if (*(ptr + 1) == '"')
     211             :                                 {
     212          32 :                                     ptr++;
     213          32 :                                     trglen += 1;
     214             :                                 }
     215             :                                 else
     216         112 :                                     state = 5;
     217             :                             }
     218         464 :                             break;
     219           0 :                         default:
     220             :                             /* bogus line format */
     221           0 :                             state = -1;
     222           0 :                             break;
     223             :                     }
     224             :                 }
     225             : 
     226        6600 :                 if (state == 1 || state == 2)
     227             :                 {
     228             :                     /* trg was omitted, so use "" */
     229         424 :                     trg = "";
     230         424 :                     trglen = 0;
     231             :                 }
     232             : 
     233             :                 /* If still in a quoted area, fallback to an error */
     234        6600 :                 if (state == 4)
     235           0 :                     state = -2;
     236             : 
     237             :                 /* If trg was quoted, remove its quotes and unescape it */
     238        6600 :                 if (trgquoted && state > 0)
     239             :                 {
     240             :                     /* Ignore first and end quotes */
     241         112 :                     trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
     242         112 :                     trgstorelen = 0;
     243         464 :                     for (int i = 1; i < trglen - 1; i++)
     244             :                     {
     245         352 :                         trgstore[trgstorelen] = trg[i];
     246         352 :                         trgstorelen++;
     247             :                         /* skip second double quotes */
     248         352 :                         if (trg[i] == '"' && trg[i + 1] == '"')
     249          32 :                             i++;
     250             :                     }
     251             :                 }
     252             :                 else
     253             :                 {
     254        6488 :                     trgstore = (char *) palloc(sizeof(char) * trglen);
     255        6488 :                     trgstorelen = trglen;
     256        6488 :                     memcpy(trgstore, trg, trgstorelen);
     257             :                 }
     258             : 
     259        6600 :                 if (state > 0)
     260        6600 :                     rootTrie = placeChar(rootTrie,
     261             :                                          (unsigned char *) src, srclen,
     262             :                                          trgstore, trgstorelen);
     263           0 :                 else if (state == -1)
     264           0 :                     ereport(WARNING,
     265             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     266             :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
     267           0 :                 else if (state == -2)
     268           0 :                     ereport(WARNING,
     269             :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
     270             :                              errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
     271             : 
     272        6600 :                 pfree(trgstore);
     273        6600 :                 pfree(line);
     274             :             }
     275           4 :             skip = false;
     276             :         }
     277           0 :         PG_CATCH();
     278             :         {
     279             :             ErrorData  *errdata;
     280             :             MemoryContext ecxt;
     281             : 
     282           0 :             ecxt = MemoryContextSwitchTo(ccxt);
     283           0 :             errdata = CopyErrorData();
     284           0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
     285             :             {
     286           0 :                 FlushErrorState();
     287             :             }
     288             :             else
     289             :             {
     290           0 :                 MemoryContextSwitchTo(ecxt);
     291           0 :                 PG_RE_THROW();
     292             :             }
     293             :         }
     294           4 :         PG_END_TRY();
     295             :     }
     296           4 :     while (skip);
     297             : 
     298           4 :     tsearch_readline_end(&trst);
     299             : 
     300           4 :     return rootTrie;
     301             : }
     302             : 
     303             : /*
     304             :  * findReplaceTo - find longest possible match in trie
     305             :  *
     306             :  * On success, returns pointer to ending subnode, plus length of matched
     307             :  * source string in *p_matchlen.  On failure, returns NULL.
     308             :  */
     309             : static TrieChar *
     310         158 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
     311             :               int *p_matchlen)
     312             : {
     313         158 :     TrieChar   *result = NULL;
     314         158 :     int         matchlen = 0;
     315             : 
     316         158 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
     317             : 
     318         452 :     while (node && matchlen < srclen)
     319             :     {
     320         294 :         node = node + src[matchlen];
     321         294 :         matchlen++;
     322             : 
     323         294 :         if (node->replaceTo)
     324             :         {
     325          74 :             result = node;
     326          74 :             *p_matchlen = matchlen;
     327             :         }
     328             : 
     329         294 :         node = node->nextChar;
     330             :     }
     331             : 
     332         158 :     return result;
     333             : }
     334             : 
     335           4 : PG_FUNCTION_INFO_V1(unaccent_init);
     336             : Datum
     337           4 : unaccent_init(PG_FUNCTION_ARGS)
     338             : {
     339           4 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     340           4 :     TrieChar   *rootTrie = NULL;
     341           4 :     bool        fileloaded = false;
     342             :     ListCell   *l;
     343             : 
     344           8 :     foreach(l, dictoptions)
     345             :     {
     346           4 :         DefElem    *defel = (DefElem *) lfirst(l);
     347             : 
     348           4 :         if (strcmp(defel->defname, "rules") == 0)
     349             :         {
     350           4 :             if (fileloaded)
     351           0 :                 ereport(ERROR,
     352             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     353             :                          errmsg("multiple Rules parameters")));
     354           4 :             rootTrie = initTrie(defGetString(defel));
     355           4 :             fileloaded = true;
     356             :         }
     357             :         else
     358             :         {
     359           0 :             ereport(ERROR,
     360             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     361             :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
     362             :                             defel->defname)));
     363             :         }
     364             :     }
     365             : 
     366           4 :     if (!fileloaded)
     367             :     {
     368           0 :         ereport(ERROR,
     369             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     370             :                  errmsg("missing Rules parameter")));
     371             :     }
     372             : 
     373           4 :     PG_RETURN_POINTER(rootTrie);
     374             : }
     375             : 
     376           4 : PG_FUNCTION_INFO_V1(unaccent_lexize);
     377             : Datum
     378          56 : unaccent_lexize(PG_FUNCTION_ARGS)
     379             : {
     380          56 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
     381          56 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
     382          56 :     int32       len = PG_GETARG_INT32(2);
     383          56 :     char       *srcstart = srcchar;
     384             :     TSLexeme   *res;
     385             :     StringInfoData buf;
     386             : 
     387             :     /* we allocate storage for the buffer only if needed */
     388          56 :     buf.data = NULL;
     389             : 
     390         214 :     while (len > 0)
     391             :     {
     392             :         TrieChar   *node;
     393             :         int         matchlen;
     394             : 
     395         158 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
     396             :                              &matchlen);
     397         158 :         if (node && node->replaceTo)
     398             :         {
     399          74 :             if (buf.data == NULL)
     400             :             {
     401             :                 /* initialize buffer */
     402          50 :                 initStringInfo(&buf);
     403             :                 /* insert any data we already skipped over */
     404          50 :                 if (srcchar != srcstart)
     405          12 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
     406             :             }
     407          74 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
     408             :         }
     409             :         else
     410             :         {
     411          84 :             matchlen = pg_mblen(srcchar);
     412          84 :             if (buf.data != NULL)
     413          36 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
     414             :         }
     415             : 
     416         158 :         srcchar += matchlen;
     417         158 :         len -= matchlen;
     418             :     }
     419             : 
     420             :     /* return a result only if we made at least one substitution */
     421          56 :     if (buf.data != NULL)
     422             :     {
     423          50 :         res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
     424          50 :         res->lexeme = buf.data;
     425          50 :         res->flags = TSL_FILTER;
     426             :     }
     427             :     else
     428           6 :         res = NULL;
     429             : 
     430          56 :     PG_RETURN_POINTER(res);
     431             : }
     432             : 
     433             : /*
     434             :  * Function-like wrapper for dictionary
     435             :  */
     436           8 : PG_FUNCTION_INFO_V1(unaccent_dict);
     437             : Datum
     438          38 : unaccent_dict(PG_FUNCTION_ARGS)
     439             : {
     440             :     text       *str;
     441             :     int         strArg;
     442             :     Oid         dictOid;
     443             :     TSDictionaryCacheEntry *dict;
     444             :     TSLexeme   *res;
     445             : 
     446          38 :     if (PG_NARGS() == 1)
     447             :     {
     448             :         /*
     449             :          * Use the "unaccent" dictionary that is in the same schema that this
     450             :          * function is in.
     451             :          */
     452          20 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
     453          20 :         const char *dictname = "unaccent";
     454             : 
     455          20 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
     456             :                                   PointerGetDatum(dictname),
     457             :                                   ObjectIdGetDatum(procnspid));
     458          20 :         if (!OidIsValid(dictOid))
     459           0 :             ereport(ERROR,
     460             :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
     461             :                      errmsg("text search dictionary \"%s.%s\" does not exist",
     462             :                             get_namespace_name(procnspid), dictname)));
     463          20 :         strArg = 0;
     464             :     }
     465             :     else
     466             :     {
     467          18 :         dictOid = PG_GETARG_OID(0);
     468          18 :         strArg = 1;
     469             :     }
     470          38 :     str = PG_GETARG_TEXT_PP(strArg);
     471             : 
     472          38 :     dict = lookup_ts_dictionary_cache(dictOid);
     473             : 
     474          38 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
     475             :                                                      PointerGetDatum(dict->dictData),
     476             :                                                      PointerGetDatum(VARDATA_ANY(str)),
     477             :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
     478             :                                                      PointerGetDatum(NULL)));
     479             : 
     480          38 :     PG_FREE_IF_COPY(str, strArg);
     481             : 
     482          38 :     if (res == NULL)
     483             :     {
     484           4 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     485             :     }
     486          34 :     else if (res->lexeme == NULL)
     487             :     {
     488           0 :         pfree(res);
     489           0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
     490             :     }
     491             :     else
     492             :     {
     493          34 :         text       *txt = cstring_to_text(res->lexeme);
     494             : 
     495          34 :         pfree(res->lexeme);
     496          34 :         pfree(res);
     497             : 
     498          34 :         PG_RETURN_TEXT_P(txt);
     499             :     }
     500             : }

Generated by: LCOV version 1.14