LCOV - code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 70.6 % 68 48
Test Date: 2026-03-05 08:14:41 Functions: 100.0 % 6 6
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * dict_snowball.c
       4              :  *      Snowball dictionary
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/backend/snowball/dict_snowball.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "postgres.h"
      14              : 
      15              : #include "catalog/pg_collation_d.h"
      16              : #include "commands/defrem.h"
      17              : #include "mb/pg_wchar.h"
      18              : #include "tsearch/ts_public.h"
      19              : #include "utils/formatting.h"
      20              : 
      21              : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      22              : #ifdef MAXINT
      23              : #undef MAXINT
      24              : #endif
      25              : #ifdef MININT
      26              : #undef MININT
      27              : #endif
      28              : 
      29              : /* Now we can include the original Snowball snowball_runtime.h */
      30              : #include "snowball/libstemmer/snowball_runtime.h"
      31              : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
      32              : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
      33              : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      34              : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      35              : #include "snowball/libstemmer/stem_ISO_8859_1_dutch_porter.h"
      36              : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      37              : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      38              : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      39              : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      40              : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      41              : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      42              : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      43              : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      44              : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      45              : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      46              : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      47              : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      48              : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      49              : #include "snowball/libstemmer/stem_ISO_8859_2_polish.h"
      50              : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      51              : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      52              : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
      53              : #include "snowball/libstemmer/stem_UTF_8_basque.h"
      54              : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
      55              : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      56              : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      57              : #include "snowball/libstemmer/stem_UTF_8_dutch_porter.h"
      58              : #include "snowball/libstemmer/stem_UTF_8_english.h"
      59              : #include "snowball/libstemmer/stem_UTF_8_esperanto.h"
      60              : #include "snowball/libstemmer/stem_UTF_8_estonian.h"
      61              : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      62              : #include "snowball/libstemmer/stem_UTF_8_french.h"
      63              : #include "snowball/libstemmer/stem_UTF_8_german.h"
      64              : #include "snowball/libstemmer/stem_UTF_8_greek.h"
      65              : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
      66              : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      67              : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      68              : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      69              : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      70              : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      71              : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      72              : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      73              : #include "snowball/libstemmer/stem_UTF_8_polish.h"
      74              : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      75              : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      76              : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      77              : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      78              : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
      79              : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      80              : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      81              : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      82              : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      83              : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
      84              : 
      85           70 : PG_MODULE_MAGIC_EXT(
      86              :                     .name = "dict_snowball",
      87              :                     .version = PG_VERSION
      88              : );
      89              : 
      90           68 : PG_FUNCTION_INFO_V1(dsnowball_init);
      91              : 
      92           68 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      93              : 
      94              : /* List of supported modules */
      95              : typedef struct stemmer_module
      96              : {
      97              :     const char *name;
      98              :     pg_enc      enc;
      99              :     struct SN_env *(*create) (void);
     100              :     void        (*close) (struct SN_env *);
     101              :     int         (*stem) (struct SN_env *);
     102              : } stemmer_module;
     103              : 
     104              : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
     105              : #define STEMMER_MODULE(name,enc,senc) \
     106              :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
     107              : 
     108              : static const stemmer_module stemmer_modules[] =
     109              : {
     110              :     /*
     111              :      * Stemmers list from Snowball distribution
     112              :      */
     113              :     STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
     114              :     STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
     115              :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
     116              :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
     117              :     STEMMER_MODULE(dutch_porter, PG_LATIN1, ISO_8859_1),
     118              :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
     119              :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
     120              :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
     121              :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     122              :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     123              :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     124              :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     125              :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     126              :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     127              :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     128              :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     129              :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     130              :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     131              :     STEMMER_MODULE(polish, PG_LATIN2, ISO_8859_2),
     132              :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     133              :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     134              :     STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
     135              :     STEMMER_MODULE(basque, PG_UTF8, UTF_8),
     136              :     STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
     137              :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     138              :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     139              :     STEMMER_MODULE(dutch_porter, PG_UTF8, UTF_8),
     140              :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
     141              :     STEMMER_MODULE(esperanto, PG_UTF8, UTF_8),
     142              :     STEMMER_MODULE(estonian, PG_UTF8, UTF_8),
     143              :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     144              :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
     145              :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
     146              :     STEMMER_MODULE(greek, PG_UTF8, UTF_8),
     147              :     STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
     148              :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     149              :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     150              :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     151              :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     152              :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     153              :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     154              :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     155              :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     156              :     STEMMER_MODULE(polish, PG_UTF8, UTF_8),
     157              :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     158              :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     159              :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     160              :     STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
     161              :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     162              :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     163              :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     164              :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     165              :     STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
     166              : 
     167              :     /*
     168              :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     169              :      * encoding
     170              :      */
     171              :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     172              : 
     173              :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     174              : };
     175              : 
     176              : 
     177              : typedef struct DictSnowball
     178              : {
     179              :     struct SN_env *z;
     180              :     StopList    stoplist;
     181              :     bool        needrecode;     /* needs recoding before/after call stem */
     182              :     int         (*stem) (struct SN_env *z);
     183              : 
     184              :     /*
     185              :      * snowball saves alloced memory between calls, so we should run it in our
     186              :      * private memory context. Note, init function is executed in long lived
     187              :      * context, so we just remember CurrentMemoryContext
     188              :      */
     189              :     MemoryContext dictCtx;
     190              : } DictSnowball;
     191              : 
     192              : 
     193              : static void
     194           22 : locate_stem_module(DictSnowball *d, const char *lang)
     195              : {
     196              :     const stemmer_module *m;
     197              : 
     198              :     /*
     199              :      * First, try to find exact match of stemmer module. Stemmer with
     200              :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     201              :      */
     202          616 :     for (m = stemmer_modules; m->name; m++)
     203              :     {
     204          792 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     205          176 :             pg_strcasecmp(m->name, lang) == 0)
     206              :         {
     207           22 :             d->stem = m->stem;
     208           22 :             d->z = m->create();
     209           22 :             d->needrecode = false;
     210           22 :             return;
     211              :         }
     212              :     }
     213              : 
     214              :     /*
     215              :      * Second, try to find stemmer for needed language for UTF8 encoding.
     216              :      */
     217            0 :     for (m = stemmer_modules; m->name; m++)
     218              :     {
     219            0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     220              :         {
     221            0 :             d->stem = m->stem;
     222            0 :             d->z = m->create();
     223            0 :             d->needrecode = true;
     224            0 :             return;
     225              :         }
     226              :     }
     227              : 
     228            0 :     ereport(ERROR,
     229              :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     230              :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     231              :                     lang, GetDatabaseEncodingName())));
     232              : }
     233              : 
     234              : Datum
     235           22 : dsnowball_init(PG_FUNCTION_ARGS)
     236              : {
     237           22 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     238              :     DictSnowball *d;
     239           22 :     bool        stoploaded = false;
     240              :     ListCell   *l;
     241              : 
     242           22 :     d = palloc0_object(DictSnowball);
     243              : 
     244           66 :     foreach(l, dictoptions)
     245              :     {
     246           44 :         DefElem    *defel = (DefElem *) lfirst(l);
     247              : 
     248           44 :         if (strcmp(defel->defname, "stopwords") == 0)
     249              :         {
     250           22 :             if (stoploaded)
     251            0 :                 ereport(ERROR,
     252              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     253              :                          errmsg("multiple StopWords parameters")));
     254           22 :             readstoplist(defGetString(defel), &d->stoplist, str_tolower);
     255           22 :             stoploaded = true;
     256              :         }
     257           22 :         else if (strcmp(defel->defname, "language") == 0)
     258              :         {
     259           22 :             if (d->stem)
     260            0 :                 ereport(ERROR,
     261              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     262              :                          errmsg("multiple Language parameters")));
     263           22 :             locate_stem_module(d, defGetString(defel));
     264              :         }
     265              :         else
     266              :         {
     267            0 :             ereport(ERROR,
     268              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     269              :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     270              :                             defel->defname)));
     271              :         }
     272              :     }
     273              : 
     274           22 :     if (!d->stem)
     275            0 :         ereport(ERROR,
     276              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     277              :                  errmsg("missing Language parameter")));
     278              : 
     279           22 :     d->dictCtx = CurrentMemoryContext;
     280              : 
     281           22 :     PG_RETURN_POINTER(d);
     282              : }
     283              : 
     284              : Datum
     285         5159 : dsnowball_lexize(PG_FUNCTION_ARGS)
     286              : {
     287         5159 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     288         5159 :     char       *in = (char *) PG_GETARG_POINTER(1);
     289         5159 :     int32       len = PG_GETARG_INT32(2);
     290         5159 :     char       *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
     291         5159 :     TSLexeme   *res = palloc0_array(TSLexeme, 2);
     292              : 
     293              :     /*
     294              :      * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
     295              :      * surely not words in any human language.  This restriction avoids
     296              :      * wasting cycles on stuff like base64-encoded data, and it protects us
     297              :      * against possible inefficiency or misbehavior in the stemmer.  (For
     298              :      * example, the Turkish stemmer has an indefinite recursion, so it can
     299              :      * crash on long-enough strings.)  However, Snowball dictionaries are
     300              :      * defined to recognize all strings, so we can't reject the string as an
     301              :      * unknown word.
     302              :      */
     303         5159 :     if (len > 1000)
     304              :     {
     305              :         /* return the lexeme lowercased, but otherwise unmodified */
     306            0 :         res->lexeme = txt;
     307              :     }
     308         5159 :     else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     309              :     {
     310              :         /* empty or stopword, so report as stopword */
     311         1734 :         pfree(txt);
     312              :     }
     313              :     else
     314              :     {
     315              :         MemoryContext saveCtx;
     316              : 
     317              :         /*
     318              :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     319              :          */
     320         3425 :         if (d->needrecode)
     321              :         {
     322              :             char       *recoded;
     323              : 
     324            0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     325            0 :             if (recoded != txt)
     326              :             {
     327            0 :                 pfree(txt);
     328            0 :                 txt = recoded;
     329              :             }
     330              :         }
     331              : 
     332              :         /* see comment about d->dictCtx */
     333         3425 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     334         3425 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     335         3425 :         d->stem(d->z);
     336         3425 :         MemoryContextSwitchTo(saveCtx);
     337              : 
     338         3425 :         if (d->z->p && d->z->l)
     339              :         {
     340         3425 :             txt = repalloc(txt, d->z->l + 1);
     341         3425 :             memcpy(txt, d->z->p, d->z->l);
     342         3425 :             txt[d->z->l] = '\0';
     343              :         }
     344              : 
     345              :         /* back recode if needed */
     346         3425 :         if (d->needrecode)
     347              :         {
     348              :             char       *recoded;
     349              : 
     350            0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     351            0 :             if (recoded != txt)
     352              :             {
     353            0 :                 pfree(txt);
     354            0 :                 txt = recoded;
     355              :             }
     356              :         }
     357              : 
     358         3425 :         res->lexeme = txt;
     359              :     }
     360              : 
     361         5159 :     PG_RETURN_POINTER(res);
     362              : }
        

Generated by: LCOV version 2.0-1