LCOV - code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 48 68 70.6 %
Date: 2025-04-01 15:15:16 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * dict_snowball.c
       4             :  *      Snowball dictionary
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/snowball/dict_snowball.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : 
      15             : #include "catalog/pg_collation_d.h"
      16             : #include "commands/defrem.h"
      17             : #include "mb/pg_wchar.h"
      18             : #include "tsearch/ts_public.h"
      19             : #include "utils/formatting.h"
      20             : 
      21             : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      22             : #ifdef MAXINT
      23             : #undef MAXINT
      24             : #endif
      25             : #ifdef MININT
      26             : #undef MININT
      27             : #endif
      28             : 
      29             : /* Now we can include the original Snowball header.h */
      30             : #include "snowball/libstemmer/header.h"
      31             : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
      32             : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
      33             : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      34             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      35             : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      36             : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      37             : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      38             : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      39             : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      40             : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      41             : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      42             : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      43             : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      44             : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      45             : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      46             : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      47             : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      48             : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      49             : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      50             : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
      51             : #include "snowball/libstemmer/stem_UTF_8_basque.h"
      52             : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
      53             : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      54             : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      55             : #include "snowball/libstemmer/stem_UTF_8_english.h"
      56             : #include "snowball/libstemmer/stem_UTF_8_estonian.h"
      57             : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      58             : #include "snowball/libstemmer/stem_UTF_8_french.h"
      59             : #include "snowball/libstemmer/stem_UTF_8_german.h"
      60             : #include "snowball/libstemmer/stem_UTF_8_greek.h"
      61             : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
      62             : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      63             : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      64             : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      65             : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      66             : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      67             : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      68             : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      69             : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      70             : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      71             : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      72             : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      73             : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
      74             : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      75             : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      76             : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      77             : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      78             : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
      79             : 
      80         118 : PG_MODULE_MAGIC_EXT(
      81             :                     .name = "dict_snowball",
      82             :                     .version = PG_VERSION
      83             : );
      84             : 
      85         118 : PG_FUNCTION_INFO_V1(dsnowball_init);
      86             : 
      87         118 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      88             : 
      89             : /* List of supported modules */
      90             : typedef struct stemmer_module
      91             : {
      92             :     const char *name;
      93             :     pg_enc      enc;
      94             :     struct SN_env *(*create) (void);
      95             :     void        (*close) (struct SN_env *);
      96             :     int         (*stem) (struct SN_env *);
      97             : } stemmer_module;
      98             : 
      99             : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
     100             : #define STEMMER_MODULE(name,enc,senc) \
     101             :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
     102             : 
     103             : static const stemmer_module stemmer_modules[] =
     104             : {
     105             :     /*
     106             :      * Stemmers list from Snowball distribution
     107             :      */
     108             :     STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
     109             :     STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
     110             :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
     111             :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
     112             :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
     113             :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
     114             :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
     115             :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     116             :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     117             :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     118             :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     119             :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     120             :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     121             :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     122             :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     123             :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     124             :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     125             :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     126             :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     127             :     STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
     128             :     STEMMER_MODULE(basque, PG_UTF8, UTF_8),
     129             :     STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
     130             :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     131             :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     132             :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
     133             :     STEMMER_MODULE(estonian, PG_UTF8, UTF_8),
     134             :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     135             :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
     136             :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
     137             :     STEMMER_MODULE(greek, PG_UTF8, UTF_8),
     138             :     STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
     139             :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     140             :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     141             :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     142             :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     143             :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     144             :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     145             :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     146             :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     147             :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     148             :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     149             :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     150             :     STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
     151             :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     152             :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     153             :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     154             :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     155             :     STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
     156             : 
     157             :     /*
     158             :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     159             :      * encoding
     160             :      */
     161             :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     162             : 
     163             :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     164             : };
     165             : 
     166             : 
     167             : typedef struct DictSnowball
     168             : {
     169             :     struct SN_env *z;
     170             :     StopList    stoplist;
     171             :     bool        needrecode;     /* needs recoding before/after call stem */
     172             :     int         (*stem) (struct SN_env *z);
     173             : 
     174             :     /*
     175             :      * snowball saves alloced memory between calls, so we should run it in our
     176             :      * private memory context. Note, init function is executed in long lived
     177             :      * context, so we just remember CurrentMemoryContext
     178             :      */
     179             :     MemoryContext dictCtx;
     180             : } DictSnowball;
     181             : 
     182             : 
     183             : static void
     184          38 : locate_stem_module(DictSnowball *d, const char *lang)
     185             : {
     186             :     const stemmer_module *m;
     187             : 
     188             :     /*
     189             :      * First, try to find exact match of stemmer module. Stemmer with
     190             :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     191             :      */
     192         950 :     for (m = stemmer_modules; m->name; m++)
     193             :     {
     194        1216 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     195         266 :             pg_strcasecmp(m->name, lang) == 0)
     196             :         {
     197          38 :             d->stem = m->stem;
     198          38 :             d->z = m->create();
     199          38 :             d->needrecode = false;
     200          38 :             return;
     201             :         }
     202             :     }
     203             : 
     204             :     /*
     205             :      * Second, try to find stemmer for needed language for UTF8 encoding.
     206             :      */
     207           0 :     for (m = stemmer_modules; m->name; m++)
     208             :     {
     209           0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     210             :         {
     211           0 :             d->stem = m->stem;
     212           0 :             d->z = m->create();
     213           0 :             d->needrecode = true;
     214           0 :             return;
     215             :         }
     216             :     }
     217             : 
     218           0 :     ereport(ERROR,
     219             :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     220             :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     221             :                     lang, GetDatabaseEncodingName())));
     222             : }
     223             : 
     224             : Datum
     225          38 : dsnowball_init(PG_FUNCTION_ARGS)
     226             : {
     227          38 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     228             :     DictSnowball *d;
     229          38 :     bool        stoploaded = false;
     230             :     ListCell   *l;
     231             : 
     232          38 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
     233             : 
     234         114 :     foreach(l, dictoptions)
     235             :     {
     236          76 :         DefElem    *defel = (DefElem *) lfirst(l);
     237             : 
     238          76 :         if (strcmp(defel->defname, "stopwords") == 0)
     239             :         {
     240          38 :             if (stoploaded)
     241           0 :                 ereport(ERROR,
     242             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     243             :                          errmsg("multiple StopWords parameters")));
     244          38 :             readstoplist(defGetString(defel), &d->stoplist, str_tolower);
     245          38 :             stoploaded = true;
     246             :         }
     247          38 :         else if (strcmp(defel->defname, "language") == 0)
     248             :         {
     249          38 :             if (d->stem)
     250           0 :                 ereport(ERROR,
     251             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     252             :                          errmsg("multiple Language parameters")));
     253          38 :             locate_stem_module(d, defGetString(defel));
     254             :         }
     255             :         else
     256             :         {
     257           0 :             ereport(ERROR,
     258             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     259             :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     260             :                             defel->defname)));
     261             :         }
     262             :     }
     263             : 
     264          38 :     if (!d->stem)
     265           0 :         ereport(ERROR,
     266             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     267             :                  errmsg("missing Language parameter")));
     268             : 
     269          38 :     d->dictCtx = CurrentMemoryContext;
     270             : 
     271          38 :     PG_RETURN_POINTER(d);
     272             : }
     273             : 
     274             : Datum
     275       10270 : dsnowball_lexize(PG_FUNCTION_ARGS)
     276             : {
     277       10270 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     278       10270 :     char       *in = (char *) PG_GETARG_POINTER(1);
     279       10270 :     int32       len = PG_GETARG_INT32(2);
     280       10270 :     char       *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
     281       10270 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
     282             : 
     283             :     /*
     284             :      * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
     285             :      * surely not words in any human language.  This restriction avoids
     286             :      * wasting cycles on stuff like base64-encoded data, and it protects us
     287             :      * against possible inefficiency or misbehavior in the stemmer.  (For
     288             :      * example, the Turkish stemmer has an indefinite recursion, so it can
     289             :      * crash on long-enough strings.)  However, Snowball dictionaries are
     290             :      * defined to recognize all strings, so we can't reject the string as an
     291             :      * unknown word.
     292             :      */
     293       10270 :     if (len > 1000)
     294             :     {
     295             :         /* return the lexeme lowercased, but otherwise unmodified */
     296           0 :         res->lexeme = txt;
     297             :     }
     298       10270 :     else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     299             :     {
     300             :         /* empty or stopword, so report as stopword */
     301        3468 :         pfree(txt);
     302             :     }
     303             :     else
     304             :     {
     305             :         MemoryContext saveCtx;
     306             : 
     307             :         /*
     308             :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     309             :          */
     310        6802 :         if (d->needrecode)
     311             :         {
     312             :             char       *recoded;
     313             : 
     314           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     315           0 :             if (recoded != txt)
     316             :             {
     317           0 :                 pfree(txt);
     318           0 :                 txt = recoded;
     319             :             }
     320             :         }
     321             : 
     322             :         /* see comment about d->dictCtx */
     323        6802 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     324        6802 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     325        6802 :         d->stem(d->z);
     326        6802 :         MemoryContextSwitchTo(saveCtx);
     327             : 
     328        6802 :         if (d->z->p && d->z->l)
     329             :         {
     330        6802 :             txt = repalloc(txt, d->z->l + 1);
     331        6802 :             memcpy(txt, d->z->p, d->z->l);
     332        6802 :             txt[d->z->l] = '\0';
     333             :         }
     334             : 
     335             :         /* back recode if needed */
     336        6802 :         if (d->needrecode)
     337             :         {
     338             :             char       *recoded;
     339             : 
     340           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     341           0 :             if (recoded != txt)
     342             :             {
     343           0 :                 pfree(txt);
     344           0 :                 txt = recoded;
     345             :             }
     346             :         }
     347             : 
     348        6802 :         res->lexeme = txt;
     349             :     }
     350             : 
     351       10270 :     PG_RETURN_POINTER(res);
     352             : }

Generated by: LCOV version 1.14