LCOV - code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta1 Lines: 47 66 71.2 %
Date: 2019-06-16 15:06:48 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * dict_snowball.c
       4             :  *      Snowball dictionary
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/snowball/dict_snowball.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : 
      15             : #include "commands/defrem.h"
      16             : #include "tsearch/ts_locale.h"
      17             : #include "tsearch/ts_utils.h"
      18             : 
      19             : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      20             : #ifdef MAXINT
      21             : #undef MAXINT
      22             : #endif
      23             : #ifdef MININT
      24             : #undef MININT
      25             : #endif
      26             : 
      27             : /* Now we can include the original Snowball header.h */
      28             : #include "snowball/libstemmer/header.h"
      29             : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      30             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      31             : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      32             : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      33             : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      34             : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      35             : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      36             : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      37             : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      38             : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      39             : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      40             : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      41             : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      42             : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      43             : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      44             : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
      45             : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      46             : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      47             : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      48             : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      49             : #include "snowball/libstemmer/stem_UTF_8_english.h"
      50             : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      51             : #include "snowball/libstemmer/stem_UTF_8_french.h"
      52             : #include "snowball/libstemmer/stem_UTF_8_german.h"
      53             : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      54             : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      55             : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      56             : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      57             : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      58             : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      59             : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      60             : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      61             : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      62             : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      63             : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      64             : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      65             : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      66             : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      67             : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      68             : 
      69         340 : PG_MODULE_MAGIC;
      70             : 
      71         340 : PG_FUNCTION_INFO_V1(dsnowball_init);
      72             : 
      73         340 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      74             : 
      75             : /* List of supported modules */
      76             : typedef struct stemmer_module
      77             : {
      78             :     const char *name;
      79             :     pg_enc      enc;
      80             :     struct SN_env *(*create) (void);
      81             :     void        (*close) (struct SN_env *);
      82             :     int         (*stem) (struct SN_env *);
      83             : } stemmer_module;
      84             : 
      85             : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
      86             : #define STEMMER_MODULE(name,enc,senc) \
      87             :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
      88             : 
      89             : static const stemmer_module stemmer_modules[] =
      90             : {
      91             :     /*
      92             :      * Stemmers list from Snowball distribution
      93             :      */
      94             :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
      95             :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
      96             :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
      97             :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
      98             :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
      99             :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     100             :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     101             :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     102             :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     103             :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     104             :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     105             :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     106             :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     107             :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     108             :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     109             :     STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
     110             :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     111             :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     112             :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     113             :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     114             :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
     115             :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     116             :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
     117             :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
     118             :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     119             :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     120             :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     121             :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     122             :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     123             :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     124             :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     125             :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     126             :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     127             :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     128             :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     129             :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     130             :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     131             :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     132             :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     133             : 
     134             :     /*
     135             :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     136             :      * encoding
     137             :      */
     138             :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     139             : 
     140             :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     141             : };
     142             : 
     143             : 
     144             : typedef struct DictSnowball
     145             : {
     146             :     struct SN_env *z;
     147             :     StopList    stoplist;
     148             :     bool        needrecode;     /* needs recoding before/after call stem */
     149             :     int         (*stem) (struct SN_env *z);
     150             : 
     151             :     /*
     152             :      * snowball saves alloced memory between calls, so we should run it in our
     153             :      * private memory context. Note, init function is executed in long lived
     154             :      * context, so we just remember CurrentMemoryContext
     155             :      */
     156             :     MemoryContext dictCtx;
     157             : } DictSnowball;
     158             : 
     159             : 
     160             : static void
     161          28 : locate_stem_module(DictSnowball *d, const char *lang)
     162             : {
     163             :     const stemmer_module *m;
     164             : 
     165             :     /*
     166             :      * First, try to find exact match of stemmer module. Stemmer with
     167             :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     168             :      */
     169         588 :     for (m = stemmer_modules; m->name; m++)
     170             :     {
     171         700 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     172         112 :             pg_strcasecmp(m->name, lang) == 0)
     173             :         {
     174          28 :             d->stem = m->stem;
     175          28 :             d->z = m->create();
     176          28 :             d->needrecode = false;
     177          28 :             return;
     178             :         }
     179             :     }
     180             : 
     181             :     /*
     182             :      * Second, try to find stemmer for needed language for UTF8 encoding.
     183             :      */
     184           0 :     for (m = stemmer_modules; m->name; m++)
     185             :     {
     186           0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     187             :         {
     188           0 :             d->stem = m->stem;
     189           0 :             d->z = m->create();
     190           0 :             d->needrecode = true;
     191           0 :             return;
     192             :         }
     193             :     }
     194             : 
     195           0 :     ereport(ERROR,
     196             :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     197             :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     198             :                     lang, GetDatabaseEncodingName())));
     199             : }
     200             : 
     201             : Datum
     202          28 : dsnowball_init(PG_FUNCTION_ARGS)
     203             : {
     204          28 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     205             :     DictSnowball *d;
     206          28 :     bool        stoploaded = false;
     207             :     ListCell   *l;
     208             : 
     209          28 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
     210             : 
     211          84 :     foreach(l, dictoptions)
     212             :     {
     213          56 :         DefElem    *defel = (DefElem *) lfirst(l);
     214             : 
     215          56 :         if (strcmp(defel->defname, "stopwords") == 0)
     216             :         {
     217          28 :             if (stoploaded)
     218           0 :                 ereport(ERROR,
     219             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     220             :                          errmsg("multiple StopWords parameters")));
     221          28 :             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
     222          28 :             stoploaded = true;
     223             :         }
     224          28 :         else if (strcmp(defel->defname, "language") == 0)
     225             :         {
     226          28 :             if (d->stem)
     227           0 :                 ereport(ERROR,
     228             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     229             :                          errmsg("multiple Language parameters")));
     230          28 :             locate_stem_module(d, defGetString(defel));
     231             :         }
     232             :         else
     233             :         {
     234           0 :             ereport(ERROR,
     235             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     236             :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     237             :                             defel->defname)));
     238             :         }
     239             :     }
     240             : 
     241          28 :     if (!d->stem)
     242           0 :         ereport(ERROR,
     243             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     244             :                  errmsg("missing Language parameter")));
     245             : 
     246          28 :     d->dictCtx = CurrentMemoryContext;
     247             : 
     248          28 :     PG_RETURN_POINTER(d);
     249             : }
     250             : 
     251             : Datum
     252        5604 : dsnowball_lexize(PG_FUNCTION_ARGS)
     253             : {
     254        5604 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     255        5604 :     char       *in = (char *) PG_GETARG_POINTER(1);
     256        5604 :     int32       len = PG_GETARG_INT32(2);
     257        5604 :     char       *txt = lowerstr_with_len(in, len);
     258        5604 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
     259             : 
     260        5604 :     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     261             :     {
     262        1832 :         pfree(txt);
     263             :     }
     264             :     else
     265             :     {
     266             :         MemoryContext saveCtx;
     267             : 
     268             :         /*
     269             :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     270             :          */
     271        3772 :         if (d->needrecode)
     272             :         {
     273             :             char       *recoded;
     274             : 
     275           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     276           0 :             if (recoded != txt)
     277             :             {
     278           0 :                 pfree(txt);
     279           0 :                 txt = recoded;
     280             :             }
     281             :         }
     282             : 
     283             :         /* see comment about d->dictCtx */
     284        3772 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     285        3772 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     286        3772 :         d->stem(d->z);
     287        3772 :         MemoryContextSwitchTo(saveCtx);
     288             : 
     289        3772 :         if (d->z->p && d->z->l)
     290             :         {
     291        3772 :             txt = repalloc(txt, d->z->l + 1);
     292        3772 :             memcpy(txt, d->z->p, d->z->l);
     293        3772 :             txt[d->z->l] = '\0';
     294             :         }
     295             : 
     296             :         /* back recode if needed */
     297        3772 :         if (d->needrecode)
     298             :         {
     299             :             char       *recoded;
     300             : 
     301           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     302           0 :             if (recoded != txt)
     303             :             {
     304           0 :                 pfree(txt);
     305           0 :                 txt = recoded;
     306             :             }
     307             :         }
     308             : 
     309        3772 :         res->lexeme = txt;
     310             :     }
     311             : 
     312        5604 :     PG_RETURN_POINTER(res);
     313             : }

Generated by: LCOV version 1.13