LCOV - code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 47 66 71.2 %
Date: 2019-08-24 07:07:03 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * dict_snowball.c
       4             :  *      Snowball dictionary
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/snowball/dict_snowball.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : 
      15             : #include "commands/defrem.h"
      16             : #include "tsearch/ts_locale.h"
      17             : #include "tsearch/ts_utils.h"
      18             : 
      19             : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      20             : #ifdef MAXINT
      21             : #undef MAXINT
      22             : #endif
      23             : #ifdef MININT
      24             : #undef MININT
      25             : #endif
      26             : 
      27             : /* Now we can include the original Snowball header.h */
      28             : #include "snowball/libstemmer/header.h"
      29             : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      30             : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      31             : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      32             : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      33             : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      34             : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      35             : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      36             : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      37             : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      38             : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      39             : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      40             : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      41             : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      42             : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      43             : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      44             : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
      45             : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      46             : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      47             : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      48             : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      49             : #include "snowball/libstemmer/stem_UTF_8_english.h"
      50             : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      51             : #include "snowball/libstemmer/stem_UTF_8_french.h"
      52             : #include "snowball/libstemmer/stem_UTF_8_german.h"
      53             : #include "snowball/libstemmer/stem_UTF_8_greek.h"
      54             : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      55             : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      56             : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      57             : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      58             : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      59             : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      60             : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      61             : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      62             : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      63             : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      64             : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      65             : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      66             : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      67             : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      68             : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      69             : 
      70         342 : PG_MODULE_MAGIC;
      71             : 
      72         342 : PG_FUNCTION_INFO_V1(dsnowball_init);
      73             : 
      74         342 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      75             : 
      76             : /* List of supported modules */
      77             : typedef struct stemmer_module
      78             : {
      79             :     const char *name;
      80             :     pg_enc      enc;
      81             :     struct SN_env *(*create) (void);
      82             :     void        (*close) (struct SN_env *);
      83             :     int         (*stem) (struct SN_env *);
      84             : } stemmer_module;
      85             : 
      86             : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
      87             : #define STEMMER_MODULE(name,enc,senc) \
      88             :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
      89             : 
      90             : static const stemmer_module stemmer_modules[] =
      91             : {
      92             :     /*
      93             :      * Stemmers list from Snowball distribution
      94             :      */
      95             :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
      96             :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
      97             :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
      98             :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
      99             :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
     100             :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     101             :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     102             :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     103             :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     104             :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     105             :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     106             :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     107             :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     108             :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     109             :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     110             :     STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
     111             :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     112             :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     113             :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     114             :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     115             :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
     116             :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     117             :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
     118             :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
     119             :     STEMMER_MODULE(greek, PG_UTF8, UTF_8),
     120             :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     121             :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     122             :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     123             :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     124             :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     125             :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     126             :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     127             :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     128             :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     129             :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     130             :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     131             :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     132             :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     133             :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     134             :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     135             : 
     136             :     /*
     137             :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     138             :      * encoding
     139             :      */
     140             :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     141             : 
     142             :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     143             : };
     144             : 
     145             : 
     146             : typedef struct DictSnowball
     147             : {
     148             :     struct SN_env *z;
     149             :     StopList    stoplist;
     150             :     bool        needrecode;     /* needs recoding before/after call stem */
     151             :     int         (*stem) (struct SN_env *z);
     152             : 
     153             :     /*
     154             :      * snowball saves alloced memory between calls, so we should run it in our
     155             :      * private memory context. Note, init function is executed in long lived
     156             :      * context, so we just remember CurrentMemoryContext
     157             :      */
     158             :     MemoryContext dictCtx;
     159             : } DictSnowball;
     160             : 
     161             : 
     162             : static void
     163          26 : locate_stem_module(DictSnowball *d, const char *lang)
     164             : {
     165             :     const stemmer_module *m;
     166             : 
     167             :     /*
     168             :      * First, try to find exact match of stemmer module. Stemmer with
     169             :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     170             :      */
     171         546 :     for (m = stemmer_modules; m->name; m++)
     172             :     {
     173         650 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     174         104 :             pg_strcasecmp(m->name, lang) == 0)
     175             :         {
     176          26 :             d->stem = m->stem;
     177          26 :             d->z = m->create();
     178          26 :             d->needrecode = false;
     179          26 :             return;
     180             :         }
     181             :     }
     182             : 
     183             :     /*
     184             :      * Second, try to find stemmer for needed language for UTF8 encoding.
     185             :      */
     186           0 :     for (m = stemmer_modules; m->name; m++)
     187             :     {
     188           0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     189             :         {
     190           0 :             d->stem = m->stem;
     191           0 :             d->z = m->create();
     192           0 :             d->needrecode = true;
     193           0 :             return;
     194             :         }
     195             :     }
     196             : 
     197           0 :     ereport(ERROR,
     198             :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     199             :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     200             :                     lang, GetDatabaseEncodingName())));
     201             : }
     202             : 
     203             : Datum
     204          26 : dsnowball_init(PG_FUNCTION_ARGS)
     205             : {
     206          26 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     207             :     DictSnowball *d;
     208          26 :     bool        stoploaded = false;
     209             :     ListCell   *l;
     210             : 
     211          26 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
     212             : 
     213          78 :     foreach(l, dictoptions)
     214             :     {
     215          52 :         DefElem    *defel = (DefElem *) lfirst(l);
     216             : 
     217          52 :         if (strcmp(defel->defname, "stopwords") == 0)
     218             :         {
     219          26 :             if (stoploaded)
     220           0 :                 ereport(ERROR,
     221             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     222             :                          errmsg("multiple StopWords parameters")));
     223          26 :             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
     224          26 :             stoploaded = true;
     225             :         }
     226          26 :         else if (strcmp(defel->defname, "language") == 0)
     227             :         {
     228          26 :             if (d->stem)
     229           0 :                 ereport(ERROR,
     230             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     231             :                          errmsg("multiple Language parameters")));
     232          26 :             locate_stem_module(d, defGetString(defel));
     233             :         }
     234             :         else
     235             :         {
     236           0 :             ereport(ERROR,
     237             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     238             :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     239             :                             defel->defname)));
     240             :         }
     241             :     }
     242             : 
     243          26 :     if (!d->stem)
     244           0 :         ereport(ERROR,
     245             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     246             :                  errmsg("missing Language parameter")));
     247             : 
     248          26 :     d->dictCtx = CurrentMemoryContext;
     249             : 
     250          26 :     PG_RETURN_POINTER(d);
     251             : }
     252             : 
     253             : Datum
     254        5608 : dsnowball_lexize(PG_FUNCTION_ARGS)
     255             : {
     256        5608 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     257        5608 :     char       *in = (char *) PG_GETARG_POINTER(1);
     258        5608 :     int32       len = PG_GETARG_INT32(2);
     259        5608 :     char       *txt = lowerstr_with_len(in, len);
     260        5608 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
     261             : 
     262        5608 :     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     263             :     {
     264        1832 :         pfree(txt);
     265             :     }
     266             :     else
     267             :     {
     268             :         MemoryContext saveCtx;
     269             : 
     270             :         /*
     271             :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     272             :          */
     273        3776 :         if (d->needrecode)
     274             :         {
     275             :             char       *recoded;
     276             : 
     277           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     278           0 :             if (recoded != txt)
     279             :             {
     280           0 :                 pfree(txt);
     281           0 :                 txt = recoded;
     282             :             }
     283             :         }
     284             : 
     285             :         /* see comment about d->dictCtx */
     286        3776 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     287        3776 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     288        3776 :         d->stem(d->z);
     289        3776 :         MemoryContextSwitchTo(saveCtx);
     290             : 
     291        3776 :         if (d->z->p && d->z->l)
     292             :         {
     293        3776 :             txt = repalloc(txt, d->z->l + 1);
     294        3776 :             memcpy(txt, d->z->p, d->z->l);
     295        3776 :             txt[d->z->l] = '\0';
     296             :         }
     297             : 
     298             :         /* back recode if needed */
     299        3776 :         if (d->needrecode)
     300             :         {
     301             :             char       *recoded;
     302             : 
     303           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     304           0 :             if (recoded != txt)
     305             :             {
     306           0 :                 pfree(txt);
     307           0 :                 txt = recoded;
     308             :             }
     309             :         }
     310             : 
     311        3776 :         res->lexeme = txt;
     312             :     }
     313             : 
     314        5608 :     PG_RETURN_POINTER(res);
     315             : }

Generated by: LCOV version 1.13