Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dict_snowball.c
4 : * Snowball dictionary
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/snowball/dict_snowball.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 :
15 : #include "catalog/pg_collation_d.h"
16 : #include "commands/defrem.h"
17 : #include "mb/pg_wchar.h"
18 : #include "tsearch/ts_public.h"
19 : #include "utils/formatting.h"
20 :
21 : /* Some platforms define MAXINT and/or MININT, causing conflicts */
22 : #ifdef MAXINT
23 : #undef MAXINT
24 : #endif
25 : #ifdef MININT
26 : #undef MININT
27 : #endif
28 :
29 : /* Now we can include the original Snowball header.h */
30 : #include "snowball/libstemmer/header.h"
31 : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
32 : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
33 : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
34 : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
35 : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
36 : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
37 : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
38 : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
39 : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
40 : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
41 : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
42 : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
43 : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
44 : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
45 : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
46 : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
47 : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
48 : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
49 : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
50 : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
51 : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
52 : #include "snowball/libstemmer/stem_UTF_8_basque.h"
53 : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
54 : #include "snowball/libstemmer/stem_UTF_8_danish.h"
55 : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
56 : #include "snowball/libstemmer/stem_UTF_8_english.h"
57 : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
58 : #include "snowball/libstemmer/stem_UTF_8_french.h"
59 : #include "snowball/libstemmer/stem_UTF_8_german.h"
60 : #include "snowball/libstemmer/stem_UTF_8_greek.h"
61 : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
62 : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
63 : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
64 : #include "snowball/libstemmer/stem_UTF_8_irish.h"
65 : #include "snowball/libstemmer/stem_UTF_8_italian.h"
66 : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
67 : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
68 : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
69 : #include "snowball/libstemmer/stem_UTF_8_porter.h"
70 : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
71 : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
72 : #include "snowball/libstemmer/stem_UTF_8_russian.h"
73 : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
74 : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
75 : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
76 : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
77 : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
78 : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
79 :
80 118 : PG_MODULE_MAGIC;
81 :
82 118 : PG_FUNCTION_INFO_V1(dsnowball_init);
83 :
84 118 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
85 :
86 : /* List of supported modules */
87 : typedef struct stemmer_module
88 : {
89 : const char *name;
90 : pg_enc enc;
91 : struct SN_env *(*create) (void);
92 : void (*close) (struct SN_env *);
93 : int (*stem) (struct SN_env *);
94 : } stemmer_module;
95 :
96 : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
97 : #define STEMMER_MODULE(name,enc,senc) \
98 : {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
99 :
100 : static const stemmer_module stemmer_modules[] =
101 : {
102 : /*
103 : * Stemmers list from Snowball distribution
104 : */
105 : STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
106 : STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
107 : STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
108 : STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
109 : STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
110 : STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
111 : STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
112 : STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
113 : STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
114 : STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
115 : STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
116 : STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
117 : STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
118 : STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
119 : STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
120 : STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
121 : STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
122 : STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
123 : STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
124 : STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
125 : STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
126 : STEMMER_MODULE(basque, PG_UTF8, UTF_8),
127 : STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
128 : STEMMER_MODULE(danish, PG_UTF8, UTF_8),
129 : STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
130 : STEMMER_MODULE(english, PG_UTF8, UTF_8),
131 : STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
132 : STEMMER_MODULE(french, PG_UTF8, UTF_8),
133 : STEMMER_MODULE(german, PG_UTF8, UTF_8),
134 : STEMMER_MODULE(greek, PG_UTF8, UTF_8),
135 : STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
136 : STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
137 : STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
138 : STEMMER_MODULE(irish, PG_UTF8, UTF_8),
139 : STEMMER_MODULE(italian, PG_UTF8, UTF_8),
140 : STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
141 : STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
142 : STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
143 : STEMMER_MODULE(porter, PG_UTF8, UTF_8),
144 : STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
145 : STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
146 : STEMMER_MODULE(russian, PG_UTF8, UTF_8),
147 : STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
148 : STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
149 : STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
150 : STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
151 : STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
152 : STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
153 :
154 : /*
155 : * Stemmer with PG_SQL_ASCII encoding should be valid for any server
156 : * encoding
157 : */
158 : STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
159 :
160 : {NULL, 0, NULL, NULL, NULL} /* list end marker */
161 : };
162 :
163 :
164 : typedef struct DictSnowball
165 : {
166 : struct SN_env *z;
167 : StopList stoplist;
168 : bool needrecode; /* needs recoding before/after call stem */
169 : int (*stem) (struct SN_env *z);
170 :
171 : /*
172 : * snowball saves alloced memory between calls, so we should run it in our
173 : * private memory context. Note, init function is executed in long lived
174 : * context, so we just remember CurrentMemoryContext
175 : */
176 : MemoryContext dictCtx;
177 : } DictSnowball;
178 :
179 :
180 : static void
181 38 : locate_stem_module(DictSnowball *d, const char *lang)
182 : {
183 : const stemmer_module *m;
184 :
185 : /*
186 : * First, try to find exact match of stemmer module. Stemmer with
187 : * PG_SQL_ASCII encoding is treated as working with any server encoding
188 : */
189 988 : for (m = stemmer_modules; m->name; m++)
190 : {
191 1254 : if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
192 266 : pg_strcasecmp(m->name, lang) == 0)
193 : {
194 38 : d->stem = m->stem;
195 38 : d->z = m->create();
196 38 : d->needrecode = false;
197 38 : return;
198 : }
199 : }
200 :
201 : /*
202 : * Second, try to find stemmer for needed language for UTF8 encoding.
203 : */
204 0 : for (m = stemmer_modules; m->name; m++)
205 : {
206 0 : if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
207 : {
208 0 : d->stem = m->stem;
209 0 : d->z = m->create();
210 0 : d->needrecode = true;
211 0 : return;
212 : }
213 : }
214 :
215 0 : ereport(ERROR,
216 : (errcode(ERRCODE_UNDEFINED_OBJECT),
217 : errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
218 : lang, GetDatabaseEncodingName())));
219 : }
220 :
221 : Datum
222 38 : dsnowball_init(PG_FUNCTION_ARGS)
223 : {
224 38 : List *dictoptions = (List *) PG_GETARG_POINTER(0);
225 : DictSnowball *d;
226 38 : bool stoploaded = false;
227 : ListCell *l;
228 :
229 38 : d = (DictSnowball *) palloc0(sizeof(DictSnowball));
230 :
231 114 : foreach(l, dictoptions)
232 : {
233 76 : DefElem *defel = (DefElem *) lfirst(l);
234 :
235 76 : if (strcmp(defel->defname, "stopwords") == 0)
236 : {
237 38 : if (stoploaded)
238 0 : ereport(ERROR,
239 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
240 : errmsg("multiple StopWords parameters")));
241 38 : readstoplist(defGetString(defel), &d->stoplist, str_tolower);
242 38 : stoploaded = true;
243 : }
244 38 : else if (strcmp(defel->defname, "language") == 0)
245 : {
246 38 : if (d->stem)
247 0 : ereport(ERROR,
248 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
249 : errmsg("multiple Language parameters")));
250 38 : locate_stem_module(d, defGetString(defel));
251 : }
252 : else
253 : {
254 0 : ereport(ERROR,
255 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
256 : errmsg("unrecognized Snowball parameter: \"%s\"",
257 : defel->defname)));
258 : }
259 : }
260 :
261 38 : if (!d->stem)
262 0 : ereport(ERROR,
263 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
264 : errmsg("missing Language parameter")));
265 :
266 38 : d->dictCtx = CurrentMemoryContext;
267 :
268 38 : PG_RETURN_POINTER(d);
269 : }
270 :
271 : Datum
272 10270 : dsnowball_lexize(PG_FUNCTION_ARGS)
273 : {
274 10270 : DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
275 10270 : char *in = (char *) PG_GETARG_POINTER(1);
276 10270 : int32 len = PG_GETARG_INT32(2);
277 10270 : char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
278 10270 : TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
279 :
280 : /*
281 : * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
282 : * surely not words in any human language. This restriction avoids
283 : * wasting cycles on stuff like base64-encoded data, and it protects us
284 : * against possible inefficiency or misbehavior in the stemmer. (For
285 : * example, the Turkish stemmer has an indefinite recursion, so it can
286 : * crash on long-enough strings.) However, Snowball dictionaries are
287 : * defined to recognize all strings, so we can't reject the string as an
288 : * unknown word.
289 : */
290 10270 : if (len > 1000)
291 : {
292 : /* return the lexeme lowercased, but otherwise unmodified */
293 0 : res->lexeme = txt;
294 : }
295 10270 : else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
296 : {
297 : /* empty or stopword, so report as stopword */
298 3468 : pfree(txt);
299 : }
300 : else
301 : {
302 : MemoryContext saveCtx;
303 :
304 : /*
305 : * recode to utf8 if stemmer is utf8 and doesn't match server encoding
306 : */
307 6802 : if (d->needrecode)
308 : {
309 : char *recoded;
310 :
311 0 : recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
312 0 : if (recoded != txt)
313 : {
314 0 : pfree(txt);
315 0 : txt = recoded;
316 : }
317 : }
318 :
319 : /* see comment about d->dictCtx */
320 6802 : saveCtx = MemoryContextSwitchTo(d->dictCtx);
321 6802 : SN_set_current(d->z, strlen(txt), (symbol *) txt);
322 6802 : d->stem(d->z);
323 6802 : MemoryContextSwitchTo(saveCtx);
324 :
325 6802 : if (d->z->p && d->z->l)
326 : {
327 6802 : txt = repalloc(txt, d->z->l + 1);
328 6802 : memcpy(txt, d->z->p, d->z->l);
329 6802 : txt[d->z->l] = '\0';
330 : }
331 :
332 : /* back recode if needed */
333 6802 : if (d->needrecode)
334 : {
335 : char *recoded;
336 :
337 0 : recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
338 0 : if (recoded != txt)
339 : {
340 0 : pfree(txt);
341 0 : txt = recoded;
342 : }
343 : }
344 :
345 6802 : res->lexeme = txt;
346 : }
347 :
348 10270 : PG_RETURN_POINTER(res);
349 : }
|