Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "common/unicode_case.h"
20 : #include "common/unicode_category.h"
21 : #include "utils/pg_locale.h"
22 : #include "utils/pg_locale_c.h"
23 :
24 : static pg_locale_t pg_regex_locale;
25 :
26 :
27 : /*
28 : * pg_set_regex_collation: set collation for these functions to obey
29 : *
30 : * This is called when beginning compilation or execution of a regexp.
31 : * Since there's no need for reentrancy of regexp operations, it's okay
32 : * to store the results in static variables.
33 : */
34 : void
35 8042396 : pg_set_regex_collation(Oid collation)
36 : {
37 8042396 : pg_locale_t locale = 0;
38 :
39 8042396 : if (!OidIsValid(collation))
40 : {
41 : /*
42 : * This typically means that the parser could not resolve a conflict
43 : * of implicit collations, so report it that way.
44 : */
45 0 : ereport(ERROR,
46 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
47 : errmsg("could not determine which collation to use for regular expression"),
48 : errhint("Use the COLLATE clause to set the collation explicitly.")));
49 : }
50 :
51 8042396 : locale = pg_newlocale_from_collation(collation);
52 :
53 8042396 : if (!locale->deterministic)
54 24 : ereport(ERROR,
55 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
56 : errmsg("nondeterministic collations are not supported for regular expressions")));
57 :
58 8042372 : pg_regex_locale = locale;
59 8042372 : }
60 :
61 : /*
62 : * The following functions overlap with those defined in pg_locale.c. XXX:
63 : * consider refactor.
64 : */
65 :
66 : static int
67 186910 : regc_wc_isdigit(pg_wchar c)
68 : {
69 186910 : if (pg_regex_locale->ctype_is_c)
70 4260 : return (c <= (pg_wchar) 127 &&
71 2130 : (pg_char_properties[c] & PG_ISDIGIT));
72 : else
73 184780 : return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
74 : }
75 :
76 : static int
77 29718 : regc_wc_isalpha(pg_wchar c)
78 : {
79 29718 : if (pg_regex_locale->ctype_is_c)
80 1536 : return (c <= (pg_wchar) 127 &&
81 768 : (pg_char_properties[c] & PG_ISALPHA));
82 : else
83 28950 : return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
84 : }
85 :
86 : static int
87 94986 : regc_wc_isalnum(pg_wchar c)
88 : {
89 94986 : if (pg_regex_locale->ctype_is_c)
90 1524 : return (c <= (pg_wchar) 127 &&
91 762 : (pg_char_properties[c] & PG_ISALNUM));
92 : else
93 94224 : return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
94 : }
95 :
96 : static int
97 37634 : regc_wc_isword(pg_wchar c)
98 : {
99 : /* We define word characters as alnum class plus underscore */
100 37634 : if (c == CHR('_'))
101 24 : return 1;
102 37610 : return regc_wc_isalnum(c);
103 : }
104 :
105 : static int
106 40976 : regc_wc_isupper(pg_wchar c)
107 : {
108 40976 : if (pg_regex_locale->ctype_is_c)
109 0 : return (c <= (pg_wchar) 127 &&
110 0 : (pg_char_properties[c] & PG_ISUPPER));
111 : else
112 40976 : return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
113 : }
114 :
115 : static int
116 16390 : regc_wc_islower(pg_wchar c)
117 : {
118 16390 : if (pg_regex_locale->ctype_is_c)
119 0 : return (c <= (pg_wchar) 127 &&
120 0 : (pg_char_properties[c] & PG_ISLOWER));
121 : else
122 16390 : return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
123 : }
124 :
125 : static int
126 16390 : regc_wc_isgraph(pg_wchar c)
127 : {
128 16390 : if (pg_regex_locale->ctype_is_c)
129 0 : return (c <= (pg_wchar) 127 &&
130 0 : (pg_char_properties[c] & PG_ISGRAPH));
131 : else
132 16390 : return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
133 : }
134 :
135 : static int
136 16390 : regc_wc_isprint(pg_wchar c)
137 : {
138 16390 : if (pg_regex_locale->ctype_is_c)
139 0 : return (c <= (pg_wchar) 127 &&
140 0 : (pg_char_properties[c] & PG_ISPRINT));
141 : else
142 16390 : return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
143 : }
144 :
145 : static int
146 40966 : regc_wc_ispunct(pg_wchar c)
147 : {
148 40966 : if (pg_regex_locale->ctype_is_c)
149 0 : return (c <= (pg_wchar) 127 &&
150 0 : (pg_char_properties[c] & PG_ISPUNCT));
151 : else
152 40966 : return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
153 : }
154 :
155 : static int
156 76386 : regc_wc_isspace(pg_wchar c)
157 : {
158 76386 : if (pg_regex_locale->ctype_is_c)
159 0 : return (c <= (pg_wchar) 127 &&
160 0 : (pg_char_properties[c] & PG_ISSPACE));
161 : else
162 76386 : return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
163 : }
164 :
165 : static pg_wchar
166 10702 : regc_wc_toupper(pg_wchar c)
167 : {
168 10702 : if (pg_regex_locale->ctype_is_c)
169 : {
170 978 : if (c <= (pg_wchar) 127)
171 978 : return pg_ascii_toupper((unsigned char) c);
172 0 : return c;
173 : }
174 : else
175 9724 : return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
176 : }
177 :
178 : static pg_wchar
179 10706 : regc_wc_tolower(pg_wchar c)
180 : {
181 10706 : if (pg_regex_locale->ctype_is_c)
182 : {
183 978 : if (c <= (pg_wchar) 127)
184 978 : return pg_ascii_tolower((unsigned char) c);
185 0 : return c;
186 : }
187 : else
188 9728 : return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
189 : }
190 :
191 :
192 : /*
193 : * These functions cache the results of probing libc's ctype behavior for
194 : * all character codes of interest in a given encoding/collation. The
195 : * result is provided as a "struct cvec", but notice that the representation
196 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
197 : * chrs[] and ranges[] arrays separately from the struct so that we can
198 : * realloc them larger at need. This is okay since the cvecs made here
199 : * should never be freed by freecvec().
200 : *
201 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
202 : * the main regex code expects us to return a failure indication instead.
203 : */
204 :
205 : typedef int (*regc_wc_probefunc) (pg_wchar c);
206 :
207 : typedef struct pg_ctype_cache
208 : {
209 : regc_wc_probefunc probefunc; /* regc_wc_isalpha or a sibling */
210 : pg_locale_t locale; /* locale this entry is for */
211 : struct cvec cv; /* cache entry contents */
212 : struct pg_ctype_cache *next; /* chain link */
213 : } pg_ctype_cache;
214 :
215 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
216 :
217 : /*
218 : * Add a chr or range to pcc->cv; return false if run out of memory
219 : */
220 : static bool
221 11794 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
222 : {
223 : chr *newchrs;
224 :
225 11794 : if (nchrs > 1)
226 : {
227 3720 : if (pcc->cv.nranges >= pcc->cv.rangespace)
228 : {
229 0 : pcc->cv.rangespace *= 2;
230 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
231 0 : pcc->cv.rangespace * sizeof(chr) * 2);
232 0 : if (newchrs == NULL)
233 0 : return false;
234 0 : pcc->cv.ranges = newchrs;
235 : }
236 3720 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
237 3720 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
238 3720 : pcc->cv.nranges++;
239 : }
240 : else
241 : {
242 : assert(nchrs == 1);
243 8074 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
244 : {
245 28 : pcc->cv.chrspace *= 2;
246 28 : newchrs = (chr *) realloc(pcc->cv.chrs,
247 28 : pcc->cv.chrspace * sizeof(chr));
248 28 : if (newchrs == NULL)
249 0 : return false;
250 28 : pcc->cv.chrs = newchrs;
251 : }
252 8074 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
253 : }
254 11794 : return true;
255 : }
256 :
257 : /*
258 : * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
259 : * chrs satisfying the probe function. The active collation is the one
260 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
261 : *
262 : * Note that the result must not be freed or modified by caller.
263 : */
264 : static struct cvec *
265 878 : regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
266 : {
267 : pg_ctype_cache *pcc;
268 : pg_wchar max_chr;
269 : pg_wchar cur_chr;
270 : int nmatches;
271 : chr *newchrs;
272 :
273 : /*
274 : * Do we already have the answer cached?
275 : */
276 2040 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
277 : {
278 1762 : if (pcc->probefunc == probefunc &&
279 672 : pcc->locale == pg_regex_locale)
280 600 : return &pcc->cv;
281 : }
282 :
283 : /*
284 : * Nope, so initialize some workspace ...
285 : */
286 278 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
287 278 : if (pcc == NULL)
288 0 : return NULL;
289 278 : pcc->probefunc = probefunc;
290 278 : pcc->locale = pg_regex_locale;
291 278 : pcc->cv.nchrs = 0;
292 278 : pcc->cv.chrspace = 128;
293 278 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
294 278 : pcc->cv.nranges = 0;
295 278 : pcc->cv.rangespace = 64;
296 278 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
297 278 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
298 0 : goto out_of_memory;
299 278 : pcc->cv.cclasscode = cclasscode;
300 :
301 : /*
302 : * Decide how many character codes we ought to look through. In general
303 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
304 : * runtime using the "high colormap" mechanism. However, in C locale
305 : * there's no need to go further than 127, and if we only have a 1-byte
306 : * <ctype.h> API there's no need to go further than that can handle.
307 : *
308 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
309 : * output cvec as not having any locale-dependent behavior, since there
310 : * will be no need to do any run-time locale checks. (The #if's here
311 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
312 : * useful to allow it to be small for testing purposes.)
313 : */
314 278 : if (pg_regex_locale->ctype_is_c)
315 : {
316 : #if MAX_SIMPLE_CHR >= 127
317 28 : max_chr = (pg_wchar) 127;
318 28 : pcc->cv.cclasscode = -1;
319 : #else
320 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
321 : #endif
322 : }
323 : else
324 : {
325 250 : if (pg_regex_locale->ctype->max_chr != 0 &&
326 0 : pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
327 : {
328 0 : max_chr = pg_regex_locale->ctype->max_chr;
329 0 : pcc->cv.cclasscode = -1;
330 : }
331 : else
332 250 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
333 : }
334 :
335 : /*
336 : * And scan 'em ...
337 : */
338 278 : nmatches = 0; /* number of consecutive matches */
339 :
340 515862 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
341 : {
342 515584 : if ((*probefunc) (cur_chr))
343 142080 : nmatches++;
344 373504 : else if (nmatches > 0)
345 : {
346 11770 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
347 0 : goto out_of_memory;
348 11770 : nmatches = 0;
349 : }
350 : }
351 :
352 278 : if (nmatches > 0)
353 24 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
354 0 : goto out_of_memory;
355 :
356 : /*
357 : * We might have allocated more memory than needed, if so free it
358 : */
359 278 : if (pcc->cv.nchrs == 0)
360 : {
361 112 : free(pcc->cv.chrs);
362 112 : pcc->cv.chrs = NULL;
363 112 : pcc->cv.chrspace = 0;
364 : }
365 166 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
366 : {
367 166 : newchrs = (chr *) realloc(pcc->cv.chrs,
368 166 : pcc->cv.nchrs * sizeof(chr));
369 166 : if (newchrs == NULL)
370 0 : goto out_of_memory;
371 166 : pcc->cv.chrs = newchrs;
372 166 : pcc->cv.chrspace = pcc->cv.nchrs;
373 : }
374 278 : if (pcc->cv.nranges == 0)
375 : {
376 0 : free(pcc->cv.ranges);
377 0 : pcc->cv.ranges = NULL;
378 0 : pcc->cv.rangespace = 0;
379 : }
380 278 : else if (pcc->cv.nranges < pcc->cv.rangespace)
381 : {
382 278 : newchrs = (chr *) realloc(pcc->cv.ranges,
383 278 : pcc->cv.nranges * sizeof(chr) * 2);
384 278 : if (newchrs == NULL)
385 0 : goto out_of_memory;
386 278 : pcc->cv.ranges = newchrs;
387 278 : pcc->cv.rangespace = pcc->cv.nranges;
388 : }
389 :
390 : /*
391 : * Success, link it into cache chain
392 : */
393 278 : pcc->next = pg_ctype_cache_list;
394 278 : pg_ctype_cache_list = pcc;
395 :
396 278 : return &pcc->cv;
397 :
398 : /*
399 : * Failure, clean up
400 : */
401 0 : out_of_memory:
402 0 : free(pcc->cv.chrs);
403 0 : free(pcc->cv.ranges);
404 0 : free(pcc);
405 :
406 0 : return NULL;
407 : }
|