Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "common/unicode_case.h"
20 : #include "common/unicode_category.h"
21 : #include "utils/pg_locale.h"
22 :
23 : /*
24 : * For the libc provider, to provide as much functionality as possible on a
25 : * variety of platforms without going so far as to implement everything from
26 : * scratch, we use several implementation strategies depending on the
27 : * situation:
28 : *
29 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
30 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
31 : * collations don't give a fig about multibyte characters.
32 : *
33 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
34 : * This assumes that every platform uses Unicode codepoints directly
35 : * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
36 : * even for non-UTF8 encodings, which may be a problem.) On some platforms
37 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38 : *
39 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
40 : * values up to 255, and punt for values above that. This is 100% correct
41 : * only in single-byte encodings such as LATINn. However, non-Unicode
42 : * multibyte encodings are mostly Far Eastern character sets for which the
43 : * properties being tested here aren't very relevant for higher code values
44 : * anyway. The difficulty with using the <wctype.h> functions with
45 : * non-Unicode multibyte encodings is that we can have no certainty that
46 : * the platform's wchar_t representation matches what we do in pg_wchar
47 : * conversions.
48 : *
49 : * As a special case, in the "default" collation, (2) and (3) force ASCII
50 : * letters to follow ASCII upcase/downcase rules, while in a non-default
51 : * collation we just let the library functions do what they will. The case
52 : * where this matters is treatment of I/i in Turkish, and the behavior is
53 : * meant to match the upper()/lower() SQL functions.
54 : *
55 : * We store the active collation setting in static variables. In principle
56 : * it could be passed down to here via the regex library's "struct vars" data
57 : * structure; but that would require somewhat invasive changes in the regex
58 : * library, and right now there's no real benefit to be gained from that.
59 : *
60 : * NB: the coding here assumes pg_wchar is an unsigned type.
61 : */
62 :
63 : typedef enum
64 : {
65 : PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */
66 : PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */
67 : PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */
68 : PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
69 : PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */
70 : } PG_Locale_Strategy;
71 :
72 : static PG_Locale_Strategy pg_regex_strategy;
73 : static pg_locale_t pg_regex_locale;
74 :
75 : /*
76 : * Hard-wired character properties for C locale
77 : */
78 : #define PG_ISDIGIT 0x01
79 : #define PG_ISALPHA 0x02
80 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
81 : #define PG_ISUPPER 0x04
82 : #define PG_ISLOWER 0x08
83 : #define PG_ISGRAPH 0x10
84 : #define PG_ISPRINT 0x20
85 : #define PG_ISPUNCT 0x40
86 : #define PG_ISSPACE 0x80
87 :
88 : static const unsigned char pg_char_properties[128] = {
89 : /* NUL */ 0,
90 : /* ^A */ 0,
91 : /* ^B */ 0,
92 : /* ^C */ 0,
93 : /* ^D */ 0,
94 : /* ^E */ 0,
95 : /* ^F */ 0,
96 : /* ^G */ 0,
97 : /* ^H */ 0,
98 : /* ^I */ PG_ISSPACE,
99 : /* ^J */ PG_ISSPACE,
100 : /* ^K */ PG_ISSPACE,
101 : /* ^L */ PG_ISSPACE,
102 : /* ^M */ PG_ISSPACE,
103 : /* ^N */ 0,
104 : /* ^O */ 0,
105 : /* ^P */ 0,
106 : /* ^Q */ 0,
107 : /* ^R */ 0,
108 : /* ^S */ 0,
109 : /* ^T */ 0,
110 : /* ^U */ 0,
111 : /* ^V */ 0,
112 : /* ^W */ 0,
113 : /* ^X */ 0,
114 : /* ^Y */ 0,
115 : /* ^Z */ 0,
116 : /* ^[ */ 0,
117 : /* ^\ */ 0,
118 : /* ^] */ 0,
119 : /* ^^ */ 0,
120 : /* ^_ */ 0,
121 : /* */ PG_ISPRINT | PG_ISSPACE,
122 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
123 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
124 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
125 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
126 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
138 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
139 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
140 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
141 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
148 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
149 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
150 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
151 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
155 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
156 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
157 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
158 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
181 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
182 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
183 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
184 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
187 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
188 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
189 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
190 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
213 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
214 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
215 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
216 : /* DEL */ 0
217 : };
218 :
219 :
220 : /*
221 : * pg_set_regex_collation: set collation for these functions to obey
222 : *
223 : * This is called when beginning compilation or execution of a regexp.
224 : * Since there's no need for reentrancy of regexp operations, it's okay
225 : * to store the results in static variables.
226 : */
227 : void
228 7957608 : pg_set_regex_collation(Oid collation)
229 : {
230 7957608 : pg_locale_t locale = 0;
231 : PG_Locale_Strategy strategy;
232 :
233 7957608 : if (!OidIsValid(collation))
234 : {
235 : /*
236 : * This typically means that the parser could not resolve a conflict
237 : * of implicit collations, so report it that way.
238 : */
239 0 : ereport(ERROR,
240 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
241 : errmsg("could not determine which collation to use for regular expression"),
242 : errhint("Use the COLLATE clause to set the collation explicitly.")));
243 : }
244 :
245 7957608 : if (collation == C_COLLATION_OID)
246 : {
247 : /*
248 : * Some callers expect regexes to work for C_COLLATION_OID before
249 : * catalog access is available, so we can't call
250 : * pg_newlocale_from_collation().
251 : */
252 127846 : strategy = PG_REGEX_STRATEGY_C;
253 127846 : locale = 0;
254 : }
255 : else
256 : {
257 7829762 : locale = pg_newlocale_from_collation(collation);
258 :
259 7829762 : if (!locale->deterministic)
260 24 : ereport(ERROR,
261 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
262 : errmsg("nondeterministic collations are not supported for regular expressions")));
263 :
264 7829738 : if (locale->ctype_is_c)
265 : {
266 : /*
267 : * C/POSIX collations use this path regardless of database
268 : * encoding
269 : */
270 284 : strategy = PG_REGEX_STRATEGY_C;
271 284 : locale = 0;
272 : }
273 7829454 : else if (locale->provider == COLLPROVIDER_BUILTIN)
274 : {
275 : Assert(GetDatabaseEncoding() == PG_UTF8);
276 2185254 : strategy = PG_REGEX_STRATEGY_BUILTIN;
277 : }
278 : #ifdef USE_ICU
279 5644200 : else if (locale->provider == COLLPROVIDER_ICU)
280 : {
281 942 : strategy = PG_REGEX_STRATEGY_ICU;
282 : }
283 : #endif
284 : else
285 : {
286 : Assert(locale->provider == COLLPROVIDER_LIBC);
287 5643258 : if (GetDatabaseEncoding() == PG_UTF8)
288 5643254 : strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
289 : else
290 4 : strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
291 : }
292 : }
293 :
294 7957584 : pg_regex_strategy = strategy;
295 7957584 : pg_regex_locale = locale;
296 7957584 : }
297 :
298 : static int
299 186918 : pg_wc_isdigit(pg_wchar c)
300 : {
301 186918 : switch (pg_regex_strategy)
302 : {
303 2130 : case PG_REGEX_STRATEGY_C:
304 4260 : return (c <= (pg_wchar) 127 &&
305 2130 : (pg_char_properties[c] & PG_ISDIGIT));
306 65660 : case PG_REGEX_STRATEGY_BUILTIN:
307 65660 : return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
308 106840 : case PG_REGEX_STRATEGY_LIBC_WIDE:
309 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
310 106840 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
311 : /* FALL THRU */
312 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
313 0 : return (c <= (pg_wchar) UCHAR_MAX &&
314 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
315 : break;
316 12288 : case PG_REGEX_STRATEGY_ICU:
317 : #ifdef USE_ICU
318 12288 : return u_isdigit(c);
319 : #endif
320 : break;
321 : }
322 0 : return 0; /* can't get here, but keep compiler quiet */
323 : }
324 :
325 : static int
326 17430 : pg_wc_isalpha(pg_wchar c)
327 : {
328 17430 : switch (pg_regex_strategy)
329 : {
330 768 : case PG_REGEX_STRATEGY_C:
331 1536 : return (c <= (pg_wchar) 127 &&
332 768 : (pg_char_properties[c] & PG_ISALPHA));
333 22 : case PG_REGEX_STRATEGY_BUILTIN:
334 22 : return pg_u_isalpha(c);
335 4352 : case PG_REGEX_STRATEGY_LIBC_WIDE:
336 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
337 4352 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
338 : /* FALL THRU */
339 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
340 0 : return (c <= (pg_wchar) UCHAR_MAX &&
341 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
342 : break;
343 12288 : case PG_REGEX_STRATEGY_ICU:
344 : #ifdef USE_ICU
345 12288 : return u_isalpha(c);
346 : #endif
347 : break;
348 : }
349 0 : return 0; /* can't get here, but keep compiler quiet */
350 : }
351 :
352 : static int
353 82698 : pg_wc_isalnum(pg_wchar c)
354 : {
355 82698 : switch (pg_regex_strategy)
356 : {
357 762 : case PG_REGEX_STRATEGY_C:
358 1524 : return (c <= (pg_wchar) 127 &&
359 762 : (pg_char_properties[c] & PG_ISALNUM));
360 32764 : case PG_REGEX_STRATEGY_BUILTIN:
361 32764 : return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
362 36884 : case PG_REGEX_STRATEGY_LIBC_WIDE:
363 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
364 36884 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
365 : /* FALL THRU */
366 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
367 0 : return (c <= (pg_wchar) UCHAR_MAX &&
368 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
369 : break;
370 12288 : case PG_REGEX_STRATEGY_ICU:
371 : #ifdef USE_ICU
372 12288 : return u_isalnum(c);
373 : #endif
374 : break;
375 : }
376 0 : return 0; /* can't get here, but keep compiler quiet */
377 : }
378 :
379 : static int
380 37634 : pg_wc_isword(pg_wchar c)
381 : {
382 : /* We define word characters as alnum class plus underscore */
383 37634 : if (c == CHR('_'))
384 24 : return 1;
385 37610 : return pg_wc_isalnum(c);
386 : }
387 :
388 : static int
389 40976 : pg_wc_isupper(pg_wchar c)
390 : {
391 40976 : switch (pg_regex_strategy)
392 : {
393 0 : case PG_REGEX_STRATEGY_C:
394 0 : return (c <= (pg_wchar) 127 &&
395 0 : (pg_char_properties[c] & PG_ISUPPER));
396 24576 : case PG_REGEX_STRATEGY_BUILTIN:
397 24576 : return pg_u_isupper(c);
398 4112 : case PG_REGEX_STRATEGY_LIBC_WIDE:
399 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
400 4112 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
401 : /* FALL THRU */
402 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
403 0 : return (c <= (pg_wchar) UCHAR_MAX &&
404 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
405 : break;
406 12288 : case PG_REGEX_STRATEGY_ICU:
407 : #ifdef USE_ICU
408 12288 : return u_isupper(c);
409 : #endif
410 : break;
411 : }
412 0 : return 0; /* can't get here, but keep compiler quiet */
413 : }
414 :
415 : static int
416 16390 : pg_wc_islower(pg_wchar c)
417 : {
418 16390 : switch (pg_regex_strategy)
419 : {
420 0 : case PG_REGEX_STRATEGY_C:
421 0 : return (c <= (pg_wchar) 127 &&
422 0 : (pg_char_properties[c] & PG_ISLOWER));
423 0 : case PG_REGEX_STRATEGY_BUILTIN:
424 0 : return pg_u_islower(c);
425 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
426 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
427 4102 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
428 : /* FALL THRU */
429 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
430 0 : return (c <= (pg_wchar) UCHAR_MAX &&
431 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
432 : break;
433 12288 : case PG_REGEX_STRATEGY_ICU:
434 : #ifdef USE_ICU
435 12288 : return u_islower(c);
436 : #endif
437 : break;
438 : }
439 0 : return 0; /* can't get here, but keep compiler quiet */
440 : }
441 :
442 : static int
443 16390 : pg_wc_isgraph(pg_wchar c)
444 : {
445 16390 : switch (pg_regex_strategy)
446 : {
447 0 : case PG_REGEX_STRATEGY_C:
448 0 : return (c <= (pg_wchar) 127 &&
449 0 : (pg_char_properties[c] & PG_ISGRAPH));
450 0 : case PG_REGEX_STRATEGY_BUILTIN:
451 0 : return pg_u_isgraph(c);
452 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
453 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
454 4102 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
455 : /* FALL THRU */
456 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
457 0 : return (c <= (pg_wchar) UCHAR_MAX &&
458 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
459 : break;
460 12288 : case PG_REGEX_STRATEGY_ICU:
461 : #ifdef USE_ICU
462 12288 : return u_isgraph(c);
463 : #endif
464 : break;
465 : }
466 0 : return 0; /* can't get here, but keep compiler quiet */
467 : }
468 :
469 : static int
470 16390 : pg_wc_isprint(pg_wchar c)
471 : {
472 16390 : switch (pg_regex_strategy)
473 : {
474 0 : case PG_REGEX_STRATEGY_C:
475 0 : return (c <= (pg_wchar) 127 &&
476 0 : (pg_char_properties[c] & PG_ISPRINT));
477 0 : case PG_REGEX_STRATEGY_BUILTIN:
478 0 : return pg_u_isprint(c);
479 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
480 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481 4102 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
482 : /* FALL THRU */
483 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
484 0 : return (c <= (pg_wchar) UCHAR_MAX &&
485 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
486 : break;
487 12288 : case PG_REGEX_STRATEGY_ICU:
488 : #ifdef USE_ICU
489 12288 : return u_isprint(c);
490 : #endif
491 : break;
492 : }
493 0 : return 0; /* can't get here, but keep compiler quiet */
494 : }
495 :
496 : static int
497 40966 : pg_wc_ispunct(pg_wchar c)
498 : {
499 40966 : switch (pg_regex_strategy)
500 : {
501 0 : case PG_REGEX_STRATEGY_C:
502 0 : return (c <= (pg_wchar) 127 &&
503 0 : (pg_char_properties[c] & PG_ISPUNCT));
504 24576 : case PG_REGEX_STRATEGY_BUILTIN:
505 24576 : return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
506 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
507 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
508 4102 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
509 : /* FALL THRU */
510 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
511 0 : return (c <= (pg_wchar) UCHAR_MAX &&
512 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
513 : break;
514 12288 : case PG_REGEX_STRATEGY_ICU:
515 : #ifdef USE_ICU
516 12288 : return u_ispunct(c);
517 : #endif
518 : break;
519 : }
520 0 : return 0; /* can't get here, but keep compiler quiet */
521 : }
522 :
523 : static int
524 76386 : pg_wc_isspace(pg_wchar c)
525 : {
526 76386 : switch (pg_regex_strategy)
527 : {
528 0 : case PG_REGEX_STRATEGY_C:
529 0 : return (c <= (pg_wchar) 127 &&
530 0 : (pg_char_properties[c] & PG_ISSPACE));
531 16398 : case PG_REGEX_STRATEGY_BUILTIN:
532 16398 : return pg_u_isspace(c);
533 47700 : case PG_REGEX_STRATEGY_LIBC_WIDE:
534 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
535 47700 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
536 : /* FALL THRU */
537 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
538 0 : return (c <= (pg_wchar) UCHAR_MAX &&
539 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
540 : break;
541 12288 : case PG_REGEX_STRATEGY_ICU:
542 : #ifdef USE_ICU
543 12288 : return u_isspace(c);
544 : #endif
545 : break;
546 : }
547 0 : return 0; /* can't get here, but keep compiler quiet */
548 : }
549 :
550 : static pg_wchar
551 10702 : pg_wc_toupper(pg_wchar c)
552 : {
553 10702 : switch (pg_regex_strategy)
554 : {
555 978 : case PG_REGEX_STRATEGY_C:
556 978 : if (c <= (pg_wchar) 127)
557 978 : return pg_ascii_toupper((unsigned char) c);
558 0 : return c;
559 528 : case PG_REGEX_STRATEGY_BUILTIN:
560 528 : return unicode_uppercase_simple(c);
561 9088 : case PG_REGEX_STRATEGY_LIBC_WIDE:
562 : /* force C behavior for ASCII characters, per comments above */
563 9088 : if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
564 892 : return pg_ascii_toupper((unsigned char) c);
565 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
566 8196 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
567 : /* FALL THRU */
568 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
569 : /* force C behavior for ASCII characters, per comments above */
570 0 : if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
571 0 : return pg_ascii_toupper((unsigned char) c);
572 0 : if (c <= (pg_wchar) UCHAR_MAX)
573 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
574 0 : return c;
575 108 : case PG_REGEX_STRATEGY_ICU:
576 : #ifdef USE_ICU
577 108 : return u_toupper(c);
578 : #endif
579 : break;
580 : }
581 0 : return 0; /* can't get here, but keep compiler quiet */
582 : }
583 :
584 : static pg_wchar
585 10706 : pg_wc_tolower(pg_wchar c)
586 : {
587 10706 : switch (pg_regex_strategy)
588 : {
589 978 : case PG_REGEX_STRATEGY_C:
590 978 : if (c <= (pg_wchar) 127)
591 978 : return pg_ascii_tolower((unsigned char) c);
592 0 : return c;
593 528 : case PG_REGEX_STRATEGY_BUILTIN:
594 528 : return unicode_lowercase_simple(c);
595 9092 : case PG_REGEX_STRATEGY_LIBC_WIDE:
596 : /* force C behavior for ASCII characters, per comments above */
597 9092 : if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
598 896 : return pg_ascii_tolower((unsigned char) c);
599 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
600 8196 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
601 : /* FALL THRU */
602 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
603 : /* force C behavior for ASCII characters, per comments above */
604 0 : if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
605 0 : return pg_ascii_tolower((unsigned char) c);
606 0 : if (c <= (pg_wchar) UCHAR_MAX)
607 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
608 0 : return c;
609 108 : case PG_REGEX_STRATEGY_ICU:
610 : #ifdef USE_ICU
611 108 : return u_tolower(c);
612 : #endif
613 : break;
614 : }
615 0 : return 0; /* can't get here, but keep compiler quiet */
616 : }
617 :
618 :
619 : /*
620 : * These functions cache the results of probing libc's ctype behavior for
621 : * all character codes of interest in a given encoding/collation. The
622 : * result is provided as a "struct cvec", but notice that the representation
623 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
624 : * chrs[] and ranges[] arrays separately from the struct so that we can
625 : * realloc them larger at need. This is okay since the cvecs made here
626 : * should never be freed by freecvec().
627 : *
628 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
629 : * the main regex code expects us to return a failure indication instead.
630 : */
631 :
632 : typedef int (*pg_wc_probefunc) (pg_wchar c);
633 :
634 : typedef struct pg_ctype_cache
635 : {
636 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
637 : pg_locale_t locale; /* locale this entry is for */
638 : struct cvec cv; /* cache entry contents */
639 : struct pg_ctype_cache *next; /* chain link */
640 : } pg_ctype_cache;
641 :
642 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
643 :
644 : /*
645 : * Add a chr or range to pcc->cv; return false if run out of memory
646 : */
647 : static bool
648 11226 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
649 : {
650 : chr *newchrs;
651 :
652 11226 : if (nchrs > 1)
653 : {
654 3320 : if (pcc->cv.nranges >= pcc->cv.rangespace)
655 : {
656 0 : pcc->cv.rangespace *= 2;
657 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
658 0 : pcc->cv.rangespace * sizeof(chr) * 2);
659 0 : if (newchrs == NULL)
660 0 : return false;
661 0 : pcc->cv.ranges = newchrs;
662 : }
663 3320 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
664 3320 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
665 3320 : pcc->cv.nranges++;
666 : }
667 : else
668 : {
669 : assert(nchrs == 1);
670 7906 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
671 : {
672 28 : pcc->cv.chrspace *= 2;
673 28 : newchrs = (chr *) realloc(pcc->cv.chrs,
674 28 : pcc->cv.chrspace * sizeof(chr));
675 28 : if (newchrs == NULL)
676 0 : return false;
677 28 : pcc->cv.chrs = newchrs;
678 : }
679 7906 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
680 : }
681 11226 : return true;
682 : }
683 :
684 : /*
685 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
686 : * chrs satisfying the probe function. The active collation is the one
687 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
688 : *
689 : * Note that the result must not be freed or modified by caller.
690 : */
691 : static struct cvec *
692 836 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
693 : {
694 : pg_ctype_cache *pcc;
695 : pg_wchar max_chr;
696 : pg_wchar cur_chr;
697 : int nmatches;
698 : chr *newchrs;
699 :
700 : /*
701 : * Do we already have the answer cached?
702 : */
703 1968 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
704 : {
705 1702 : if (pcc->probefunc == probefunc &&
706 642 : pcc->locale == pg_regex_locale)
707 570 : return &pcc->cv;
708 : }
709 :
710 : /*
711 : * Nope, so initialize some workspace ...
712 : */
713 266 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
714 266 : if (pcc == NULL)
715 0 : return NULL;
716 266 : pcc->probefunc = probefunc;
717 266 : pcc->locale = pg_regex_locale;
718 266 : pcc->cv.nchrs = 0;
719 266 : pcc->cv.chrspace = 128;
720 266 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
721 266 : pcc->cv.nranges = 0;
722 266 : pcc->cv.rangespace = 64;
723 266 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
724 266 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
725 0 : goto out_of_memory;
726 266 : pcc->cv.cclasscode = cclasscode;
727 :
728 : /*
729 : * Decide how many character codes we ought to look through. In general
730 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
731 : * runtime using the "high colormap" mechanism. However, in C locale
732 : * there's no need to go further than 127, and if we only have a 1-byte
733 : * <ctype.h> API there's no need to go further than that can handle.
734 : *
735 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
736 : * output cvec as not having any locale-dependent behavior, since there
737 : * will be no need to do any run-time locale checks. (The #if's here
738 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
739 : * useful to allow it to be small for testing purposes.)
740 : */
741 266 : switch (pg_regex_strategy)
742 : {
743 28 : case PG_REGEX_STRATEGY_C:
744 : #if MAX_SIMPLE_CHR >= 127
745 28 : max_chr = (pg_wchar) 127;
746 28 : pcc->cv.cclasscode = -1;
747 : #else
748 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
749 : #endif
750 28 : break;
751 80 : case PG_REGEX_STRATEGY_BUILTIN:
752 80 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
753 80 : break;
754 104 : case PG_REGEX_STRATEGY_LIBC_WIDE:
755 104 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
756 104 : break;
757 0 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
758 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
759 0 : max_chr = (pg_wchar) UCHAR_MAX;
760 0 : pcc->cv.cclasscode = -1;
761 : #else
762 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
763 : #endif
764 0 : break;
765 54 : case PG_REGEX_STRATEGY_ICU:
766 54 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
767 54 : break;
768 0 : default:
769 : Assert(false);
770 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
771 0 : break;
772 : }
773 :
774 : /*
775 : * And scan 'em ...
776 : */
777 266 : nmatches = 0; /* number of consecutive matches */
778 :
779 491274 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
780 : {
781 491008 : if ((*probefunc) (cur_chr))
782 123380 : nmatches++;
783 367628 : else if (nmatches > 0)
784 : {
785 11202 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
786 0 : goto out_of_memory;
787 11202 : nmatches = 0;
788 : }
789 : }
790 :
791 266 : if (nmatches > 0)
792 24 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
793 0 : goto out_of_memory;
794 :
795 : /*
796 : * We might have allocated more memory than needed, if so free it
797 : */
798 266 : if (pcc->cv.nchrs == 0)
799 : {
800 112 : free(pcc->cv.chrs);
801 112 : pcc->cv.chrs = NULL;
802 112 : pcc->cv.chrspace = 0;
803 : }
804 154 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
805 : {
806 154 : newchrs = (chr *) realloc(pcc->cv.chrs,
807 154 : pcc->cv.nchrs * sizeof(chr));
808 154 : if (newchrs == NULL)
809 0 : goto out_of_memory;
810 154 : pcc->cv.chrs = newchrs;
811 154 : pcc->cv.chrspace = pcc->cv.nchrs;
812 : }
813 266 : if (pcc->cv.nranges == 0)
814 : {
815 0 : free(pcc->cv.ranges);
816 0 : pcc->cv.ranges = NULL;
817 0 : pcc->cv.rangespace = 0;
818 : }
819 266 : else if (pcc->cv.nranges < pcc->cv.rangespace)
820 : {
821 266 : newchrs = (chr *) realloc(pcc->cv.ranges,
822 266 : pcc->cv.nranges * sizeof(chr) * 2);
823 266 : if (newchrs == NULL)
824 0 : goto out_of_memory;
825 266 : pcc->cv.ranges = newchrs;
826 266 : pcc->cv.rangespace = pcc->cv.nranges;
827 : }
828 :
829 : /*
830 : * Success, link it into cache chain
831 : */
832 266 : pcc->next = pg_ctype_cache_list;
833 266 : pg_ctype_cache_list = pcc;
834 :
835 266 : return &pcc->cv;
836 :
837 : /*
838 : * Failure, clean up
839 : */
840 0 : out_of_memory:
841 0 : free(pcc->cv.chrs);
842 0 : free(pcc->cv.ranges);
843 0 : free(pcc);
844 :
845 0 : return NULL;
846 : }
|