Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "common/unicode_case.h"
20 : #include "common/unicode_category.h"
21 : #include "utils/pg_locale.h"
22 :
23 : /*
24 : * To provide as much functionality as possible on a variety of platforms,
25 : * without going so far as to implement everything from scratch, we use
26 : * several implementation strategies depending on the situation:
27 : *
28 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
29 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
30 : * collations don't give a fig about multibyte characters.
31 : *
32 : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
33 : *
34 : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
35 : * This assumes that every platform uses Unicode codepoints directly
36 : * as the wchar_t representation of Unicode. On some platforms
37 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38 : *
39 : * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
40 : * values up to 255, and punt for values above that. This is 100% correct
41 : * only in single-byte encodings such as LATINn. However, non-Unicode
42 : * multibyte encodings are mostly Far Eastern character sets for which the
43 : * properties being tested here aren't very relevant for higher code values
44 : * anyway. The difficulty with using the <wctype.h> functions with
45 : * non-Unicode multibyte encodings is that we can have no certainty that
46 : * the platform's wchar_t representation matches what we do in pg_wchar
47 : * conversions.
48 : *
49 : * 3. Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
50 : * functions, under exactly the same cases as #2.
51 : *
52 : * There is one notable difference between cases 2 and 3: in the "default"
53 : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
54 : * while in a non-default collation we just let the library functions do what
55 : * they will. The case where this matters is treatment of I/i in Turkish,
56 : * and the behavior is meant to match the upper()/lower() SQL functions.
57 : *
58 : * We store the active collation setting in static variables. In principle
59 : * it could be passed down to here via the regex library's "struct vars" data
60 : * structure; but that would require somewhat invasive changes in the regex
61 : * library, and right now there's no real benefit to be gained from that.
62 : *
63 : * NB: the coding here assumes pg_wchar is an unsigned type.
64 : */
65 :
66 : typedef enum
67 : {
68 : PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */
69 : PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */
70 : PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */
71 : PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
72 : PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */
73 : } PG_Locale_Strategy;
74 :
75 : static PG_Locale_Strategy pg_regex_strategy;
76 : static pg_locale_t pg_regex_locale;
77 :
78 : /*
79 : * Hard-wired character properties for C locale
80 : */
81 : #define PG_ISDIGIT 0x01
82 : #define PG_ISALPHA 0x02
83 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
84 : #define PG_ISUPPER 0x04
85 : #define PG_ISLOWER 0x08
86 : #define PG_ISGRAPH 0x10
87 : #define PG_ISPRINT 0x20
88 : #define PG_ISPUNCT 0x40
89 : #define PG_ISSPACE 0x80
90 :
91 : static const unsigned char pg_char_properties[128] = {
92 : /* NUL */ 0,
93 : /* ^A */ 0,
94 : /* ^B */ 0,
95 : /* ^C */ 0,
96 : /* ^D */ 0,
97 : /* ^E */ 0,
98 : /* ^F */ 0,
99 : /* ^G */ 0,
100 : /* ^H */ 0,
101 : /* ^I */ PG_ISSPACE,
102 : /* ^J */ PG_ISSPACE,
103 : /* ^K */ PG_ISSPACE,
104 : /* ^L */ PG_ISSPACE,
105 : /* ^M */ PG_ISSPACE,
106 : /* ^N */ 0,
107 : /* ^O */ 0,
108 : /* ^P */ 0,
109 : /* ^Q */ 0,
110 : /* ^R */ 0,
111 : /* ^S */ 0,
112 : /* ^T */ 0,
113 : /* ^U */ 0,
114 : /* ^V */ 0,
115 : /* ^W */ 0,
116 : /* ^X */ 0,
117 : /* ^Y */ 0,
118 : /* ^Z */ 0,
119 : /* ^[ */ 0,
120 : /* ^\ */ 0,
121 : /* ^] */ 0,
122 : /* ^^ */ 0,
123 : /* ^_ */ 0,
124 : /* */ PG_ISPRINT | PG_ISSPACE,
125 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
126 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
141 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
151 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
158 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
184 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
190 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
216 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
217 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
218 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : /* DEL */ 0
220 : };
221 :
222 :
223 : /*
224 : * pg_set_regex_collation: set collation for these functions to obey
225 : *
226 : * This is called when beginning compilation or execution of a regexp.
227 : * Since there's no need for reentrancy of regexp operations, it's okay
228 : * to store the results in static variables.
229 : */
230 : void
231 7447072 : pg_set_regex_collation(Oid collation)
232 : {
233 7447072 : pg_locale_t locale = 0;
234 : PG_Locale_Strategy strategy;
235 :
236 7447072 : if (!OidIsValid(collation))
237 : {
238 : /*
239 : * This typically means that the parser could not resolve a conflict
240 : * of implicit collations, so report it that way.
241 : */
242 0 : ereport(ERROR,
243 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
244 : errmsg("could not determine which collation to use for regular expression"),
245 : errhint("Use the COLLATE clause to set the collation explicitly.")));
246 : }
247 :
248 7447072 : if (collation == C_COLLATION_OID)
249 : {
250 : /*
251 : * Some callers expect regexes to work for C_COLLATION_OID before
252 : * catalog access is available, so we can't call
253 : * pg_newlocale_from_collation().
254 : */
255 103568 : strategy = PG_REGEX_STRATEGY_C;
256 103568 : locale = 0;
257 : }
258 : else
259 : {
260 7343504 : locale = pg_newlocale_from_collation(collation);
261 :
262 7343504 : if (!locale->deterministic)
263 24 : ereport(ERROR,
264 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
265 : errmsg("nondeterministic collations are not supported for regular expressions")));
266 :
267 7343480 : if (locale->ctype_is_c)
268 : {
269 : /*
270 : * C/POSIX collations use this path regardless of database
271 : * encoding
272 : */
273 284 : strategy = PG_REGEX_STRATEGY_C;
274 284 : locale = 0;
275 : }
276 7343196 : else if (locale->provider == COLLPROVIDER_BUILTIN)
277 : {
278 : Assert(GetDatabaseEncoding() == PG_UTF8);
279 2027552 : strategy = PG_REGEX_STRATEGY_BUILTIN;
280 : }
281 : #ifdef USE_ICU
282 5315644 : else if (locale->provider == COLLPROVIDER_ICU)
283 : {
284 942 : strategy = PG_REGEX_STRATEGY_ICU;
285 : }
286 : #endif
287 : else
288 : {
289 : Assert(locale->provider == COLLPROVIDER_LIBC);
290 5314702 : if (GetDatabaseEncoding() == PG_UTF8)
291 5314698 : strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
292 : else
293 4 : strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
294 : }
295 : }
296 :
297 7447048 : pg_regex_strategy = strategy;
298 7447048 : pg_regex_locale = locale;
299 7447048 : }
300 :
301 : static int
302 170524 : pg_wc_isdigit(pg_wchar c)
303 : {
304 170524 : switch (pg_regex_strategy)
305 : {
306 2130 : case PG_REGEX_STRATEGY_C:
307 4260 : return (c <= (pg_wchar) 127 &&
308 2130 : (pg_char_properties[c] & PG_ISDIGIT));
309 61556 : case PG_REGEX_STRATEGY_BUILTIN:
310 61556 : return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
311 94550 : case PG_REGEX_STRATEGY_LIBC_WIDE:
312 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
313 94550 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
314 : /* FALL THRU */
315 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
316 0 : return (c <= (pg_wchar) UCHAR_MAX &&
317 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
318 : break;
319 12288 : case PG_REGEX_STRATEGY_ICU:
320 : #ifdef USE_ICU
321 12288 : return u_isdigit(c);
322 : #endif
323 : break;
324 : }
325 0 : return 0; /* can't get here, but keep compiler quiet */
326 : }
327 :
328 : static int
329 17430 : pg_wc_isalpha(pg_wchar c)
330 : {
331 17430 : switch (pg_regex_strategy)
332 : {
333 768 : case PG_REGEX_STRATEGY_C:
334 1536 : return (c <= (pg_wchar) 127 &&
335 768 : (pg_char_properties[c] & PG_ISALPHA));
336 22 : case PG_REGEX_STRATEGY_BUILTIN:
337 22 : return pg_u_isalpha(c);
338 4352 : case PG_REGEX_STRATEGY_LIBC_WIDE:
339 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
340 4352 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
341 : /* FALL THRU */
342 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
343 0 : return (c <= (pg_wchar) UCHAR_MAX &&
344 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
345 : break;
346 12288 : case PG_REGEX_STRATEGY_ICU:
347 : #ifdef USE_ICU
348 12288 : return u_isalpha(c);
349 : #endif
350 : break;
351 : }
352 0 : return 0; /* can't get here, but keep compiler quiet */
353 : }
354 :
355 : static int
356 78604 : pg_wc_isalnum(pg_wchar c)
357 : {
358 78604 : switch (pg_regex_strategy)
359 : {
360 762 : case PG_REGEX_STRATEGY_C:
361 1524 : return (c <= (pg_wchar) 127 &&
362 762 : (pg_char_properties[c] & PG_ISALNUM));
363 32764 : case PG_REGEX_STRATEGY_BUILTIN:
364 32764 : return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
365 32790 : case PG_REGEX_STRATEGY_LIBC_WIDE:
366 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
367 32790 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
368 : /* FALL THRU */
369 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
370 0 : return (c <= (pg_wchar) UCHAR_MAX &&
371 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
372 : break;
373 12288 : case PG_REGEX_STRATEGY_ICU:
374 : #ifdef USE_ICU
375 12288 : return u_isalnum(c);
376 : #endif
377 : break;
378 : }
379 0 : return 0; /* can't get here, but keep compiler quiet */
380 : }
381 :
382 : static int
383 33538 : pg_wc_isword(pg_wchar c)
384 : {
385 : /* We define word characters as alnum class plus underscore */
386 33538 : if (c == CHR('_'))
387 22 : return 1;
388 33516 : return pg_wc_isalnum(c);
389 : }
390 :
391 : static int
392 40976 : pg_wc_isupper(pg_wchar c)
393 : {
394 40976 : switch (pg_regex_strategy)
395 : {
396 0 : case PG_REGEX_STRATEGY_C:
397 0 : return (c <= (pg_wchar) 127 &&
398 0 : (pg_char_properties[c] & PG_ISUPPER));
399 24576 : case PG_REGEX_STRATEGY_BUILTIN:
400 24576 : return pg_u_isupper(c);
401 4112 : case PG_REGEX_STRATEGY_LIBC_WIDE:
402 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
403 4112 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
404 : /* FALL THRU */
405 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
406 0 : return (c <= (pg_wchar) UCHAR_MAX &&
407 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
408 : break;
409 12288 : case PG_REGEX_STRATEGY_ICU:
410 : #ifdef USE_ICU
411 12288 : return u_isupper(c);
412 : #endif
413 : break;
414 : }
415 0 : return 0; /* can't get here, but keep compiler quiet */
416 : }
417 :
418 : static int
419 16390 : pg_wc_islower(pg_wchar c)
420 : {
421 16390 : switch (pg_regex_strategy)
422 : {
423 0 : case PG_REGEX_STRATEGY_C:
424 0 : return (c <= (pg_wchar) 127 &&
425 0 : (pg_char_properties[c] & PG_ISLOWER));
426 0 : case PG_REGEX_STRATEGY_BUILTIN:
427 0 : return pg_u_islower(c);
428 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
429 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
430 4102 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
431 : /* FALL THRU */
432 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
433 0 : return (c <= (pg_wchar) UCHAR_MAX &&
434 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
435 : break;
436 12288 : case PG_REGEX_STRATEGY_ICU:
437 : #ifdef USE_ICU
438 12288 : return u_islower(c);
439 : #endif
440 : break;
441 : }
442 0 : return 0; /* can't get here, but keep compiler quiet */
443 : }
444 :
445 : static int
446 16390 : pg_wc_isgraph(pg_wchar c)
447 : {
448 16390 : switch (pg_regex_strategy)
449 : {
450 0 : case PG_REGEX_STRATEGY_C:
451 0 : return (c <= (pg_wchar) 127 &&
452 0 : (pg_char_properties[c] & PG_ISGRAPH));
453 0 : case PG_REGEX_STRATEGY_BUILTIN:
454 0 : return pg_u_isgraph(c);
455 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
456 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
457 4102 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
458 : /* FALL THRU */
459 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
460 0 : return (c <= (pg_wchar) UCHAR_MAX &&
461 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
462 : break;
463 12288 : case PG_REGEX_STRATEGY_ICU:
464 : #ifdef USE_ICU
465 12288 : return u_isgraph(c);
466 : #endif
467 : break;
468 : }
469 0 : return 0; /* can't get here, but keep compiler quiet */
470 : }
471 :
472 : static int
473 16390 : pg_wc_isprint(pg_wchar c)
474 : {
475 16390 : switch (pg_regex_strategy)
476 : {
477 0 : case PG_REGEX_STRATEGY_C:
478 0 : return (c <= (pg_wchar) 127 &&
479 0 : (pg_char_properties[c] & PG_ISPRINT));
480 0 : case PG_REGEX_STRATEGY_BUILTIN:
481 0 : return pg_u_isprint(c);
482 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
483 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
484 4102 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
485 : /* FALL THRU */
486 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
487 0 : return (c <= (pg_wchar) UCHAR_MAX &&
488 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
489 : break;
490 12288 : case PG_REGEX_STRATEGY_ICU:
491 : #ifdef USE_ICU
492 12288 : return u_isprint(c);
493 : #endif
494 : break;
495 : }
496 0 : return 0; /* can't get here, but keep compiler quiet */
497 : }
498 :
499 : static int
500 40966 : pg_wc_ispunct(pg_wchar c)
501 : {
502 40966 : switch (pg_regex_strategy)
503 : {
504 0 : case PG_REGEX_STRATEGY_C:
505 0 : return (c <= (pg_wchar) 127 &&
506 0 : (pg_char_properties[c] & PG_ISPUNCT));
507 24576 : case PG_REGEX_STRATEGY_BUILTIN:
508 24576 : return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
509 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
510 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
511 4102 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
512 : /* FALL THRU */
513 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
514 0 : return (c <= (pg_wchar) UCHAR_MAX &&
515 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
516 : break;
517 12288 : case PG_REGEX_STRATEGY_ICU:
518 : #ifdef USE_ICU
519 12288 : return u_ispunct(c);
520 : #endif
521 : break;
522 : }
523 0 : return 0; /* can't get here, but keep compiler quiet */
524 : }
525 :
526 : static int
527 76386 : pg_wc_isspace(pg_wchar c)
528 : {
529 76386 : switch (pg_regex_strategy)
530 : {
531 0 : case PG_REGEX_STRATEGY_C:
532 0 : return (c <= (pg_wchar) 127 &&
533 0 : (pg_char_properties[c] & PG_ISSPACE));
534 16398 : case PG_REGEX_STRATEGY_BUILTIN:
535 16398 : return pg_u_isspace(c);
536 47700 : case PG_REGEX_STRATEGY_LIBC_WIDE:
537 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
538 47700 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
539 : /* FALL THRU */
540 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
541 0 : return (c <= (pg_wchar) UCHAR_MAX &&
542 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
543 : break;
544 12288 : case PG_REGEX_STRATEGY_ICU:
545 : #ifdef USE_ICU
546 12288 : return u_isspace(c);
547 : #endif
548 : break;
549 : }
550 0 : return 0; /* can't get here, but keep compiler quiet */
551 : }
552 :
553 : static pg_wchar
554 10702 : pg_wc_toupper(pg_wchar c)
555 : {
556 10702 : switch (pg_regex_strategy)
557 : {
558 978 : case PG_REGEX_STRATEGY_C:
559 978 : if (c <= (pg_wchar) 127)
560 978 : return pg_ascii_toupper((unsigned char) c);
561 0 : return c;
562 528 : case PG_REGEX_STRATEGY_BUILTIN:
563 528 : return unicode_uppercase_simple(c);
564 9088 : case PG_REGEX_STRATEGY_LIBC_WIDE:
565 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
566 9088 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
567 : /* FALL THRU */
568 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
569 0 : if (c <= (pg_wchar) UCHAR_MAX)
570 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
571 0 : return c;
572 108 : case PG_REGEX_STRATEGY_ICU:
573 : #ifdef USE_ICU
574 108 : return u_toupper(c);
575 : #endif
576 : break;
577 : }
578 0 : return 0; /* can't get here, but keep compiler quiet */
579 : }
580 :
581 : static pg_wchar
582 10706 : pg_wc_tolower(pg_wchar c)
583 : {
584 10706 : switch (pg_regex_strategy)
585 : {
586 978 : case PG_REGEX_STRATEGY_C:
587 978 : if (c <= (pg_wchar) 127)
588 978 : return pg_ascii_tolower((unsigned char) c);
589 0 : return c;
590 528 : case PG_REGEX_STRATEGY_BUILTIN:
591 528 : return unicode_lowercase_simple(c);
592 9092 : case PG_REGEX_STRATEGY_LIBC_WIDE:
593 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
594 9092 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
595 : /* FALL THRU */
596 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
597 0 : if (c <= (pg_wchar) UCHAR_MAX)
598 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
599 0 : return c;
600 108 : case PG_REGEX_STRATEGY_ICU:
601 : #ifdef USE_ICU
602 108 : return u_tolower(c);
603 : #endif
604 : break;
605 : }
606 0 : return 0; /* can't get here, but keep compiler quiet */
607 : }
608 :
609 :
610 : /*
611 : * These functions cache the results of probing libc's ctype behavior for
612 : * all character codes of interest in a given encoding/collation. The
613 : * result is provided as a "struct cvec", but notice that the representation
614 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
615 : * chrs[] and ranges[] arrays separately from the struct so that we can
616 : * realloc them larger at need. This is okay since the cvecs made here
617 : * should never be freed by freecvec().
618 : *
619 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
620 : * the main regex code expects us to return a failure indication instead.
621 : */
622 :
623 : typedef int (*pg_wc_probefunc) (pg_wchar c);
624 :
625 : typedef struct pg_ctype_cache
626 : {
627 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
628 : pg_locale_t locale; /* locale this entry is for */
629 : struct cvec cv; /* cache entry contents */
630 : struct pg_ctype_cache *next; /* chain link */
631 : } pg_ctype_cache;
632 :
633 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
634 :
635 : /*
636 : * Add a chr or range to pcc->cv; return false if run out of memory
637 : */
638 : static bool
639 11120 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
640 : {
641 : chr *newchrs;
642 :
643 11120 : if (nchrs > 1)
644 : {
645 3244 : if (pcc->cv.nranges >= pcc->cv.rangespace)
646 : {
647 0 : pcc->cv.rangespace *= 2;
648 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
649 0 : pcc->cv.rangespace * sizeof(chr) * 2);
650 0 : if (newchrs == NULL)
651 0 : return false;
652 0 : pcc->cv.ranges = newchrs;
653 : }
654 3244 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
655 3244 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
656 3244 : pcc->cv.nranges++;
657 : }
658 : else
659 : {
660 : assert(nchrs == 1);
661 7876 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
662 : {
663 28 : pcc->cv.chrspace *= 2;
664 28 : newchrs = (chr *) realloc(pcc->cv.chrs,
665 28 : pcc->cv.chrspace * sizeof(chr));
666 28 : if (newchrs == NULL)
667 0 : return false;
668 28 : pcc->cv.chrs = newchrs;
669 : }
670 7876 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
671 : }
672 11120 : return true;
673 : }
674 :
675 : /*
676 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
677 : * chrs satisfying the probe function. The active collation is the one
678 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
679 : *
680 : * Note that the result must not be freed or modified by caller.
681 : */
682 : static struct cvec *
683 794 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
684 : {
685 : pg_ctype_cache *pcc;
686 : pg_wchar max_chr;
687 : pg_wchar cur_chr;
688 : int nmatches;
689 : chr *newchrs;
690 :
691 : /*
692 : * Do we already have the answer cached?
693 : */
694 1922 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
695 : {
696 1666 : if (pcc->probefunc == probefunc &&
697 610 : pcc->locale == pg_regex_locale)
698 538 : return &pcc->cv;
699 : }
700 :
701 : /*
702 : * Nope, so initialize some workspace ...
703 : */
704 256 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
705 256 : if (pcc == NULL)
706 0 : return NULL;
707 256 : pcc->probefunc = probefunc;
708 256 : pcc->locale = pg_regex_locale;
709 256 : pcc->cv.nchrs = 0;
710 256 : pcc->cv.chrspace = 128;
711 256 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
712 256 : pcc->cv.nranges = 0;
713 256 : pcc->cv.rangespace = 64;
714 256 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
715 256 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
716 0 : goto out_of_memory;
717 256 : pcc->cv.cclasscode = cclasscode;
718 :
719 : /*
720 : * Decide how many character codes we ought to look through. In general
721 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
722 : * runtime using the "high colormap" mechanism. However, in C locale
723 : * there's no need to go further than 127, and if we only have a 1-byte
724 : * <ctype.h> API there's no need to go further than that can handle.
725 : *
726 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
727 : * output cvec as not having any locale-dependent behavior, since there
728 : * will be no need to do any run-time locale checks. (The #if's here
729 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
730 : * useful to allow it to be small for testing purposes.)
731 : */
732 256 : switch (pg_regex_strategy)
733 : {
734 28 : case PG_REGEX_STRATEGY_C:
735 : #if MAX_SIMPLE_CHR >= 127
736 28 : max_chr = (pg_wchar) 127;
737 28 : pcc->cv.cclasscode = -1;
738 : #else
739 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
740 : #endif
741 28 : break;
742 78 : case PG_REGEX_STRATEGY_BUILTIN:
743 78 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
744 78 : break;
745 96 : case PG_REGEX_STRATEGY_LIBC_WIDE:
746 96 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
747 96 : break;
748 0 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
749 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
750 0 : max_chr = (pg_wchar) UCHAR_MAX;
751 0 : pcc->cv.cclasscode = -1;
752 : #else
753 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
754 : #endif
755 0 : break;
756 54 : case PG_REGEX_STRATEGY_ICU:
757 54 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
758 54 : break;
759 0 : default:
760 : Assert(false);
761 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
762 0 : break;
763 : }
764 :
765 : /*
766 : * And scan 'em ...
767 : */
768 256 : nmatches = 0; /* number of consecutive matches */
769 :
770 470784 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
771 : {
772 470528 : if ((*probefunc) (cur_chr))
773 119956 : nmatches++;
774 350572 : else if (nmatches > 0)
775 : {
776 11096 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
777 0 : goto out_of_memory;
778 11096 : nmatches = 0;
779 : }
780 : }
781 :
782 256 : if (nmatches > 0)
783 24 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
784 0 : goto out_of_memory;
785 :
786 : /*
787 : * We might have allocated more memory than needed, if so free it
788 : */
789 256 : if (pcc->cv.nchrs == 0)
790 : {
791 104 : free(pcc->cv.chrs);
792 104 : pcc->cv.chrs = NULL;
793 104 : pcc->cv.chrspace = 0;
794 : }
795 152 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
796 : {
797 152 : newchrs = (chr *) realloc(pcc->cv.chrs,
798 152 : pcc->cv.nchrs * sizeof(chr));
799 152 : if (newchrs == NULL)
800 0 : goto out_of_memory;
801 152 : pcc->cv.chrs = newchrs;
802 152 : pcc->cv.chrspace = pcc->cv.nchrs;
803 : }
804 256 : if (pcc->cv.nranges == 0)
805 : {
806 0 : free(pcc->cv.ranges);
807 0 : pcc->cv.ranges = NULL;
808 0 : pcc->cv.rangespace = 0;
809 : }
810 256 : else if (pcc->cv.nranges < pcc->cv.rangespace)
811 : {
812 256 : newchrs = (chr *) realloc(pcc->cv.ranges,
813 256 : pcc->cv.nranges * sizeof(chr) * 2);
814 256 : if (newchrs == NULL)
815 0 : goto out_of_memory;
816 256 : pcc->cv.ranges = newchrs;
817 256 : pcc->cv.rangespace = pcc->cv.nranges;
818 : }
819 :
820 : /*
821 : * Success, link it into cache chain
822 : */
823 256 : pcc->next = pg_ctype_cache_list;
824 256 : pg_ctype_cache_list = pcc;
825 :
826 256 : return &pcc->cv;
827 :
828 : /*
829 : * Failure, clean up
830 : */
831 0 : out_of_memory:
832 0 : free(pcc->cv.chrs);
833 0 : free(pcc->cv.ranges);
834 0 : free(pcc);
835 :
836 0 : return NULL;
837 : }
|