Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "common/unicode_case.h"
20 : #include "common/unicode_category.h"
21 : #include "utils/pg_locale.h"
22 :
23 : /*
24 : * To provide as much functionality as possible on a variety of platforms,
25 : * without going so far as to implement everything from scratch, we use
26 : * several implementation strategies depending on the situation:
27 : *
28 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
29 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
30 : * collations don't give a fig about multibyte characters.
31 : *
32 : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
33 : *
34 : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
35 : * This assumes that every platform uses Unicode codepoints directly
36 : * as the wchar_t representation of Unicode. On some platforms
37 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38 : *
39 : * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
40 : * values up to 255, and punt for values above that. This is 100% correct
41 : * only in single-byte encodings such as LATINn. However, non-Unicode
42 : * multibyte encodings are mostly Far Eastern character sets for which the
43 : * properties being tested here aren't very relevant for higher code values
44 : * anyway. The difficulty with using the <wctype.h> functions with
45 : * non-Unicode multibyte encodings is that we can have no certainty that
46 : * the platform's wchar_t representation matches what we do in pg_wchar
47 : * conversions.
48 : *
49 : * 3. Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
50 : * functions, under exactly the same cases as #2.
51 : *
52 : * There is one notable difference between cases 2 and 3: in the "default"
53 : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
54 : * while in a non-default collation we just let the library functions do what
55 : * they will. The case where this matters is treatment of I/i in Turkish,
56 : * and the behavior is meant to match the upper()/lower() SQL functions.
57 : *
58 : * We store the active collation setting in static variables. In principle
59 : * it could be passed down to here via the regex library's "struct vars" data
60 : * structure; but that would require somewhat invasive changes in the regex
61 : * library, and right now there's no real benefit to be gained from that.
62 : *
63 : * NB: the coding here assumes pg_wchar is an unsigned type.
64 : */
65 :
66 : typedef enum
67 : {
68 : PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */
69 : PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */
70 : PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */
71 : PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
72 : PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */
73 : } PG_Locale_Strategy;
74 :
75 : static PG_Locale_Strategy pg_regex_strategy;
76 : static pg_locale_t pg_regex_locale;
77 : static Oid pg_regex_collation;
78 :
79 : /*
80 : * Hard-wired character properties for C locale
81 : */
82 : #define PG_ISDIGIT 0x01
83 : #define PG_ISALPHA 0x02
84 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
85 : #define PG_ISUPPER 0x04
86 : #define PG_ISLOWER 0x08
87 : #define PG_ISGRAPH 0x10
88 : #define PG_ISPRINT 0x20
89 : #define PG_ISPUNCT 0x40
90 : #define PG_ISSPACE 0x80
91 :
92 : static const unsigned char pg_char_properties[128] = {
93 : /* NUL */ 0,
94 : /* ^A */ 0,
95 : /* ^B */ 0,
96 : /* ^C */ 0,
97 : /* ^D */ 0,
98 : /* ^E */ 0,
99 : /* ^F */ 0,
100 : /* ^G */ 0,
101 : /* ^H */ 0,
102 : /* ^I */ PG_ISSPACE,
103 : /* ^J */ PG_ISSPACE,
104 : /* ^K */ PG_ISSPACE,
105 : /* ^L */ PG_ISSPACE,
106 : /* ^M */ PG_ISSPACE,
107 : /* ^N */ 0,
108 : /* ^O */ 0,
109 : /* ^P */ 0,
110 : /* ^Q */ 0,
111 : /* ^R */ 0,
112 : /* ^S */ 0,
113 : /* ^T */ 0,
114 : /* ^U */ 0,
115 : /* ^V */ 0,
116 : /* ^W */ 0,
117 : /* ^X */ 0,
118 : /* ^Y */ 0,
119 : /* ^Z */ 0,
120 : /* ^[ */ 0,
121 : /* ^\ */ 0,
122 : /* ^] */ 0,
123 : /* ^^ */ 0,
124 : /* ^_ */ 0,
125 : /* */ PG_ISPRINT | PG_ISSPACE,
126 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
151 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
158 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
184 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
190 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
216 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
217 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
218 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
220 : /* DEL */ 0
221 : };
222 :
223 :
224 : /*
225 : * pg_set_regex_collation: set collation for these functions to obey
226 : *
227 : * This is called when beginning compilation or execution of a regexp.
228 : * Since there's no need for reentrancy of regexp operations, it's okay
229 : * to store the results in static variables.
230 : */
231 : void
232 2376544 : pg_set_regex_collation(Oid collation)
233 : {
234 2376544 : pg_locale_t locale = 0;
235 : PG_Locale_Strategy strategy;
236 :
237 2376544 : if (!OidIsValid(collation))
238 : {
239 : /*
240 : * This typically means that the parser could not resolve a conflict
241 : * of implicit collations, so report it that way.
242 : */
243 0 : ereport(ERROR,
244 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
245 : errmsg("could not determine which collation to use for regular expression"),
246 : errhint("Use the COLLATE clause to set the collation explicitly.")));
247 : }
248 :
249 2376544 : if (collation == C_COLLATION_OID)
250 : {
251 : /*
252 : * Some callers expect regexes to work for C_COLLATION_OID before
253 : * catalog access is available, so we can't call
254 : * pg_newlocale_from_collation().
255 : */
256 102802 : strategy = PG_REGEX_STRATEGY_C;
257 102802 : collation = C_COLLATION_OID;
258 : }
259 : else
260 : {
261 2273742 : locale = pg_newlocale_from_collation(collation);
262 :
263 2273742 : if (!locale->deterministic)
264 24 : ereport(ERROR,
265 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
266 : errmsg("nondeterministic collations are not supported for regular expressions")));
267 :
268 2273718 : if (locale->ctype_is_c)
269 : {
270 : /*
271 : * C/POSIX collations use this path regardless of database
272 : * encoding
273 : */
274 284 : strategy = PG_REGEX_STRATEGY_C;
275 284 : locale = 0;
276 284 : collation = C_COLLATION_OID;
277 : }
278 2273434 : else if (locale->provider == COLLPROVIDER_BUILTIN)
279 : {
280 : Assert(GetDatabaseEncoding() == PG_UTF8);
281 191336 : strategy = PG_REGEX_STRATEGY_BUILTIN;
282 : }
283 : #ifdef USE_ICU
284 2082098 : else if (locale->provider == COLLPROVIDER_ICU)
285 : {
286 942 : strategy = PG_REGEX_STRATEGY_ICU;
287 : }
288 : #endif
289 : else
290 : {
291 : Assert(locale->provider == COLLPROVIDER_LIBC);
292 2081156 : if (GetDatabaseEncoding() == PG_UTF8)
293 2081152 : strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
294 : else
295 4 : strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
296 : }
297 : }
298 :
299 2376520 : pg_regex_strategy = strategy;
300 2376520 : pg_regex_locale = locale;
301 2376520 : pg_regex_collation = collation;
302 2376520 : }
303 :
304 : static int
305 158230 : pg_wc_isdigit(pg_wchar c)
306 : {
307 158230 : switch (pg_regex_strategy)
308 : {
309 2130 : case PG_REGEX_STRATEGY_C:
310 4260 : return (c <= (pg_wchar) 127 &&
311 2130 : (pg_char_properties[c] & PG_ISDIGIT));
312 49262 : case PG_REGEX_STRATEGY_BUILTIN:
313 49262 : return pg_u_isdigit(c, true);
314 94550 : case PG_REGEX_STRATEGY_LIBC_WIDE:
315 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
316 94550 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
317 : /* FALL THRU */
318 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
319 0 : return (c <= (pg_wchar) UCHAR_MAX &&
320 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
321 : break;
322 12288 : case PG_REGEX_STRATEGY_ICU:
323 : #ifdef USE_ICU
324 12288 : return u_isdigit(c);
325 : #endif
326 : break;
327 : }
328 0 : return 0; /* can't get here, but keep compiler quiet */
329 : }
330 :
331 : static int
332 17430 : pg_wc_isalpha(pg_wchar c)
333 : {
334 17430 : switch (pg_regex_strategy)
335 : {
336 768 : case PG_REGEX_STRATEGY_C:
337 1536 : return (c <= (pg_wchar) 127 &&
338 768 : (pg_char_properties[c] & PG_ISALPHA));
339 22 : case PG_REGEX_STRATEGY_BUILTIN:
340 22 : return pg_u_isalpha(c);
341 4352 : case PG_REGEX_STRATEGY_LIBC_WIDE:
342 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
343 4352 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
344 : /* FALL THRU */
345 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
346 0 : return (c <= (pg_wchar) UCHAR_MAX &&
347 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
348 : break;
349 12288 : case PG_REGEX_STRATEGY_ICU:
350 : #ifdef USE_ICU
351 12288 : return u_isalpha(c);
352 : #endif
353 : break;
354 : }
355 0 : return 0; /* can't get here, but keep compiler quiet */
356 : }
357 :
358 : static int
359 66316 : pg_wc_isalnum(pg_wchar c)
360 : {
361 66316 : switch (pg_regex_strategy)
362 : {
363 762 : case PG_REGEX_STRATEGY_C:
364 1524 : return (c <= (pg_wchar) 127 &&
365 762 : (pg_char_properties[c] & PG_ISALNUM));
366 20476 : case PG_REGEX_STRATEGY_BUILTIN:
367 20476 : return pg_u_isalnum(c, true);
368 32790 : case PG_REGEX_STRATEGY_LIBC_WIDE:
369 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
370 32790 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
371 : /* FALL THRU */
372 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
373 0 : return (c <= (pg_wchar) UCHAR_MAX &&
374 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
375 : break;
376 12288 : case PG_REGEX_STRATEGY_ICU:
377 : #ifdef USE_ICU
378 12288 : return u_isalnum(c);
379 : #endif
380 : break;
381 : }
382 0 : return 0; /* can't get here, but keep compiler quiet */
383 : }
384 :
385 : static int
386 33538 : pg_wc_isword(pg_wchar c)
387 : {
388 : /* We define word characters as alnum class plus underscore */
389 33538 : if (c == CHR('_'))
390 22 : return 1;
391 33516 : return pg_wc_isalnum(c);
392 : }
393 :
394 : static int
395 28688 : pg_wc_isupper(pg_wchar c)
396 : {
397 28688 : switch (pg_regex_strategy)
398 : {
399 0 : case PG_REGEX_STRATEGY_C:
400 0 : return (c <= (pg_wchar) 127 &&
401 0 : (pg_char_properties[c] & PG_ISUPPER));
402 12288 : case PG_REGEX_STRATEGY_BUILTIN:
403 12288 : return pg_u_isupper(c);
404 4112 : case PG_REGEX_STRATEGY_LIBC_WIDE:
405 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
406 4112 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
407 : /* FALL THRU */
408 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
409 0 : return (c <= (pg_wchar) UCHAR_MAX &&
410 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
411 : break;
412 12288 : case PG_REGEX_STRATEGY_ICU:
413 : #ifdef USE_ICU
414 12288 : return u_isupper(c);
415 : #endif
416 : break;
417 : }
418 0 : return 0; /* can't get here, but keep compiler quiet */
419 : }
420 :
421 : static int
422 16390 : pg_wc_islower(pg_wchar c)
423 : {
424 16390 : switch (pg_regex_strategy)
425 : {
426 0 : case PG_REGEX_STRATEGY_C:
427 0 : return (c <= (pg_wchar) 127 &&
428 0 : (pg_char_properties[c] & PG_ISLOWER));
429 0 : case PG_REGEX_STRATEGY_BUILTIN:
430 0 : return pg_u_islower(c);
431 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
432 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
433 4102 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
434 : /* FALL THRU */
435 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
436 0 : return (c <= (pg_wchar) UCHAR_MAX &&
437 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
438 : break;
439 12288 : case PG_REGEX_STRATEGY_ICU:
440 : #ifdef USE_ICU
441 12288 : return u_islower(c);
442 : #endif
443 : break;
444 : }
445 0 : return 0; /* can't get here, but keep compiler quiet */
446 : }
447 :
448 : static int
449 16390 : pg_wc_isgraph(pg_wchar c)
450 : {
451 16390 : switch (pg_regex_strategy)
452 : {
453 0 : case PG_REGEX_STRATEGY_C:
454 0 : return (c <= (pg_wchar) 127 &&
455 0 : (pg_char_properties[c] & PG_ISGRAPH));
456 0 : case PG_REGEX_STRATEGY_BUILTIN:
457 0 : return pg_u_isgraph(c);
458 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
459 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
460 4102 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
461 : /* FALL THRU */
462 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
463 0 : return (c <= (pg_wchar) UCHAR_MAX &&
464 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
465 : break;
466 12288 : case PG_REGEX_STRATEGY_ICU:
467 : #ifdef USE_ICU
468 12288 : return u_isgraph(c);
469 : #endif
470 : break;
471 : }
472 0 : return 0; /* can't get here, but keep compiler quiet */
473 : }
474 :
475 : static int
476 16390 : pg_wc_isprint(pg_wchar c)
477 : {
478 16390 : switch (pg_regex_strategy)
479 : {
480 0 : case PG_REGEX_STRATEGY_C:
481 0 : return (c <= (pg_wchar) 127 &&
482 0 : (pg_char_properties[c] & PG_ISPRINT));
483 0 : case PG_REGEX_STRATEGY_BUILTIN:
484 0 : return pg_u_isprint(c);
485 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
486 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
487 4102 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
488 : /* FALL THRU */
489 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
490 0 : return (c <= (pg_wchar) UCHAR_MAX &&
491 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
492 : break;
493 12288 : case PG_REGEX_STRATEGY_ICU:
494 : #ifdef USE_ICU
495 12288 : return u_isprint(c);
496 : #endif
497 : break;
498 : }
499 0 : return 0; /* can't get here, but keep compiler quiet */
500 : }
501 :
502 : static int
503 28678 : pg_wc_ispunct(pg_wchar c)
504 : {
505 28678 : switch (pg_regex_strategy)
506 : {
507 0 : case PG_REGEX_STRATEGY_C:
508 0 : return (c <= (pg_wchar) 127 &&
509 0 : (pg_char_properties[c] & PG_ISPUNCT));
510 12288 : case PG_REGEX_STRATEGY_BUILTIN:
511 12288 : return pg_u_ispunct(c, true);
512 4102 : case PG_REGEX_STRATEGY_LIBC_WIDE:
513 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
514 4102 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
515 : /* FALL THRU */
516 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
517 0 : return (c <= (pg_wchar) UCHAR_MAX &&
518 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
519 : break;
520 12288 : case PG_REGEX_STRATEGY_ICU:
521 : #ifdef USE_ICU
522 12288 : return u_ispunct(c);
523 : #endif
524 : break;
525 : }
526 0 : return 0; /* can't get here, but keep compiler quiet */
527 : }
528 :
529 : static int
530 76386 : pg_wc_isspace(pg_wchar c)
531 : {
532 76386 : switch (pg_regex_strategy)
533 : {
534 0 : case PG_REGEX_STRATEGY_C:
535 0 : return (c <= (pg_wchar) 127 &&
536 0 : (pg_char_properties[c] & PG_ISSPACE));
537 16398 : case PG_REGEX_STRATEGY_BUILTIN:
538 16398 : return pg_u_isspace(c);
539 47700 : case PG_REGEX_STRATEGY_LIBC_WIDE:
540 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
541 47700 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
542 : /* FALL THRU */
543 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
544 0 : return (c <= (pg_wchar) UCHAR_MAX &&
545 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
546 : break;
547 12288 : case PG_REGEX_STRATEGY_ICU:
548 : #ifdef USE_ICU
549 12288 : return u_isspace(c);
550 : #endif
551 : break;
552 : }
553 0 : return 0; /* can't get here, but keep compiler quiet */
554 : }
555 :
556 : static pg_wchar
557 10546 : pg_wc_toupper(pg_wchar c)
558 : {
559 10546 : switch (pg_regex_strategy)
560 : {
561 978 : case PG_REGEX_STRATEGY_C:
562 978 : if (c <= (pg_wchar) 127)
563 978 : return pg_ascii_toupper((unsigned char) c);
564 0 : return c;
565 372 : case PG_REGEX_STRATEGY_BUILTIN:
566 372 : return unicode_uppercase_simple(c);
567 9088 : case PG_REGEX_STRATEGY_LIBC_WIDE:
568 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
569 9088 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
570 : /* FALL THRU */
571 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
572 0 : if (c <= (pg_wchar) UCHAR_MAX)
573 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
574 0 : return c;
575 108 : case PG_REGEX_STRATEGY_ICU:
576 : #ifdef USE_ICU
577 108 : return u_toupper(c);
578 : #endif
579 : break;
580 : }
581 0 : return 0; /* can't get here, but keep compiler quiet */
582 : }
583 :
584 : static pg_wchar
585 10550 : pg_wc_tolower(pg_wchar c)
586 : {
587 10550 : switch (pg_regex_strategy)
588 : {
589 978 : case PG_REGEX_STRATEGY_C:
590 978 : if (c <= (pg_wchar) 127)
591 978 : return pg_ascii_tolower((unsigned char) c);
592 0 : return c;
593 372 : case PG_REGEX_STRATEGY_BUILTIN:
594 372 : return unicode_lowercase_simple(c);
595 9092 : case PG_REGEX_STRATEGY_LIBC_WIDE:
596 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
597 9092 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
598 : /* FALL THRU */
599 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
600 0 : if (c <= (pg_wchar) UCHAR_MAX)
601 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
602 0 : return c;
603 108 : case PG_REGEX_STRATEGY_ICU:
604 : #ifdef USE_ICU
605 108 : return u_tolower(c);
606 : #endif
607 : break;
608 : }
609 0 : return 0; /* can't get here, but keep compiler quiet */
610 : }
611 :
612 :
613 : /*
614 : * These functions cache the results of probing libc's ctype behavior for
615 : * all character codes of interest in a given encoding/collation. The
616 : * result is provided as a "struct cvec", but notice that the representation
617 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
618 : * chrs[] and ranges[] arrays separately from the struct so that we can
619 : * realloc them larger at need. This is okay since the cvecs made here
620 : * should never be freed by freecvec().
621 : *
622 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
623 : * the main regex code expects us to return a failure indication instead.
624 : */
625 :
626 : typedef int (*pg_wc_probefunc) (pg_wchar c);
627 :
628 : typedef struct pg_ctype_cache
629 : {
630 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
631 : Oid collation; /* collation this entry is for */
632 : struct cvec cv; /* cache entry contents */
633 : struct pg_ctype_cache *next; /* chain link */
634 : } pg_ctype_cache;
635 :
636 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
637 :
638 : /*
639 : * Add a chr or range to pcc->cv; return false if run out of memory
640 : */
641 : static bool
642 8984 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
643 : {
644 : chr *newchrs;
645 :
646 8984 : if (nchrs > 1)
647 : {
648 2746 : if (pcc->cv.nranges >= pcc->cv.rangespace)
649 : {
650 0 : pcc->cv.rangespace *= 2;
651 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
652 0 : pcc->cv.rangespace * sizeof(chr) * 2);
653 0 : if (newchrs == NULL)
654 0 : return false;
655 0 : pcc->cv.ranges = newchrs;
656 : }
657 2746 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
658 2746 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
659 2746 : pcc->cv.nranges++;
660 : }
661 : else
662 : {
663 : assert(nchrs == 1);
664 6238 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
665 : {
666 22 : pcc->cv.chrspace *= 2;
667 22 : newchrs = (chr *) realloc(pcc->cv.chrs,
668 22 : pcc->cv.chrspace * sizeof(chr));
669 22 : if (newchrs == NULL)
670 0 : return false;
671 22 : pcc->cv.chrs = newchrs;
672 : }
673 6238 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
674 : }
675 8984 : return true;
676 : }
677 :
678 : /*
679 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
680 : * chrs satisfying the probe function. The active collation is the one
681 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
682 : *
683 : * Note that the result must not be freed or modified by caller.
684 : */
685 : static struct cvec *
686 764 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
687 : {
688 : pg_ctype_cache *pcc;
689 : pg_wchar max_chr;
690 : pg_wchar cur_chr;
691 : int nmatches;
692 : chr *newchrs;
693 :
694 : /*
695 : * Do we already have the answer cached?
696 : */
697 1736 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
698 : {
699 1504 : if (pcc->probefunc == probefunc &&
700 580 : pcc->collation == pg_regex_collation)
701 532 : return &pcc->cv;
702 : }
703 :
704 : /*
705 : * Nope, so initialize some workspace ...
706 : */
707 232 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
708 232 : if (pcc == NULL)
709 0 : return NULL;
710 232 : pcc->probefunc = probefunc;
711 232 : pcc->collation = pg_regex_collation;
712 232 : pcc->cv.nchrs = 0;
713 232 : pcc->cv.chrspace = 128;
714 232 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
715 232 : pcc->cv.nranges = 0;
716 232 : pcc->cv.rangespace = 64;
717 232 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
718 232 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
719 0 : goto out_of_memory;
720 232 : pcc->cv.cclasscode = cclasscode;
721 :
722 : /*
723 : * Decide how many character codes we ought to look through. In general
724 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
725 : * runtime using the "high colormap" mechanism. However, in C locale
726 : * there's no need to go further than 127, and if we only have a 1-byte
727 : * <ctype.h> API there's no need to go further than that can handle.
728 : *
729 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
730 : * output cvec as not having any locale-dependent behavior, since there
731 : * will be no need to do any run-time locale checks. (The #if's here
732 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
733 : * useful to allow it to be small for testing purposes.)
734 : */
735 232 : switch (pg_regex_strategy)
736 : {
737 28 : case PG_REGEX_STRATEGY_C:
738 : #if MAX_SIMPLE_CHR >= 127
739 28 : max_chr = (pg_wchar) 127;
740 28 : pcc->cv.cclasscode = -1;
741 : #else
742 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
743 : #endif
744 28 : break;
745 54 : case PG_REGEX_STRATEGY_BUILTIN:
746 54 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
747 54 : break;
748 96 : case PG_REGEX_STRATEGY_LIBC_WIDE:
749 96 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
750 96 : break;
751 0 : case PG_REGEX_STRATEGY_LIBC_1BYTE:
752 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
753 0 : max_chr = (pg_wchar) UCHAR_MAX;
754 0 : pcc->cv.cclasscode = -1;
755 : #else
756 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
757 : #endif
758 0 : break;
759 54 : case PG_REGEX_STRATEGY_ICU:
760 54 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
761 54 : break;
762 0 : default:
763 : Assert(false);
764 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
765 0 : break;
766 : }
767 :
768 : /*
769 : * And scan 'em ...
770 : */
771 232 : nmatches = 0; /* number of consecutive matches */
772 :
773 421608 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
774 : {
775 421376 : if ((*probefunc) (cur_chr))
776 107050 : nmatches++;
777 314326 : else if (nmatches > 0)
778 : {
779 8960 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
780 0 : goto out_of_memory;
781 8960 : nmatches = 0;
782 : }
783 : }
784 :
785 232 : if (nmatches > 0)
786 24 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
787 0 : goto out_of_memory;
788 :
789 : /*
790 : * We might have allocated more memory than needed, if so free it
791 : */
792 232 : if (pcc->cv.nchrs == 0)
793 : {
794 98 : free(pcc->cv.chrs);
795 98 : pcc->cv.chrs = NULL;
796 98 : pcc->cv.chrspace = 0;
797 : }
798 134 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
799 : {
800 134 : newchrs = (chr *) realloc(pcc->cv.chrs,
801 134 : pcc->cv.nchrs * sizeof(chr));
802 134 : if (newchrs == NULL)
803 0 : goto out_of_memory;
804 134 : pcc->cv.chrs = newchrs;
805 134 : pcc->cv.chrspace = pcc->cv.nchrs;
806 : }
807 232 : if (pcc->cv.nranges == 0)
808 : {
809 0 : free(pcc->cv.ranges);
810 0 : pcc->cv.ranges = NULL;
811 0 : pcc->cv.rangespace = 0;
812 : }
813 232 : else if (pcc->cv.nranges < pcc->cv.rangespace)
814 : {
815 232 : newchrs = (chr *) realloc(pcc->cv.ranges,
816 232 : pcc->cv.nranges * sizeof(chr) * 2);
817 232 : if (newchrs == NULL)
818 0 : goto out_of_memory;
819 232 : pcc->cv.ranges = newchrs;
820 232 : pcc->cv.rangespace = pcc->cv.nranges;
821 : }
822 :
823 : /*
824 : * Success, link it into cache chain
825 : */
826 232 : pcc->next = pg_ctype_cache_list;
827 232 : pg_ctype_cache_list = pcc;
828 :
829 232 : return &pcc->cv;
830 :
831 : /*
832 : * Failure, clean up
833 : */
834 0 : out_of_memory:
835 0 : free(pcc->cv.chrs);
836 0 : free(pcc->cv.ranges);
837 0 : free(pcc);
838 :
839 0 : return NULL;
840 : }
|