Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "common/unicode_case.h"
20 : #include "common/unicode_category.h"
21 : #include "utils/pg_locale.h"
22 :
23 : /*
24 : * To provide as much functionality as possible on a variety of platforms,
25 : * without going so far as to implement everything from scratch, we use
26 : * several implementation strategies depending on the situation:
27 : *
28 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
29 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
30 : * collations don't give a fig about multibyte characters.
31 : *
32 : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
33 : *
34 : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
35 : * This assumes that every platform uses Unicode codepoints directly
36 : * as the wchar_t representation of Unicode. On some platforms
37 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38 : *
39 : * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
40 : * values up to 255, and punt for values above that. This is 100% correct
41 : * only in single-byte encodings such as LATINn. However, non-Unicode
42 : * multibyte encodings are mostly Far Eastern character sets for which the
43 : * properties being tested here aren't very relevant for higher code values
44 : * anyway. The difficulty with using the <wctype.h> functions with
45 : * non-Unicode multibyte encodings is that we can have no certainty that
46 : * the platform's wchar_t representation matches what we do in pg_wchar
47 : * conversions.
48 : *
49 : * 3. Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
50 : * functions, under exactly the same cases as #2.
51 : *
52 : * There is one notable difference between cases 2 and 3: in the "default"
53 : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
54 : * while in a non-default collation we just let the library functions do what
55 : * they will. The case where this matters is treatment of I/i in Turkish,
56 : * and the behavior is meant to match the upper()/lower() SQL functions.
57 : *
58 : * We store the active collation setting in static variables. In principle
59 : * it could be passed down to here via the regex library's "struct vars" data
60 : * structure; but that would require somewhat invasive changes in the regex
61 : * library, and right now there's no real benefit to be gained from that.
62 : *
63 : * NB: the coding here assumes pg_wchar is an unsigned type.
64 : */
65 :
66 : typedef enum
67 : {
68 : PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
69 : PG_REGEX_BUILTIN, /* built-in Unicode semantics */
70 : PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
71 : PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
72 : PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
73 : PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */
74 : PG_REGEX_LOCALE_ICU, /* Use ICU uchar.h functions */
75 : } PG_Locale_Strategy;
76 :
77 : static PG_Locale_Strategy pg_regex_strategy;
78 : static pg_locale_t pg_regex_locale;
79 : static Oid pg_regex_collation;
80 :
81 : /*
82 : * Hard-wired character properties for C locale
83 : */
84 : #define PG_ISDIGIT 0x01
85 : #define PG_ISALPHA 0x02
86 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
87 : #define PG_ISUPPER 0x04
88 : #define PG_ISLOWER 0x08
89 : #define PG_ISGRAPH 0x10
90 : #define PG_ISPRINT 0x20
91 : #define PG_ISPUNCT 0x40
92 : #define PG_ISSPACE 0x80
93 :
94 : static const unsigned char pg_char_properties[128] = {
95 : /* NUL */ 0,
96 : /* ^A */ 0,
97 : /* ^B */ 0,
98 : /* ^C */ 0,
99 : /* ^D */ 0,
100 : /* ^E */ 0,
101 : /* ^F */ 0,
102 : /* ^G */ 0,
103 : /* ^H */ 0,
104 : /* ^I */ PG_ISSPACE,
105 : /* ^J */ PG_ISSPACE,
106 : /* ^K */ PG_ISSPACE,
107 : /* ^L */ PG_ISSPACE,
108 : /* ^M */ PG_ISSPACE,
109 : /* ^N */ 0,
110 : /* ^O */ 0,
111 : /* ^P */ 0,
112 : /* ^Q */ 0,
113 : /* ^R */ 0,
114 : /* ^S */ 0,
115 : /* ^T */ 0,
116 : /* ^U */ 0,
117 : /* ^V */ 0,
118 : /* ^W */ 0,
119 : /* ^X */ 0,
120 : /* ^Y */ 0,
121 : /* ^Z */ 0,
122 : /* ^[ */ 0,
123 : /* ^\ */ 0,
124 : /* ^] */ 0,
125 : /* ^^ */ 0,
126 : /* ^_ */ 0,
127 : /* */ PG_ISPRINT | PG_ISSPACE,
128 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
142 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
143 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
151 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
152 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
153 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
158 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
159 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
160 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
184 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
185 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
186 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
190 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
191 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
192 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
216 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
217 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
218 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
220 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
221 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
222 : /* DEL */ 0
223 : };
224 :
225 :
226 : /*
227 : * pg_set_regex_collation: set collation for these functions to obey
228 : *
229 : * This is called when beginning compilation or execution of a regexp.
230 : * Since there's no need for reentrancy of regexp operations, it's okay
231 : * to store the results in static variables.
232 : */
233 : void
234 1889728 : pg_set_regex_collation(Oid collation)
235 : {
236 1889728 : if (!OidIsValid(collation))
237 : {
238 : /*
239 : * This typically means that the parser could not resolve a conflict
240 : * of implicit collations, so report it that way.
241 : */
242 0 : ereport(ERROR,
243 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
244 : errmsg("could not determine which collation to use for regular expression"),
245 : errhint("Use the COLLATE clause to set the collation explicitly.")));
246 : }
247 :
248 1889728 : if (lc_ctype_is_c(collation))
249 : {
250 : /* C/POSIX collations use this path regardless of database encoding */
251 108374 : pg_regex_strategy = PG_REGEX_LOCALE_C;
252 108374 : pg_regex_locale = 0;
253 108374 : pg_regex_collation = C_COLLATION_OID;
254 : }
255 : else
256 : {
257 1781354 : pg_regex_locale = pg_newlocale_from_collation(collation);
258 :
259 1781354 : if (!pg_locale_deterministic(pg_regex_locale))
260 24 : ereport(ERROR,
261 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
262 : errmsg("nondeterministic collations are not supported for regular expressions")));
263 :
264 : #ifdef USE_ICU
265 1781330 : if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
266 942 : pg_regex_strategy = PG_REGEX_LOCALE_ICU;
267 : else
268 : #endif
269 1780388 : if (GetDatabaseEncoding() == PG_UTF8)
270 : {
271 1780384 : if (pg_regex_locale)
272 : {
273 187994 : if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
274 187994 : pg_regex_strategy = PG_REGEX_BUILTIN;
275 : else
276 0 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
277 : }
278 : else
279 1592390 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
280 : }
281 : else
282 : {
283 4 : if (pg_regex_locale)
284 0 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
285 : else
286 4 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
287 : }
288 :
289 1781330 : pg_regex_collation = collation;
290 : }
291 1889704 : }
292 :
293 : static int
294 145936 : pg_wc_isdigit(pg_wchar c)
295 : {
296 145936 : switch (pg_regex_strategy)
297 : {
298 2142 : case PG_REGEX_LOCALE_C:
299 4284 : return (c <= (pg_wchar) 127 &&
300 2142 : (pg_char_properties[c] & PG_ISDIGIT));
301 45166 : case PG_REGEX_BUILTIN:
302 45166 : return pg_u_isdigit(c, true);
303 86340 : case PG_REGEX_LOCALE_WIDE:
304 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
305 86340 : return iswdigit((wint_t) c);
306 : /* FALL THRU */
307 : case PG_REGEX_LOCALE_1BYTE:
308 0 : return (c <= (pg_wchar) UCHAR_MAX &&
309 0 : isdigit((unsigned char) c));
310 0 : case PG_REGEX_LOCALE_WIDE_L:
311 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
312 0 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
313 : /* FALL THRU */
314 : case PG_REGEX_LOCALE_1BYTE_L:
315 0 : return (c <= (pg_wchar) UCHAR_MAX &&
316 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
317 : break;
318 12288 : case PG_REGEX_LOCALE_ICU:
319 : #ifdef USE_ICU
320 12288 : return u_isdigit(c);
321 : #endif
322 : break;
323 : }
324 0 : return 0; /* can't get here, but keep compiler quiet */
325 : }
326 :
327 : static int
328 16660 : pg_wc_isalpha(pg_wchar c)
329 : {
330 16660 : switch (pg_regex_strategy)
331 : {
332 0 : case PG_REGEX_LOCALE_C:
333 0 : return (c <= (pg_wchar) 127 &&
334 0 : (pg_char_properties[c] & PG_ISALPHA));
335 22 : case PG_REGEX_BUILTIN:
336 22 : return pg_u_isalpha(c);
337 4350 : case PG_REGEX_LOCALE_WIDE:
338 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
339 4350 : return iswalpha((wint_t) c);
340 : /* FALL THRU */
341 : case PG_REGEX_LOCALE_1BYTE:
342 0 : return (c <= (pg_wchar) UCHAR_MAX &&
343 0 : isalpha((unsigned char) c));
344 0 : case PG_REGEX_LOCALE_WIDE_L:
345 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
346 0 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
347 : /* FALL THRU */
348 : case PG_REGEX_LOCALE_1BYTE_L:
349 0 : return (c <= (pg_wchar) UCHAR_MAX &&
350 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
351 : break;
352 12288 : case PG_REGEX_LOCALE_ICU:
353 : #ifdef USE_ICU
354 12288 : return u_isalpha(c);
355 : #endif
356 : break;
357 : }
358 0 : return 0; /* can't get here, but keep compiler quiet */
359 : }
360 :
361 : static int
362 66316 : pg_wc_isalnum(pg_wchar c)
363 : {
364 66316 : switch (pg_regex_strategy)
365 : {
366 762 : case PG_REGEX_LOCALE_C:
367 1524 : return (c <= (pg_wchar) 127 &&
368 762 : (pg_char_properties[c] & PG_ISALNUM));
369 20476 : case PG_REGEX_BUILTIN:
370 20476 : return pg_u_isalnum(c, true);
371 32790 : case PG_REGEX_LOCALE_WIDE:
372 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
373 32790 : return iswalnum((wint_t) c);
374 : /* FALL THRU */
375 : case PG_REGEX_LOCALE_1BYTE:
376 0 : return (c <= (pg_wchar) UCHAR_MAX &&
377 0 : isalnum((unsigned char) c));
378 0 : case PG_REGEX_LOCALE_WIDE_L:
379 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
380 0 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
381 : /* FALL THRU */
382 : case PG_REGEX_LOCALE_1BYTE_L:
383 0 : return (c <= (pg_wchar) UCHAR_MAX &&
384 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
385 : break;
386 12288 : case PG_REGEX_LOCALE_ICU:
387 : #ifdef USE_ICU
388 12288 : return u_isalnum(c);
389 : #endif
390 : break;
391 : }
392 0 : return 0; /* can't get here, but keep compiler quiet */
393 : }
394 :
395 : static int
396 33538 : pg_wc_isword(pg_wchar c)
397 : {
398 : /* We define word characters as alnum class plus underscore */
399 33538 : if (c == CHR('_'))
400 22 : return 1;
401 33516 : return pg_wc_isalnum(c);
402 : }
403 :
404 : static int
405 28688 : pg_wc_isupper(pg_wchar c)
406 : {
407 28688 : switch (pg_regex_strategy)
408 : {
409 0 : case PG_REGEX_LOCALE_C:
410 0 : return (c <= (pg_wchar) 127 &&
411 0 : (pg_char_properties[c] & PG_ISUPPER));
412 12288 : case PG_REGEX_BUILTIN:
413 12288 : return pg_u_isupper(c);
414 4112 : case PG_REGEX_LOCALE_WIDE:
415 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416 4112 : return iswupper((wint_t) c);
417 : /* FALL THRU */
418 : case PG_REGEX_LOCALE_1BYTE:
419 0 : return (c <= (pg_wchar) UCHAR_MAX &&
420 0 : isupper((unsigned char) c));
421 0 : case PG_REGEX_LOCALE_WIDE_L:
422 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
423 0 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
424 : /* FALL THRU */
425 : case PG_REGEX_LOCALE_1BYTE_L:
426 0 : return (c <= (pg_wchar) UCHAR_MAX &&
427 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
428 : break;
429 12288 : case PG_REGEX_LOCALE_ICU:
430 : #ifdef USE_ICU
431 12288 : return u_isupper(c);
432 : #endif
433 : break;
434 : }
435 0 : return 0; /* can't get here, but keep compiler quiet */
436 : }
437 :
438 : static int
439 16390 : pg_wc_islower(pg_wchar c)
440 : {
441 16390 : switch (pg_regex_strategy)
442 : {
443 0 : case PG_REGEX_LOCALE_C:
444 0 : return (c <= (pg_wchar) 127 &&
445 0 : (pg_char_properties[c] & PG_ISLOWER));
446 0 : case PG_REGEX_BUILTIN:
447 0 : return pg_u_islower(c);
448 4102 : case PG_REGEX_LOCALE_WIDE:
449 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
450 4102 : return iswlower((wint_t) c);
451 : /* FALL THRU */
452 : case PG_REGEX_LOCALE_1BYTE:
453 0 : return (c <= (pg_wchar) UCHAR_MAX &&
454 0 : islower((unsigned char) c));
455 0 : case PG_REGEX_LOCALE_WIDE_L:
456 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
457 0 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
458 : /* FALL THRU */
459 : case PG_REGEX_LOCALE_1BYTE_L:
460 0 : return (c <= (pg_wchar) UCHAR_MAX &&
461 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
462 : break;
463 12288 : case PG_REGEX_LOCALE_ICU:
464 : #ifdef USE_ICU
465 12288 : return u_islower(c);
466 : #endif
467 : break;
468 : }
469 0 : return 0; /* can't get here, but keep compiler quiet */
470 : }
471 :
472 : static int
473 16390 : pg_wc_isgraph(pg_wchar c)
474 : {
475 16390 : switch (pg_regex_strategy)
476 : {
477 0 : case PG_REGEX_LOCALE_C:
478 0 : return (c <= (pg_wchar) 127 &&
479 0 : (pg_char_properties[c] & PG_ISGRAPH));
480 0 : case PG_REGEX_BUILTIN:
481 0 : return pg_u_isgraph(c);
482 4102 : case PG_REGEX_LOCALE_WIDE:
483 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
484 4102 : return iswgraph((wint_t) c);
485 : /* FALL THRU */
486 : case PG_REGEX_LOCALE_1BYTE:
487 0 : return (c <= (pg_wchar) UCHAR_MAX &&
488 0 : isgraph((unsigned char) c));
489 0 : case PG_REGEX_LOCALE_WIDE_L:
490 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
491 0 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
492 : /* FALL THRU */
493 : case PG_REGEX_LOCALE_1BYTE_L:
494 0 : return (c <= (pg_wchar) UCHAR_MAX &&
495 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
496 : break;
497 12288 : case PG_REGEX_LOCALE_ICU:
498 : #ifdef USE_ICU
499 12288 : return u_isgraph(c);
500 : #endif
501 : break;
502 : }
503 0 : return 0; /* can't get here, but keep compiler quiet */
504 : }
505 :
506 : static int
507 16390 : pg_wc_isprint(pg_wchar c)
508 : {
509 16390 : switch (pg_regex_strategy)
510 : {
511 0 : case PG_REGEX_LOCALE_C:
512 0 : return (c <= (pg_wchar) 127 &&
513 0 : (pg_char_properties[c] & PG_ISPRINT));
514 0 : case PG_REGEX_BUILTIN:
515 0 : return pg_u_isprint(c);
516 4102 : case PG_REGEX_LOCALE_WIDE:
517 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
518 4102 : return iswprint((wint_t) c);
519 : /* FALL THRU */
520 : case PG_REGEX_LOCALE_1BYTE:
521 0 : return (c <= (pg_wchar) UCHAR_MAX &&
522 0 : isprint((unsigned char) c));
523 0 : case PG_REGEX_LOCALE_WIDE_L:
524 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
525 0 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
526 : /* FALL THRU */
527 : case PG_REGEX_LOCALE_1BYTE_L:
528 0 : return (c <= (pg_wchar) UCHAR_MAX &&
529 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
530 : break;
531 12288 : case PG_REGEX_LOCALE_ICU:
532 : #ifdef USE_ICU
533 12288 : return u_isprint(c);
534 : #endif
535 : break;
536 : }
537 0 : return 0; /* can't get here, but keep compiler quiet */
538 : }
539 :
540 : static int
541 28678 : pg_wc_ispunct(pg_wchar c)
542 : {
543 28678 : switch (pg_regex_strategy)
544 : {
545 0 : case PG_REGEX_LOCALE_C:
546 0 : return (c <= (pg_wchar) 127 &&
547 0 : (pg_char_properties[c] & PG_ISPUNCT));
548 12288 : case PG_REGEX_BUILTIN:
549 12288 : return pg_u_ispunct(c, true);
550 4102 : case PG_REGEX_LOCALE_WIDE:
551 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
552 4102 : return iswpunct((wint_t) c);
553 : /* FALL THRU */
554 : case PG_REGEX_LOCALE_1BYTE:
555 0 : return (c <= (pg_wchar) UCHAR_MAX &&
556 0 : ispunct((unsigned char) c));
557 0 : case PG_REGEX_LOCALE_WIDE_L:
558 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
559 0 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
560 : /* FALL THRU */
561 : case PG_REGEX_LOCALE_1BYTE_L:
562 0 : return (c <= (pg_wchar) UCHAR_MAX &&
563 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
564 : break;
565 12288 : case PG_REGEX_LOCALE_ICU:
566 : #ifdef USE_ICU
567 12288 : return u_ispunct(c);
568 : #endif
569 : break;
570 : }
571 0 : return 0; /* can't get here, but keep compiler quiet */
572 : }
573 :
574 : static int
575 76386 : pg_wc_isspace(pg_wchar c)
576 : {
577 76386 : switch (pg_regex_strategy)
578 : {
579 0 : case PG_REGEX_LOCALE_C:
580 0 : return (c <= (pg_wchar) 127 &&
581 0 : (pg_char_properties[c] & PG_ISSPACE));
582 16398 : case PG_REGEX_BUILTIN:
583 16398 : return pg_u_isspace(c);
584 47700 : case PG_REGEX_LOCALE_WIDE:
585 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
586 47700 : return iswspace((wint_t) c);
587 : /* FALL THRU */
588 : case PG_REGEX_LOCALE_1BYTE:
589 0 : return (c <= (pg_wchar) UCHAR_MAX &&
590 0 : isspace((unsigned char) c));
591 0 : case PG_REGEX_LOCALE_WIDE_L:
592 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
593 0 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
594 : /* FALL THRU */
595 : case PG_REGEX_LOCALE_1BYTE_L:
596 0 : return (c <= (pg_wchar) UCHAR_MAX &&
597 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
598 : break;
599 12288 : case PG_REGEX_LOCALE_ICU:
600 : #ifdef USE_ICU
601 12288 : return u_isspace(c);
602 : #endif
603 : break;
604 : }
605 0 : return 0; /* can't get here, but keep compiler quiet */
606 : }
607 :
608 : static pg_wchar
609 10546 : pg_wc_toupper(pg_wchar c)
610 : {
611 10546 : switch (pg_regex_strategy)
612 : {
613 1056 : case PG_REGEX_LOCALE_C:
614 1056 : if (c <= (pg_wchar) 127)
615 1056 : return pg_ascii_toupper((unsigned char) c);
616 0 : return c;
617 372 : case PG_REGEX_BUILTIN:
618 372 : return unicode_uppercase_simple(c);
619 9010 : case PG_REGEX_LOCALE_WIDE:
620 : /* force C behavior for ASCII characters, per comments above */
621 9010 : if (c <= (pg_wchar) 127)
622 814 : return pg_ascii_toupper((unsigned char) c);
623 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
624 8196 : return towupper((wint_t) c);
625 : /* FALL THRU */
626 : case PG_REGEX_LOCALE_1BYTE:
627 : /* force C behavior for ASCII characters, per comments above */
628 0 : if (c <= (pg_wchar) 127)
629 0 : return pg_ascii_toupper((unsigned char) c);
630 0 : if (c <= (pg_wchar) UCHAR_MAX)
631 0 : return toupper((unsigned char) c);
632 0 : return c;
633 0 : case PG_REGEX_LOCALE_WIDE_L:
634 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
635 0 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
636 : /* FALL THRU */
637 : case PG_REGEX_LOCALE_1BYTE_L:
638 0 : if (c <= (pg_wchar) UCHAR_MAX)
639 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
640 0 : return c;
641 108 : case PG_REGEX_LOCALE_ICU:
642 : #ifdef USE_ICU
643 108 : return u_toupper(c);
644 : #endif
645 : break;
646 : }
647 0 : return 0; /* can't get here, but keep compiler quiet */
648 : }
649 :
650 : static pg_wchar
651 10550 : pg_wc_tolower(pg_wchar c)
652 : {
653 10550 : switch (pg_regex_strategy)
654 : {
655 1056 : case PG_REGEX_LOCALE_C:
656 1056 : if (c <= (pg_wchar) 127)
657 1056 : return pg_ascii_tolower((unsigned char) c);
658 0 : return c;
659 372 : case PG_REGEX_BUILTIN:
660 372 : return unicode_lowercase_simple(c);
661 9014 : case PG_REGEX_LOCALE_WIDE:
662 : /* force C behavior for ASCII characters, per comments above */
663 9014 : if (c <= (pg_wchar) 127)
664 818 : return pg_ascii_tolower((unsigned char) c);
665 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
666 8196 : return towlower((wint_t) c);
667 : /* FALL THRU */
668 : case PG_REGEX_LOCALE_1BYTE:
669 : /* force C behavior for ASCII characters, per comments above */
670 0 : if (c <= (pg_wchar) 127)
671 0 : return pg_ascii_tolower((unsigned char) c);
672 0 : if (c <= (pg_wchar) UCHAR_MAX)
673 0 : return tolower((unsigned char) c);
674 0 : return c;
675 0 : case PG_REGEX_LOCALE_WIDE_L:
676 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
677 0 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
678 : /* FALL THRU */
679 : case PG_REGEX_LOCALE_1BYTE_L:
680 0 : if (c <= (pg_wchar) UCHAR_MAX)
681 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
682 0 : return c;
683 108 : case PG_REGEX_LOCALE_ICU:
684 : #ifdef USE_ICU
685 108 : return u_tolower(c);
686 : #endif
687 : break;
688 : }
689 0 : return 0; /* can't get here, but keep compiler quiet */
690 : }
691 :
692 :
693 : /*
694 : * These functions cache the results of probing libc's ctype behavior for
695 : * all character codes of interest in a given encoding/collation. The
696 : * result is provided as a "struct cvec", but notice that the representation
697 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
698 : * chrs[] and ranges[] arrays separately from the struct so that we can
699 : * realloc them larger at need. This is okay since the cvecs made here
700 : * should never be freed by freecvec().
701 : *
702 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
703 : * the main regex code expects us to return a failure indication instead.
704 : */
705 :
706 : typedef int (*pg_wc_probefunc) (pg_wchar c);
707 :
708 : typedef struct pg_ctype_cache
709 : {
710 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
711 : Oid collation; /* collation this entry is for */
712 : struct cvec cv; /* cache entry contents */
713 : struct pg_ctype_cache *next; /* chain link */
714 : } pg_ctype_cache;
715 :
716 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
717 :
718 : /*
719 : * Add a chr or range to pcc->cv; return false if run out of memory
720 : */
721 : static bool
722 8966 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
723 : {
724 : chr *newchrs;
725 :
726 8966 : if (nchrs > 1)
727 : {
728 2728 : if (pcc->cv.nranges >= pcc->cv.rangespace)
729 : {
730 0 : pcc->cv.rangespace *= 2;
731 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
732 0 : pcc->cv.rangespace * sizeof(chr) * 2);
733 0 : if (newchrs == NULL)
734 0 : return false;
735 0 : pcc->cv.ranges = newchrs;
736 : }
737 2728 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
738 2728 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
739 2728 : pcc->cv.nranges++;
740 : }
741 : else
742 : {
743 : assert(nchrs == 1);
744 6238 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
745 : {
746 22 : pcc->cv.chrspace *= 2;
747 22 : newchrs = (chr *) realloc(pcc->cv.chrs,
748 22 : pcc->cv.chrspace * sizeof(chr));
749 22 : if (newchrs == NULL)
750 0 : return false;
751 22 : pcc->cv.chrs = newchrs;
752 : }
753 6238 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
754 : }
755 8966 : return true;
756 : }
757 :
758 : /*
759 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
760 : * chrs satisfying the probe function. The active collation is the one
761 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
762 : *
763 : * Note that the result must not be freed or modified by caller.
764 : */
765 : static struct cvec *
766 702 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
767 : {
768 : pg_ctype_cache *pcc;
769 : pg_wchar max_chr;
770 : pg_wchar cur_chr;
771 : int nmatches;
772 : chr *newchrs;
773 :
774 : /*
775 : * Do we already have the answer cached?
776 : */
777 1634 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
778 : {
779 1414 : if (pcc->probefunc == probefunc &&
780 530 : pcc->collation == pg_regex_collation)
781 482 : return &pcc->cv;
782 : }
783 :
784 : /*
785 : * Nope, so initialize some workspace ...
786 : */
787 220 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
788 220 : if (pcc == NULL)
789 0 : return NULL;
790 220 : pcc->probefunc = probefunc;
791 220 : pcc->collation = pg_regex_collation;
792 220 : pcc->cv.nchrs = 0;
793 220 : pcc->cv.chrspace = 128;
794 220 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
795 220 : pcc->cv.nranges = 0;
796 220 : pcc->cv.rangespace = 64;
797 220 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
798 220 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
799 0 : goto out_of_memory;
800 220 : pcc->cv.cclasscode = cclasscode;
801 :
802 : /*
803 : * Decide how many character codes we ought to look through. In general
804 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
805 : * runtime using the "high colormap" mechanism. However, in C locale
806 : * there's no need to go further than 127, and if we only have a 1-byte
807 : * <ctype.h> API there's no need to go further than that can handle.
808 : *
809 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
810 : * output cvec as not having any locale-dependent behavior, since there
811 : * will be no need to do any run-time locale checks. (The #if's here
812 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
813 : * useful to allow it to be small for testing purposes.)
814 : */
815 220 : switch (pg_regex_strategy)
816 : {
817 22 : case PG_REGEX_LOCALE_C:
818 : #if MAX_SIMPLE_CHR >= 127
819 22 : max_chr = (pg_wchar) 127;
820 22 : pcc->cv.cclasscode = -1;
821 : #else
822 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
823 : #endif
824 22 : break;
825 52 : case PG_REGEX_BUILTIN:
826 52 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
827 52 : break;
828 92 : case PG_REGEX_LOCALE_WIDE:
829 : case PG_REGEX_LOCALE_WIDE_L:
830 92 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
831 92 : break;
832 0 : case PG_REGEX_LOCALE_1BYTE:
833 : case PG_REGEX_LOCALE_1BYTE_L:
834 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
835 0 : max_chr = (pg_wchar) UCHAR_MAX;
836 0 : pcc->cv.cclasscode = -1;
837 : #else
838 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
839 : #endif
840 0 : break;
841 54 : case PG_REGEX_LOCALE_ICU:
842 54 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
843 54 : break;
844 0 : default:
845 : Assert(false);
846 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
847 0 : break;
848 : }
849 :
850 : /*
851 : * And scan 'em ...
852 : */
853 220 : nmatches = 0; /* number of consecutive matches */
854 :
855 408540 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
856 : {
857 408320 : if ((*probefunc) (cur_chr))
858 106678 : nmatches++;
859 301642 : else if (nmatches > 0)
860 : {
861 8942 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
862 0 : goto out_of_memory;
863 8942 : nmatches = 0;
864 : }
865 : }
866 :
867 220 : if (nmatches > 0)
868 24 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
869 0 : goto out_of_memory;
870 :
871 : /*
872 : * We might have allocated more memory than needed, if so free it
873 : */
874 220 : if (pcc->cv.nchrs == 0)
875 : {
876 86 : free(pcc->cv.chrs);
877 86 : pcc->cv.chrs = NULL;
878 86 : pcc->cv.chrspace = 0;
879 : }
880 134 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
881 : {
882 134 : newchrs = (chr *) realloc(pcc->cv.chrs,
883 134 : pcc->cv.nchrs * sizeof(chr));
884 134 : if (newchrs == NULL)
885 0 : goto out_of_memory;
886 134 : pcc->cv.chrs = newchrs;
887 134 : pcc->cv.chrspace = pcc->cv.nchrs;
888 : }
889 220 : if (pcc->cv.nranges == 0)
890 : {
891 0 : free(pcc->cv.ranges);
892 0 : pcc->cv.ranges = NULL;
893 0 : pcc->cv.rangespace = 0;
894 : }
895 220 : else if (pcc->cv.nranges < pcc->cv.rangespace)
896 : {
897 220 : newchrs = (chr *) realloc(pcc->cv.ranges,
898 220 : pcc->cv.nranges * sizeof(chr) * 2);
899 220 : if (newchrs == NULL)
900 0 : goto out_of_memory;
901 220 : pcc->cv.ranges = newchrs;
902 220 : pcc->cv.rangespace = pcc->cv.nranges;
903 : }
904 :
905 : /*
906 : * Success, link it into cache chain
907 : */
908 220 : pcc->next = pg_ctype_cache_list;
909 220 : pg_ctype_cache_list = pcc;
910 :
911 220 : return &pcc->cv;
912 :
913 : /*
914 : * Failure, clean up
915 : */
916 0 : out_of_memory:
917 0 : free(pcc->cv.chrs);
918 0 : free(pcc->cv.ranges);
919 0 : free(pcc);
920 :
921 0 : return NULL;
922 : }
|