Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "utils/pg_locale.h"
20 :
21 : /*
22 : * To provide as much functionality as possible on a variety of platforms,
23 : * without going so far as to implement everything from scratch, we use
24 : * several implementation strategies depending on the situation:
25 : *
26 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
27 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
28 : * collations don't give a fig about multibyte characters.
29 : *
30 : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
31 : *
32 : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
33 : * This assumes that every platform uses Unicode codepoints directly
34 : * as the wchar_t representation of Unicode. On some platforms
35 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
36 : *
37 : * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
38 : * values up to 255, and punt for values above that. This is 100% correct
39 : * only in single-byte encodings such as LATINn. However, non-Unicode
40 : * multibyte encodings are mostly Far Eastern character sets for which the
41 : * properties being tested here aren't very relevant for higher code values
42 : * anyway. The difficulty with using the <wctype.h> functions with
43 : * non-Unicode multibyte encodings is that we can have no certainty that
44 : * the platform's wchar_t representation matches what we do in pg_wchar
45 : * conversions.
46 : *
47 : * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
48 : * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
49 : * functions, under exactly the same cases as #2.
50 : *
51 : * There is one notable difference between cases 2 and 3: in the "default"
52 : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
53 : * while in a non-default collation we just let the library functions do what
54 : * they will. The case where this matters is treatment of I/i in Turkish,
55 : * and the behavior is meant to match the upper()/lower() SQL functions.
56 : *
57 : * We store the active collation setting in static variables. In principle
58 : * it could be passed down to here via the regex library's "struct vars" data
59 : * structure; but that would require somewhat invasive changes in the regex
60 : * library, and right now there's no real benefit to be gained from that.
61 : *
62 : * NB: the coding here assumes pg_wchar is an unsigned type.
63 : */
64 :
65 : typedef enum
66 : {
67 : PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
68 : PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
69 : PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
70 : PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
71 : PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */
72 : PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */
73 : } PG_Locale_Strategy;
74 :
75 : static PG_Locale_Strategy pg_regex_strategy;
76 : static pg_locale_t pg_regex_locale;
77 : static Oid pg_regex_collation;
78 :
79 : /*
80 : * Hard-wired character properties for C locale
81 : */
82 : #define PG_ISDIGIT 0x01
83 : #define PG_ISALPHA 0x02
84 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
85 : #define PG_ISUPPER 0x04
86 : #define PG_ISLOWER 0x08
87 : #define PG_ISGRAPH 0x10
88 : #define PG_ISPRINT 0x20
89 : #define PG_ISPUNCT 0x40
90 : #define PG_ISSPACE 0x80
91 :
92 : static const unsigned char pg_char_properties[128] = {
93 : /* NUL */ 0,
94 : /* ^A */ 0,
95 : /* ^B */ 0,
96 : /* ^C */ 0,
97 : /* ^D */ 0,
98 : /* ^E */ 0,
99 : /* ^F */ 0,
100 : /* ^G */ 0,
101 : /* ^H */ 0,
102 : /* ^I */ PG_ISSPACE,
103 : /* ^J */ PG_ISSPACE,
104 : /* ^K */ PG_ISSPACE,
105 : /* ^L */ PG_ISSPACE,
106 : /* ^M */ PG_ISSPACE,
107 : /* ^N */ 0,
108 : /* ^O */ 0,
109 : /* ^P */ 0,
110 : /* ^Q */ 0,
111 : /* ^R */ 0,
112 : /* ^S */ 0,
113 : /* ^T */ 0,
114 : /* ^U */ 0,
115 : /* ^V */ 0,
116 : /* ^W */ 0,
117 : /* ^X */ 0,
118 : /* ^Y */ 0,
119 : /* ^Z */ 0,
120 : /* ^[ */ 0,
121 : /* ^\ */ 0,
122 : /* ^] */ 0,
123 : /* ^^ */ 0,
124 : /* ^_ */ 0,
125 : /* */ PG_ISPRINT | PG_ISSPACE,
126 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
151 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
158 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
184 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
190 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
216 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
217 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
218 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
220 : /* DEL */ 0
221 : };
222 :
223 :
224 : /*
225 : * pg_set_regex_collation: set collation for these functions to obey
226 : *
227 : * This is called when beginning compilation or execution of a regexp.
228 : * Since there's no need for reentrancy of regexp operations, it's okay
229 : * to store the results in static variables.
230 : */
231 : void
232 1817230 : pg_set_regex_collation(Oid collation)
233 : {
234 1817230 : if (!OidIsValid(collation))
235 : {
236 : /*
237 : * This typically means that the parser could not resolve a conflict
238 : * of implicit collations, so report it that way.
239 : */
240 0 : ereport(ERROR,
241 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
242 : errmsg("could not determine which collation to use for regular expression"),
243 : errhint("Use the COLLATE clause to set the collation explicitly.")));
244 : }
245 :
246 1817230 : if (lc_ctype_is_c(collation))
247 : {
248 : /* C/POSIX collations use this path regardless of database encoding */
249 113232 : pg_regex_strategy = PG_REGEX_LOCALE_C;
250 113232 : pg_regex_locale = 0;
251 113232 : pg_regex_collation = C_COLLATION_OID;
252 : }
253 : else
254 : {
255 : /*
256 : * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; the
257 : * case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not have to
258 : * be considered below.
259 : */
260 1703998 : pg_regex_locale = pg_newlocale_from_collation(collation);
261 :
262 1703998 : if (!pg_locale_deterministic(pg_regex_locale))
263 24 : ereport(ERROR,
264 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
265 : errmsg("nondeterministic collations are not supported for regular expressions")));
266 :
267 : #ifdef USE_ICU
268 1703974 : if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
269 1703962 : pg_regex_strategy = PG_REGEX_LOCALE_ICU;
270 : else
271 : #endif
272 12 : if (GetDatabaseEncoding() == PG_UTF8)
273 : {
274 8 : if (pg_regex_locale)
275 0 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
276 : else
277 8 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
278 : }
279 : else
280 : {
281 4 : if (pg_regex_locale)
282 0 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
283 : else
284 4 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
285 : }
286 :
287 1703974 : pg_regex_collation = collation;
288 : }
289 1817206 : }
290 :
291 : static int
292 133642 : pg_wc_isdigit(pg_wchar c)
293 : {
294 133642 : switch (pg_regex_strategy)
295 : {
296 2142 : case PG_REGEX_LOCALE_C:
297 4284 : return (c <= (pg_wchar) 127 &&
298 2142 : (pg_char_properties[c] & PG_ISDIGIT));
299 0 : case PG_REGEX_LOCALE_WIDE:
300 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
301 0 : return iswdigit((wint_t) c);
302 : /* FALL THRU */
303 : case PG_REGEX_LOCALE_1BYTE:
304 0 : return (c <= (pg_wchar) UCHAR_MAX &&
305 0 : isdigit((unsigned char) c));
306 0 : case PG_REGEX_LOCALE_WIDE_L:
307 : #ifdef HAVE_LOCALE_T
308 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
309 0 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
310 : #endif
311 : /* FALL THRU */
312 : case PG_REGEX_LOCALE_1BYTE_L:
313 : #ifdef HAVE_LOCALE_T
314 0 : return (c <= (pg_wchar) UCHAR_MAX &&
315 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
316 : #endif
317 : break;
318 131500 : case PG_REGEX_LOCALE_ICU:
319 : #ifdef USE_ICU
320 131500 : return u_isdigit(c);
321 : #endif
322 : break;
323 : }
324 0 : return 0; /* can't get here, but keep compiler quiet */
325 : }
326 :
327 : static int
328 16660 : pg_wc_isalpha(pg_wchar c)
329 : {
330 16660 : switch (pg_regex_strategy)
331 : {
332 0 : case PG_REGEX_LOCALE_C:
333 0 : return (c <= (pg_wchar) 127 &&
334 0 : (pg_char_properties[c] & PG_ISALPHA));
335 0 : case PG_REGEX_LOCALE_WIDE:
336 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
337 0 : return iswalpha((wint_t) c);
338 : /* FALL THRU */
339 : case PG_REGEX_LOCALE_1BYTE:
340 0 : return (c <= (pg_wchar) UCHAR_MAX &&
341 0 : isalpha((unsigned char) c));
342 0 : case PG_REGEX_LOCALE_WIDE_L:
343 : #ifdef HAVE_LOCALE_T
344 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
345 0 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
346 : #endif
347 : /* FALL THRU */
348 : case PG_REGEX_LOCALE_1BYTE_L:
349 : #ifdef HAVE_LOCALE_T
350 0 : return (c <= (pg_wchar) UCHAR_MAX &&
351 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
352 : #endif
353 : break;
354 16660 : case PG_REGEX_LOCALE_ICU:
355 : #ifdef USE_ICU
356 16660 : return u_isalpha(c);
357 : #endif
358 : break;
359 : }
360 0 : return 0; /* can't get here, but keep compiler quiet */
361 : }
362 :
363 : static int
364 54028 : pg_wc_isalnum(pg_wchar c)
365 : {
366 54028 : switch (pg_regex_strategy)
367 : {
368 762 : case PG_REGEX_LOCALE_C:
369 1524 : return (c <= (pg_wchar) 127 &&
370 762 : (pg_char_properties[c] & PG_ISALNUM));
371 0 : case PG_REGEX_LOCALE_WIDE:
372 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
373 0 : return iswalnum((wint_t) c);
374 : /* FALL THRU */
375 : case PG_REGEX_LOCALE_1BYTE:
376 0 : return (c <= (pg_wchar) UCHAR_MAX &&
377 0 : isalnum((unsigned char) c));
378 0 : case PG_REGEX_LOCALE_WIDE_L:
379 : #ifdef HAVE_LOCALE_T
380 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
381 0 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
382 : #endif
383 : /* FALL THRU */
384 : case PG_REGEX_LOCALE_1BYTE_L:
385 : #ifdef HAVE_LOCALE_T
386 0 : return (c <= (pg_wchar) UCHAR_MAX &&
387 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
388 : #endif
389 : break;
390 53266 : case PG_REGEX_LOCALE_ICU:
391 : #ifdef USE_ICU
392 53266 : return u_isalnum(c);
393 : #endif
394 : break;
395 : }
396 0 : return 0; /* can't get here, but keep compiler quiet */
397 : }
398 :
399 : static int
400 33538 : pg_wc_isword(pg_wchar c)
401 : {
402 : /* We define word characters as alnum class plus underscore */
403 33538 : if (c == CHR('_'))
404 22 : return 1;
405 33516 : return pg_wc_isalnum(c);
406 : }
407 :
408 : static int
409 16400 : pg_wc_isupper(pg_wchar c)
410 : {
411 16400 : switch (pg_regex_strategy)
412 : {
413 0 : case PG_REGEX_LOCALE_C:
414 0 : return (c <= (pg_wchar) 127 &&
415 0 : (pg_char_properties[c] & PG_ISUPPER));
416 0 : case PG_REGEX_LOCALE_WIDE:
417 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
418 0 : return iswupper((wint_t) c);
419 : /* FALL THRU */
420 : case PG_REGEX_LOCALE_1BYTE:
421 0 : return (c <= (pg_wchar) UCHAR_MAX &&
422 0 : isupper((unsigned char) c));
423 0 : case PG_REGEX_LOCALE_WIDE_L:
424 : #ifdef HAVE_LOCALE_T
425 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
426 0 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
427 : #endif
428 : /* FALL THRU */
429 : case PG_REGEX_LOCALE_1BYTE_L:
430 : #ifdef HAVE_LOCALE_T
431 0 : return (c <= (pg_wchar) UCHAR_MAX &&
432 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
433 : #endif
434 : break;
435 16400 : case PG_REGEX_LOCALE_ICU:
436 : #ifdef USE_ICU
437 16400 : return u_isupper(c);
438 : #endif
439 : break;
440 : }
441 0 : return 0; /* can't get here, but keep compiler quiet */
442 : }
443 :
444 : static int
445 16390 : pg_wc_islower(pg_wchar c)
446 : {
447 16390 : switch (pg_regex_strategy)
448 : {
449 0 : case PG_REGEX_LOCALE_C:
450 0 : return (c <= (pg_wchar) 127 &&
451 0 : (pg_char_properties[c] & PG_ISLOWER));
452 0 : case PG_REGEX_LOCALE_WIDE:
453 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
454 0 : return iswlower((wint_t) c);
455 : /* FALL THRU */
456 : case PG_REGEX_LOCALE_1BYTE:
457 0 : return (c <= (pg_wchar) UCHAR_MAX &&
458 0 : islower((unsigned char) c));
459 0 : case PG_REGEX_LOCALE_WIDE_L:
460 : #ifdef HAVE_LOCALE_T
461 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
462 0 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
463 : #endif
464 : /* FALL THRU */
465 : case PG_REGEX_LOCALE_1BYTE_L:
466 : #ifdef HAVE_LOCALE_T
467 0 : return (c <= (pg_wchar) UCHAR_MAX &&
468 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
469 : #endif
470 : break;
471 16390 : case PG_REGEX_LOCALE_ICU:
472 : #ifdef USE_ICU
473 16390 : return u_islower(c);
474 : #endif
475 : break;
476 : }
477 0 : return 0; /* can't get here, but keep compiler quiet */
478 : }
479 :
480 : static int
481 16390 : pg_wc_isgraph(pg_wchar c)
482 : {
483 16390 : switch (pg_regex_strategy)
484 : {
485 0 : case PG_REGEX_LOCALE_C:
486 0 : return (c <= (pg_wchar) 127 &&
487 0 : (pg_char_properties[c] & PG_ISGRAPH));
488 0 : case PG_REGEX_LOCALE_WIDE:
489 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
490 0 : return iswgraph((wint_t) c);
491 : /* FALL THRU */
492 : case PG_REGEX_LOCALE_1BYTE:
493 0 : return (c <= (pg_wchar) UCHAR_MAX &&
494 0 : isgraph((unsigned char) c));
495 0 : case PG_REGEX_LOCALE_WIDE_L:
496 : #ifdef HAVE_LOCALE_T
497 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
498 0 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
499 : #endif
500 : /* FALL THRU */
501 : case PG_REGEX_LOCALE_1BYTE_L:
502 : #ifdef HAVE_LOCALE_T
503 0 : return (c <= (pg_wchar) UCHAR_MAX &&
504 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
505 : #endif
506 : break;
507 16390 : case PG_REGEX_LOCALE_ICU:
508 : #ifdef USE_ICU
509 16390 : return u_isgraph(c);
510 : #endif
511 : break;
512 : }
513 0 : return 0; /* can't get here, but keep compiler quiet */
514 : }
515 :
516 : static int
517 16390 : pg_wc_isprint(pg_wchar c)
518 : {
519 16390 : switch (pg_regex_strategy)
520 : {
521 0 : case PG_REGEX_LOCALE_C:
522 0 : return (c <= (pg_wchar) 127 &&
523 0 : (pg_char_properties[c] & PG_ISPRINT));
524 0 : case PG_REGEX_LOCALE_WIDE:
525 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
526 0 : return iswprint((wint_t) c);
527 : /* FALL THRU */
528 : case PG_REGEX_LOCALE_1BYTE:
529 0 : return (c <= (pg_wchar) UCHAR_MAX &&
530 0 : isprint((unsigned char) c));
531 0 : case PG_REGEX_LOCALE_WIDE_L:
532 : #ifdef HAVE_LOCALE_T
533 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
534 0 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
535 : #endif
536 : /* FALL THRU */
537 : case PG_REGEX_LOCALE_1BYTE_L:
538 : #ifdef HAVE_LOCALE_T
539 0 : return (c <= (pg_wchar) UCHAR_MAX &&
540 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
541 : #endif
542 : break;
543 16390 : case PG_REGEX_LOCALE_ICU:
544 : #ifdef USE_ICU
545 16390 : return u_isprint(c);
546 : #endif
547 : break;
548 : }
549 0 : return 0; /* can't get here, but keep compiler quiet */
550 : }
551 :
552 : static int
553 16390 : pg_wc_ispunct(pg_wchar c)
554 : {
555 16390 : switch (pg_regex_strategy)
556 : {
557 0 : case PG_REGEX_LOCALE_C:
558 0 : return (c <= (pg_wchar) 127 &&
559 0 : (pg_char_properties[c] & PG_ISPUNCT));
560 0 : case PG_REGEX_LOCALE_WIDE:
561 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
562 0 : return iswpunct((wint_t) c);
563 : /* FALL THRU */
564 : case PG_REGEX_LOCALE_1BYTE:
565 0 : return (c <= (pg_wchar) UCHAR_MAX &&
566 0 : ispunct((unsigned char) c));
567 0 : case PG_REGEX_LOCALE_WIDE_L:
568 : #ifdef HAVE_LOCALE_T
569 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
570 0 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
571 : #endif
572 : /* FALL THRU */
573 : case PG_REGEX_LOCALE_1BYTE_L:
574 : #ifdef HAVE_LOCALE_T
575 0 : return (c <= (pg_wchar) UCHAR_MAX &&
576 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
577 : #endif
578 : break;
579 16390 : case PG_REGEX_LOCALE_ICU:
580 : #ifdef USE_ICU
581 16390 : return u_ispunct(c);
582 : #endif
583 : break;
584 : }
585 0 : return 0; /* can't get here, but keep compiler quiet */
586 : }
587 :
588 : static int
589 76386 : pg_wc_isspace(pg_wchar c)
590 : {
591 76386 : switch (pg_regex_strategy)
592 : {
593 0 : case PG_REGEX_LOCALE_C:
594 0 : return (c <= (pg_wchar) 127 &&
595 0 : (pg_char_properties[c] & PG_ISSPACE));
596 0 : case PG_REGEX_LOCALE_WIDE:
597 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
598 0 : return iswspace((wint_t) c);
599 : /* FALL THRU */
600 : case PG_REGEX_LOCALE_1BYTE:
601 0 : return (c <= (pg_wchar) UCHAR_MAX &&
602 0 : isspace((unsigned char) c));
603 0 : case PG_REGEX_LOCALE_WIDE_L:
604 : #ifdef HAVE_LOCALE_T
605 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
606 0 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
607 : #endif
608 : /* FALL THRU */
609 : case PG_REGEX_LOCALE_1BYTE_L:
610 : #ifdef HAVE_LOCALE_T
611 0 : return (c <= (pg_wchar) UCHAR_MAX &&
612 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
613 : #endif
614 : break;
615 76386 : case PG_REGEX_LOCALE_ICU:
616 : #ifdef USE_ICU
617 76386 : return u_isspace(c);
618 : #endif
619 : break;
620 : }
621 0 : return 0; /* can't get here, but keep compiler quiet */
622 : }
623 :
624 : static pg_wchar
625 10390 : pg_wc_toupper(pg_wchar c)
626 : {
627 10390 : switch (pg_regex_strategy)
628 : {
629 1056 : case PG_REGEX_LOCALE_C:
630 1056 : if (c <= (pg_wchar) 127)
631 1056 : return pg_ascii_toupper((unsigned char) c);
632 0 : return c;
633 0 : case PG_REGEX_LOCALE_WIDE:
634 : /* force C behavior for ASCII characters, per comments above */
635 0 : if (c <= (pg_wchar) 127)
636 0 : return pg_ascii_toupper((unsigned char) c);
637 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
638 0 : return towupper((wint_t) c);
639 : /* FALL THRU */
640 : case PG_REGEX_LOCALE_1BYTE:
641 : /* force C behavior for ASCII characters, per comments above */
642 0 : if (c <= (pg_wchar) 127)
643 0 : return pg_ascii_toupper((unsigned char) c);
644 0 : if (c <= (pg_wchar) UCHAR_MAX)
645 0 : return toupper((unsigned char) c);
646 0 : return c;
647 0 : case PG_REGEX_LOCALE_WIDE_L:
648 : #ifdef HAVE_LOCALE_T
649 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
650 0 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
651 : #endif
652 : /* FALL THRU */
653 : case PG_REGEX_LOCALE_1BYTE_L:
654 : #ifdef HAVE_LOCALE_T
655 0 : if (c <= (pg_wchar) UCHAR_MAX)
656 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
657 : #endif
658 0 : return c;
659 9334 : case PG_REGEX_LOCALE_ICU:
660 : #ifdef USE_ICU
661 9334 : return u_toupper(c);
662 : #endif
663 : break;
664 : }
665 0 : return 0; /* can't get here, but keep compiler quiet */
666 : }
667 :
668 : static pg_wchar
669 10394 : pg_wc_tolower(pg_wchar c)
670 : {
671 10394 : switch (pg_regex_strategy)
672 : {
673 1056 : case PG_REGEX_LOCALE_C:
674 1056 : if (c <= (pg_wchar) 127)
675 1056 : return pg_ascii_tolower((unsigned char) c);
676 0 : return c;
677 0 : case PG_REGEX_LOCALE_WIDE:
678 : /* force C behavior for ASCII characters, per comments above */
679 0 : if (c <= (pg_wchar) 127)
680 0 : return pg_ascii_tolower((unsigned char) c);
681 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
682 0 : return towlower((wint_t) c);
683 : /* FALL THRU */
684 : case PG_REGEX_LOCALE_1BYTE:
685 : /* force C behavior for ASCII characters, per comments above */
686 0 : if (c <= (pg_wchar) 127)
687 0 : return pg_ascii_tolower((unsigned char) c);
688 0 : if (c <= (pg_wchar) UCHAR_MAX)
689 0 : return tolower((unsigned char) c);
690 0 : return c;
691 0 : case PG_REGEX_LOCALE_WIDE_L:
692 : #ifdef HAVE_LOCALE_T
693 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
694 0 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
695 : #endif
696 : /* FALL THRU */
697 : case PG_REGEX_LOCALE_1BYTE_L:
698 : #ifdef HAVE_LOCALE_T
699 0 : if (c <= (pg_wchar) UCHAR_MAX)
700 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
701 : #endif
702 0 : return c;
703 9338 : case PG_REGEX_LOCALE_ICU:
704 : #ifdef USE_ICU
705 9338 : return u_tolower(c);
706 : #endif
707 : break;
708 : }
709 0 : return 0; /* can't get here, but keep compiler quiet */
710 : }
711 :
712 :
713 : /*
714 : * These functions cache the results of probing libc's ctype behavior for
715 : * all character codes of interest in a given encoding/collation. The
716 : * result is provided as a "struct cvec", but notice that the representation
717 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
718 : * chrs[] and ranges[] arrays separately from the struct so that we can
719 : * realloc them larger at need. This is okay since the cvecs made here
720 : * should never be freed by freecvec().
721 : *
722 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
723 : * the main regex code expects us to return a failure indication instead.
724 : */
725 :
726 : typedef int (*pg_wc_probefunc) (pg_wchar c);
727 :
728 : typedef struct pg_ctype_cache
729 : {
730 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
731 : Oid collation; /* collation this entry is for */
732 : struct cvec cv; /* cache entry contents */
733 : struct pg_ctype_cache *next; /* chain link */
734 : } pg_ctype_cache;
735 :
736 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
737 :
738 : /*
739 : * Add a chr or range to pcc->cv; return false if run out of memory
740 : */
741 : static bool
742 6872 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
743 : {
744 : chr *newchrs;
745 :
746 6872 : if (nchrs > 1)
747 : {
748 2262 : if (pcc->cv.nranges >= pcc->cv.rangespace)
749 : {
750 0 : pcc->cv.rangespace *= 2;
751 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
752 0 : pcc->cv.rangespace * sizeof(chr) * 2);
753 0 : if (newchrs == NULL)
754 0 : return false;
755 0 : pcc->cv.ranges = newchrs;
756 : }
757 2262 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
758 2262 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
759 2262 : pcc->cv.nranges++;
760 : }
761 : else
762 : {
763 : assert(nchrs == 1);
764 4610 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
765 : {
766 16 : pcc->cv.chrspace *= 2;
767 16 : newchrs = (chr *) realloc(pcc->cv.chrs,
768 16 : pcc->cv.chrspace * sizeof(chr));
769 16 : if (newchrs == NULL)
770 0 : return false;
771 16 : pcc->cv.chrs = newchrs;
772 : }
773 4610 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
774 : }
775 6872 : return true;
776 : }
777 :
778 : /*
779 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
780 : * chrs satisfying the probe function. The active collation is the one
781 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
782 : *
783 : * Note that the result must not be freed or modified by caller.
784 : */
785 : static struct cvec *
786 672 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
787 : {
788 : pg_ctype_cache *pcc;
789 : pg_wchar max_chr;
790 : pg_wchar cur_chr;
791 : int nmatches;
792 : chr *newchrs;
793 :
794 : /*
795 : * Do we already have the answer cached?
796 : */
797 1568 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
798 : {
799 1372 : if (pcc->probefunc == probefunc &&
800 524 : pcc->collation == pg_regex_collation)
801 476 : return &pcc->cv;
802 : }
803 :
804 : /*
805 : * Nope, so initialize some workspace ...
806 : */
807 196 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
808 196 : if (pcc == NULL)
809 0 : return NULL;
810 196 : pcc->probefunc = probefunc;
811 196 : pcc->collation = pg_regex_collation;
812 196 : pcc->cv.nchrs = 0;
813 196 : pcc->cv.chrspace = 128;
814 196 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
815 196 : pcc->cv.nranges = 0;
816 196 : pcc->cv.rangespace = 64;
817 196 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
818 196 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
819 0 : goto out_of_memory;
820 196 : pcc->cv.cclasscode = cclasscode;
821 :
822 : /*
823 : * Decide how many character codes we ought to look through. In general
824 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
825 : * runtime using the "high colormap" mechanism. However, in C locale
826 : * there's no need to go further than 127, and if we only have a 1-byte
827 : * <ctype.h> API there's no need to go further than that can handle.
828 : *
829 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
830 : * output cvec as not having any locale-dependent behavior, since there
831 : * will be no need to do any run-time locale checks. (The #if's here
832 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
833 : * useful to allow it to be small for testing purposes.)
834 : */
835 196 : switch (pg_regex_strategy)
836 : {
837 22 : case PG_REGEX_LOCALE_C:
838 : #if MAX_SIMPLE_CHR >= 127
839 22 : max_chr = (pg_wchar) 127;
840 22 : pcc->cv.cclasscode = -1;
841 : #else
842 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
843 : #endif
844 22 : break;
845 0 : case PG_REGEX_LOCALE_WIDE:
846 : case PG_REGEX_LOCALE_WIDE_L:
847 0 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
848 0 : break;
849 0 : case PG_REGEX_LOCALE_1BYTE:
850 : case PG_REGEX_LOCALE_1BYTE_L:
851 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
852 0 : max_chr = (pg_wchar) UCHAR_MAX;
853 0 : pcc->cv.cclasscode = -1;
854 : #else
855 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
856 : #endif
857 0 : break;
858 174 : case PG_REGEX_LOCALE_ICU:
859 174 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
860 174 : break;
861 0 : default:
862 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
863 0 : break;
864 : }
865 :
866 : /*
867 : * And scan 'em ...
868 : */
869 196 : nmatches = 0; /* number of consecutive matches */
870 :
871 359364 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
872 : {
873 359168 : if ((*probefunc) (cur_chr))
874 92810 : nmatches++;
875 266358 : else if (nmatches > 0)
876 : {
877 6856 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
878 0 : goto out_of_memory;
879 6856 : nmatches = 0;
880 : }
881 : }
882 :
883 196 : if (nmatches > 0)
884 16 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
885 0 : goto out_of_memory;
886 :
887 : /*
888 : * We might have allocated more memory than needed, if so free it
889 : */
890 196 : if (pcc->cv.nchrs == 0)
891 : {
892 80 : free(pcc->cv.chrs);
893 80 : pcc->cv.chrs = NULL;
894 80 : pcc->cv.chrspace = 0;
895 : }
896 116 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
897 : {
898 116 : newchrs = (chr *) realloc(pcc->cv.chrs,
899 116 : pcc->cv.nchrs * sizeof(chr));
900 116 : if (newchrs == NULL)
901 0 : goto out_of_memory;
902 116 : pcc->cv.chrs = newchrs;
903 116 : pcc->cv.chrspace = pcc->cv.nchrs;
904 : }
905 196 : if (pcc->cv.nranges == 0)
906 : {
907 0 : free(pcc->cv.ranges);
908 0 : pcc->cv.ranges = NULL;
909 0 : pcc->cv.rangespace = 0;
910 : }
911 196 : else if (pcc->cv.nranges < pcc->cv.rangespace)
912 : {
913 196 : newchrs = (chr *) realloc(pcc->cv.ranges,
914 196 : pcc->cv.nranges * sizeof(chr) * 2);
915 196 : if (newchrs == NULL)
916 0 : goto out_of_memory;
917 196 : pcc->cv.ranges = newchrs;
918 196 : pcc->cv.rangespace = pcc->cv.nranges;
919 : }
920 :
921 : /*
922 : * Success, link it into cache chain
923 : */
924 196 : pcc->next = pg_ctype_cache_list;
925 196 : pg_ctype_cache_list = pcc;
926 :
927 196 : return &pcc->cv;
928 :
929 : /*
930 : * Failure, clean up
931 : */
932 0 : out_of_memory:
933 0 : free(pcc->cv.chrs);
934 0 : free(pcc->cv.ranges);
935 0 : free(pcc);
936 :
937 0 : return NULL;
938 : }
|