Line data Source code
1 : /*
2 : * regc_locale.c --
3 : *
4 : * This file contains locale-specific regexp routines.
5 : * This file is #included by regcomp.c.
6 : *
7 : * Copyright (c) 1998 by Scriptics Corporation.
8 : *
9 : * This software is copyrighted by the Regents of the University of
10 : * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 : * Corporation and other parties. The following terms apply to all files
12 : * associated with the software unless explicitly disclaimed in
13 : * individual files.
14 : *
15 : * The authors hereby grant permission to use, copy, modify, distribute,
16 : * and license this software and its documentation for any purpose, provided
17 : * that existing copyright notices are retained in all copies and that this
18 : * notice is included verbatim in any distributions. No written agreement,
19 : * license, or royalty fee is required for any of the authorized uses.
20 : * Modifications to this software may be copyrighted by their authors
21 : * and need not follow the licensing terms described here, provided that
22 : * the new terms are clearly indicated on the first page of each file where
23 : * they apply.
24 : *
25 : * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 : * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 : * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 : * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 : * POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 : * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 : * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 : * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 : * MODIFICATIONS.
37 : *
38 : * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 : * U.S. government, the Government shall have only "Restricted Rights"
40 : * in the software and related documentation as defined in the Federal
41 : * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 : * are acquiring the software on behalf of the Department of Defense, the
43 : * software shall be classified as "Commercial Computer Software" and the
44 : * Government shall have only "Restricted Rights" as defined in Clause
45 : * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 : * authors grant the U.S. Government and others acting in its behalf
47 : * permission to use and distribute the software in accordance with the
48 : * terms specified in this license.
49 : *
50 : * src/backend/regex/regc_locale.c
51 : */
52 :
53 : /* ASCII character-name table */
54 :
55 : static const struct cname
56 : {
57 : const char *name;
58 : const char code;
59 : } cnames[] =
60 :
61 : {
62 : {
63 : "NUL", '\0'
64 : },
65 : {
66 : "SOH", '\001'
67 : },
68 : {
69 : "STX", '\002'
70 : },
71 : {
72 : "ETX", '\003'
73 : },
74 : {
75 : "EOT", '\004'
76 : },
77 : {
78 : "ENQ", '\005'
79 : },
80 : {
81 : "ACK", '\006'
82 : },
83 : {
84 : "BEL", '\007'
85 : },
86 : {
87 : "alert", '\007'
88 : },
89 : {
90 : "BS", '\010'
91 : },
92 : {
93 : "backspace", '\b'
94 : },
95 : {
96 : "HT", '\011'
97 : },
98 : {
99 : "tab", '\t'
100 : },
101 : {
102 : "LF", '\012'
103 : },
104 : {
105 : "newline", '\n'
106 : },
107 : {
108 : "VT", '\013'
109 : },
110 : {
111 : "vertical-tab", '\v'
112 : },
113 : {
114 : "FF", '\014'
115 : },
116 : {
117 : "form-feed", '\f'
118 : },
119 : {
120 : "CR", '\015'
121 : },
122 : {
123 : "carriage-return", '\r'
124 : },
125 : {
126 : "SO", '\016'
127 : },
128 : {
129 : "SI", '\017'
130 : },
131 : {
132 : "DLE", '\020'
133 : },
134 : {
135 : "DC1", '\021'
136 : },
137 : {
138 : "DC2", '\022'
139 : },
140 : {
141 : "DC3", '\023'
142 : },
143 : {
144 : "DC4", '\024'
145 : },
146 : {
147 : "NAK", '\025'
148 : },
149 : {
150 : "SYN", '\026'
151 : },
152 : {
153 : "ETB", '\027'
154 : },
155 : {
156 : "CAN", '\030'
157 : },
158 : {
159 : "EM", '\031'
160 : },
161 : {
162 : "SUB", '\032'
163 : },
164 : {
165 : "ESC", '\033'
166 : },
167 : {
168 : "IS4", '\034'
169 : },
170 : {
171 : "FS", '\034'
172 : },
173 : {
174 : "IS3", '\035'
175 : },
176 : {
177 : "GS", '\035'
178 : },
179 : {
180 : "IS2", '\036'
181 : },
182 : {
183 : "RS", '\036'
184 : },
185 : {
186 : "IS1", '\037'
187 : },
188 : {
189 : "US", '\037'
190 : },
191 : {
192 : "space", ' '
193 : },
194 : {
195 : "exclamation-mark", '!'
196 : },
197 : {
198 : "quotation-mark", '"'
199 : },
200 : {
201 : "number-sign", '#'
202 : },
203 : {
204 : "dollar-sign", '$'
205 : },
206 : {
207 : "percent-sign", '%'
208 : },
209 : {
210 : "ampersand", '&'
211 : },
212 : {
213 : "apostrophe", '\''
214 : },
215 : {
216 : "left-parenthesis", '('
217 : },
218 : {
219 : "right-parenthesis", ')'
220 : },
221 : {
222 : "asterisk", '*'
223 : },
224 : {
225 : "plus-sign", '+'
226 : },
227 : {
228 : "comma", ','
229 : },
230 : {
231 : "hyphen", '-'
232 : },
233 : {
234 : "hyphen-minus", '-'
235 : },
236 : {
237 : "period", '.'
238 : },
239 : {
240 : "full-stop", '.'
241 : },
242 : {
243 : "slash", '/'
244 : },
245 : {
246 : "solidus", '/'
247 : },
248 : {
249 : "zero", '0'
250 : },
251 : {
252 : "one", '1'
253 : },
254 : {
255 : "two", '2'
256 : },
257 : {
258 : "three", '3'
259 : },
260 : {
261 : "four", '4'
262 : },
263 : {
264 : "five", '5'
265 : },
266 : {
267 : "six", '6'
268 : },
269 : {
270 : "seven", '7'
271 : },
272 : {
273 : "eight", '8'
274 : },
275 : {
276 : "nine", '9'
277 : },
278 : {
279 : "colon", ':'
280 : },
281 : {
282 : "semicolon", ';'
283 : },
284 : {
285 : "less-than-sign", '<'
286 : },
287 : {
288 : "equals-sign", '='
289 : },
290 : {
291 : "greater-than-sign", '>'
292 : },
293 : {
294 : "question-mark", '?'
295 : },
296 : {
297 : "commercial-at", '@'
298 : },
299 : {
300 : "left-square-bracket", '['
301 : },
302 : {
303 : "backslash", '\\'
304 : },
305 : {
306 : "reverse-solidus", '\\'
307 : },
308 : {
309 : "right-square-bracket", ']'
310 : },
311 : {
312 : "circumflex", '^'
313 : },
314 : {
315 : "circumflex-accent", '^'
316 : },
317 : {
318 : "underscore", '_'
319 : },
320 : {
321 : "low-line", '_'
322 : },
323 : {
324 : "grave-accent", '`'
325 : },
326 : {
327 : "left-brace", '{'
328 : },
329 : {
330 : "left-curly-bracket", '{'
331 : },
332 : {
333 : "vertical-line", '|'
334 : },
335 : {
336 : "right-brace", '}'
337 : },
338 : {
339 : "right-curly-bracket", '}'
340 : },
341 : {
342 : "tilde", '~'
343 : },
344 : {
345 : "DEL", '\177'
346 : },
347 : {
348 : NULL, 0
349 : }
350 : };
351 :
352 : /*
353 : * The following array defines the valid character class names.
354 : * The entries must match enum char_classes in regguts.h.
355 : */
356 : static const char *const classNames[NUM_CCLASSES + 1] = {
357 : "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
358 : "lower", "print", "punct", "space", "upper", "xdigit", "word",
359 : NULL
360 : };
361 :
362 : /*
363 : * We do not use the hard-wired Unicode classification tables that Tcl does.
364 : * This is because (a) we need to deal with other encodings besides Unicode,
365 : * and (b) we want to track the behavior of the libc locale routines as
366 : * closely as possible. For example, it wouldn't be unreasonable for a
367 : * locale to not consider every Unicode letter as a letter. So we build
368 : * character classification cvecs by asking libc, even for Unicode.
369 : */
370 :
371 :
372 : /*
373 : * element - map collating-element name to chr
374 : */
375 : static chr
376 48 : element(struct vars *v, /* context */
377 : const chr *startp, /* points to start of name */
378 : const chr *endp) /* points just past end of name */
379 : {
380 : const struct cname *cn;
381 : size_t len;
382 :
383 : /* generic: one-chr names stand for themselves */
384 : assert(startp < endp);
385 48 : len = endp - startp;
386 48 : if (len == 1)
387 28 : return *startp;
388 :
389 20 : NOTE(REG_ULOCALE);
390 :
391 : /* search table */
392 1360 : for (cn = cnames; cn->name != NULL; cn++)
393 : {
394 1506 : if (strlen(cn->name) == len &&
395 154 : pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
396 : {
397 12 : break; /* NOTE BREAK OUT */
398 : }
399 : }
400 20 : if (cn->name != NULL)
401 12 : return CHR(cn->code);
402 :
403 : /* couldn't find it */
404 8 : ERR(REG_ECOLLATE);
405 8 : return 0;
406 : }
407 :
408 : /*
409 : * range - supply cvec for a range, including legality check
410 : */
411 : static struct cvec *
412 574 : range(struct vars *v, /* context */
413 : chr a, /* range start */
414 : chr b, /* range end, might equal a */
415 : int cases) /* case-independent? */
416 : {
417 : int nchrs;
418 : struct cvec *cv;
419 : chr c,
420 : cc;
421 :
422 574 : if (a != b && !before(a, b))
423 : {
424 4 : ERR(REG_ERANGE);
425 4 : return NULL;
426 : }
427 :
428 570 : if (!cases)
429 : { /* easy version */
430 514 : cv = getcvec(v, 0, 1);
431 514 : NOERRN();
432 514 : addrange(cv, a, b);
433 514 : return cv;
434 : }
435 :
436 : /*
437 : * When case-independent, it's hard to decide when cvec ranges are usable,
438 : * so for now at least, we won't try. We use a range for the originally
439 : * specified chrs and then add on any case-equivalents that are outside
440 : * that range as individual chrs.
441 : *
442 : * To ensure sane behavior if someone specifies a very large range, limit
443 : * the allocation size to 100000 chrs (arbitrary) and check for overrun
444 : * inside the loop below.
445 : */
446 56 : nchrs = b - a + 1;
447 56 : if (nchrs <= 0 || nchrs > 100000)
448 0 : nchrs = 100000;
449 :
450 56 : cv = getcvec(v, nchrs, 1);
451 56 : NOERRN();
452 56 : addrange(cv, a, b);
453 :
454 8976 : for (c = a; c <= b; c++)
455 : {
456 8920 : cc = pg_wc_tolower(c);
457 9976 : if (cc != c &&
458 2110 : (before(cc, a) || before(b, cc)))
459 : {
460 514 : if (cv->nchrs >= cv->chrspace)
461 : {
462 0 : ERR(REG_ETOOBIG);
463 0 : return NULL;
464 : }
465 514 : addchr(cv, cc);
466 : }
467 8920 : cc = pg_wc_toupper(c);
468 9944 : if (cc != c &&
469 1576 : (before(cc, a) || before(b, cc)))
470 : {
471 480 : if (cv->nchrs >= cv->chrspace)
472 : {
473 0 : ERR(REG_ETOOBIG);
474 0 : return NULL;
475 : }
476 480 : addchr(cv, cc);
477 : }
478 8920 : INTERRUPT(v->re);
479 : }
480 :
481 56 : return cv;
482 : }
483 :
484 : /*
485 : * before - is chr x before chr y, for purposes of range legality?
486 : */
487 : static int /* predicate */
488 4240 : before(chr x, chr y)
489 : {
490 4240 : if (x < y)
491 1544 : return 1;
492 2696 : return 0;
493 : }
494 :
495 : /*
496 : * eclass - supply cvec for an equivalence class
497 : * Must include case counterparts on request.
498 : */
499 : static struct cvec *
500 20 : eclass(struct vars *v, /* context */
501 : chr c, /* Collating element representing the
502 : * equivalence class. */
503 : int cases) /* all cases? */
504 : {
505 : struct cvec *cv;
506 :
507 : /* crude fake equivalence class for testing */
508 20 : if ((v->cflags & REG_FAKE) && c == 'x')
509 : {
510 12 : cv = getcvec(v, 4, 0);
511 12 : addchr(cv, CHR('x'));
512 12 : addchr(cv, CHR('y'));
513 12 : if (cases)
514 : {
515 0 : addchr(cv, CHR('X'));
516 0 : addchr(cv, CHR('Y'));
517 : }
518 12 : return cv;
519 : }
520 :
521 : /* otherwise, none */
522 8 : if (cases)
523 4 : return allcases(v, c);
524 4 : cv = getcvec(v, 1, 0);
525 : assert(cv != NULL);
526 4 : addchr(cv, c);
527 4 : return cv;
528 : }
529 :
530 : /*
531 : * lookupcclass - lookup a character class identified by name
532 : *
533 : * On failure, sets an error code in *v; the result is then garbage.
534 : */
535 : static enum char_classes
536 238 : lookupcclass(struct vars *v, /* context (for returning errors) */
537 : const chr *startp, /* where the name starts */
538 : const chr *endp) /* just past the end of the name */
539 : {
540 : size_t len;
541 : const char *const *namePtr;
542 : int i;
543 :
544 : /*
545 : * Map the name to the corresponding enumerated value.
546 : */
547 238 : len = endp - startp;
548 1504 : for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
549 : {
550 2806 : if (strlen(*namePtr) == len &&
551 1310 : pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
552 230 : return (enum char_classes) i;
553 : }
554 :
555 8 : ERR(REG_ECTYPE);
556 8 : return (enum char_classes) 0;
557 : }
558 :
559 : /*
560 : * cclasscvec - supply cvec for a character class
561 : *
562 : * Must include case counterparts if "cases" is true.
563 : *
564 : * The returned cvec might be either a transient cvec gotten from getcvec(),
565 : * or a permanently cached one from pg_ctype_get_cache(). This is okay
566 : * because callers are not supposed to explicitly free the result either way.
567 : */
568 : static struct cvec *
569 842 : cclasscvec(struct vars *v, /* context */
570 : enum char_classes cclasscode, /* class to build a cvec for */
571 : int cases) /* case-independent? */
572 : {
573 842 : struct cvec *cv = NULL;
574 :
575 : /*
576 : * Remap lower and upper to alpha if the match is case insensitive.
577 : */
578 :
579 842 : if (cases &&
580 18 : (cclasscode == CC_LOWER ||
581 : cclasscode == CC_UPPER))
582 2 : cclasscode = CC_ALPHA;
583 :
584 : /*
585 : * Now compute the character class contents. For classes that are based
586 : * on the behavior of a <wctype.h> or <ctype.h> function, we use
587 : * pg_ctype_get_cache so that we can cache the results. Other classes
588 : * have definitions that are hard-wired here, and for those we just
589 : * construct a transient cvec on the fly.
590 : *
591 : * NB: keep this code in sync with cclass_column_index(), below.
592 : */
593 :
594 842 : switch (cclasscode)
595 : {
596 8 : case CC_PRINT:
597 8 : cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
598 8 : break;
599 22 : case CC_ALNUM:
600 22 : cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
601 22 : break;
602 16 : case CC_ALPHA:
603 16 : cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
604 16 : break;
605 256 : case CC_WORD:
606 256 : cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
607 256 : break;
608 2 : case CC_ASCII:
609 : /* hard-wired meaning */
610 2 : cv = getcvec(v, 0, 1);
611 2 : if (cv)
612 2 : addrange(cv, 0, 0x7f);
613 2 : break;
614 70 : case CC_BLANK:
615 : /* hard-wired meaning */
616 70 : cv = getcvec(v, 2, 0);
617 70 : addchr(cv, '\t');
618 70 : addchr(cv, ' ');
619 70 : break;
620 2 : case CC_CNTRL:
621 : /* hard-wired meaning */
622 2 : cv = getcvec(v, 0, 2);
623 2 : addrange(cv, 0x0, 0x1f);
624 2 : addrange(cv, 0x7f, 0x9f);
625 2 : break;
626 314 : case CC_DIGIT:
627 314 : cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
628 314 : break;
629 14 : case CC_PUNCT:
630 14 : cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
631 14 : break;
632 4 : case CC_XDIGIT:
633 :
634 : /*
635 : * It's not clear how to define this in non-western locales, and
636 : * even less clear that there's any particular use in trying. So
637 : * just hard-wire the meaning.
638 : */
639 4 : cv = getcvec(v, 0, 3);
640 4 : if (cv)
641 : {
642 4 : addrange(cv, '0', '9');
643 4 : addrange(cv, 'a', 'f');
644 4 : addrange(cv, 'A', 'F');
645 : }
646 4 : break;
647 100 : case CC_SPACE:
648 100 : cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
649 100 : break;
650 8 : case CC_LOWER:
651 8 : cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
652 8 : break;
653 18 : case CC_UPPER:
654 18 : cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
655 18 : break;
656 8 : case CC_GRAPH:
657 8 : cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
658 8 : break;
659 : }
660 :
661 : /* If cv is NULL now, the reason must be "out of memory" */
662 842 : if (cv == NULL)
663 0 : ERR(REG_ESPACE);
664 842 : return cv;
665 : }
666 :
667 : /*
668 : * cclass_column_index - get appropriate high colormap column index for chr
669 : */
670 : static int
671 70 : cclass_column_index(struct colormap *cm, chr c)
672 : {
673 70 : int colnum = 0;
674 :
675 : /* Shouldn't go through all these pushups for simple chrs */
676 : assert(c > MAX_SIMPLE_CHR);
677 :
678 : /*
679 : * Note: we should not see requests to consider cclasses that are not
680 : * treated as locale-specific by cclasscvec(), above.
681 : */
682 70 : if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
683 6 : colnum |= cm->classbits[CC_PRINT];
684 70 : if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
685 20 : colnum |= cm->classbits[CC_ALNUM];
686 70 : if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
687 10 : colnum |= cm->classbits[CC_ALPHA];
688 70 : if (cm->classbits[CC_WORD] && pg_wc_isword(c))
689 2 : colnum |= cm->classbits[CC_WORD];
690 : assert(cm->classbits[CC_ASCII] == 0);
691 : assert(cm->classbits[CC_BLANK] == 0);
692 : assert(cm->classbits[CC_CNTRL] == 0);
693 70 : if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
694 0 : colnum |= cm->classbits[CC_DIGIT];
695 70 : if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
696 0 : colnum |= cm->classbits[CC_PUNCT];
697 : assert(cm->classbits[CC_XDIGIT] == 0);
698 70 : if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
699 0 : colnum |= cm->classbits[CC_SPACE];
700 70 : if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
701 0 : colnum |= cm->classbits[CC_LOWER];
702 70 : if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
703 0 : colnum |= cm->classbits[CC_UPPER];
704 70 : if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
705 6 : colnum |= cm->classbits[CC_GRAPH];
706 :
707 70 : return colnum;
708 : }
709 :
710 : /*
711 : * allcases - supply cvec for all case counterparts of a chr (including itself)
712 : *
713 : * This is a shortcut, preferably an efficient one, for simple characters;
714 : * messy cases are done via range().
715 : */
716 : static struct cvec *
717 1626 : allcases(struct vars *v, /* context */
718 : chr c) /* character to get case equivs of */
719 : {
720 : struct cvec *cv;
721 : chr lc,
722 : uc;
723 :
724 1626 : lc = pg_wc_tolower(c);
725 1626 : uc = pg_wc_toupper(c);
726 :
727 1626 : cv = getcvec(v, 2, 0);
728 1626 : addchr(cv, lc);
729 1626 : if (lc != uc)
730 1366 : addchr(cv, uc);
731 1626 : return cv;
732 : }
733 :
734 : /*
735 : * cmp - chr-substring compare
736 : *
737 : * Backrefs need this. It should preferably be efficient.
738 : * Note that it does not need to report anything except equal/unequal.
739 : * Note also that the length is exact, and the comparison should not
740 : * stop at embedded NULs!
741 : */
742 : static int /* 0 for equal, nonzero for unequal */
743 1284 : cmp(const chr *x, const chr *y, /* strings to compare */
744 : size_t len) /* exact length of comparison */
745 : {
746 1284 : return memcmp(VS(x), VS(y), len * sizeof(chr));
747 : }
748 :
749 : /*
750 : * casecmp - case-independent chr-substring compare
751 : *
752 : * REG_ICASE backrefs need this. It should preferably be efficient.
753 : * Note that it does not need to report anything except equal/unequal.
754 : * Note also that the length is exact, and the comparison should not
755 : * stop at embedded NULs!
756 : */
757 : static int /* 0 for equal, nonzero for unequal */
758 2 : casecmp(const chr *x, const chr *y, /* strings to compare */
759 : size_t len) /* exact length of comparison */
760 : {
761 4 : for (; len > 0; len--, x++, y++)
762 : {
763 2 : if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
764 0 : return 1;
765 : }
766 2 : return 0;
767 : }
|