|           Line data    Source code 
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regc_pg_locale.c
       4             :  *    ctype functions adapted to work on pg_wchar (a/k/a chr),
       5             :  *    and functions to cache the results of wholesale ctype probing.
       6             :  *
       7             :  * This file is #included by regcomp.c; it's not meant to compile standalone.
       8             :  *
       9             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      10             :  * Portions Copyright (c) 1994, Regents of the University of California
      11             :  *
      12             :  * IDENTIFICATION
      13             :  *    src/backend/regex/regc_pg_locale.c
      14             :  *
      15             :  *-------------------------------------------------------------------------
      16             :  */
      17             : 
      18             : #include "catalog/pg_collation.h"
      19             : #include "common/unicode_case.h"
      20             : #include "common/unicode_category.h"
      21             : #include "utils/pg_locale.h"
      22             : #include "utils/pg_locale_c.h"
      23             : 
      24             : static pg_locale_t pg_regex_locale;
      25             : 
      26             : static struct pg_locale_struct dummy_c_locale = {
      27             :     .collate_is_c = true,
      28             :     .ctype_is_c = true,
      29             : };
      30             : 
      31             : 
      32             : /*
      33             :  * pg_set_regex_collation: set collation for these functions to obey
      34             :  *
      35             :  * This is called when beginning compilation or execution of a regexp.
      36             :  * Since there's no need for reentrancy of regexp operations, it's okay
      37             :  * to store the results in static variables.
      38             :  */
      39             : void
      40     8062808 : pg_set_regex_collation(Oid collation)
      41             : {
      42     8062808 :     pg_locale_t locale = 0;
      43             : 
      44     8062808 :     if (!OidIsValid(collation))
      45             :     {
      46             :         /*
      47             :          * This typically means that the parser could not resolve a conflict
      48             :          * of implicit collations, so report it that way.
      49             :          */
      50           0 :         ereport(ERROR,
      51             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
      52             :                  errmsg("could not determine which collation to use for regular expression"),
      53             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
      54             :     }
      55             : 
      56     8062808 :     if (collation == C_COLLATION_OID)
      57             :     {
      58             :         /*
      59             :          * Some callers expect regexes to work for C_COLLATION_OID before
      60             :          * catalog access is available, so we can't call
      61             :          * pg_newlocale_from_collation().
      62             :          */
      63      150144 :         locale = &dummy_c_locale;
      64             :     }
      65             :     else
      66             :     {
      67     7912664 :         locale = pg_newlocale_from_collation(collation);
      68             : 
      69     7912664 :         if (!locale->deterministic)
      70          24 :             ereport(ERROR,
      71             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
      72             :                      errmsg("nondeterministic collations are not supported for regular expressions")));
      73             : 
      74     7912640 :         if (locale->ctype_is_c)
      75             :         {
      76             :             /*
      77             :              * C/POSIX collations use this path regardless of database
      78             :              * encoding
      79             :              */
      80         284 :             locale = &dummy_c_locale;
      81             :         }
      82             :     }
      83             : 
      84     8062784 :     pg_regex_locale = locale;
      85     8062784 : }
      86             : 
      87             : /*
      88             :  * The following functions overlap with those defined in pg_locale.c. XXX:
      89             :  * consider refactor.
      90             :  */
      91             : 
      92             : static int
      93      186910 : regc_wc_isdigit(pg_wchar c)
      94             : {
      95      186910 :     if (pg_regex_locale->ctype_is_c)
      96        4260 :         return (c <= (pg_wchar) 127 &&
      97        2130 :                 (pg_char_properties[c] & PG_ISDIGIT));
      98             :     else
      99      184780 :         return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
     100             : }
     101             : 
     102             : static int
     103       29718 : regc_wc_isalpha(pg_wchar c)
     104             : {
     105       29718 :     if (pg_regex_locale->ctype_is_c)
     106        1536 :         return (c <= (pg_wchar) 127 &&
     107         768 :                 (pg_char_properties[c] & PG_ISALPHA));
     108             :     else
     109       28950 :         return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
     110             : }
     111             : 
     112             : static int
     113       94986 : regc_wc_isalnum(pg_wchar c)
     114             : {
     115       94986 :     if (pg_regex_locale->ctype_is_c)
     116        1524 :         return (c <= (pg_wchar) 127 &&
     117         762 :                 (pg_char_properties[c] & PG_ISALNUM));
     118             :     else
     119       94224 :         return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
     120             : }
     121             : 
     122             : static int
     123       37634 : regc_wc_isword(pg_wchar c)
     124             : {
     125             :     /* We define word characters as alnum class plus underscore */
     126       37634 :     if (c == CHR('_'))
     127          24 :         return 1;
     128       37610 :     return regc_wc_isalnum(c);
     129             : }
     130             : 
     131             : static int
     132       40976 : regc_wc_isupper(pg_wchar c)
     133             : {
     134       40976 :     if (pg_regex_locale->ctype_is_c)
     135           0 :         return (c <= (pg_wchar) 127 &&
     136           0 :                 (pg_char_properties[c] & PG_ISUPPER));
     137             :     else
     138       40976 :         return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
     139             : }
     140             : 
     141             : static int
     142       16390 : regc_wc_islower(pg_wchar c)
     143             : {
     144       16390 :     if (pg_regex_locale->ctype_is_c)
     145           0 :         return (c <= (pg_wchar) 127 &&
     146           0 :                 (pg_char_properties[c] & PG_ISLOWER));
     147             :     else
     148       16390 :         return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
     149             : }
     150             : 
     151             : static int
     152       16390 : regc_wc_isgraph(pg_wchar c)
     153             : {
     154       16390 :     if (pg_regex_locale->ctype_is_c)
     155           0 :         return (c <= (pg_wchar) 127 &&
     156           0 :                 (pg_char_properties[c] & PG_ISGRAPH));
     157             :     else
     158       16390 :         return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
     159             : }
     160             : 
     161             : static int
     162       16390 : regc_wc_isprint(pg_wchar c)
     163             : {
     164       16390 :     if (pg_regex_locale->ctype_is_c)
     165           0 :         return (c <= (pg_wchar) 127 &&
     166           0 :                 (pg_char_properties[c] & PG_ISPRINT));
     167             :     else
     168       16390 :         return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
     169             : }
     170             : 
     171             : static int
     172       40966 : regc_wc_ispunct(pg_wchar c)
     173             : {
     174       40966 :     if (pg_regex_locale->ctype_is_c)
     175           0 :         return (c <= (pg_wchar) 127 &&
     176           0 :                 (pg_char_properties[c] & PG_ISPUNCT));
     177             :     else
     178       40966 :         return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
     179             : }
     180             : 
     181             : static int
     182       76386 : regc_wc_isspace(pg_wchar c)
     183             : {
     184       76386 :     if (pg_regex_locale->ctype_is_c)
     185           0 :         return (c <= (pg_wchar) 127 &&
     186           0 :                 (pg_char_properties[c] & PG_ISSPACE));
     187             :     else
     188       76386 :         return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
     189             : }
     190             : 
     191             : static pg_wchar
     192       10702 : regc_wc_toupper(pg_wchar c)
     193             : {
     194       10702 :     if (pg_regex_locale->ctype_is_c)
     195             :     {
     196         978 :         if (c <= (pg_wchar) 127)
     197         978 :             return pg_ascii_toupper((unsigned char) c);
     198           0 :         return c;
     199             :     }
     200             :     else
     201        9724 :         return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
     202             : }
     203             : 
     204             : static pg_wchar
     205       10706 : regc_wc_tolower(pg_wchar c)
     206             : {
     207       10706 :     if (pg_regex_locale->ctype_is_c)
     208             :     {
     209         978 :         if (c <= (pg_wchar) 127)
     210         978 :             return pg_ascii_tolower((unsigned char) c);
     211           0 :         return c;
     212             :     }
     213             :     else
     214        9728 :         return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
     215             : }
     216             : 
     217             : 
     218             : /*
     219             :  * These functions cache the results of probing libc's ctype behavior for
     220             :  * all character codes of interest in a given encoding/collation.  The
     221             :  * result is provided as a "struct cvec", but notice that the representation
     222             :  * is a touch different from a cvec created by regc_cvec.c: we allocate the
     223             :  * chrs[] and ranges[] arrays separately from the struct so that we can
     224             :  * realloc them larger at need.  This is okay since the cvecs made here
     225             :  * should never be freed by freecvec().
     226             :  *
     227             :  * We use malloc not palloc since we mustn't lose control on out-of-memory;
     228             :  * the main regex code expects us to return a failure indication instead.
     229             :  */
     230             : 
     231             : typedef int (*regc_wc_probefunc) (pg_wchar c);
     232             : 
     233             : typedef struct pg_ctype_cache
     234             : {
     235             :     regc_wc_probefunc probefunc;    /* regc_wc_isalpha or a sibling */
     236             :     pg_locale_t locale;         /* locale this entry is for */
     237             :     struct cvec cv;             /* cache entry contents */
     238             :     struct pg_ctype_cache *next;    /* chain link */
     239             : } pg_ctype_cache;
     240             : 
     241             : static pg_ctype_cache *pg_ctype_cache_list = NULL;
     242             : 
     243             : /*
     244             :  * Add a chr or range to pcc->cv; return false if run out of memory
     245             :  */
     246             : static bool
     247       11794 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
     248             : {
     249             :     chr        *newchrs;
     250             : 
     251       11794 :     if (nchrs > 1)
     252             :     {
     253        3720 :         if (pcc->cv.nranges >= pcc->cv.rangespace)
     254             :         {
     255           0 :             pcc->cv.rangespace *= 2;
     256           0 :             newchrs = (chr *) realloc(pcc->cv.ranges,
     257           0 :                                       pcc->cv.rangespace * sizeof(chr) * 2);
     258           0 :             if (newchrs == NULL)
     259           0 :                 return false;
     260           0 :             pcc->cv.ranges = newchrs;
     261             :         }
     262        3720 :         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
     263        3720 :         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
     264        3720 :         pcc->cv.nranges++;
     265             :     }
     266             :     else
     267             :     {
     268             :         assert(nchrs == 1);
     269        8074 :         if (pcc->cv.nchrs >= pcc->cv.chrspace)
     270             :         {
     271          28 :             pcc->cv.chrspace *= 2;
     272          28 :             newchrs = (chr *) realloc(pcc->cv.chrs,
     273          28 :                                       pcc->cv.chrspace * sizeof(chr));
     274          28 :             if (newchrs == NULL)
     275           0 :                 return false;
     276          28 :             pcc->cv.chrs = newchrs;
     277             :         }
     278        8074 :         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
     279             :     }
     280       11794 :     return true;
     281             : }
     282             : 
     283             : /*
     284             :  * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
     285             :  * chrs satisfying the probe function.  The active collation is the one
     286             :  * previously set by pg_set_regex_collation.  Return NULL if out of memory.
     287             :  *
     288             :  * Note that the result must not be freed or modified by caller.
     289             :  */
     290             : static struct cvec *
     291         878 : regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
     292             : {
     293             :     pg_ctype_cache *pcc;
     294             :     pg_wchar    max_chr;
     295             :     pg_wchar    cur_chr;
     296             :     int         nmatches;
     297             :     chr        *newchrs;
     298             : 
     299             :     /*
     300             :      * Do we already have the answer cached?
     301             :      */
     302        2040 :     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
     303             :     {
     304        1762 :         if (pcc->probefunc == probefunc &&
     305         672 :             pcc->locale == pg_regex_locale)
     306         600 :             return &pcc->cv;
     307             :     }
     308             : 
     309             :     /*
     310             :      * Nope, so initialize some workspace ...
     311             :      */
     312         278 :     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
     313         278 :     if (pcc == NULL)
     314           0 :         return NULL;
     315         278 :     pcc->probefunc = probefunc;
     316         278 :     pcc->locale = pg_regex_locale;
     317         278 :     pcc->cv.nchrs = 0;
     318         278 :     pcc->cv.chrspace = 128;
     319         278 :     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
     320         278 :     pcc->cv.nranges = 0;
     321         278 :     pcc->cv.rangespace = 64;
     322         278 :     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
     323         278 :     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
     324           0 :         goto out_of_memory;
     325         278 :     pcc->cv.cclasscode = cclasscode;
     326             : 
     327             :     /*
     328             :      * Decide how many character codes we ought to look through.  In general
     329             :      * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
     330             :      * runtime using the "high colormap" mechanism.  However, in C locale
     331             :      * there's no need to go further than 127, and if we only have a 1-byte
     332             :      * <ctype.h> API there's no need to go further than that can handle.
     333             :      *
     334             :      * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
     335             :      * output cvec as not having any locale-dependent behavior, since there
     336             :      * will be no need to do any run-time locale checks.  (The #if's here
     337             :      * would always be true for production values of MAX_SIMPLE_CHR, but it's
     338             :      * useful to allow it to be small for testing purposes.)
     339             :      */
     340         278 :     if (pg_regex_locale->ctype_is_c)
     341             :     {
     342             : #if MAX_SIMPLE_CHR >= 127
     343          28 :         max_chr = (pg_wchar) 127;
     344          28 :         pcc->cv.cclasscode = -1;
     345             : #else
     346             :         max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     347             : #endif
     348             :     }
     349             :     else
     350             :     {
     351         250 :         if (pg_regex_locale->ctype->max_chr != 0 &&
     352           0 :             pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
     353             :         {
     354           0 :             max_chr = pg_regex_locale->ctype->max_chr;
     355           0 :             pcc->cv.cclasscode = -1;
     356             :         }
     357             :         else
     358         250 :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     359             :     }
     360             : 
     361             :     /*
     362             :      * And scan 'em ...
     363             :      */
     364         278 :     nmatches = 0;               /* number of consecutive matches */
     365             : 
     366      515862 :     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
     367             :     {
     368      515584 :         if ((*probefunc) (cur_chr))
     369      142080 :             nmatches++;
     370      373504 :         else if (nmatches > 0)
     371             :         {
     372       11770 :             if (!store_match(pcc, cur_chr - nmatches, nmatches))
     373           0 :                 goto out_of_memory;
     374       11770 :             nmatches = 0;
     375             :         }
     376             :     }
     377             : 
     378         278 :     if (nmatches > 0)
     379          24 :         if (!store_match(pcc, cur_chr - nmatches, nmatches))
     380           0 :             goto out_of_memory;
     381             : 
     382             :     /*
     383             :      * We might have allocated more memory than needed, if so free it
     384             :      */
     385         278 :     if (pcc->cv.nchrs == 0)
     386             :     {
     387         112 :         free(pcc->cv.chrs);
     388         112 :         pcc->cv.chrs = NULL;
     389         112 :         pcc->cv.chrspace = 0;
     390             :     }
     391         166 :     else if (pcc->cv.nchrs < pcc->cv.chrspace)
     392             :     {
     393         166 :         newchrs = (chr *) realloc(pcc->cv.chrs,
     394         166 :                                   pcc->cv.nchrs * sizeof(chr));
     395         166 :         if (newchrs == NULL)
     396           0 :             goto out_of_memory;
     397         166 :         pcc->cv.chrs = newchrs;
     398         166 :         pcc->cv.chrspace = pcc->cv.nchrs;
     399             :     }
     400         278 :     if (pcc->cv.nranges == 0)
     401             :     {
     402           0 :         free(pcc->cv.ranges);
     403           0 :         pcc->cv.ranges = NULL;
     404           0 :         pcc->cv.rangespace = 0;
     405             :     }
     406         278 :     else if (pcc->cv.nranges < pcc->cv.rangespace)
     407             :     {
     408         278 :         newchrs = (chr *) realloc(pcc->cv.ranges,
     409         278 :                                   pcc->cv.nranges * sizeof(chr) * 2);
     410         278 :         if (newchrs == NULL)
     411           0 :             goto out_of_memory;
     412         278 :         pcc->cv.ranges = newchrs;
     413         278 :         pcc->cv.rangespace = pcc->cv.nranges;
     414             :     }
     415             : 
     416             :     /*
     417             :      * Success, link it into cache chain
     418             :      */
     419         278 :     pcc->next = pg_ctype_cache_list;
     420         278 :     pg_ctype_cache_list = pcc;
     421             : 
     422         278 :     return &pcc->cv;
     423             : 
     424             :     /*
     425             :      * Failure, clean up
     426             :      */
     427           0 : out_of_memory:
     428           0 :     free(pcc->cv.chrs);
     429           0 :     free(pcc->cv.ranges);
     430           0 :     free(pcc);
     431             : 
     432           0 :     return NULL;
     433             : }
 |