LCOV - code coverage report
Current view: top level - src/backend/regex - regc_pg_locale.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 75.9 % 158 120
Test Date: 2026-03-02 08:16:13 Functions: 100.0 % 15 15
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * regc_pg_locale.c
       4              :  *    ctype functions adapted to work on pg_wchar (a/k/a chr),
       5              :  *    and functions to cache the results of wholesale ctype probing.
       6              :  *
       7              :  * This file is #included by regcomp.c; it's not meant to compile standalone.
       8              :  *
       9              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      10              :  * Portions Copyright (c) 1994, Regents of the University of California
      11              :  *
      12              :  * IDENTIFICATION
      13              :  *    src/backend/regex/regc_pg_locale.c
      14              :  *
      15              :  *-------------------------------------------------------------------------
      16              :  */
      17              : 
      18              : #include "catalog/pg_collation.h"
      19              : #include "common/unicode_case.h"
      20              : #include "common/unicode_category.h"
      21              : #include "utils/pg_locale.h"
      22              : #include "utils/pg_locale_c.h"
      23              : 
      24              : static pg_locale_t pg_regex_locale;
      25              : 
      26              : 
      27              : /*
      28              :  * pg_set_regex_collation: set collation for these functions to obey
      29              :  *
      30              :  * This is called when beginning compilation or execution of a regexp.
      31              :  * Since there's no need for reentrancy of regexp operations, it's okay
      32              :  * to store the results in static variables.
      33              :  */
      34              : void
      35      4166639 : pg_set_regex_collation(Oid collation)
      36              : {
      37      4166639 :     pg_locale_t locale = 0;
      38              : 
      39      4166639 :     if (!OidIsValid(collation))
      40              :     {
      41              :         /*
      42              :          * This typically means that the parser could not resolve a conflict
      43              :          * of implicit collations, so report it that way.
      44              :          */
      45            0 :         ereport(ERROR,
      46              :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
      47              :                  errmsg("could not determine which collation to use for regular expression"),
      48              :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
      49              :     }
      50              : 
      51      4166639 :     locale = pg_newlocale_from_collation(collation);
      52              : 
      53      4166639 :     if (!locale->deterministic)
      54           12 :         ereport(ERROR,
      55              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
      56              :                  errmsg("nondeterministic collations are not supported for regular expressions")));
      57              : 
      58      4166627 :     pg_regex_locale = locale;
      59      4166627 : }
      60              : 
      61              : /*
      62              :  * The following functions overlap with those defined in pg_locale.c. XXX:
      63              :  * consider refactor.
      64              :  */
      65              : 
      66              : static int
      67        93455 : regc_wc_isdigit(pg_wchar c)
      68              : {
      69        93455 :     if (pg_regex_locale->ctype_is_c)
      70         2130 :         return (c <= (pg_wchar) 127 &&
      71         1065 :                 (pg_char_properties[c] & PG_ISDIGIT));
      72              :     else
      73        92390 :         return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
      74              : }
      75              : 
      76              : static int
      77        14859 : regc_wc_isalpha(pg_wchar c)
      78              : {
      79        14859 :     if (pg_regex_locale->ctype_is_c)
      80          768 :         return (c <= (pg_wchar) 127 &&
      81          384 :                 (pg_char_properties[c] & PG_ISALPHA));
      82              :     else
      83        14475 :         return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
      84              : }
      85              : 
      86              : static int
      87        47493 : regc_wc_isalnum(pg_wchar c)
      88              : {
      89        47493 :     if (pg_regex_locale->ctype_is_c)
      90          762 :         return (c <= (pg_wchar) 127 &&
      91          381 :                 (pg_char_properties[c] & PG_ISALNUM));
      92              :     else
      93        47112 :         return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
      94              : }
      95              : 
      96              : static int
      97        18817 : regc_wc_isword(pg_wchar c)
      98              : {
      99              :     /* We define word characters as alnum class plus underscore */
     100        18817 :     if (c == CHR('_'))
     101           12 :         return 1;
     102        18805 :     return regc_wc_isalnum(c);
     103              : }
     104              : 
     105              : static int
     106        20488 : regc_wc_isupper(pg_wchar c)
     107              : {
     108        20488 :     if (pg_regex_locale->ctype_is_c)
     109            0 :         return (c <= (pg_wchar) 127 &&
     110            0 :                 (pg_char_properties[c] & PG_ISUPPER));
     111              :     else
     112        20488 :         return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
     113              : }
     114              : 
     115              : static int
     116         8195 : regc_wc_islower(pg_wchar c)
     117              : {
     118         8195 :     if (pg_regex_locale->ctype_is_c)
     119            0 :         return (c <= (pg_wchar) 127 &&
     120            0 :                 (pg_char_properties[c] & PG_ISLOWER));
     121              :     else
     122         8195 :         return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
     123              : }
     124              : 
     125              : static int
     126         8195 : regc_wc_isgraph(pg_wchar c)
     127              : {
     128         8195 :     if (pg_regex_locale->ctype_is_c)
     129            0 :         return (c <= (pg_wchar) 127 &&
     130            0 :                 (pg_char_properties[c] & PG_ISGRAPH));
     131              :     else
     132         8195 :         return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
     133              : }
     134              : 
     135              : static int
     136         8195 : regc_wc_isprint(pg_wchar c)
     137              : {
     138         8195 :     if (pg_regex_locale->ctype_is_c)
     139            0 :         return (c <= (pg_wchar) 127 &&
     140            0 :                 (pg_char_properties[c] & PG_ISPRINT));
     141              :     else
     142         8195 :         return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
     143              : }
     144              : 
     145              : static int
     146        20483 : regc_wc_ispunct(pg_wchar c)
     147              : {
     148        20483 :     if (pg_regex_locale->ctype_is_c)
     149            0 :         return (c <= (pg_wchar) 127 &&
     150            0 :                 (pg_char_properties[c] & PG_ISPUNCT));
     151              :     else
     152        20483 :         return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
     153              : }
     154              : 
     155              : static int
     156        38193 : regc_wc_isspace(pg_wchar c)
     157              : {
     158        38193 :     if (pg_regex_locale->ctype_is_c)
     159            0 :         return (c <= (pg_wchar) 127 &&
     160            0 :                 (pg_char_properties[c] & PG_ISSPACE));
     161              :     else
     162        38193 :         return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
     163              : }
     164              : 
     165              : static pg_wchar
     166         5351 : regc_wc_toupper(pg_wchar c)
     167              : {
     168         5351 :     if (pg_regex_locale->ctype_is_c)
     169              :     {
     170          489 :         if (c <= (pg_wchar) 127)
     171          489 :             return pg_ascii_toupper((unsigned char) c);
     172            0 :         return c;
     173              :     }
     174              :     else
     175         4862 :         return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
     176              : }
     177              : 
     178              : static pg_wchar
     179         5353 : regc_wc_tolower(pg_wchar c)
     180              : {
     181         5353 :     if (pg_regex_locale->ctype_is_c)
     182              :     {
     183          489 :         if (c <= (pg_wchar) 127)
     184          489 :             return pg_ascii_tolower((unsigned char) c);
     185            0 :         return c;
     186              :     }
     187              :     else
     188         4864 :         return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
     189              : }
     190              : 
     191              : 
     192              : /*
     193              :  * These functions cache the results of probing libc's ctype behavior for
     194              :  * all character codes of interest in a given encoding/collation.  The
     195              :  * result is provided as a "struct cvec", but notice that the representation
     196              :  * is a touch different from a cvec created by regc_cvec.c: we allocate the
     197              :  * chrs[] and ranges[] arrays separately from the struct so that we can
     198              :  * realloc them larger at need.  This is okay since the cvecs made here
     199              :  * should never be freed by freecvec().
     200              :  *
     201              :  * We use malloc not palloc since we mustn't lose control on out-of-memory;
     202              :  * the main regex code expects us to return a failure indication instead.
     203              :  */
     204              : 
     205              : typedef int (*regc_wc_probefunc) (pg_wchar c);
     206              : 
     207              : typedef struct pg_ctype_cache
     208              : {
     209              :     regc_wc_probefunc probefunc;    /* regc_wc_isalpha or a sibling */
     210              :     pg_locale_t locale;         /* locale this entry is for */
     211              :     struct cvec cv;             /* cache entry contents */
     212              :     struct pg_ctype_cache *next;    /* chain link */
     213              : } pg_ctype_cache;
     214              : 
     215              : static pg_ctype_cache *pg_ctype_cache_list = NULL;
     216              : 
     217              : /*
     218              :  * Add a chr or range to pcc->cv; return false if run out of memory
     219              :  */
     220              : static bool
     221         5897 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
     222              : {
     223              :     chr        *newchrs;
     224              : 
     225         5897 :     if (nchrs > 1)
     226              :     {
     227         1860 :         if (pcc->cv.nranges >= pcc->cv.rangespace)
     228              :         {
     229            0 :             pcc->cv.rangespace *= 2;
     230            0 :             newchrs = (chr *) realloc(pcc->cv.ranges,
     231            0 :                                       pcc->cv.rangespace * sizeof(chr) * 2);
     232            0 :             if (newchrs == NULL)
     233            0 :                 return false;
     234            0 :             pcc->cv.ranges = newchrs;
     235              :         }
     236         1860 :         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
     237         1860 :         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
     238         1860 :         pcc->cv.nranges++;
     239              :     }
     240              :     else
     241              :     {
     242              :         assert(nchrs == 1);
     243         4037 :         if (pcc->cv.nchrs >= pcc->cv.chrspace)
     244              :         {
     245           14 :             pcc->cv.chrspace *= 2;
     246           14 :             newchrs = (chr *) realloc(pcc->cv.chrs,
     247           14 :                                       pcc->cv.chrspace * sizeof(chr));
     248           14 :             if (newchrs == NULL)
     249            0 :                 return false;
     250           14 :             pcc->cv.chrs = newchrs;
     251              :         }
     252         4037 :         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
     253              :     }
     254         5897 :     return true;
     255              : }
     256              : 
     257              : /*
     258              :  * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
     259              :  * chrs satisfying the probe function.  The active collation is the one
     260              :  * previously set by pg_set_regex_collation.  Return NULL if out of memory.
     261              :  *
     262              :  * Note that the result must not be freed or modified by caller.
     263              :  */
     264              : static struct cvec *
     265          439 : regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
     266              : {
     267              :     pg_ctype_cache *pcc;
     268              :     pg_wchar    max_chr;
     269              :     pg_wchar    cur_chr;
     270              :     int         nmatches;
     271              :     chr        *newchrs;
     272              : 
     273              :     /*
     274              :      * Do we already have the answer cached?
     275              :      */
     276         1020 :     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
     277              :     {
     278          881 :         if (pcc->probefunc == probefunc &&
     279          336 :             pcc->locale == pg_regex_locale)
     280          300 :             return &pcc->cv;
     281              :     }
     282              : 
     283              :     /*
     284              :      * Nope, so initialize some workspace ...
     285              :      */
     286          139 :     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
     287          139 :     if (pcc == NULL)
     288            0 :         return NULL;
     289          139 :     pcc->probefunc = probefunc;
     290          139 :     pcc->locale = pg_regex_locale;
     291          139 :     pcc->cv.nchrs = 0;
     292          139 :     pcc->cv.chrspace = 128;
     293          139 :     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
     294          139 :     pcc->cv.nranges = 0;
     295          139 :     pcc->cv.rangespace = 64;
     296          139 :     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
     297          139 :     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
     298            0 :         goto out_of_memory;
     299          139 :     pcc->cv.cclasscode = cclasscode;
     300              : 
     301              :     /*
     302              :      * Decide how many character codes we ought to look through.  In general
     303              :      * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
     304              :      * runtime using the "high colormap" mechanism.  However, in C locale
     305              :      * there's no need to go further than 127, and if we only have a 1-byte
     306              :      * <ctype.h> API there's no need to go further than that can handle.
     307              :      *
     308              :      * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
     309              :      * output cvec as not having any locale-dependent behavior, since there
     310              :      * will be no need to do any run-time locale checks.  (The #if's here
     311              :      * would always be true for production values of MAX_SIMPLE_CHR, but it's
     312              :      * useful to allow it to be small for testing purposes.)
     313              :      */
     314          139 :     if (pg_regex_locale->ctype_is_c)
     315              :     {
     316              : #if MAX_SIMPLE_CHR >= 127
     317           14 :         max_chr = (pg_wchar) 127;
     318           14 :         pcc->cv.cclasscode = -1;
     319              : #else
     320              :         max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     321              : #endif
     322              :     }
     323          125 :     else if (GetDatabaseEncoding() == PG_UTF8)
     324              :     {
     325          125 :         max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     326              :     }
     327              :     else
     328              :     {
     329              : #if MAX_SIMPLE_CHR >= UCHAR_MAX
     330            0 :         max_chr = (pg_wchar) UCHAR_MAX;
     331            0 :         pcc->cv.cclasscode = -1;
     332              : #else
     333              :         max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     334              : #endif
     335              :     }
     336              : 
     337              :     /*
     338              :      * And scan 'em ...
     339              :      */
     340          139 :     nmatches = 0;               /* number of consecutive matches */
     341              : 
     342       257931 :     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
     343              :     {
     344       257792 :         if ((*probefunc) (cur_chr))
     345        71209 :             nmatches++;
     346       186583 :         else if (nmatches > 0)
     347              :         {
     348         5885 :             if (!store_match(pcc, cur_chr - nmatches, nmatches))
     349            0 :                 goto out_of_memory;
     350         5885 :             nmatches = 0;
     351              :         }
     352              :     }
     353              : 
     354          139 :     if (nmatches > 0)
     355           12 :         if (!store_match(pcc, cur_chr - nmatches, nmatches))
     356            0 :             goto out_of_memory;
     357              : 
     358              :     /*
     359              :      * We might have allocated more memory than needed, if so free it
     360              :      */
     361          139 :     if (pcc->cv.nchrs == 0)
     362              :     {
     363           56 :         free(pcc->cv.chrs);
     364           56 :         pcc->cv.chrs = NULL;
     365           56 :         pcc->cv.chrspace = 0;
     366              :     }
     367           83 :     else if (pcc->cv.nchrs < pcc->cv.chrspace)
     368              :     {
     369           83 :         newchrs = (chr *) realloc(pcc->cv.chrs,
     370           83 :                                   pcc->cv.nchrs * sizeof(chr));
     371           83 :         if (newchrs == NULL)
     372            0 :             goto out_of_memory;
     373           83 :         pcc->cv.chrs = newchrs;
     374           83 :         pcc->cv.chrspace = pcc->cv.nchrs;
     375              :     }
     376          139 :     if (pcc->cv.nranges == 0)
     377              :     {
     378            0 :         free(pcc->cv.ranges);
     379            0 :         pcc->cv.ranges = NULL;
     380            0 :         pcc->cv.rangespace = 0;
     381              :     }
     382          139 :     else if (pcc->cv.nranges < pcc->cv.rangespace)
     383              :     {
     384          139 :         newchrs = (chr *) realloc(pcc->cv.ranges,
     385          139 :                                   pcc->cv.nranges * sizeof(chr) * 2);
     386          139 :         if (newchrs == NULL)
     387            0 :             goto out_of_memory;
     388          139 :         pcc->cv.ranges = newchrs;
     389          139 :         pcc->cv.rangespace = pcc->cv.nranges;
     390              :     }
     391              : 
     392              :     /*
     393              :      * Success, link it into cache chain
     394              :      */
     395          139 :     pcc->next = pg_ctype_cache_list;
     396          139 :     pg_ctype_cache_list = pcc;
     397              : 
     398          139 :     return &pcc->cv;
     399              : 
     400              :     /*
     401              :      * Failure, clean up
     402              :      */
     403            0 : out_of_memory:
     404            0 :     free(pcc->cv.chrs);
     405            0 :     free(pcc->cv.ranges);
     406            0 :     free(pcc);
     407              : 
     408            0 :     return NULL;
     409              : }
        

Generated by: LCOV version 2.0-1