LCOV - code coverage report
Current view: top level - src/backend/regex - regc_pg_locale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 120 159 75.5 %
Date: 2025-11-26 13:17:43 Functions: 15 15 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regc_pg_locale.c
       4             :  *    ctype functions adapted to work on pg_wchar (a/k/a chr),
       5             :  *    and functions to cache the results of wholesale ctype probing.
       6             :  *
       7             :  * This file is #included by regcomp.c; it's not meant to compile standalone.
       8             :  *
       9             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      10             :  * Portions Copyright (c) 1994, Regents of the University of California
      11             :  *
      12             :  * IDENTIFICATION
      13             :  *    src/backend/regex/regc_pg_locale.c
      14             :  *
      15             :  *-------------------------------------------------------------------------
      16             :  */
      17             : 
      18             : #include "catalog/pg_collation.h"
      19             : #include "common/unicode_case.h"
      20             : #include "common/unicode_category.h"
      21             : #include "utils/pg_locale.h"
      22             : #include "utils/pg_locale_c.h"
      23             : 
      24             : static pg_locale_t pg_regex_locale;
      25             : 
      26             : 
      27             : /*
      28             :  * pg_set_regex_collation: set collation for these functions to obey
      29             :  *
      30             :  * This is called when beginning compilation or execution of a regexp.
      31             :  * Since there's no need for reentrancy of regexp operations, it's okay
      32             :  * to store the results in static variables.
      33             :  */
      34             : void
      35     8042396 : pg_set_regex_collation(Oid collation)
      36             : {
      37     8042396 :     pg_locale_t locale = 0;
      38             : 
      39     8042396 :     if (!OidIsValid(collation))
      40             :     {
      41             :         /*
      42             :          * This typically means that the parser could not resolve a conflict
      43             :          * of implicit collations, so report it that way.
      44             :          */
      45           0 :         ereport(ERROR,
      46             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
      47             :                  errmsg("could not determine which collation to use for regular expression"),
      48             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
      49             :     }
      50             : 
      51     8042396 :     locale = pg_newlocale_from_collation(collation);
      52             : 
      53     8042396 :     if (!locale->deterministic)
      54          24 :         ereport(ERROR,
      55             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
      56             :                  errmsg("nondeterministic collations are not supported for regular expressions")));
      57             : 
      58     8042372 :     pg_regex_locale = locale;
      59     8042372 : }
      60             : 
      61             : /*
      62             :  * The following functions overlap with those defined in pg_locale.c. XXX:
      63             :  * consider refactor.
      64             :  */
      65             : 
      66             : static int
      67      186910 : regc_wc_isdigit(pg_wchar c)
      68             : {
      69      186910 :     if (pg_regex_locale->ctype_is_c)
      70        4260 :         return (c <= (pg_wchar) 127 &&
      71        2130 :                 (pg_char_properties[c] & PG_ISDIGIT));
      72             :     else
      73      184780 :         return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
      74             : }
      75             : 
      76             : static int
      77       29718 : regc_wc_isalpha(pg_wchar c)
      78             : {
      79       29718 :     if (pg_regex_locale->ctype_is_c)
      80        1536 :         return (c <= (pg_wchar) 127 &&
      81         768 :                 (pg_char_properties[c] & PG_ISALPHA));
      82             :     else
      83       28950 :         return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
      84             : }
      85             : 
      86             : static int
      87       94986 : regc_wc_isalnum(pg_wchar c)
      88             : {
      89       94986 :     if (pg_regex_locale->ctype_is_c)
      90        1524 :         return (c <= (pg_wchar) 127 &&
      91         762 :                 (pg_char_properties[c] & PG_ISALNUM));
      92             :     else
      93       94224 :         return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
      94             : }
      95             : 
      96             : static int
      97       37634 : regc_wc_isword(pg_wchar c)
      98             : {
      99             :     /* We define word characters as alnum class plus underscore */
     100       37634 :     if (c == CHR('_'))
     101          24 :         return 1;
     102       37610 :     return regc_wc_isalnum(c);
     103             : }
     104             : 
     105             : static int
     106       40976 : regc_wc_isupper(pg_wchar c)
     107             : {
     108       40976 :     if (pg_regex_locale->ctype_is_c)
     109           0 :         return (c <= (pg_wchar) 127 &&
     110           0 :                 (pg_char_properties[c] & PG_ISUPPER));
     111             :     else
     112       40976 :         return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
     113             : }
     114             : 
     115             : static int
     116       16390 : regc_wc_islower(pg_wchar c)
     117             : {
     118       16390 :     if (pg_regex_locale->ctype_is_c)
     119           0 :         return (c <= (pg_wchar) 127 &&
     120           0 :                 (pg_char_properties[c] & PG_ISLOWER));
     121             :     else
     122       16390 :         return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
     123             : }
     124             : 
     125             : static int
     126       16390 : regc_wc_isgraph(pg_wchar c)
     127             : {
     128       16390 :     if (pg_regex_locale->ctype_is_c)
     129           0 :         return (c <= (pg_wchar) 127 &&
     130           0 :                 (pg_char_properties[c] & PG_ISGRAPH));
     131             :     else
     132       16390 :         return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
     133             : }
     134             : 
     135             : static int
     136       16390 : regc_wc_isprint(pg_wchar c)
     137             : {
     138       16390 :     if (pg_regex_locale->ctype_is_c)
     139           0 :         return (c <= (pg_wchar) 127 &&
     140           0 :                 (pg_char_properties[c] & PG_ISPRINT));
     141             :     else
     142       16390 :         return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
     143             : }
     144             : 
     145             : static int
     146       40966 : regc_wc_ispunct(pg_wchar c)
     147             : {
     148       40966 :     if (pg_regex_locale->ctype_is_c)
     149           0 :         return (c <= (pg_wchar) 127 &&
     150           0 :                 (pg_char_properties[c] & PG_ISPUNCT));
     151             :     else
     152       40966 :         return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
     153             : }
     154             : 
     155             : static int
     156       76386 : regc_wc_isspace(pg_wchar c)
     157             : {
     158       76386 :     if (pg_regex_locale->ctype_is_c)
     159           0 :         return (c <= (pg_wchar) 127 &&
     160           0 :                 (pg_char_properties[c] & PG_ISSPACE));
     161             :     else
     162       76386 :         return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
     163             : }
     164             : 
     165             : static pg_wchar
     166       10702 : regc_wc_toupper(pg_wchar c)
     167             : {
     168       10702 :     if (pg_regex_locale->ctype_is_c)
     169             :     {
     170         978 :         if (c <= (pg_wchar) 127)
     171         978 :             return pg_ascii_toupper((unsigned char) c);
     172           0 :         return c;
     173             :     }
     174             :     else
     175        9724 :         return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
     176             : }
     177             : 
     178             : static pg_wchar
     179       10706 : regc_wc_tolower(pg_wchar c)
     180             : {
     181       10706 :     if (pg_regex_locale->ctype_is_c)
     182             :     {
     183         978 :         if (c <= (pg_wchar) 127)
     184         978 :             return pg_ascii_tolower((unsigned char) c);
     185           0 :         return c;
     186             :     }
     187             :     else
     188        9728 :         return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
     189             : }
     190             : 
     191             : 
     192             : /*
     193             :  * These functions cache the results of probing libc's ctype behavior for
     194             :  * all character codes of interest in a given encoding/collation.  The
     195             :  * result is provided as a "struct cvec", but notice that the representation
     196             :  * is a touch different from a cvec created by regc_cvec.c: we allocate the
     197             :  * chrs[] and ranges[] arrays separately from the struct so that we can
     198             :  * realloc them larger at need.  This is okay since the cvecs made here
     199             :  * should never be freed by freecvec().
     200             :  *
     201             :  * We use malloc not palloc since we mustn't lose control on out-of-memory;
     202             :  * the main regex code expects us to return a failure indication instead.
     203             :  */
     204             : 
     205             : typedef int (*regc_wc_probefunc) (pg_wchar c);
     206             : 
     207             : typedef struct pg_ctype_cache
     208             : {
     209             :     regc_wc_probefunc probefunc;    /* regc_wc_isalpha or a sibling */
     210             :     pg_locale_t locale;         /* locale this entry is for */
     211             :     struct cvec cv;             /* cache entry contents */
     212             :     struct pg_ctype_cache *next;    /* chain link */
     213             : } pg_ctype_cache;
     214             : 
     215             : static pg_ctype_cache *pg_ctype_cache_list = NULL;
     216             : 
     217             : /*
     218             :  * Add a chr or range to pcc->cv; return false if run out of memory
     219             :  */
     220             : static bool
     221       11794 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
     222             : {
     223             :     chr        *newchrs;
     224             : 
     225       11794 :     if (nchrs > 1)
     226             :     {
     227        3720 :         if (pcc->cv.nranges >= pcc->cv.rangespace)
     228             :         {
     229           0 :             pcc->cv.rangespace *= 2;
     230           0 :             newchrs = (chr *) realloc(pcc->cv.ranges,
     231           0 :                                       pcc->cv.rangespace * sizeof(chr) * 2);
     232           0 :             if (newchrs == NULL)
     233           0 :                 return false;
     234           0 :             pcc->cv.ranges = newchrs;
     235             :         }
     236        3720 :         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
     237        3720 :         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
     238        3720 :         pcc->cv.nranges++;
     239             :     }
     240             :     else
     241             :     {
     242             :         assert(nchrs == 1);
     243        8074 :         if (pcc->cv.nchrs >= pcc->cv.chrspace)
     244             :         {
     245          28 :             pcc->cv.chrspace *= 2;
     246          28 :             newchrs = (chr *) realloc(pcc->cv.chrs,
     247          28 :                                       pcc->cv.chrspace * sizeof(chr));
     248          28 :             if (newchrs == NULL)
     249           0 :                 return false;
     250          28 :             pcc->cv.chrs = newchrs;
     251             :         }
     252        8074 :         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
     253             :     }
     254       11794 :     return true;
     255             : }
     256             : 
     257             : /*
     258             :  * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
     259             :  * chrs satisfying the probe function.  The active collation is the one
     260             :  * previously set by pg_set_regex_collation.  Return NULL if out of memory.
     261             :  *
     262             :  * Note that the result must not be freed or modified by caller.
     263             :  */
     264             : static struct cvec *
     265         878 : regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
     266             : {
     267             :     pg_ctype_cache *pcc;
     268             :     pg_wchar    max_chr;
     269             :     pg_wchar    cur_chr;
     270             :     int         nmatches;
     271             :     chr        *newchrs;
     272             : 
     273             :     /*
     274             :      * Do we already have the answer cached?
     275             :      */
     276        2040 :     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
     277             :     {
     278        1762 :         if (pcc->probefunc == probefunc &&
     279         672 :             pcc->locale == pg_regex_locale)
     280         600 :             return &pcc->cv;
     281             :     }
     282             : 
     283             :     /*
     284             :      * Nope, so initialize some workspace ...
     285             :      */
     286         278 :     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
     287         278 :     if (pcc == NULL)
     288           0 :         return NULL;
     289         278 :     pcc->probefunc = probefunc;
     290         278 :     pcc->locale = pg_regex_locale;
     291         278 :     pcc->cv.nchrs = 0;
     292         278 :     pcc->cv.chrspace = 128;
     293         278 :     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
     294         278 :     pcc->cv.nranges = 0;
     295         278 :     pcc->cv.rangespace = 64;
     296         278 :     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
     297         278 :     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
     298           0 :         goto out_of_memory;
     299         278 :     pcc->cv.cclasscode = cclasscode;
     300             : 
     301             :     /*
     302             :      * Decide how many character codes we ought to look through.  In general
     303             :      * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
     304             :      * runtime using the "high colormap" mechanism.  However, in C locale
     305             :      * there's no need to go further than 127, and if we only have a 1-byte
     306             :      * <ctype.h> API there's no need to go further than that can handle.
     307             :      *
     308             :      * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
     309             :      * output cvec as not having any locale-dependent behavior, since there
     310             :      * will be no need to do any run-time locale checks.  (The #if's here
     311             :      * would always be true for production values of MAX_SIMPLE_CHR, but it's
     312             :      * useful to allow it to be small for testing purposes.)
     313             :      */
     314         278 :     if (pg_regex_locale->ctype_is_c)
     315             :     {
     316             : #if MAX_SIMPLE_CHR >= 127
     317          28 :         max_chr = (pg_wchar) 127;
     318          28 :         pcc->cv.cclasscode = -1;
     319             : #else
     320             :         max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     321             : #endif
     322             :     }
     323             :     else
     324             :     {
     325         250 :         if (pg_regex_locale->ctype->max_chr != 0 &&
     326           0 :             pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
     327             :         {
     328           0 :             max_chr = pg_regex_locale->ctype->max_chr;
     329           0 :             pcc->cv.cclasscode = -1;
     330             :         }
     331             :         else
     332         250 :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     333             :     }
     334             : 
     335             :     /*
     336             :      * And scan 'em ...
     337             :      */
     338         278 :     nmatches = 0;               /* number of consecutive matches */
     339             : 
     340      515862 :     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
     341             :     {
     342      515584 :         if ((*probefunc) (cur_chr))
     343      142080 :             nmatches++;
     344      373504 :         else if (nmatches > 0)
     345             :         {
     346       11770 :             if (!store_match(pcc, cur_chr - nmatches, nmatches))
     347           0 :                 goto out_of_memory;
     348       11770 :             nmatches = 0;
     349             :         }
     350             :     }
     351             : 
     352         278 :     if (nmatches > 0)
     353          24 :         if (!store_match(pcc, cur_chr - nmatches, nmatches))
     354           0 :             goto out_of_memory;
     355             : 
     356             :     /*
     357             :      * We might have allocated more memory than needed, if so free it
     358             :      */
     359         278 :     if (pcc->cv.nchrs == 0)
     360             :     {
     361         112 :         free(pcc->cv.chrs);
     362         112 :         pcc->cv.chrs = NULL;
     363         112 :         pcc->cv.chrspace = 0;
     364             :     }
     365         166 :     else if (pcc->cv.nchrs < pcc->cv.chrspace)
     366             :     {
     367         166 :         newchrs = (chr *) realloc(pcc->cv.chrs,
     368         166 :                                   pcc->cv.nchrs * sizeof(chr));
     369         166 :         if (newchrs == NULL)
     370           0 :             goto out_of_memory;
     371         166 :         pcc->cv.chrs = newchrs;
     372         166 :         pcc->cv.chrspace = pcc->cv.nchrs;
     373             :     }
     374         278 :     if (pcc->cv.nranges == 0)
     375             :     {
     376           0 :         free(pcc->cv.ranges);
     377           0 :         pcc->cv.ranges = NULL;
     378           0 :         pcc->cv.rangespace = 0;
     379             :     }
     380         278 :     else if (pcc->cv.nranges < pcc->cv.rangespace)
     381             :     {
     382         278 :         newchrs = (chr *) realloc(pcc->cv.ranges,
     383         278 :                                   pcc->cv.nranges * sizeof(chr) * 2);
     384         278 :         if (newchrs == NULL)
     385           0 :             goto out_of_memory;
     386         278 :         pcc->cv.ranges = newchrs;
     387         278 :         pcc->cv.rangespace = pcc->cv.nranges;
     388             :     }
     389             : 
     390             :     /*
     391             :      * Success, link it into cache chain
     392             :      */
     393         278 :     pcc->next = pg_ctype_cache_list;
     394         278 :     pg_ctype_cache_list = pcc;
     395             : 
     396         278 :     return &pcc->cv;
     397             : 
     398             :     /*
     399             :      * Failure, clean up
     400             :      */
     401           0 : out_of_memory:
     402           0 :     free(pcc->cv.chrs);
     403           0 :     free(pcc->cv.ranges);
     404           0 :     free(pcc);
     405             : 
     406           0 :     return NULL;
     407             : }

Generated by: LCOV version 1.16