LCOV - code coverage report
Current view: top level - src/backend/regex - regc_pg_locale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 201 295 68.1 %
Date: 2025-04-24 12:15:10 Functions: 15 15 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regc_pg_locale.c
       4             :  *    ctype functions adapted to work on pg_wchar (a/k/a chr),
       5             :  *    and functions to cache the results of wholesale ctype probing.
       6             :  *
       7             :  * This file is #included by regcomp.c; it's not meant to compile standalone.
       8             :  *
       9             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      10             :  * Portions Copyright (c) 1994, Regents of the University of California
      11             :  *
      12             :  * IDENTIFICATION
      13             :  *    src/backend/regex/regc_pg_locale.c
      14             :  *
      15             :  *-------------------------------------------------------------------------
      16             :  */
      17             : 
      18             : #include "catalog/pg_collation.h"
      19             : #include "common/unicode_case.h"
      20             : #include "common/unicode_category.h"
      21             : #include "utils/pg_locale.h"
      22             : 
      23             : /*
      24             :  * For the libc provider, to provide as much functionality as possible on a
      25             :  * variety of platforms without going so far as to implement everything from
      26             :  * scratch, we use several implementation strategies depending on the
      27             :  * situation:
      28             :  *
      29             :  * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
      30             :  * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
      31             :  * collations don't give a fig about multibyte characters.
      32             :  *
      33             :  * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
      34             :  * This assumes that every platform uses Unicode codepoints directly
      35             :  * as the wchar_t representation of Unicode.  (XXX: ICU makes this assumption
      36             :  * even for non-UTF8 encodings, which may be a problem.)  On some platforms
      37             :  * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
      38             :  *
      39             :  * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
      40             :  * values up to 255, and punt for values above that.  This is 100% correct
      41             :  * only in single-byte encodings such as LATINn.  However, non-Unicode
      42             :  * multibyte encodings are mostly Far Eastern character sets for which the
      43             :  * properties being tested here aren't very relevant for higher code values
      44             :  * anyway.  The difficulty with using the <wctype.h> functions with
      45             :  * non-Unicode multibyte encodings is that we can have no certainty that
      46             :  * the platform's wchar_t representation matches what we do in pg_wchar
      47             :  * conversions.
      48             :  *
      49             :  * As a special case, in the "default" collation, (2) and (3) force ASCII
      50             :  * letters to follow ASCII upcase/downcase rules, while in a non-default
      51             :  * collation we just let the library functions do what they will.  The case
      52             :  * where this matters is treatment of I/i in Turkish, and the behavior is
      53             :  * meant to match the upper()/lower() SQL functions.
      54             :  *
      55             :  * We store the active collation setting in static variables.  In principle
      56             :  * it could be passed down to here via the regex library's "struct vars" data
      57             :  * structure; but that would require somewhat invasive changes in the regex
      58             :  * library, and right now there's no real benefit to be gained from that.
      59             :  *
      60             :  * NB: the coding here assumes pg_wchar is an unsigned type.
      61             :  */
      62             : 
      63             : typedef enum
      64             : {
      65             :     PG_REGEX_STRATEGY_C,        /* C locale (encoding independent) */
      66             :     PG_REGEX_STRATEGY_BUILTIN,  /* built-in Unicode semantics */
      67             :     PG_REGEX_STRATEGY_LIBC_WIDE,    /* Use locale_t <wctype.h> functions */
      68             :     PG_REGEX_STRATEGY_LIBC_1BYTE,   /* Use locale_t <ctype.h> functions */
      69             :     PG_REGEX_STRATEGY_ICU,      /* Use ICU uchar.h functions */
      70             : } PG_Locale_Strategy;
      71             : 
      72             : static PG_Locale_Strategy pg_regex_strategy;
      73             : static pg_locale_t pg_regex_locale;
      74             : 
      75             : /*
      76             :  * Hard-wired character properties for C locale
      77             :  */
      78             : #define PG_ISDIGIT  0x01
      79             : #define PG_ISALPHA  0x02
      80             : #define PG_ISALNUM  (PG_ISDIGIT | PG_ISALPHA)
      81             : #define PG_ISUPPER  0x04
      82             : #define PG_ISLOWER  0x08
      83             : #define PG_ISGRAPH  0x10
      84             : #define PG_ISPRINT  0x20
      85             : #define PG_ISPUNCT  0x40
      86             : #define PG_ISSPACE  0x80
      87             : 
      88             : static const unsigned char pg_char_properties[128] = {
      89             :      /* NUL */ 0,
      90             :      /* ^A */ 0,
      91             :      /* ^B */ 0,
      92             :      /* ^C */ 0,
      93             :      /* ^D */ 0,
      94             :      /* ^E */ 0,
      95             :      /* ^F */ 0,
      96             :      /* ^G */ 0,
      97             :      /* ^H */ 0,
      98             :      /* ^I */ PG_ISSPACE,
      99             :      /* ^J */ PG_ISSPACE,
     100             :      /* ^K */ PG_ISSPACE,
     101             :      /* ^L */ PG_ISSPACE,
     102             :      /* ^M */ PG_ISSPACE,
     103             :      /* ^N */ 0,
     104             :      /* ^O */ 0,
     105             :      /* ^P */ 0,
     106             :      /* ^Q */ 0,
     107             :      /* ^R */ 0,
     108             :      /* ^S */ 0,
     109             :      /* ^T */ 0,
     110             :      /* ^U */ 0,
     111             :      /* ^V */ 0,
     112             :      /* ^W */ 0,
     113             :      /* ^X */ 0,
     114             :      /* ^Y */ 0,
     115             :      /* ^Z */ 0,
     116             :      /* ^[ */ 0,
     117             :      /* ^\ */ 0,
     118             :      /* ^] */ 0,
     119             :      /* ^^ */ 0,
     120             :      /* ^_ */ 0,
     121             :      /* */ PG_ISPRINT | PG_ISSPACE,
     122             :      /* !  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     123             :      /* "  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     124             :      /* #  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     125             :      /* $  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     126             :      /* %  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     127             :      /* &  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     128             :      /* '  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     129             :      /* (  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     130             :      /* )  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     131             :      /* *  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     132             :      /* +  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     133             :      /* ,  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     134             :      /* -  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     135             :      /* .  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     136             :      /* /  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     137             :      /* 0  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     138             :      /* 1  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     139             :      /* 2  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     140             :      /* 3  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     141             :      /* 4  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     142             :      /* 5  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     143             :      /* 6  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     144             :      /* 7  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     145             :      /* 8  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     146             :      /* 9  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
     147             :      /* :  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     148             :      /* ;  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     149             :      /* <  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     150             :      /* =  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     151             :      /* >  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     152             :      /* ?  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     153             :      /* @  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     154             :      /* A  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     155             :      /* B  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     156             :      /* C  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     157             :      /* D  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     158             :      /* E  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     159             :      /* F  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     160             :      /* G  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     161             :      /* H  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     162             :      /* I  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     163             :      /* J  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     164             :      /* K  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     165             :      /* L  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     166             :      /* M  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     167             :      /* N  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     168             :      /* O  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     169             :      /* P  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     170             :      /* Q  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     171             :      /* R  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     172             :      /* S  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     173             :      /* T  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     174             :      /* U  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     175             :      /* V  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     176             :      /* W  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     177             :      /* X  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     178             :      /* Y  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     179             :      /* Z  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
     180             :      /* [  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     181             :      /* \  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     182             :      /* ]  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     183             :      /* ^  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     184             :      /* _  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     185             :      /* `  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     186             :      /* a  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     187             :      /* b  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     188             :      /* c  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     189             :      /* d  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     190             :      /* e  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     191             :      /* f  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     192             :      /* g  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     193             :      /* h  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     194             :      /* i  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     195             :      /* j  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     196             :      /* k  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     197             :      /* l  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     198             :      /* m  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     199             :      /* n  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     200             :      /* o  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     201             :      /* p  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     202             :      /* q  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     203             :      /* r  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     204             :      /* s  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     205             :      /* t  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     206             :      /* u  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     207             :      /* v  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     208             :      /* w  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     209             :      /* x  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     210             :      /* y  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     211             :      /* z  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
     212             :      /* {  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     213             :      /* |  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     214             :      /* }  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     215             :      /* ~  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
     216             :      /* DEL */ 0
     217             : };
     218             : 
     219             : 
     220             : /*
     221             :  * pg_set_regex_collation: set collation for these functions to obey
     222             :  *
     223             :  * This is called when beginning compilation or execution of a regexp.
     224             :  * Since there's no need for reentrancy of regexp operations, it's okay
     225             :  * to store the results in static variables.
     226             :  */
     227             : void
     228     7957608 : pg_set_regex_collation(Oid collation)
     229             : {
     230     7957608 :     pg_locale_t locale = 0;
     231             :     PG_Locale_Strategy strategy;
     232             : 
     233     7957608 :     if (!OidIsValid(collation))
     234             :     {
     235             :         /*
     236             :          * This typically means that the parser could not resolve a conflict
     237             :          * of implicit collations, so report it that way.
     238             :          */
     239           0 :         ereport(ERROR,
     240             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
     241             :                  errmsg("could not determine which collation to use for regular expression"),
     242             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
     243             :     }
     244             : 
     245     7957608 :     if (collation == C_COLLATION_OID)
     246             :     {
     247             :         /*
     248             :          * Some callers expect regexes to work for C_COLLATION_OID before
     249             :          * catalog access is available, so we can't call
     250             :          * pg_newlocale_from_collation().
     251             :          */
     252      127846 :         strategy = PG_REGEX_STRATEGY_C;
     253      127846 :         locale = 0;
     254             :     }
     255             :     else
     256             :     {
     257     7829762 :         locale = pg_newlocale_from_collation(collation);
     258             : 
     259     7829762 :         if (!locale->deterministic)
     260          24 :             ereport(ERROR,
     261             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     262             :                      errmsg("nondeterministic collations are not supported for regular expressions")));
     263             : 
     264     7829738 :         if (locale->ctype_is_c)
     265             :         {
     266             :             /*
     267             :              * C/POSIX collations use this path regardless of database
     268             :              * encoding
     269             :              */
     270         284 :             strategy = PG_REGEX_STRATEGY_C;
     271         284 :             locale = 0;
     272             :         }
     273     7829454 :         else if (locale->provider == COLLPROVIDER_BUILTIN)
     274             :         {
     275             :             Assert(GetDatabaseEncoding() == PG_UTF8);
     276     2185254 :             strategy = PG_REGEX_STRATEGY_BUILTIN;
     277             :         }
     278             : #ifdef USE_ICU
     279     5644200 :         else if (locale->provider == COLLPROVIDER_ICU)
     280             :         {
     281         942 :             strategy = PG_REGEX_STRATEGY_ICU;
     282             :         }
     283             : #endif
     284             :         else
     285             :         {
     286             :             Assert(locale->provider == COLLPROVIDER_LIBC);
     287     5643258 :             if (GetDatabaseEncoding() == PG_UTF8)
     288     5643254 :                 strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
     289             :             else
     290           4 :                 strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
     291             :         }
     292             :     }
     293             : 
     294     7957584 :     pg_regex_strategy = strategy;
     295     7957584 :     pg_regex_locale = locale;
     296     7957584 : }
     297             : 
     298             : static int
     299      186918 : pg_wc_isdigit(pg_wchar c)
     300             : {
     301      186918 :     switch (pg_regex_strategy)
     302             :     {
     303        2130 :         case PG_REGEX_STRATEGY_C:
     304        4260 :             return (c <= (pg_wchar) 127 &&
     305        2130 :                     (pg_char_properties[c] & PG_ISDIGIT));
     306       65660 :         case PG_REGEX_STRATEGY_BUILTIN:
     307       65660 :             return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
     308      106840 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     309             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     310      106840 :                 return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
     311             :             /* FALL THRU */
     312             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     313           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     314           0 :                     isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
     315             :             break;
     316       12288 :         case PG_REGEX_STRATEGY_ICU:
     317             : #ifdef USE_ICU
     318       12288 :             return u_isdigit(c);
     319             : #endif
     320             :             break;
     321             :     }
     322           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     323             : }
     324             : 
     325             : static int
     326       17430 : pg_wc_isalpha(pg_wchar c)
     327             : {
     328       17430 :     switch (pg_regex_strategy)
     329             :     {
     330         768 :         case PG_REGEX_STRATEGY_C:
     331        1536 :             return (c <= (pg_wchar) 127 &&
     332         768 :                     (pg_char_properties[c] & PG_ISALPHA));
     333          22 :         case PG_REGEX_STRATEGY_BUILTIN:
     334          22 :             return pg_u_isalpha(c);
     335        4352 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     336             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     337        4352 :                 return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
     338             :             /* FALL THRU */
     339             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     340           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     341           0 :                     isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
     342             :             break;
     343       12288 :         case PG_REGEX_STRATEGY_ICU:
     344             : #ifdef USE_ICU
     345       12288 :             return u_isalpha(c);
     346             : #endif
     347             :             break;
     348             :     }
     349           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     350             : }
     351             : 
     352             : static int
     353       82698 : pg_wc_isalnum(pg_wchar c)
     354             : {
     355       82698 :     switch (pg_regex_strategy)
     356             :     {
     357         762 :         case PG_REGEX_STRATEGY_C:
     358        1524 :             return (c <= (pg_wchar) 127 &&
     359         762 :                     (pg_char_properties[c] & PG_ISALNUM));
     360       32764 :         case PG_REGEX_STRATEGY_BUILTIN:
     361       32764 :             return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
     362       36884 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     363             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     364       36884 :                 return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
     365             :             /* FALL THRU */
     366             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     367           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     368           0 :                     isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
     369             :             break;
     370       12288 :         case PG_REGEX_STRATEGY_ICU:
     371             : #ifdef USE_ICU
     372       12288 :             return u_isalnum(c);
     373             : #endif
     374             :             break;
     375             :     }
     376           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     377             : }
     378             : 
     379             : static int
     380       37634 : pg_wc_isword(pg_wchar c)
     381             : {
     382             :     /* We define word characters as alnum class plus underscore */
     383       37634 :     if (c == CHR('_'))
     384          24 :         return 1;
     385       37610 :     return pg_wc_isalnum(c);
     386             : }
     387             : 
     388             : static int
     389       40976 : pg_wc_isupper(pg_wchar c)
     390             : {
     391       40976 :     switch (pg_regex_strategy)
     392             :     {
     393           0 :         case PG_REGEX_STRATEGY_C:
     394           0 :             return (c <= (pg_wchar) 127 &&
     395           0 :                     (pg_char_properties[c] & PG_ISUPPER));
     396       24576 :         case PG_REGEX_STRATEGY_BUILTIN:
     397       24576 :             return pg_u_isupper(c);
     398        4112 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     399             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     400        4112 :                 return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
     401             :             /* FALL THRU */
     402             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     403           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     404           0 :                     isupper_l((unsigned char) c, pg_regex_locale->info.lt));
     405             :             break;
     406       12288 :         case PG_REGEX_STRATEGY_ICU:
     407             : #ifdef USE_ICU
     408       12288 :             return u_isupper(c);
     409             : #endif
     410             :             break;
     411             :     }
     412           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     413             : }
     414             : 
     415             : static int
     416       16390 : pg_wc_islower(pg_wchar c)
     417             : {
     418       16390 :     switch (pg_regex_strategy)
     419             :     {
     420           0 :         case PG_REGEX_STRATEGY_C:
     421           0 :             return (c <= (pg_wchar) 127 &&
     422           0 :                     (pg_char_properties[c] & PG_ISLOWER));
     423           0 :         case PG_REGEX_STRATEGY_BUILTIN:
     424           0 :             return pg_u_islower(c);
     425        4102 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     426             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     427        4102 :                 return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
     428             :             /* FALL THRU */
     429             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     430           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     431           0 :                     islower_l((unsigned char) c, pg_regex_locale->info.lt));
     432             :             break;
     433       12288 :         case PG_REGEX_STRATEGY_ICU:
     434             : #ifdef USE_ICU
     435       12288 :             return u_islower(c);
     436             : #endif
     437             :             break;
     438             :     }
     439           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     440             : }
     441             : 
     442             : static int
     443       16390 : pg_wc_isgraph(pg_wchar c)
     444             : {
     445       16390 :     switch (pg_regex_strategy)
     446             :     {
     447           0 :         case PG_REGEX_STRATEGY_C:
     448           0 :             return (c <= (pg_wchar) 127 &&
     449           0 :                     (pg_char_properties[c] & PG_ISGRAPH));
     450           0 :         case PG_REGEX_STRATEGY_BUILTIN:
     451           0 :             return pg_u_isgraph(c);
     452        4102 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     453             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     454        4102 :                 return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
     455             :             /* FALL THRU */
     456             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     457           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     458           0 :                     isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
     459             :             break;
     460       12288 :         case PG_REGEX_STRATEGY_ICU:
     461             : #ifdef USE_ICU
     462       12288 :             return u_isgraph(c);
     463             : #endif
     464             :             break;
     465             :     }
     466           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     467             : }
     468             : 
     469             : static int
     470       16390 : pg_wc_isprint(pg_wchar c)
     471             : {
     472       16390 :     switch (pg_regex_strategy)
     473             :     {
     474           0 :         case PG_REGEX_STRATEGY_C:
     475           0 :             return (c <= (pg_wchar) 127 &&
     476           0 :                     (pg_char_properties[c] & PG_ISPRINT));
     477           0 :         case PG_REGEX_STRATEGY_BUILTIN:
     478           0 :             return pg_u_isprint(c);
     479        4102 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     480             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     481        4102 :                 return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
     482             :             /* FALL THRU */
     483             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     484           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     485           0 :                     isprint_l((unsigned char) c, pg_regex_locale->info.lt));
     486             :             break;
     487       12288 :         case PG_REGEX_STRATEGY_ICU:
     488             : #ifdef USE_ICU
     489       12288 :             return u_isprint(c);
     490             : #endif
     491             :             break;
     492             :     }
     493           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     494             : }
     495             : 
     496             : static int
     497       40966 : pg_wc_ispunct(pg_wchar c)
     498             : {
     499       40966 :     switch (pg_regex_strategy)
     500             :     {
     501           0 :         case PG_REGEX_STRATEGY_C:
     502           0 :             return (c <= (pg_wchar) 127 &&
     503           0 :                     (pg_char_properties[c] & PG_ISPUNCT));
     504       24576 :         case PG_REGEX_STRATEGY_BUILTIN:
     505       24576 :             return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
     506        4102 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     507             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     508        4102 :                 return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
     509             :             /* FALL THRU */
     510             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     511           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     512           0 :                     ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
     513             :             break;
     514       12288 :         case PG_REGEX_STRATEGY_ICU:
     515             : #ifdef USE_ICU
     516       12288 :             return u_ispunct(c);
     517             : #endif
     518             :             break;
     519             :     }
     520           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     521             : }
     522             : 
     523             : static int
     524       76386 : pg_wc_isspace(pg_wchar c)
     525             : {
     526       76386 :     switch (pg_regex_strategy)
     527             :     {
     528           0 :         case PG_REGEX_STRATEGY_C:
     529           0 :             return (c <= (pg_wchar) 127 &&
     530           0 :                     (pg_char_properties[c] & PG_ISSPACE));
     531       16398 :         case PG_REGEX_STRATEGY_BUILTIN:
     532       16398 :             return pg_u_isspace(c);
     533       47700 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     534             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     535       47700 :                 return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
     536             :             /* FALL THRU */
     537             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     538           0 :             return (c <= (pg_wchar) UCHAR_MAX &&
     539           0 :                     isspace_l((unsigned char) c, pg_regex_locale->info.lt));
     540             :             break;
     541       12288 :         case PG_REGEX_STRATEGY_ICU:
     542             : #ifdef USE_ICU
     543       12288 :             return u_isspace(c);
     544             : #endif
     545             :             break;
     546             :     }
     547           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     548             : }
     549             : 
     550             : static pg_wchar
     551       10702 : pg_wc_toupper(pg_wchar c)
     552             : {
     553       10702 :     switch (pg_regex_strategy)
     554             :     {
     555         978 :         case PG_REGEX_STRATEGY_C:
     556         978 :             if (c <= (pg_wchar) 127)
     557         978 :                 return pg_ascii_toupper((unsigned char) c);
     558           0 :             return c;
     559         528 :         case PG_REGEX_STRATEGY_BUILTIN:
     560         528 :             return unicode_uppercase_simple(c);
     561        9088 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     562             :             /* force C behavior for ASCII characters, per comments above */
     563        9088 :             if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
     564         892 :                 return pg_ascii_toupper((unsigned char) c);
     565             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     566        8196 :                 return towupper_l((wint_t) c, pg_regex_locale->info.lt);
     567             :             /* FALL THRU */
     568             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     569             :             /* force C behavior for ASCII characters, per comments above */
     570           0 :             if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
     571           0 :                 return pg_ascii_toupper((unsigned char) c);
     572           0 :             if (c <= (pg_wchar) UCHAR_MAX)
     573           0 :                 return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
     574           0 :             return c;
     575         108 :         case PG_REGEX_STRATEGY_ICU:
     576             : #ifdef USE_ICU
     577         108 :             return u_toupper(c);
     578             : #endif
     579             :             break;
     580             :     }
     581           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     582             : }
     583             : 
     584             : static pg_wchar
     585       10706 : pg_wc_tolower(pg_wchar c)
     586             : {
     587       10706 :     switch (pg_regex_strategy)
     588             :     {
     589         978 :         case PG_REGEX_STRATEGY_C:
     590         978 :             if (c <= (pg_wchar) 127)
     591         978 :                 return pg_ascii_tolower((unsigned char) c);
     592           0 :             return c;
     593         528 :         case PG_REGEX_STRATEGY_BUILTIN:
     594         528 :             return unicode_lowercase_simple(c);
     595        9092 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     596             :             /* force C behavior for ASCII characters, per comments above */
     597        9092 :             if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
     598         896 :                 return pg_ascii_tolower((unsigned char) c);
     599             :             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
     600        8196 :                 return towlower_l((wint_t) c, pg_regex_locale->info.lt);
     601             :             /* FALL THRU */
     602             :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     603             :             /* force C behavior for ASCII characters, per comments above */
     604           0 :             if (pg_regex_locale->is_default && c <= (pg_wchar) 127)
     605           0 :                 return pg_ascii_tolower((unsigned char) c);
     606           0 :             if (c <= (pg_wchar) UCHAR_MAX)
     607           0 :                 return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
     608           0 :             return c;
     609         108 :         case PG_REGEX_STRATEGY_ICU:
     610             : #ifdef USE_ICU
     611         108 :             return u_tolower(c);
     612             : #endif
     613             :             break;
     614             :     }
     615           0 :     return 0;                   /* can't get here, but keep compiler quiet */
     616             : }
     617             : 
     618             : 
     619             : /*
     620             :  * These functions cache the results of probing libc's ctype behavior for
     621             :  * all character codes of interest in a given encoding/collation.  The
     622             :  * result is provided as a "struct cvec", but notice that the representation
     623             :  * is a touch different from a cvec created by regc_cvec.c: we allocate the
     624             :  * chrs[] and ranges[] arrays separately from the struct so that we can
     625             :  * realloc them larger at need.  This is okay since the cvecs made here
     626             :  * should never be freed by freecvec().
     627             :  *
     628             :  * We use malloc not palloc since we mustn't lose control on out-of-memory;
     629             :  * the main regex code expects us to return a failure indication instead.
     630             :  */
     631             : 
     632             : typedef int (*pg_wc_probefunc) (pg_wchar c);
     633             : 
     634             : typedef struct pg_ctype_cache
     635             : {
     636             :     pg_wc_probefunc probefunc;  /* pg_wc_isalpha or a sibling */
     637             :     pg_locale_t locale;         /* locale this entry is for */
     638             :     struct cvec cv;             /* cache entry contents */
     639             :     struct pg_ctype_cache *next;    /* chain link */
     640             : } pg_ctype_cache;
     641             : 
     642             : static pg_ctype_cache *pg_ctype_cache_list = NULL;
     643             : 
     644             : /*
     645             :  * Add a chr or range to pcc->cv; return false if run out of memory
     646             :  */
     647             : static bool
     648       11226 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
     649             : {
     650             :     chr        *newchrs;
     651             : 
     652       11226 :     if (nchrs > 1)
     653             :     {
     654        3320 :         if (pcc->cv.nranges >= pcc->cv.rangespace)
     655             :         {
     656           0 :             pcc->cv.rangespace *= 2;
     657           0 :             newchrs = (chr *) realloc(pcc->cv.ranges,
     658           0 :                                       pcc->cv.rangespace * sizeof(chr) * 2);
     659           0 :             if (newchrs == NULL)
     660           0 :                 return false;
     661           0 :             pcc->cv.ranges = newchrs;
     662             :         }
     663        3320 :         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
     664        3320 :         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
     665        3320 :         pcc->cv.nranges++;
     666             :     }
     667             :     else
     668             :     {
     669             :         assert(nchrs == 1);
     670        7906 :         if (pcc->cv.nchrs >= pcc->cv.chrspace)
     671             :         {
     672          28 :             pcc->cv.chrspace *= 2;
     673          28 :             newchrs = (chr *) realloc(pcc->cv.chrs,
     674          28 :                                       pcc->cv.chrspace * sizeof(chr));
     675          28 :             if (newchrs == NULL)
     676           0 :                 return false;
     677          28 :             pcc->cv.chrs = newchrs;
     678             :         }
     679        7906 :         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
     680             :     }
     681       11226 :     return true;
     682             : }
     683             : 
     684             : /*
     685             :  * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
     686             :  * chrs satisfying the probe function.  The active collation is the one
     687             :  * previously set by pg_set_regex_collation.  Return NULL if out of memory.
     688             :  *
     689             :  * Note that the result must not be freed or modified by caller.
     690             :  */
     691             : static struct cvec *
     692         836 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
     693             : {
     694             :     pg_ctype_cache *pcc;
     695             :     pg_wchar    max_chr;
     696             :     pg_wchar    cur_chr;
     697             :     int         nmatches;
     698             :     chr        *newchrs;
     699             : 
     700             :     /*
     701             :      * Do we already have the answer cached?
     702             :      */
     703        1968 :     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
     704             :     {
     705        1702 :         if (pcc->probefunc == probefunc &&
     706         642 :             pcc->locale == pg_regex_locale)
     707         570 :             return &pcc->cv;
     708             :     }
     709             : 
     710             :     /*
     711             :      * Nope, so initialize some workspace ...
     712             :      */
     713         266 :     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
     714         266 :     if (pcc == NULL)
     715           0 :         return NULL;
     716         266 :     pcc->probefunc = probefunc;
     717         266 :     pcc->locale = pg_regex_locale;
     718         266 :     pcc->cv.nchrs = 0;
     719         266 :     pcc->cv.chrspace = 128;
     720         266 :     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
     721         266 :     pcc->cv.nranges = 0;
     722         266 :     pcc->cv.rangespace = 64;
     723         266 :     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
     724         266 :     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
     725           0 :         goto out_of_memory;
     726         266 :     pcc->cv.cclasscode = cclasscode;
     727             : 
     728             :     /*
     729             :      * Decide how many character codes we ought to look through.  In general
     730             :      * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
     731             :      * runtime using the "high colormap" mechanism.  However, in C locale
     732             :      * there's no need to go further than 127, and if we only have a 1-byte
     733             :      * <ctype.h> API there's no need to go further than that can handle.
     734             :      *
     735             :      * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
     736             :      * output cvec as not having any locale-dependent behavior, since there
     737             :      * will be no need to do any run-time locale checks.  (The #if's here
     738             :      * would always be true for production values of MAX_SIMPLE_CHR, but it's
     739             :      * useful to allow it to be small for testing purposes.)
     740             :      */
     741         266 :     switch (pg_regex_strategy)
     742             :     {
     743          28 :         case PG_REGEX_STRATEGY_C:
     744             : #if MAX_SIMPLE_CHR >= 127
     745          28 :             max_chr = (pg_wchar) 127;
     746          28 :             pcc->cv.cclasscode = -1;
     747             : #else
     748             :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     749             : #endif
     750          28 :             break;
     751          80 :         case PG_REGEX_STRATEGY_BUILTIN:
     752          80 :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     753          80 :             break;
     754         104 :         case PG_REGEX_STRATEGY_LIBC_WIDE:
     755         104 :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     756         104 :             break;
     757           0 :         case PG_REGEX_STRATEGY_LIBC_1BYTE:
     758             : #if MAX_SIMPLE_CHR >= UCHAR_MAX
     759           0 :             max_chr = (pg_wchar) UCHAR_MAX;
     760           0 :             pcc->cv.cclasscode = -1;
     761             : #else
     762             :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     763             : #endif
     764           0 :             break;
     765          54 :         case PG_REGEX_STRATEGY_ICU:
     766          54 :             max_chr = (pg_wchar) MAX_SIMPLE_CHR;
     767          54 :             break;
     768           0 :         default:
     769             :             Assert(false);
     770           0 :             max_chr = 0;        /* can't get here, but keep compiler quiet */
     771           0 :             break;
     772             :     }
     773             : 
     774             :     /*
     775             :      * And scan 'em ...
     776             :      */
     777         266 :     nmatches = 0;               /* number of consecutive matches */
     778             : 
     779      491274 :     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
     780             :     {
     781      491008 :         if ((*probefunc) (cur_chr))
     782      123380 :             nmatches++;
     783      367628 :         else if (nmatches > 0)
     784             :         {
     785       11202 :             if (!store_match(pcc, cur_chr - nmatches, nmatches))
     786           0 :                 goto out_of_memory;
     787       11202 :             nmatches = 0;
     788             :         }
     789             :     }
     790             : 
     791         266 :     if (nmatches > 0)
     792          24 :         if (!store_match(pcc, cur_chr - nmatches, nmatches))
     793           0 :             goto out_of_memory;
     794             : 
     795             :     /*
     796             :      * We might have allocated more memory than needed, if so free it
     797             :      */
     798         266 :     if (pcc->cv.nchrs == 0)
     799             :     {
     800         112 :         free(pcc->cv.chrs);
     801         112 :         pcc->cv.chrs = NULL;
     802         112 :         pcc->cv.chrspace = 0;
     803             :     }
     804         154 :     else if (pcc->cv.nchrs < pcc->cv.chrspace)
     805             :     {
     806         154 :         newchrs = (chr *) realloc(pcc->cv.chrs,
     807         154 :                                   pcc->cv.nchrs * sizeof(chr));
     808         154 :         if (newchrs == NULL)
     809           0 :             goto out_of_memory;
     810         154 :         pcc->cv.chrs = newchrs;
     811         154 :         pcc->cv.chrspace = pcc->cv.nchrs;
     812             :     }
     813         266 :     if (pcc->cv.nranges == 0)
     814             :     {
     815           0 :         free(pcc->cv.ranges);
     816           0 :         pcc->cv.ranges = NULL;
     817           0 :         pcc->cv.rangespace = 0;
     818             :     }
     819         266 :     else if (pcc->cv.nranges < pcc->cv.rangespace)
     820             :     {
     821         266 :         newchrs = (chr *) realloc(pcc->cv.ranges,
     822         266 :                                   pcc->cv.nranges * sizeof(chr) * 2);
     823         266 :         if (newchrs == NULL)
     824           0 :             goto out_of_memory;
     825         266 :         pcc->cv.ranges = newchrs;
     826         266 :         pcc->cv.rangespace = pcc->cv.nranges;
     827             :     }
     828             : 
     829             :     /*
     830             :      * Success, link it into cache chain
     831             :      */
     832         266 :     pcc->next = pg_ctype_cache_list;
     833         266 :     pg_ctype_cache_list = pcc;
     834             : 
     835         266 :     return &pcc->cv;
     836             : 
     837             :     /*
     838             :      * Failure, clean up
     839             :      */
     840           0 : out_of_memory:
     841           0 :     free(pcc->cv.chrs);
     842           0 :     free(pcc->cv.ranges);
     843           0 :     free(pcc);
     844             : 
     845           0 :     return NULL;
     846             : }

Generated by: LCOV version 1.14