LCOV - code coverage report
Current view: top level - src/backend/utils/adt - regexp.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18beta1 Lines: 589 651 90.5 %
Date: 2025-06-07 07:18:12 Functions: 50 51 98.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regexp.c
       4             :  *    Postgres' interface to the regular expression package.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/regexp.c
      12             :  *
      13             :  *      Alistair Crooks added the code for the regex caching
      14             :  *      agc - cached the regular expressions used - there's a good chance
      15             :  *      that we'll get a hit, so this saves a compile step for every
      16             :  *      attempted match. I haven't actually measured the speed improvement,
      17             :  *      but it `looks' a lot quicker visually when watching regression
      18             :  *      test output.
      19             :  *
      20             :  *      agc - incorporated Keith Bostic's Berkeley regex code into
      21             :  *      the tree for all ports. To distinguish this regex code from any that
      22             :  *      is existent on a platform, I've prepended the string "pg_" to
      23             :  *      the functions regcomp, regerror, regexec and regfree.
      24             :  *      Fixed a bug that was originally a typo by me, where `i' was used
      25             :  *      instead of `oldest' when compiling regular expressions - benign
      26             :  *      results mostly, although occasionally it bit you...
      27             :  *
      28             :  *-------------------------------------------------------------------------
      29             :  */
      30             : #include "postgres.h"
      31             : 
      32             : #include "catalog/pg_type.h"
      33             : #include "funcapi.h"
      34             : #include "regex/regex.h"
      35             : #include "utils/array.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/memutils.h"
      38             : #include "utils/varlena.h"
      39             : 
      40             : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      41             :     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
      42             : 
      43             : 
      44             : /* all the options of interest for regex functions */
      45             : typedef struct pg_re_flags
      46             : {
      47             :     int         cflags;         /* compile flags for Spencer's regex code */
      48             :     bool        glob;           /* do it globally (for each occurrence) */
      49             : } pg_re_flags;
      50             : 
      51             : /* cross-call state for regexp_match and regexp_split functions */
      52             : typedef struct regexp_matches_ctx
      53             : {
      54             :     text       *orig_str;       /* data string in original TEXT form */
      55             :     int         nmatches;       /* number of places where pattern matched */
      56             :     int         npatterns;      /* number of capturing subpatterns */
      57             :     /* We store start char index and end+1 char index for each match */
      58             :     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      59             :     int        *match_locs;     /* 0-based character indexes */
      60             :     int         next_match;     /* 0-based index of next match to process */
      61             :     /* workspace for build_regexp_match_result() */
      62             :     Datum      *elems;          /* has npatterns elements */
      63             :     bool       *nulls;          /* has npatterns elements */
      64             :     pg_wchar   *wide_str;       /* wide-char version of original string */
      65             :     char       *conv_buf;       /* conversion buffer, if needed */
      66             :     int         conv_bufsiz;    /* size thereof */
      67             : } regexp_matches_ctx;
      68             : 
      69             : /*
      70             :  * We cache precompiled regular expressions using a "self organizing list"
      71             :  * structure, in which recently-used items tend to be near the front.
      72             :  * Whenever we use an entry, it's moved up to the front of the list.
      73             :  * Over time, an item's average position corresponds to its frequency of use.
      74             :  *
      75             :  * When we first create an entry, it's inserted at the front of
      76             :  * the array, dropping the entry at the end of the array if necessary to
      77             :  * make room.  (This might seem to be weighting the new entry too heavily,
      78             :  * but if we insert new entries further back, we'll be unable to adjust to
      79             :  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
      80             :  * never-before-seen items used circularly.  We ought to be able to handle
      81             :  * that case, so we have to insert at the front.)
      82             :  *
      83             :  * Knuth mentions a variant strategy in which a used item is moved up just
      84             :  * one place in the list.  Although he says this uses fewer comparisons on
      85             :  * average, it seems not to adapt very well to the situation where you have
      86             :  * both some reusable patterns and a steady stream of non-reusable patterns.
      87             :  * A reusable pattern that isn't used at least as often as non-reusable
      88             :  * patterns are seen will "fail to keep up" and will drop off the end of the
      89             :  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
      90             :  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
      91             :  */
      92             : 
      93             : /* this is the maximum number of cached regular expressions */
      94             : #ifndef MAX_CACHED_RES
      95             : #define MAX_CACHED_RES  32
      96             : #endif
      97             : 
      98             : /* A parent memory context for regular expressions. */
      99             : static MemoryContext RegexpCacheMemoryContext;
     100             : 
     101             : /* this structure describes one cached regular expression */
     102             : typedef struct cached_re_str
     103             : {
     104             :     MemoryContext cre_context;  /* memory context for this regexp */
     105             :     char       *cre_pat;        /* original RE (not null terminated!) */
     106             :     int         cre_pat_len;    /* length of original RE, in bytes */
     107             :     int         cre_flags;      /* compile flags: extended,icase etc */
     108             :     Oid         cre_collation;  /* collation to use */
     109             :     regex_t     cre_re;         /* the compiled regular expression */
     110             : } cached_re_str;
     111             : 
     112             : static int  num_res = 0;        /* # of cached re's */
     113             : static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
     114             : 
     115             : 
     116             : /* Local functions */
     117             : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
     118             :                                                 pg_re_flags *re_flags,
     119             :                                                 int start_search,
     120             :                                                 Oid collation,
     121             :                                                 bool use_subpatterns,
     122             :                                                 bool ignore_degenerate,
     123             :                                                 bool fetching_unmatched);
     124             : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
     125             : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
     126             : 
     127             : 
     128             : /*
     129             :  * RE_compile_and_cache - compile a RE, caching if possible
     130             :  *
     131             :  * Returns regex_t *
     132             :  *
     133             :  *  text_re --- the pattern, expressed as a TEXT object
     134             :  *  cflags --- compile options for the pattern
     135             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     136             :  *
     137             :  * Pattern is given in the database encoding.  We internally convert to
     138             :  * an array of pg_wchar, which is what Spencer's regex package wants.
     139             :  */
     140             : regex_t *
     141     7106042 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
     142             : {
     143     7106042 :     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
     144     7106042 :     char       *text_re_val = VARDATA_ANY(text_re);
     145             :     pg_wchar   *pattern;
     146             :     int         pattern_len;
     147             :     int         i;
     148             :     int         regcomp_result;
     149             :     cached_re_str re_temp;
     150             :     char        errMsg[100];
     151             :     MemoryContext oldcontext;
     152             : 
     153             :     /*
     154             :      * Look for a match among previously compiled REs.  Since the data
     155             :      * structure is self-organizing with most-used entries at the front, our
     156             :      * search strategy can just be to scan from the front.
     157             :      */
     158     7694224 :     for (i = 0; i < num_res; i++)
     159             :     {
     160     7688032 :         if (re_array[i].cre_pat_len == text_re_len &&
     161     7114612 :             re_array[i].cre_flags == cflags &&
     162     7113426 :             re_array[i].cre_collation == collation &&
     163     7113050 :             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
     164             :         {
     165             :             /*
     166             :              * Found a match; move it to front if not there already.
     167             :              */
     168     7099850 :             if (i > 0)
     169             :             {
     170      465226 :                 re_temp = re_array[i];
     171      465226 :                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
     172      465226 :                 re_array[0] = re_temp;
     173             :             }
     174             : 
     175     7099850 :             return &re_array[0].cre_re;
     176             :         }
     177             :     }
     178             : 
     179             :     /* Set up the cache memory on first go through. */
     180        6192 :     if (unlikely(RegexpCacheMemoryContext == NULL))
     181        1546 :         RegexpCacheMemoryContext =
     182        1546 :             AllocSetContextCreate(TopMemoryContext,
     183             :                                   "RegexpCacheMemoryContext",
     184             :                                   ALLOCSET_SMALL_SIZES);
     185             : 
     186             :     /*
     187             :      * Couldn't find it, so try to compile the new RE.  To avoid leaking
     188             :      * resources on failure, we build into the re_temp local.
     189             :      */
     190             : 
     191             :     /* Convert pattern string to wide characters */
     192        6192 :     pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
     193        6192 :     pattern_len = pg_mb2wchar_with_len(text_re_val,
     194             :                                        pattern,
     195             :                                        text_re_len);
     196             : 
     197             :     /*
     198             :      * Make a memory context for this compiled regexp.  This is initially a
     199             :      * child of the current memory context, so it will be cleaned up
     200             :      * automatically if compilation is interrupted and throws an ERROR. We'll
     201             :      * re-parent it under the longer lived cache context if we make it to the
     202             :      * bottom of this function.
     203             :      */
     204        6192 :     re_temp.cre_context = AllocSetContextCreate(CurrentMemoryContext,
     205             :                                                 "RegexpMemoryContext",
     206             :                                                 ALLOCSET_SMALL_SIZES);
     207        6192 :     oldcontext = MemoryContextSwitchTo(re_temp.cre_context);
     208             : 
     209        6192 :     regcomp_result = pg_regcomp(&re_temp.cre_re,
     210             :                                 pattern,
     211             :                                 pattern_len,
     212             :                                 cflags,
     213             :                                 collation);
     214             : 
     215        6168 :     pfree(pattern);
     216             : 
     217        6168 :     if (regcomp_result != REG_OKAY)
     218             :     {
     219             :         /* re didn't compile (no need for pg_regfree, if so) */
     220          36 :         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
     221          36 :         ereport(ERROR,
     222             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     223             :                  errmsg("invalid regular expression: %s", errMsg)));
     224             :     }
     225             : 
     226             :     /* Copy the pattern into the per-regexp memory context. */
     227        6132 :     re_temp.cre_pat = palloc(text_re_len + 1);
     228        6132 :     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
     229             : 
     230             :     /*
     231             :      * NUL-terminate it only for the benefit of the identifier used for the
     232             :      * memory context, visible in the pg_backend_memory_contexts view.
     233             :      */
     234        6132 :     re_temp.cre_pat[text_re_len] = 0;
     235        6132 :     MemoryContextSetIdentifier(re_temp.cre_context, re_temp.cre_pat);
     236             : 
     237        6132 :     re_temp.cre_pat_len = text_re_len;
     238        6132 :     re_temp.cre_flags = cflags;
     239        6132 :     re_temp.cre_collation = collation;
     240             : 
     241             :     /*
     242             :      * Okay, we have a valid new item in re_temp; insert it into the storage
     243             :      * array.  Discard last entry if needed.
     244             :      */
     245        6132 :     if (num_res >= MAX_CACHED_RES)
     246             :     {
     247         870 :         --num_res;
     248             :         Assert(num_res < MAX_CACHED_RES);
     249             :         /* Delete the memory context holding the regexp and pattern. */
     250         870 :         MemoryContextDelete(re_array[num_res].cre_context);
     251             :     }
     252             : 
     253             :     /* Re-parent the memory context to our long-lived cache context. */
     254        6132 :     MemoryContextSetParent(re_temp.cre_context, RegexpCacheMemoryContext);
     255             : 
     256        6132 :     if (num_res > 0)
     257        4586 :         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
     258             : 
     259        6132 :     re_array[0] = re_temp;
     260        6132 :     num_res++;
     261             : 
     262        6132 :     MemoryContextSwitchTo(oldcontext);
     263             : 
     264        6132 :     return &re_array[0].cre_re;
     265             : }
     266             : 
     267             : /*
     268             :  * RE_wchar_execute - execute a RE on pg_wchar data
     269             :  *
     270             :  * Returns true on match, false on no match
     271             :  *
     272             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     273             :  *  data --- the data to match against (need not be null-terminated)
     274             :  *  data_len --- the length of the data string
     275             :  *  start_search -- the offset in the data to start searching
     276             :  *  nmatch, pmatch  --- optional return area for match details
     277             :  *
     278             :  * Data is given as array of pg_wchar which is what Spencer's regex package
     279             :  * wants.
     280             :  */
     281             : static bool
     282     7964542 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
     283             :                  int start_search, int nmatch, regmatch_t *pmatch)
     284             : {
     285             :     int         regexec_result;
     286             :     char        errMsg[100];
     287             : 
     288             :     /* Perform RE match and return result */
     289     7964542 :     regexec_result = pg_regexec(re,
     290             :                                 data,
     291             :                                 data_len,
     292             :                                 start_search,
     293             :                                 NULL,   /* no details */
     294             :                                 nmatch,
     295             :                                 pmatch,
     296             :                                 0);
     297             : 
     298     7964542 :     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
     299             :     {
     300             :         /* re failed??? */
     301           0 :         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
     302           0 :         ereport(ERROR,
     303             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     304             :                  errmsg("regular expression failed: %s", errMsg)));
     305             :     }
     306             : 
     307     7964542 :     return (regexec_result == REG_OKAY);
     308             : }
     309             : 
     310             : /*
     311             :  * RE_execute - execute a RE
     312             :  *
     313             :  * Returns true on match, false on no match
     314             :  *
     315             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     316             :  *  dat --- the data to match against (need not be null-terminated)
     317             :  *  dat_len --- the length of the data string
     318             :  *  nmatch, pmatch  --- optional return area for match details
     319             :  *
     320             :  * Data is given in the database encoding.  We internally
     321             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     322             :  */
     323             : static bool
     324     6867584 : RE_execute(regex_t *re, char *dat, int dat_len,
     325             :            int nmatch, regmatch_t *pmatch)
     326             : {
     327             :     pg_wchar   *data;
     328             :     int         data_len;
     329             :     bool        match;
     330             : 
     331             :     /* Convert data string to wide characters */
     332     6867584 :     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
     333     6867584 :     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
     334             : 
     335             :     /* Perform RE match and return result */
     336     6867584 :     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
     337             : 
     338     6867584 :     pfree(data);
     339     6867584 :     return match;
     340             : }
     341             : 
     342             : /*
     343             :  * RE_compile_and_execute - compile and execute a RE
     344             :  *
     345             :  * Returns true on match, false on no match
     346             :  *
     347             :  *  text_re --- the pattern, expressed as a TEXT object
     348             :  *  dat --- the data to match against (need not be null-terminated)
     349             :  *  dat_len --- the length of the data string
     350             :  *  cflags --- compile options for the pattern
     351             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     352             :  *  nmatch, pmatch  --- optional return area for match details
     353             :  *
     354             :  * Both pattern and data are given in the database encoding.  We internally
     355             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     356             :  */
     357             : bool
     358     6865972 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
     359             :                        int cflags, Oid collation,
     360             :                        int nmatch, regmatch_t *pmatch)
     361             : {
     362             :     regex_t    *re;
     363             : 
     364             :     /* Use REG_NOSUB if caller does not want sub-match details */
     365     6865972 :     if (nmatch < 2)
     366     6865972 :         cflags |= REG_NOSUB;
     367             : 
     368             :     /* Compile RE */
     369     6865972 :     re = RE_compile_and_cache(text_re, cflags, collation);
     370             : 
     371     6865948 :     return RE_execute(re, dat, dat_len, nmatch, pmatch);
     372             : }
     373             : 
     374             : 
     375             : /*
     376             :  * parse_re_flags - parse the options argument of regexp_match and friends
     377             :  *
     378             :  *  flags --- output argument, filled with desired options
     379             :  *  opts --- TEXT object, or NULL for defaults
     380             :  *
     381             :  * This accepts all the options allowed by any of the callers; callers that
     382             :  * don't want some have to reject them after the fact.
     383             :  */
     384             : static void
     385      209506 : parse_re_flags(pg_re_flags *flags, text *opts)
     386             : {
     387             :     /* regex flavor is always folded into the compile flags */
     388      209506 :     flags->cflags = REG_ADVANCED;
     389      209506 :     flags->glob = false;
     390             : 
     391      209506 :     if (opts)
     392             :     {
     393        4614 :         char       *opt_p = VARDATA_ANY(opts);
     394        4614 :         int         opt_len = VARSIZE_ANY_EXHDR(opts);
     395             :         int         i;
     396             : 
     397       10322 :         for (i = 0; i < opt_len; i++)
     398             :         {
     399        5732 :             switch (opt_p[i])
     400             :             {
     401        4292 :                 case 'g':
     402        4292 :                     flags->glob = true;
     403        4292 :                     break;
     404           0 :                 case 'b':       /* BREs (but why???) */
     405           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
     406           0 :                     break;
     407          10 :                 case 'c':       /* case sensitive */
     408          10 :                     flags->cflags &= ~REG_ICASE;
     409          10 :                     break;
     410           0 :                 case 'e':       /* plain EREs */
     411           0 :                     flags->cflags |= REG_EXTENDED;
     412           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
     413           0 :                     break;
     414         292 :                 case 'i':       /* case insensitive */
     415         292 :                     flags->cflags |= REG_ICASE;
     416         292 :                     break;
     417        1096 :                 case 'm':       /* Perloid synonym for n */
     418             :                 case 'n':       /* \n affects ^ $ . [^ */
     419        1096 :                     flags->cflags |= REG_NEWLINE;
     420        1096 :                     break;
     421           0 :                 case 'p':       /* ~Perl, \n affects . [^ */
     422           0 :                     flags->cflags |= REG_NLSTOP;
     423           0 :                     flags->cflags &= ~REG_NLANCH;
     424           0 :                     break;
     425           0 :                 case 'q':       /* literal string */
     426           0 :                     flags->cflags |= REG_QUOTE;
     427           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
     428           0 :                     break;
     429          12 :                 case 's':       /* single line, \n ordinary */
     430          12 :                     flags->cflags &= ~REG_NEWLINE;
     431          12 :                     break;
     432           0 :                 case 't':       /* tight syntax */
     433           0 :                     flags->cflags &= ~REG_EXPANDED;
     434           0 :                     break;
     435           0 :                 case 'w':       /* weird, \n affects ^ $ only */
     436           0 :                     flags->cflags &= ~REG_NLSTOP;
     437           0 :                     flags->cflags |= REG_NLANCH;
     438           0 :                     break;
     439           6 :                 case 'x':       /* expanded syntax */
     440           6 :                     flags->cflags |= REG_EXPANDED;
     441           6 :                     break;
     442          24 :                 default:
     443          24 :                     ereport(ERROR,
     444             :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     445             :                              errmsg("invalid regular expression option: \"%.*s\"",
     446             :                                     pg_mblen(opt_p + i), opt_p + i)));
     447             :                     break;
     448             :             }
     449             :         }
     450             :     }
     451      209482 : }
     452             : 
     453             : 
     454             : /*
     455             :  *  interface routines called by the function manager
     456             :  */
     457             : 
     458             : Datum
     459     6410928 : nameregexeq(PG_FUNCTION_ARGS)
     460             : {
     461     6410928 :     Name        n = PG_GETARG_NAME(0);
     462     6410928 :     text       *p = PG_GETARG_TEXT_PP(1);
     463             : 
     464     6410928 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     465             :                                           NameStr(*n),
     466             :                                           strlen(NameStr(*n)),
     467             :                                           REG_ADVANCED,
     468             :                                           PG_GET_COLLATION(),
     469             :                                           0, NULL));
     470             : }
     471             : 
     472             : Datum
     473       23038 : nameregexne(PG_FUNCTION_ARGS)
     474             : {
     475       23038 :     Name        n = PG_GETARG_NAME(0);
     476       23038 :     text       *p = PG_GETARG_TEXT_PP(1);
     477             : 
     478       23038 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     479             :                                            NameStr(*n),
     480             :                                            strlen(NameStr(*n)),
     481             :                                            REG_ADVANCED,
     482             :                                            PG_GET_COLLATION(),
     483             :                                            0, NULL));
     484             : }
     485             : 
     486             : Datum
     487      389854 : textregexeq(PG_FUNCTION_ARGS)
     488             : {
     489      389854 :     text       *s = PG_GETARG_TEXT_PP(0);
     490      389854 :     text       *p = PG_GETARG_TEXT_PP(1);
     491             : 
     492      389854 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     493             :                                           VARDATA_ANY(s),
     494             :                                           VARSIZE_ANY_EXHDR(s),
     495             :                                           REG_ADVANCED,
     496             :                                           PG_GET_COLLATION(),
     497             :                                           0, NULL));
     498             : }
     499             : 
     500             : Datum
     501       34152 : textregexne(PG_FUNCTION_ARGS)
     502             : {
     503       34152 :     text       *s = PG_GETARG_TEXT_PP(0);
     504       34152 :     text       *p = PG_GETARG_TEXT_PP(1);
     505             : 
     506       34152 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     507             :                                            VARDATA_ANY(s),
     508             :                                            VARSIZE_ANY_EXHDR(s),
     509             :                                            REG_ADVANCED,
     510             :                                            PG_GET_COLLATION(),
     511             :                                            0, NULL));
     512             : }
     513             : 
     514             : 
     515             : /*
     516             :  *  routines that use the regexp stuff, but ignore the case.
     517             :  *  for this, we use the REG_ICASE flag to pg_regcomp
     518             :  */
     519             : 
     520             : 
     521             : Datum
     522        7446 : nameicregexeq(PG_FUNCTION_ARGS)
     523             : {
     524        7446 :     Name        n = PG_GETARG_NAME(0);
     525        7446 :     text       *p = PG_GETARG_TEXT_PP(1);
     526             : 
     527        7446 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     528             :                                           NameStr(*n),
     529             :                                           strlen(NameStr(*n)),
     530             :                                           REG_ADVANCED | REG_ICASE,
     531             :                                           PG_GET_COLLATION(),
     532             :                                           0, NULL));
     533             : }
     534             : 
     535             : Datum
     536           6 : nameicregexne(PG_FUNCTION_ARGS)
     537             : {
     538           6 :     Name        n = PG_GETARG_NAME(0);
     539           6 :     text       *p = PG_GETARG_TEXT_PP(1);
     540             : 
     541           6 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     542             :                                            NameStr(*n),
     543             :                                            strlen(NameStr(*n)),
     544             :                                            REG_ADVANCED | REG_ICASE,
     545             :                                            PG_GET_COLLATION(),
     546             :                                            0, NULL));
     547             : }
     548             : 
     549             : Datum
     550         220 : texticregexeq(PG_FUNCTION_ARGS)
     551             : {
     552         220 :     text       *s = PG_GETARG_TEXT_PP(0);
     553         220 :     text       *p = PG_GETARG_TEXT_PP(1);
     554             : 
     555         220 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     556             :                                           VARDATA_ANY(s),
     557             :                                           VARSIZE_ANY_EXHDR(s),
     558             :                                           REG_ADVANCED | REG_ICASE,
     559             :                                           PG_GET_COLLATION(),
     560             :                                           0, NULL));
     561             : }
     562             : 
     563             : Datum
     564          28 : texticregexne(PG_FUNCTION_ARGS)
     565             : {
     566          28 :     text       *s = PG_GETARG_TEXT_PP(0);
     567          28 :     text       *p = PG_GETARG_TEXT_PP(1);
     568             : 
     569          28 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     570             :                                            VARDATA_ANY(s),
     571             :                                            VARSIZE_ANY_EXHDR(s),
     572             :                                            REG_ADVANCED | REG_ICASE,
     573             :                                            PG_GET_COLLATION(),
     574             :                                            0, NULL));
     575             : }
     576             : 
     577             : 
     578             : /*
     579             :  * textregexsubstr()
     580             :  *      Return a substring matched by a regular expression.
     581             :  */
     582             : Datum
     583        1636 : textregexsubstr(PG_FUNCTION_ARGS)
     584             : {
     585        1636 :     text       *s = PG_GETARG_TEXT_PP(0);
     586        1636 :     text       *p = PG_GETARG_TEXT_PP(1);
     587             :     regex_t    *re;
     588             :     regmatch_t  pmatch[2];
     589             :     int         so,
     590             :                 eo;
     591             : 
     592             :     /* Compile RE */
     593        1636 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     594             : 
     595             :     /*
     596             :      * We pass two regmatch_t structs to get info about the overall match and
     597             :      * the match for the first parenthesized subexpression (if any). If there
     598             :      * is a parenthesized subexpression, we return what it matched; else
     599             :      * return what the whole regexp matched.
     600             :      */
     601        3272 :     if (!RE_execute(re,
     602        3272 :                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
     603             :                     2, pmatch))
     604           6 :         PG_RETURN_NULL();       /* definitely no match */
     605             : 
     606        1630 :     if (re->re_nsub > 0)
     607             :     {
     608             :         /* has parenthesized subexpressions, use the first one */
     609        1522 :         so = pmatch[1].rm_so;
     610        1522 :         eo = pmatch[1].rm_eo;
     611             :     }
     612             :     else
     613             :     {
     614             :         /* no parenthesized subexpression, use whole match */
     615         108 :         so = pmatch[0].rm_so;
     616         108 :         eo = pmatch[0].rm_eo;
     617             :     }
     618             : 
     619             :     /*
     620             :      * It is possible to have a match to the whole pattern but no match for a
     621             :      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
     622             :      * there is no subexpression match.  So this extra test for match failure
     623             :      * is not redundant.
     624             :      */
     625        1630 :     if (so < 0 || eo < 0)
     626           6 :         PG_RETURN_NULL();
     627             : 
     628        1624 :     return DirectFunctionCall3(text_substr,
     629             :                                PointerGetDatum(s),
     630             :                                Int32GetDatum(so + 1),
     631             :                                Int32GetDatum(eo - so));
     632             : }
     633             : 
     634             : /*
     635             :  * textregexreplace_noopt()
     636             :  *      Return a string matched by a regular expression, with replacement.
     637             :  *
     638             :  * This version doesn't have an option argument: we default to case
     639             :  * sensitive match, replace the first instance only.
     640             :  */
     641             : Datum
     642       12710 : textregexreplace_noopt(PG_FUNCTION_ARGS)
     643             : {
     644       12710 :     text       *s = PG_GETARG_TEXT_PP(0);
     645       12710 :     text       *p = PG_GETARG_TEXT_PP(1);
     646       12710 :     text       *r = PG_GETARG_TEXT_PP(2);
     647             : 
     648       12710 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     649             :                                          REG_ADVANCED, PG_GET_COLLATION(),
     650             :                                          0, 1));
     651             : }
     652             : 
     653             : /*
     654             :  * textregexreplace()
     655             :  *      Return a string matched by a regular expression, with replacement.
     656             :  */
     657             : Datum
     658        4220 : textregexreplace(PG_FUNCTION_ARGS)
     659             : {
     660        4220 :     text       *s = PG_GETARG_TEXT_PP(0);
     661        4220 :     text       *p = PG_GETARG_TEXT_PP(1);
     662        4220 :     text       *r = PG_GETARG_TEXT_PP(2);
     663        4220 :     text       *opt = PG_GETARG_TEXT_PP(3);
     664             :     pg_re_flags flags;
     665             : 
     666             :     /*
     667             :      * regexp_replace() with four arguments will be preferentially resolved as
     668             :      * this form when the fourth argument is of type UNKNOWN.  However, the
     669             :      * user might have intended to call textregexreplace_extended_no_n.  If we
     670             :      * see flags that look like an integer, emit the same error that
     671             :      * parse_re_flags would, but add a HINT about how to fix it.
     672             :      */
     673        4220 :     if (VARSIZE_ANY_EXHDR(opt) > 0)
     674             :     {
     675        4220 :         char       *opt_p = VARDATA_ANY(opt);
     676             : 
     677        4220 :         if (*opt_p >= '0' && *opt_p <= '9')
     678           6 :             ereport(ERROR,
     679             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     680             :                      errmsg("invalid regular expression option: \"%.*s\"",
     681             :                             pg_mblen(opt_p), opt_p),
     682             :                      errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
     683             :     }
     684             : 
     685        4214 :     parse_re_flags(&flags, opt);
     686             : 
     687        4208 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     688             :                                          flags.cflags, PG_GET_COLLATION(),
     689             :                                          0, flags.glob ? 0 : 1));
     690             : }
     691             : 
     692             : /*
     693             :  * textregexreplace_extended()
     694             :  *      Return a string matched by a regular expression, with replacement.
     695             :  *      Extends textregexreplace by allowing a start position and the
     696             :  *      choice of the occurrence to replace (0 means all occurrences).
     697             :  */
     698             : Datum
     699          66 : textregexreplace_extended(PG_FUNCTION_ARGS)
     700             : {
     701          66 :     text       *s = PG_GETARG_TEXT_PP(0);
     702          66 :     text       *p = PG_GETARG_TEXT_PP(1);
     703          66 :     text       *r = PG_GETARG_TEXT_PP(2);
     704          66 :     int         start = 1;
     705          66 :     int         n = 1;
     706          66 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
     707             :     pg_re_flags re_flags;
     708             : 
     709             :     /* Collect optional parameters */
     710          66 :     if (PG_NARGS() > 3)
     711             :     {
     712          66 :         start = PG_GETARG_INT32(3);
     713          66 :         if (start <= 0)
     714           6 :             ereport(ERROR,
     715             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     716             :                      errmsg("invalid value for parameter \"%s\": %d",
     717             :                             "start", start)));
     718             :     }
     719          60 :     if (PG_NARGS() > 4)
     720             :     {
     721          54 :         n = PG_GETARG_INT32(4);
     722          54 :         if (n < 0)
     723           6 :             ereport(ERROR,
     724             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     725             :                      errmsg("invalid value for parameter \"%s\": %d",
     726             :                             "n", n)));
     727             :     }
     728             : 
     729             :     /* Determine options */
     730          54 :     parse_re_flags(&re_flags, flags);
     731             : 
     732             :     /* If N was not specified, deduce it from the 'g' flag */
     733          54 :     if (PG_NARGS() <= 4)
     734           6 :         n = re_flags.glob ? 0 : 1;
     735             : 
     736             :     /* Do the replacement(s) */
     737          54 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     738             :                                          re_flags.cflags, PG_GET_COLLATION(),
     739             :                                          start - 1, n));
     740             : }
     741             : 
     742             : /* This is separate to keep the opr_sanity regression test from complaining */
     743             : Datum
     744           6 : textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
     745             : {
     746           6 :     return textregexreplace_extended(fcinfo);
     747             : }
     748             : 
     749             : /* This is separate to keep the opr_sanity regression test from complaining */
     750             : Datum
     751           6 : textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
     752             : {
     753           6 :     return textregexreplace_extended(fcinfo);
     754             : }
     755             : 
     756             : /*
     757             :  * similar_to_escape(), similar_escape()
     758             :  *
     759             :  * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
     760             :  * used by our regexp engine.
     761             :  *
     762             :  * similar_escape_internal() is the common workhorse for three SQL-exposed
     763             :  * functions.  esc_text can be passed as NULL to select the default escape
     764             :  * (which is '\'), or as an empty string to select no escape character.
     765             :  */
     766             : static text *
     767         180 : similar_escape_internal(text *pat_text, text *esc_text)
     768             : {
     769             :     text       *result;
     770             :     char       *p,
     771             :                *e,
     772             :                *r;
     773             :     int         plen,
     774             :                 elen;
     775         180 :     bool        afterescape = false;
     776         180 :     int         nquotes = 0;
     777         180 :     int         charclass_depth = 0;    /* Nesting level of character classes,
     778             :                                          * encompassed by square brackets */
     779         180 :     int         charclass_start = 0;    /* State of the character class start,
     780             :                                          * for carets */
     781             : 
     782         180 :     p = VARDATA_ANY(pat_text);
     783         180 :     plen = VARSIZE_ANY_EXHDR(pat_text);
     784         180 :     if (esc_text == NULL)
     785             :     {
     786             :         /* No ESCAPE clause provided; default to backslash as escape */
     787          88 :         e = "\\";
     788          88 :         elen = 1;
     789             :     }
     790             :     else
     791             :     {
     792          92 :         e = VARDATA_ANY(esc_text);
     793          92 :         elen = VARSIZE_ANY_EXHDR(esc_text);
     794          92 :         if (elen == 0)
     795           6 :             e = NULL;           /* no escape character */
     796          86 :         else if (elen > 1)
     797             :         {
     798           6 :             int         escape_mblen = pg_mbstrlen_with_len(e, elen);
     799             : 
     800           6 :             if (escape_mblen > 1)
     801           6 :                 ereport(ERROR,
     802             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     803             :                          errmsg("invalid escape string"),
     804             :                          errhint("Escape string must be empty or one character.")));
     805             :         }
     806             :     }
     807             : 
     808             :     /*----------
     809             :      * We surround the transformed input string with
     810             :      *          ^(?: ... )$
     811             :      * which requires some explanation.  We need "^" and "$" to force
     812             :      * the pattern to match the entire input string as per the SQL spec.
     813             :      * The "(?:" and ")" are a non-capturing set of parens; we have to have
     814             :      * parens in case the string contains "|", else the "^" and "$" will
     815             :      * be bound into the first and last alternatives which is not what we
     816             :      * want, and the parens must be non capturing because we don't want them
     817             :      * to count when selecting output for SUBSTRING.
     818             :      *
     819             :      * When the pattern is divided into three parts by escape-double-quotes,
     820             :      * what we emit is
     821             :      *          ^(?:part1){1,1}?(part2){1,1}(?:part3)$
     822             :      * which requires even more explanation.  The "{1,1}?" on part1 makes it
     823             :      * non-greedy so that it will match the smallest possible amount of text
     824             :      * not the largest, as required by SQL.  The plain parens around part2
     825             :      * are capturing parens so that that part is what controls the result of
     826             :      * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
     827             :      * the largest possible amount of text; hence part3 must match the
     828             :      * smallest amount of text, as required by SQL.  We don't need an explicit
     829             :      * greediness marker on part3.  Note that this also confines the effects
     830             :      * of any "|" characters to the respective part, which is what we want.
     831             :      *
     832             :      * The SQL spec says that SUBSTRING's pattern must contain exactly two
     833             :      * escape-double-quotes, but we only complain if there's more than two.
     834             :      * With none, we act as though part1 and part3 are empty; with one, we
     835             :      * act as though part3 is empty.  Both behaviors fall out of omitting
     836             :      * the relevant part separators in the above expansion.  If the result
     837             :      * of this function is used in a plain regexp match (SIMILAR TO), the
     838             :      * escape-double-quotes have no effect on the match behavior.
     839             :      *----------
     840             :      */
     841             : 
     842             :     /*
     843             :      * We need room for the prefix/postfix and part separators, plus as many
     844             :      * as 3 output bytes per input byte; since the input is at most 1GB this
     845             :      * can't overflow size_t.
     846             :      */
     847         174 :     result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
     848         174 :     r = VARDATA(result);
     849             : 
     850         174 :     *r++ = '^';
     851         174 :     *r++ = '(';
     852         174 :     *r++ = '?';
     853         174 :     *r++ = ':';
     854             : 
     855        1804 :     while (plen > 0)
     856             :     {
     857        1636 :         char        pchar = *p;
     858             : 
     859             :         /*
     860             :          * If both the escape character and the current character from the
     861             :          * pattern are multi-byte, we need to take the slow path.
     862             :          *
     863             :          * But if one of them is single-byte, we can process the pattern one
     864             :          * byte at a time, ignoring multi-byte characters.  (This works
     865             :          * because all server-encodings have the property that a valid
     866             :          * multi-byte character representation cannot contain the
     867             :          * representation of a valid single-byte character.)
     868             :          */
     869             : 
     870        1636 :         if (elen > 1)
     871             :         {
     872           0 :             int         mblen = pg_mblen(p);
     873             : 
     874           0 :             if (mblen > 1)
     875             :             {
     876             :                 /* slow, multi-byte path */
     877           0 :                 if (afterescape)
     878             :                 {
     879           0 :                     *r++ = '\\';
     880           0 :                     memcpy(r, p, mblen);
     881           0 :                     r += mblen;
     882           0 :                     afterescape = false;
     883             :                 }
     884           0 :                 else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
     885             :                 {
     886             :                     /* SQL escape character; do not send to output */
     887           0 :                     afterescape = true;
     888             :                 }
     889             :                 else
     890             :                 {
     891             :                     /*
     892             :                      * We know it's a multi-byte character, so we don't need
     893             :                      * to do all the comparisons to single-byte characters
     894             :                      * that we do below.
     895             :                      */
     896           0 :                     memcpy(r, p, mblen);
     897           0 :                     r += mblen;
     898             :                 }
     899             : 
     900           0 :                 p += mblen;
     901           0 :                 plen -= mblen;
     902             : 
     903           0 :                 continue;
     904             :             }
     905             :         }
     906             : 
     907             :         /* fast path */
     908        1636 :         if (afterescape)
     909             :         {
     910         160 :             if (pchar == '"' && charclass_depth < 1)    /* escape-double-quote? */
     911             :             {
     912             :                 /* emit appropriate part separator, per notes above */
     913         124 :                 if (nquotes == 0)
     914             :                 {
     915          62 :                     *r++ = ')';
     916          62 :                     *r++ = '{';
     917          62 :                     *r++ = '1';
     918          62 :                     *r++ = ',';
     919          62 :                     *r++ = '1';
     920          62 :                     *r++ = '}';
     921          62 :                     *r++ = '?';
     922          62 :                     *r++ = '(';
     923             :                 }
     924          62 :                 else if (nquotes == 1)
     925             :                 {
     926          56 :                     *r++ = ')';
     927          56 :                     *r++ = '{';
     928          56 :                     *r++ = '1';
     929          56 :                     *r++ = ',';
     930          56 :                     *r++ = '1';
     931          56 :                     *r++ = '}';
     932          56 :                     *r++ = '(';
     933          56 :                     *r++ = '?';
     934          56 :                     *r++ = ':';
     935             :                 }
     936             :                 else
     937           6 :                     ereport(ERROR,
     938             :                             (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
     939             :                              errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
     940         118 :                 nquotes++;
     941             :             }
     942             :             else
     943             :             {
     944             :                 /*
     945             :                  * We allow any character at all to be escaped; notably, this
     946             :                  * allows access to POSIX character-class escapes such as
     947             :                  * "\d".  The SQL spec is considerably more restrictive.
     948             :                  */
     949          36 :                 *r++ = '\\';
     950          36 :                 *r++ = pchar;
     951             :             }
     952         154 :             afterescape = false;
     953             :         }
     954        1476 :         else if (e && pchar == *e)
     955             :         {
     956             :             /* SQL escape character; do not send to output */
     957         160 :             afterescape = true;
     958             :         }
     959        1316 :         else if (charclass_depth > 0)
     960             :         {
     961         606 :             if (pchar == '\\')
     962           0 :                 *r++ = '\\';
     963         606 :             *r++ = pchar;
     964             : 
     965             :             /*
     966             :              * Ignore a closing bracket at the start of a character class.
     967             :              * Such a bracket is taken literally rather than closing the
     968             :              * class.  "charclass_start" is 1 right at the beginning of a
     969             :              * class and 2 after an initial caret.
     970             :              */
     971         606 :             if (pchar == ']' && charclass_start > 2)
     972         132 :                 charclass_depth--;
     973         474 :             else if (pchar == '[')
     974          72 :                 charclass_depth++;
     975             : 
     976             :             /*
     977             :              * If there is a caret right after the opening bracket, it negates
     978             :              * the character class, but a following closing bracket should
     979             :              * still be treated as a normal character.  That holds only for
     980             :              * the first caret, so only the values 1 and 2 mean that closing
     981             :              * brackets should be taken literally.
     982             :              */
     983         606 :             if (pchar == '^')
     984          60 :                 charclass_start++;
     985             :             else
     986         546 :                 charclass_start = 3;    /* definitely past the start */
     987             :         }
     988         710 :         else if (pchar == '[')
     989             :         {
     990             :             /* start of a character class */
     991          60 :             *r++ = pchar;
     992          60 :             charclass_depth++;
     993          60 :             charclass_start = 1;
     994             :         }
     995         650 :         else if (pchar == '%')
     996             :         {
     997         126 :             *r++ = '.';
     998         126 :             *r++ = '*';
     999             :         }
    1000         524 :         else if (pchar == '_')
    1001          64 :             *r++ = '.';
    1002         460 :         else if (pchar == '(')
    1003             :         {
    1004             :             /* convert to non-capturing parenthesis */
    1005          30 :             *r++ = '(';
    1006          30 :             *r++ = '?';
    1007          30 :             *r++ = ':';
    1008             :         }
    1009         430 :         else if (pchar == '\\' || pchar == '.' ||
    1010         390 :                  pchar == '^' || pchar == '$')
    1011             :         {
    1012          52 :             *r++ = '\\';
    1013          52 :             *r++ = pchar;
    1014             :         }
    1015             :         else
    1016         378 :             *r++ = pchar;
    1017        1630 :         p++, plen--;
    1018             :     }
    1019             : 
    1020         168 :     *r++ = ')';
    1021         168 :     *r++ = '$';
    1022             : 
    1023         168 :     SET_VARSIZE(result, r - ((char *) result));
    1024             : 
    1025         168 :     return result;
    1026             : }
    1027             : 
    1028             : /*
    1029             :  * similar_to_escape(pattern, escape)
    1030             :  */
    1031             : Datum
    1032          92 : similar_to_escape_2(PG_FUNCTION_ARGS)
    1033             : {
    1034          92 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
    1035          92 :     text       *esc_text = PG_GETARG_TEXT_PP(1);
    1036             :     text       *result;
    1037             : 
    1038          92 :     result = similar_escape_internal(pat_text, esc_text);
    1039             : 
    1040          80 :     PG_RETURN_TEXT_P(result);
    1041             : }
    1042             : 
    1043             : /*
    1044             :  * similar_to_escape(pattern)
    1045             :  * Inserts a default escape character.
    1046             :  */
    1047             : Datum
    1048          88 : similar_to_escape_1(PG_FUNCTION_ARGS)
    1049             : {
    1050          88 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
    1051             :     text       *result;
    1052             : 
    1053          88 :     result = similar_escape_internal(pat_text, NULL);
    1054             : 
    1055          88 :     PG_RETURN_TEXT_P(result);
    1056             : }
    1057             : 
    1058             : /*
    1059             :  * similar_escape(pattern, escape)
    1060             :  *
    1061             :  * Legacy function for compatibility with views stored using the
    1062             :  * pre-v13 expansion of SIMILAR TO.  Unlike the above functions, this
    1063             :  * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
    1064             :  */
    1065             : Datum
    1066           0 : similar_escape(PG_FUNCTION_ARGS)
    1067             : {
    1068             :     text       *pat_text;
    1069             :     text       *esc_text;
    1070             :     text       *result;
    1071             : 
    1072             :     /* This function is not strict, so must test explicitly */
    1073           0 :     if (PG_ARGISNULL(0))
    1074           0 :         PG_RETURN_NULL();
    1075           0 :     pat_text = PG_GETARG_TEXT_PP(0);
    1076             : 
    1077           0 :     if (PG_ARGISNULL(1))
    1078           0 :         esc_text = NULL;        /* use default escape character */
    1079             :     else
    1080           0 :         esc_text = PG_GETARG_TEXT_PP(1);
    1081             : 
    1082           0 :     result = similar_escape_internal(pat_text, esc_text);
    1083             : 
    1084           0 :     PG_RETURN_TEXT_P(result);
    1085             : }
    1086             : 
    1087             : /*
    1088             :  * regexp_count()
    1089             :  *      Return the number of matches of a pattern within a string.
    1090             :  */
    1091             : Datum
    1092          48 : regexp_count(PG_FUNCTION_ARGS)
    1093             : {
    1094          48 :     text       *str = PG_GETARG_TEXT_PP(0);
    1095          48 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1096          48 :     int         start = 1;
    1097          48 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3);
    1098             :     pg_re_flags re_flags;
    1099             :     regexp_matches_ctx *matchctx;
    1100             : 
    1101             :     /* Collect optional parameters */
    1102          48 :     if (PG_NARGS() > 2)
    1103             :     {
    1104          42 :         start = PG_GETARG_INT32(2);
    1105          42 :         if (start <= 0)
    1106          12 :             ereport(ERROR,
    1107             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1108             :                      errmsg("invalid value for parameter \"%s\": %d",
    1109             :                             "start", start)));
    1110             :     }
    1111             : 
    1112             :     /* Determine options */
    1113          36 :     parse_re_flags(&re_flags, flags);
    1114             :     /* User mustn't specify 'g' */
    1115          36 :     if (re_flags.glob)
    1116           0 :         ereport(ERROR,
    1117             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1118             :         /* translator: %s is a SQL function name */
    1119             :                  errmsg("%s does not support the \"global\" option",
    1120             :                         "regexp_count()")));
    1121             :     /* But we find all the matches anyway */
    1122          36 :     re_flags.glob = true;
    1123             : 
    1124             :     /* Do the matching */
    1125          36 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1126             :                                     PG_GET_COLLATION(),
    1127             :                                     false,  /* can ignore subexprs */
    1128             :                                     false, false);
    1129             : 
    1130          36 :     PG_RETURN_INT32(matchctx->nmatches);
    1131             : }
    1132             : 
    1133             : /* This is separate to keep the opr_sanity regression test from complaining */
    1134             : Datum
    1135           6 : regexp_count_no_start(PG_FUNCTION_ARGS)
    1136             : {
    1137           6 :     return regexp_count(fcinfo);
    1138             : }
    1139             : 
    1140             : /* This is separate to keep the opr_sanity regression test from complaining */
    1141             : Datum
    1142          30 : regexp_count_no_flags(PG_FUNCTION_ARGS)
    1143             : {
    1144          30 :     return regexp_count(fcinfo);
    1145             : }
    1146             : 
    1147             : /*
    1148             :  * regexp_instr()
    1149             :  *      Return the match's position within the string
    1150             :  */
    1151             : Datum
    1152         156 : regexp_instr(PG_FUNCTION_ARGS)
    1153             : {
    1154         156 :     text       *str = PG_GETARG_TEXT_PP(0);
    1155         156 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1156         156 :     int         start = 1;
    1157         156 :     int         n = 1;
    1158         156 :     int         endoption = 0;
    1159         156 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
    1160         156 :     int         subexpr = 0;
    1161             :     int         pos;
    1162             :     pg_re_flags re_flags;
    1163             :     regexp_matches_ctx *matchctx;
    1164             : 
    1165             :     /* Collect optional parameters */
    1166         156 :     if (PG_NARGS() > 2)
    1167             :     {
    1168         138 :         start = PG_GETARG_INT32(2);
    1169         138 :         if (start <= 0)
    1170           6 :             ereport(ERROR,
    1171             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1172             :                      errmsg("invalid value for parameter \"%s\": %d",
    1173             :                             "start", start)));
    1174             :     }
    1175         150 :     if (PG_NARGS() > 3)
    1176             :     {
    1177         126 :         n = PG_GETARG_INT32(3);
    1178         126 :         if (n <= 0)
    1179           6 :             ereport(ERROR,
    1180             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1181             :                      errmsg("invalid value for parameter \"%s\": %d",
    1182             :                             "n", n)));
    1183             :     }
    1184         144 :     if (PG_NARGS() > 4)
    1185             :     {
    1186         108 :         endoption = PG_GETARG_INT32(4);
    1187         108 :         if (endoption != 0 && endoption != 1)
    1188          12 :             ereport(ERROR,
    1189             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1190             :                      errmsg("invalid value for parameter \"%s\": %d",
    1191             :                             "endoption", endoption)));
    1192             :     }
    1193         132 :     if (PG_NARGS() > 6)
    1194             :     {
    1195          84 :         subexpr = PG_GETARG_INT32(6);
    1196          84 :         if (subexpr < 0)
    1197           6 :             ereport(ERROR,
    1198             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1199             :                      errmsg("invalid value for parameter \"%s\": %d",
    1200             :                             "subexpr", subexpr)));
    1201             :     }
    1202             : 
    1203             :     /* Determine options */
    1204         126 :     parse_re_flags(&re_flags, flags);
    1205             :     /* User mustn't specify 'g' */
    1206         126 :     if (re_flags.glob)
    1207           6 :         ereport(ERROR,
    1208             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1209             :         /* translator: %s is a SQL function name */
    1210             :                  errmsg("%s does not support the \"global\" option",
    1211             :                         "regexp_instr()")));
    1212             :     /* But we find all the matches anyway */
    1213         120 :     re_flags.glob = true;
    1214             : 
    1215             :     /* Do the matching */
    1216         120 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1217             :                                     PG_GET_COLLATION(),
    1218             :                                     (subexpr > 0),   /* need submatches? */
    1219             :                                     false, false);
    1220             : 
    1221             :     /* When n exceeds matches return 0 (includes case of no matches) */
    1222         120 :     if (n > matchctx->nmatches)
    1223          12 :         PG_RETURN_INT32(0);
    1224             : 
    1225             :     /* When subexpr exceeds number of subexpressions return 0 */
    1226         108 :     if (subexpr > matchctx->npatterns)
    1227          12 :         PG_RETURN_INT32(0);
    1228             : 
    1229             :     /* Select the appropriate match position to return */
    1230          96 :     pos = (n - 1) * matchctx->npatterns;
    1231          96 :     if (subexpr > 0)
    1232          54 :         pos += subexpr - 1;
    1233          96 :     pos *= 2;
    1234          96 :     if (endoption == 1)
    1235          30 :         pos += 1;
    1236             : 
    1237          96 :     if (matchctx->match_locs[pos] >= 0)
    1238          90 :         PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
    1239             :     else
    1240           6 :         PG_RETURN_INT32(0);     /* position not identifiable */
    1241             : }
    1242             : 
    1243             : /* This is separate to keep the opr_sanity regression test from complaining */
    1244             : Datum
    1245          18 : regexp_instr_no_start(PG_FUNCTION_ARGS)
    1246             : {
    1247          18 :     return regexp_instr(fcinfo);
    1248             : }
    1249             : 
    1250             : /* This is separate to keep the opr_sanity regression test from complaining */
    1251             : Datum
    1252           6 : regexp_instr_no_n(PG_FUNCTION_ARGS)
    1253             : {
    1254           6 :     return regexp_instr(fcinfo);
    1255             : }
    1256             : 
    1257             : /* This is separate to keep the opr_sanity regression test from complaining */
    1258             : Datum
    1259          24 : regexp_instr_no_endoption(PG_FUNCTION_ARGS)
    1260             : {
    1261          24 :     return regexp_instr(fcinfo);
    1262             : }
    1263             : 
    1264             : /* This is separate to keep the opr_sanity regression test from complaining */
    1265             : Datum
    1266          12 : regexp_instr_no_flags(PG_FUNCTION_ARGS)
    1267             : {
    1268          12 :     return regexp_instr(fcinfo);
    1269             : }
    1270             : 
    1271             : /* This is separate to keep the opr_sanity regression test from complaining */
    1272             : Datum
    1273          12 : regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
    1274             : {
    1275          12 :     return regexp_instr(fcinfo);
    1276             : }
    1277             : 
    1278             : /*
    1279             :  * regexp_like()
    1280             :  *      Test for a pattern match within a string.
    1281             :  */
    1282             : Datum
    1283          30 : regexp_like(PG_FUNCTION_ARGS)
    1284             : {
    1285          30 :     text       *str = PG_GETARG_TEXT_PP(0);
    1286          30 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1287          30 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1288             :     pg_re_flags re_flags;
    1289             : 
    1290             :     /* Determine options */
    1291          30 :     parse_re_flags(&re_flags, flags);
    1292             :     /* User mustn't specify 'g' */
    1293          30 :     if (re_flags.glob)
    1294           6 :         ereport(ERROR,
    1295             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1296             :         /* translator: %s is a SQL function name */
    1297             :                  errmsg("%s does not support the \"global\" option",
    1298             :                         "regexp_like()")));
    1299             : 
    1300             :     /* Otherwise it's like textregexeq/texticregexeq */
    1301          24 :     PG_RETURN_BOOL(RE_compile_and_execute(pattern,
    1302             :                                           VARDATA_ANY(str),
    1303             :                                           VARSIZE_ANY_EXHDR(str),
    1304             :                                           re_flags.cflags,
    1305             :                                           PG_GET_COLLATION(),
    1306             :                                           0, NULL));
    1307             : }
    1308             : 
    1309             : /* This is separate to keep the opr_sanity regression test from complaining */
    1310             : Datum
    1311           6 : regexp_like_no_flags(PG_FUNCTION_ARGS)
    1312             : {
    1313           6 :     return regexp_like(fcinfo);
    1314             : }
    1315             : 
    1316             : /*
    1317             :  * regexp_match()
    1318             :  *      Return the first substring(s) matching a pattern within a string.
    1319             :  */
    1320             : Datum
    1321        2528 : regexp_match(PG_FUNCTION_ARGS)
    1322             : {
    1323        2528 :     text       *orig_str = PG_GETARG_TEXT_PP(0);
    1324        2528 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1325        2528 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1326             :     pg_re_flags re_flags;
    1327             :     regexp_matches_ctx *matchctx;
    1328             : 
    1329             :     /* Determine options */
    1330        2528 :     parse_re_flags(&re_flags, flags);
    1331             :     /* User mustn't specify 'g' */
    1332        2528 :     if (re_flags.glob)
    1333           8 :         ereport(ERROR,
    1334             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1335             :         /* translator: %s is a SQL function name */
    1336             :                  errmsg("%s does not support the \"global\" option",
    1337             :                         "regexp_match()"),
    1338             :                  errhint("Use the regexp_matches function instead.")));
    1339             : 
    1340        2520 :     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, 0,
    1341             :                                     PG_GET_COLLATION(), true, false, false);
    1342             : 
    1343        2520 :     if (matchctx->nmatches == 0)
    1344         130 :         PG_RETURN_NULL();
    1345             : 
    1346             :     Assert(matchctx->nmatches == 1);
    1347             : 
    1348             :     /* Create workspace that build_regexp_match_result needs */
    1349        2390 :     matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
    1350        2390 :     matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
    1351             : 
    1352        2390 :     PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
    1353             : }
    1354             : 
    1355             : /* This is separate to keep the opr_sanity regression test from complaining */
    1356             : Datum
    1357        2498 : regexp_match_no_flags(PG_FUNCTION_ARGS)
    1358             : {
    1359        2498 :     return regexp_match(fcinfo);
    1360             : }
    1361             : 
    1362             : /*
    1363             :  * regexp_matches()
    1364             :  *      Return a table of all matches of a pattern within a string.
    1365             :  */
    1366             : Datum
    1367        2610 : regexp_matches(PG_FUNCTION_ARGS)
    1368             : {
    1369             :     FuncCallContext *funcctx;
    1370             :     regexp_matches_ctx *matchctx;
    1371             : 
    1372        2610 :     if (SRF_IS_FIRSTCALL())
    1373             :     {
    1374        1950 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1375        1950 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1376             :         pg_re_flags re_flags;
    1377             :         MemoryContext oldcontext;
    1378             : 
    1379        1950 :         funcctx = SRF_FIRSTCALL_INIT();
    1380        1950 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1381             : 
    1382             :         /* Determine options */
    1383        1950 :         parse_re_flags(&re_flags, flags);
    1384             : 
    1385             :         /* be sure to copy the input string into the multi-call ctx */
    1386        1944 :         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1387             :                                         &re_flags, 0,
    1388             :                                         PG_GET_COLLATION(),
    1389             :                                         true, false, false);
    1390             : 
    1391             :         /* Pre-create workspace that build_regexp_match_result needs */
    1392        1932 :         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
    1393        1932 :         matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
    1394             : 
    1395        1932 :         MemoryContextSwitchTo(oldcontext);
    1396        1932 :         funcctx->user_fctx = matchctx;
    1397             :     }
    1398             : 
    1399        2592 :     funcctx = SRF_PERCALL_SETUP();
    1400        2592 :     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1401             : 
    1402        2592 :     if (matchctx->next_match < matchctx->nmatches)
    1403             :     {
    1404             :         ArrayType  *result_ary;
    1405             : 
    1406         660 :         result_ary = build_regexp_match_result(matchctx);
    1407         660 :         matchctx->next_match++;
    1408         660 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
    1409             :     }
    1410             : 
    1411        1932 :     SRF_RETURN_DONE(funcctx);
    1412             : }
    1413             : 
    1414             : /* This is separate to keep the opr_sanity regression test from complaining */
    1415             : Datum
    1416        2286 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
    1417             : {
    1418        2286 :     return regexp_matches(fcinfo);
    1419             : }
    1420             : 
    1421             : /*
    1422             :  * setup_regexp_matches --- do the initial matching for regexp_match,
    1423             :  *      regexp_split, and related functions
    1424             :  *
    1425             :  * To avoid having to re-find the compiled pattern on each call, we do
    1426             :  * all the matching in one swoop.  The returned regexp_matches_ctx contains
    1427             :  * the locations of all the substrings matching the pattern.
    1428             :  *
    1429             :  * start_search: the character (not byte) offset in orig_str at which to
    1430             :  * begin the search.  Returned positions are relative to orig_str anyway.
    1431             :  * use_subpatterns: collect data about matches to parenthesized subexpressions.
    1432             :  * ignore_degenerate: ignore zero-length matches.
    1433             :  * fetching_unmatched: caller wants to fetch unmatched substrings.
    1434             :  *
    1435             :  * We don't currently assume that fetching_unmatched is exclusive of fetching
    1436             :  * the matched text too; if it's set, the conversion buffer is large enough to
    1437             :  * fetch any single matched or unmatched string, but not any larger
    1438             :  * substring.  (In practice, when splitting the matches are usually small
    1439             :  * anyway, and it didn't seem worth complicating the code further.)
    1440             :  */
    1441             : static regexp_matches_ctx *
    1442      205158 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
    1443             :                      int start_search,
    1444             :                      Oid collation,
    1445             :                      bool use_subpatterns,
    1446             :                      bool ignore_degenerate,
    1447             :                      bool fetching_unmatched)
    1448             : {
    1449      205158 :     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
    1450      205158 :     int         eml = pg_database_encoding_max_length();
    1451             :     int         orig_len;
    1452             :     pg_wchar   *wide_str;
    1453             :     int         wide_len;
    1454             :     int         cflags;
    1455             :     regex_t    *cpattern;
    1456             :     regmatch_t *pmatch;
    1457             :     int         pmatch_len;
    1458             :     int         array_len;
    1459             :     int         array_idx;
    1460             :     int         prev_match_end;
    1461             :     int         prev_valid_match_end;
    1462      205158 :     int         maxlen = 0;     /* largest fetch length in characters */
    1463             : 
    1464             :     /* save original string --- we'll extract result substrings from it */
    1465      205158 :     matchctx->orig_str = orig_str;
    1466             : 
    1467             :     /* convert string to pg_wchar form for matching */
    1468      205158 :     orig_len = VARSIZE_ANY_EXHDR(orig_str);
    1469      205158 :     wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
    1470      205158 :     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
    1471             : 
    1472             :     /* set up the compiled pattern */
    1473      205158 :     cflags = re_flags->cflags;
    1474      205158 :     if (!use_subpatterns)
    1475      200592 :         cflags |= REG_NOSUB;
    1476      205158 :     cpattern = RE_compile_and_cache(pattern, cflags, collation);
    1477             : 
    1478             :     /* do we want to remember subpatterns? */
    1479      205146 :     if (use_subpatterns && cpattern->re_nsub > 0)
    1480             :     {
    1481        2682 :         matchctx->npatterns = cpattern->re_nsub;
    1482        2682 :         pmatch_len = cpattern->re_nsub + 1;
    1483             :     }
    1484             :     else
    1485             :     {
    1486      202464 :         use_subpatterns = false;
    1487      202464 :         matchctx->npatterns = 1;
    1488      202464 :         pmatch_len = 1;
    1489             :     }
    1490             : 
    1491             :     /* temporary output space for RE package */
    1492      205146 :     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
    1493             : 
    1494             :     /*
    1495             :      * the real output space (grown dynamically if needed)
    1496             :      *
    1497             :      * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
    1498             :      * than at 2^27
    1499             :      */
    1500      205146 :     array_len = re_flags->glob ? 255 : 31;
    1501      205146 :     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
    1502      205146 :     array_idx = 0;
    1503             : 
    1504             :     /* search for the pattern, perhaps repeatedly */
    1505      205146 :     prev_match_end = 0;
    1506      205146 :     prev_valid_match_end = 0;
    1507     1096958 :     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
    1508             :                             pmatch_len, pmatch))
    1509             :     {
    1510             :         /*
    1511             :          * If requested, ignore degenerate matches, which are zero-length
    1512             :          * matches occurring at the start or end of a string or just after a
    1513             :          * previous match.
    1514             :          */
    1515      894708 :         if (!ignore_degenerate ||
    1516      891262 :             (pmatch[0].rm_so < wide_len &&
    1517      891220 :              pmatch[0].rm_eo > prev_match_end))
    1518             :         {
    1519             :             /* enlarge output space if needed */
    1520      894888 :             while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
    1521             :             {
    1522         360 :                 array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
    1523         360 :                 if (array_len > MaxAllocSize / sizeof(int))
    1524           0 :                     ereport(ERROR,
    1525             :                             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    1526             :                              errmsg("too many regular expression matches")));
    1527         360 :                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
    1528             :                                                         sizeof(int) * array_len);
    1529             :             }
    1530             : 
    1531             :             /* save this match's locations */
    1532      894528 :             if (use_subpatterns)
    1533             :             {
    1534             :                 int         i;
    1535             : 
    1536        7908 :                 for (i = 1; i <= matchctx->npatterns; i++)
    1537             :                 {
    1538        5338 :                     int         so = pmatch[i].rm_so;
    1539        5338 :                     int         eo = pmatch[i].rm_eo;
    1540             : 
    1541        5338 :                     matchctx->match_locs[array_idx++] = so;
    1542        5338 :                     matchctx->match_locs[array_idx++] = eo;
    1543        5338 :                     if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1544        3432 :                         maxlen = (eo - so);
    1545             :                 }
    1546             :             }
    1547             :             else
    1548             :             {
    1549      891958 :                 int         so = pmatch[0].rm_so;
    1550      891958 :                 int         eo = pmatch[0].rm_eo;
    1551             : 
    1552      891958 :                 matchctx->match_locs[array_idx++] = so;
    1553      891958 :                 matchctx->match_locs[array_idx++] = eo;
    1554      891958 :                 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1555      200906 :                     maxlen = (eo - so);
    1556             :             }
    1557      894528 :             matchctx->nmatches++;
    1558             : 
    1559             :             /*
    1560             :              * check length of unmatched portion between end of previous valid
    1561             :              * (nondegenerate, or degenerate but not ignored) match and start
    1562             :              * of current one
    1563             :              */
    1564      894528 :             if (fetching_unmatched &&
    1565      891082 :                 pmatch[0].rm_so >= 0 &&
    1566      891082 :                 (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
    1567      380858 :                 maxlen = (pmatch[0].rm_so - prev_valid_match_end);
    1568      894528 :             prev_valid_match_end = pmatch[0].rm_eo;
    1569             :         }
    1570      894708 :         prev_match_end = pmatch[0].rm_eo;
    1571             : 
    1572             :         /* if not glob, stop after one match */
    1573      894708 :         if (!re_flags->glob)
    1574        2830 :             break;
    1575             : 
    1576             :         /*
    1577             :          * Advance search position.  Normally we start the next search at the
    1578             :          * end of the previous match; but if the match was of zero length, we
    1579             :          * have to advance by one character, or we'd just find the same match
    1580             :          * again.
    1581             :          */
    1582      891878 :         start_search = prev_match_end;
    1583      891878 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    1584        1176 :             start_search++;
    1585      891878 :         if (start_search > wide_len)
    1586          66 :             break;
    1587             :     }
    1588             : 
    1589             :     /*
    1590             :      * check length of unmatched portion between end of last match and end of
    1591             :      * input string
    1592             :      */
    1593      205146 :     if (fetching_unmatched &&
    1594      200454 :         (wide_len - prev_valid_match_end) > maxlen)
    1595          74 :         maxlen = (wide_len - prev_valid_match_end);
    1596             : 
    1597             :     /*
    1598             :      * Keep a note of the end position of the string for the benefit of
    1599             :      * splitting code.
    1600             :      */
    1601      205146 :     matchctx->match_locs[array_idx] = wide_len;
    1602             : 
    1603      205146 :     if (eml > 1)
    1604             :     {
    1605      205146 :         int64       maxsiz = eml * (int64) maxlen;
    1606             :         int         conv_bufsiz;
    1607             : 
    1608             :         /*
    1609             :          * Make the conversion buffer large enough for any substring of
    1610             :          * interest.
    1611             :          *
    1612             :          * Worst case: assume we need the maximum size (maxlen*eml), but take
    1613             :          * advantage of the fact that the original string length in bytes is
    1614             :          * an upper bound on the byte length of any fetched substring (and we
    1615             :          * know that len+1 is safe to allocate because the varlena header is
    1616             :          * longer than 1 byte).
    1617             :          */
    1618      205146 :         if (maxsiz > orig_len)
    1619      200820 :             conv_bufsiz = orig_len + 1;
    1620             :         else
    1621        4326 :             conv_bufsiz = maxsiz + 1;   /* safe since maxsiz < 2^30 */
    1622             : 
    1623      205146 :         matchctx->conv_buf = palloc(conv_bufsiz);
    1624      205146 :         matchctx->conv_bufsiz = conv_bufsiz;
    1625      205146 :         matchctx->wide_str = wide_str;
    1626             :     }
    1627             :     else
    1628             :     {
    1629             :         /* No need to keep the wide string if we're in a single-byte charset. */
    1630           0 :         pfree(wide_str);
    1631           0 :         matchctx->wide_str = NULL;
    1632           0 :         matchctx->conv_buf = NULL;
    1633           0 :         matchctx->conv_bufsiz = 0;
    1634             :     }
    1635             : 
    1636             :     /* Clean up temp storage */
    1637      205146 :     pfree(pmatch);
    1638             : 
    1639      205146 :     return matchctx;
    1640             : }
    1641             : 
    1642             : /*
    1643             :  * build_regexp_match_result - build output array for current match
    1644             :  */
    1645             : static ArrayType *
    1646        3050 : build_regexp_match_result(regexp_matches_ctx *matchctx)
    1647             : {
    1648        3050 :     char       *buf = matchctx->conv_buf;
    1649        3050 :     Datum      *elems = matchctx->elems;
    1650        3050 :     bool       *nulls = matchctx->nulls;
    1651             :     int         dims[1];
    1652             :     int         lbs[1];
    1653             :     int         loc;
    1654             :     int         i;
    1655             : 
    1656             :     /* Extract matching substrings from the original string */
    1657        3050 :     loc = matchctx->next_match * matchctx->npatterns * 2;
    1658        8598 :     for (i = 0; i < matchctx->npatterns; i++)
    1659             :     {
    1660        5548 :         int         so = matchctx->match_locs[loc++];
    1661        5548 :         int         eo = matchctx->match_locs[loc++];
    1662             : 
    1663        5548 :         if (so < 0 || eo < 0)
    1664             :         {
    1665           6 :             elems[i] = (Datum) 0;
    1666           6 :             nulls[i] = true;
    1667             :         }
    1668        5542 :         else if (buf)
    1669             :         {
    1670        5542 :             int         len = pg_wchar2mb_with_len(matchctx->wide_str + so,
    1671             :                                                    buf,
    1672             :                                                    eo - so);
    1673             : 
    1674             :             Assert(len < matchctx->conv_bufsiz);
    1675        5542 :             elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
    1676        5542 :             nulls[i] = false;
    1677             :         }
    1678             :         else
    1679             :         {
    1680           0 :             elems[i] = DirectFunctionCall3(text_substr,
    1681             :                                            PointerGetDatum(matchctx->orig_str),
    1682             :                                            Int32GetDatum(so + 1),
    1683             :                                            Int32GetDatum(eo - so));
    1684           0 :             nulls[i] = false;
    1685             :         }
    1686             :     }
    1687             : 
    1688             :     /* And form an array */
    1689        3050 :     dims[0] = matchctx->npatterns;
    1690        3050 :     lbs[0] = 1;
    1691             :     /* XXX: this hardcodes assumptions about the text type */
    1692        3050 :     return construct_md_array(elems, nulls, 1, dims, lbs,
    1693             :                               TEXTOID, -1, false, TYPALIGN_INT);
    1694             : }
    1695             : 
    1696             : /*
    1697             :  * regexp_split_to_table()
    1698             :  *      Split the string at matches of the pattern, returning the
    1699             :  *      split-out substrings as a table.
    1700             :  */
    1701             : Datum
    1702         622 : regexp_split_to_table(PG_FUNCTION_ARGS)
    1703             : {
    1704             :     FuncCallContext *funcctx;
    1705             :     regexp_matches_ctx *splitctx;
    1706             : 
    1707         622 :     if (SRF_IS_FIRSTCALL())
    1708             :     {
    1709          52 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1710          52 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1711             :         pg_re_flags re_flags;
    1712             :         MemoryContext oldcontext;
    1713             : 
    1714          52 :         funcctx = SRF_FIRSTCALL_INIT();
    1715          52 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1716             : 
    1717             :         /* Determine options */
    1718          52 :         parse_re_flags(&re_flags, flags);
    1719             :         /* User mustn't specify 'g' */
    1720          46 :         if (re_flags.glob)
    1721           6 :             ereport(ERROR,
    1722             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1723             :             /* translator: %s is a SQL function name */
    1724             :                      errmsg("%s does not support the \"global\" option",
    1725             :                             "regexp_split_to_table()")));
    1726             :         /* But we find all the matches anyway */
    1727          40 :         re_flags.glob = true;
    1728             : 
    1729             :         /* be sure to copy the input string into the multi-call ctx */
    1730          40 :         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1731             :                                         &re_flags, 0,
    1732             :                                         PG_GET_COLLATION(),
    1733             :                                         false, true, true);
    1734             : 
    1735          40 :         MemoryContextSwitchTo(oldcontext);
    1736          40 :         funcctx->user_fctx = splitctx;
    1737             :     }
    1738             : 
    1739         610 :     funcctx = SRF_PERCALL_SETUP();
    1740         610 :     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1741             : 
    1742         610 :     if (splitctx->next_match <= splitctx->nmatches)
    1743             :     {
    1744         570 :         Datum       result = build_regexp_split_result(splitctx);
    1745             : 
    1746         570 :         splitctx->next_match++;
    1747         570 :         SRF_RETURN_NEXT(funcctx, result);
    1748             :     }
    1749             : 
    1750          40 :     SRF_RETURN_DONE(funcctx);
    1751             : }
    1752             : 
    1753             : /* This is separate to keep the opr_sanity regression test from complaining */
    1754             : Datum
    1755         552 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
    1756             : {
    1757         552 :     return regexp_split_to_table(fcinfo);
    1758             : }
    1759             : 
    1760             : /*
    1761             :  * regexp_split_to_array()
    1762             :  *      Split the string at matches of the pattern, returning the
    1763             :  *      split-out substrings as an array.
    1764             :  */
    1765             : Datum
    1766      200426 : regexp_split_to_array(PG_FUNCTION_ARGS)
    1767             : {
    1768      200426 :     ArrayBuildState *astate = NULL;
    1769             :     pg_re_flags re_flags;
    1770             :     regexp_matches_ctx *splitctx;
    1771             : 
    1772             :     /* Determine options */
    1773      200426 :     parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
    1774             :     /* User mustn't specify 'g' */
    1775      200420 :     if (re_flags.glob)
    1776           6 :         ereport(ERROR,
    1777             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1778             :         /* translator: %s is a SQL function name */
    1779             :                  errmsg("%s does not support the \"global\" option",
    1780             :                         "regexp_split_to_array()")));
    1781             :     /* But we find all the matches anyway */
    1782      200414 :     re_flags.glob = true;
    1783             : 
    1784      200414 :     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
    1785      200414 :                                     PG_GETARG_TEXT_PP(1),
    1786             :                                     &re_flags, 0,
    1787             :                                     PG_GET_COLLATION(),
    1788             :                                     false, true, true);
    1789             : 
    1790     1291380 :     while (splitctx->next_match <= splitctx->nmatches)
    1791             :     {
    1792     1090966 :         astate = accumArrayResult(astate,
    1793             :                                   build_regexp_split_result(splitctx),
    1794             :                                   false,
    1795             :                                   TEXTOID,
    1796             :                                   CurrentMemoryContext);
    1797     1090966 :         splitctx->next_match++;
    1798             :     }
    1799             : 
    1800      200414 :     PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
    1801             : }
    1802             : 
    1803             : /* This is separate to keep the opr_sanity regression test from complaining */
    1804             : Datum
    1805      200384 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
    1806             : {
    1807      200384 :     return regexp_split_to_array(fcinfo);
    1808             : }
    1809             : 
    1810             : /*
    1811             :  * build_regexp_split_result - build output string for current match
    1812             :  *
    1813             :  * We return the string between the current match and the previous one,
    1814             :  * or the string after the last match when next_match == nmatches.
    1815             :  */
    1816             : static Datum
    1817     1091536 : build_regexp_split_result(regexp_matches_ctx *splitctx)
    1818             : {
    1819     1091536 :     char       *buf = splitctx->conv_buf;
    1820             :     int         startpos;
    1821             :     int         endpos;
    1822             : 
    1823     1091536 :     if (splitctx->next_match > 0)
    1824      891082 :         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
    1825             :     else
    1826      200454 :         startpos = 0;
    1827     1091536 :     if (startpos < 0)
    1828           0 :         elog(ERROR, "invalid match ending position");
    1829             : 
    1830     1091536 :     endpos = splitctx->match_locs[splitctx->next_match * 2];
    1831     1091536 :     if (endpos < startpos)
    1832           0 :         elog(ERROR, "invalid match starting position");
    1833             : 
    1834     1091536 :     if (buf)
    1835             :     {
    1836             :         int         len;
    1837             : 
    1838     1091536 :         len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
    1839             :                                    buf,
    1840             :                                    endpos - startpos);
    1841             :         Assert(len < splitctx->conv_bufsiz);
    1842     1091536 :         return PointerGetDatum(cstring_to_text_with_len(buf, len));
    1843             :     }
    1844             :     else
    1845             :     {
    1846           0 :         return DirectFunctionCall3(text_substr,
    1847             :                                    PointerGetDatum(splitctx->orig_str),
    1848             :                                    Int32GetDatum(startpos + 1),
    1849             :                                    Int32GetDatum(endpos - startpos));
    1850             :     }
    1851             : }
    1852             : 
    1853             : /*
    1854             :  * regexp_substr()
    1855             :  *      Return the substring that matches a regular expression pattern
    1856             :  */
    1857             : Datum
    1858         108 : regexp_substr(PG_FUNCTION_ARGS)
    1859             : {
    1860         108 :     text       *str = PG_GETARG_TEXT_PP(0);
    1861         108 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1862         108 :     int         start = 1;
    1863         108 :     int         n = 1;
    1864         108 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4);
    1865         108 :     int         subexpr = 0;
    1866             :     int         so,
    1867             :                 eo,
    1868             :                 pos;
    1869             :     pg_re_flags re_flags;
    1870             :     regexp_matches_ctx *matchctx;
    1871             : 
    1872             :     /* Collect optional parameters */
    1873         108 :     if (PG_NARGS() > 2)
    1874             :     {
    1875          90 :         start = PG_GETARG_INT32(2);
    1876          90 :         if (start <= 0)
    1877           6 :             ereport(ERROR,
    1878             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1879             :                      errmsg("invalid value for parameter \"%s\": %d",
    1880             :                             "start", start)));
    1881             :     }
    1882         102 :     if (PG_NARGS() > 3)
    1883             :     {
    1884          78 :         n = PG_GETARG_INT32(3);
    1885          78 :         if (n <= 0)
    1886           6 :             ereport(ERROR,
    1887             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1888             :                      errmsg("invalid value for parameter \"%s\": %d",
    1889             :                             "n", n)));
    1890             :     }
    1891          96 :     if (PG_NARGS() > 5)
    1892             :     {
    1893          48 :         subexpr = PG_GETARG_INT32(5);
    1894          48 :         if (subexpr < 0)
    1895           6 :             ereport(ERROR,
    1896             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1897             :                      errmsg("invalid value for parameter \"%s\": %d",
    1898             :                             "subexpr", subexpr)));
    1899             :     }
    1900             : 
    1901             :     /* Determine options */
    1902          90 :     parse_re_flags(&re_flags, flags);
    1903             :     /* User mustn't specify 'g' */
    1904          90 :     if (re_flags.glob)
    1905           6 :         ereport(ERROR,
    1906             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1907             :         /* translator: %s is a SQL function name */
    1908             :                  errmsg("%s does not support the \"global\" option",
    1909             :                         "regexp_substr()")));
    1910             :     /* But we find all the matches anyway */
    1911          84 :     re_flags.glob = true;
    1912             : 
    1913             :     /* Do the matching */
    1914          84 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1915             :                                     PG_GET_COLLATION(),
    1916             :                                     (subexpr > 0),   /* need submatches? */
    1917             :                                     false, false);
    1918             : 
    1919             :     /* When n exceeds matches return NULL (includes case of no matches) */
    1920          84 :     if (n > matchctx->nmatches)
    1921          12 :         PG_RETURN_NULL();
    1922             : 
    1923             :     /* When subexpr exceeds number of subexpressions return NULL */
    1924          72 :     if (subexpr > matchctx->npatterns)
    1925           6 :         PG_RETURN_NULL();
    1926             : 
    1927             :     /* Select the appropriate match position to return */
    1928          66 :     pos = (n - 1) * matchctx->npatterns;
    1929          66 :     if (subexpr > 0)
    1930          30 :         pos += subexpr - 1;
    1931          66 :     pos *= 2;
    1932          66 :     so = matchctx->match_locs[pos];
    1933          66 :     eo = matchctx->match_locs[pos + 1];
    1934             : 
    1935          66 :     if (so < 0 || eo < 0)
    1936           6 :         PG_RETURN_NULL();       /* unidentifiable location */
    1937             : 
    1938          60 :     PG_RETURN_DATUM(DirectFunctionCall3(text_substr,
    1939             :                                         PointerGetDatum(matchctx->orig_str),
    1940             :                                         Int32GetDatum(so + 1),
    1941             :                                         Int32GetDatum(eo - so)));
    1942             : }
    1943             : 
    1944             : /* This is separate to keep the opr_sanity regression test from complaining */
    1945             : Datum
    1946          18 : regexp_substr_no_start(PG_FUNCTION_ARGS)
    1947             : {
    1948          18 :     return regexp_substr(fcinfo);
    1949             : }
    1950             : 
    1951             : /* This is separate to keep the opr_sanity regression test from complaining */
    1952             : Datum
    1953           6 : regexp_substr_no_n(PG_FUNCTION_ARGS)
    1954             : {
    1955           6 :     return regexp_substr(fcinfo);
    1956             : }
    1957             : 
    1958             : /* This is separate to keep the opr_sanity regression test from complaining */
    1959             : Datum
    1960          24 : regexp_substr_no_flags(PG_FUNCTION_ARGS)
    1961             : {
    1962          24 :     return regexp_substr(fcinfo);
    1963             : }
    1964             : 
    1965             : /* This is separate to keep the opr_sanity regression test from complaining */
    1966             : Datum
    1967          12 : regexp_substr_no_subexpr(PG_FUNCTION_ARGS)
    1968             : {
    1969          12 :     return regexp_substr(fcinfo);
    1970             : }
    1971             : 
    1972             : /*
    1973             :  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
    1974             :  *
    1975             :  * The result is NULL if there is no fixed prefix, else a palloc'd string.
    1976             :  * If it is an exact match, not just a prefix, *exact is returned as true.
    1977             :  */
    1978             : char *
    1979       16304 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
    1980             :                     bool *exact)
    1981             : {
    1982             :     char       *result;
    1983             :     regex_t    *re;
    1984             :     int         cflags;
    1985             :     int         re_result;
    1986             :     pg_wchar   *str;
    1987             :     size_t      slen;
    1988             :     size_t      maxlen;
    1989             :     char        errMsg[100];
    1990             : 
    1991       16304 :     *exact = false;             /* default result */
    1992             : 
    1993             :     /* Compile RE */
    1994       16304 :     cflags = REG_ADVANCED;
    1995       16304 :     if (case_insensitive)
    1996          62 :         cflags |= REG_ICASE;
    1997             : 
    1998       16304 :     re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
    1999             : 
    2000             :     /* Examine it to see if there's a fixed prefix */
    2001       16280 :     re_result = pg_regprefix(re, &str, &slen);
    2002             : 
    2003       16280 :     switch (re_result)
    2004             :     {
    2005         764 :         case REG_NOMATCH:
    2006         764 :             return NULL;
    2007             : 
    2008        2782 :         case REG_PREFIX:
    2009             :             /* continue with wchar conversion */
    2010        2782 :             break;
    2011             : 
    2012       12734 :         case REG_EXACT:
    2013       12734 :             *exact = true;
    2014             :             /* continue with wchar conversion */
    2015       12734 :             break;
    2016             : 
    2017           0 :         default:
    2018             :             /* re failed??? */
    2019           0 :             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
    2020           0 :             ereport(ERROR,
    2021             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    2022             :                      errmsg("regular expression failed: %s", errMsg)));
    2023             :             break;
    2024             :     }
    2025             : 
    2026             :     /* Convert pg_wchar result back to database encoding */
    2027       15516 :     maxlen = pg_database_encoding_max_length() * slen + 1;
    2028       15516 :     result = (char *) palloc(maxlen);
    2029       15516 :     slen = pg_wchar2mb_with_len(str, result, slen);
    2030             :     Assert(slen < maxlen);
    2031             : 
    2032       15516 :     pfree(str);
    2033             : 
    2034       15516 :     return result;
    2035             : }

Generated by: LCOV version 1.16