LCOV - PostgreSQL 19devel - src/backend/utils/adt/regexp.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - regexp.c (source / functions)		Coverage	Total	Hit
Test:	PostgreSQL 19devel	Lines:	91.9 %	655	602
Test Date:	2026-02-27 05:14:50	Functions:	98.0 %	51	50
Legend:	Lines: hit not hit

            Line data    Source code

       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * regexp.c
       4              :  *    Postgres' interface to the regular expression package.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/utils/adt/regexp.c
      12              :  *
      13              :  *      Alistair Crooks added the code for the regex caching
      14              :  *      agc - cached the regular expressions used - there's a good chance
      15              :  *      that we'll get a hit, so this saves a compile step for every
      16              :  *      attempted match. I haven't actually measured the speed improvement,
      17              :  *      but it `looks' a lot quicker visually when watching regression
      18              :  *      test output.
      19              :  *
      20              :  *      agc - incorporated Keith Bostic's Berkeley regex code into
      21              :  *      the tree for all ports. To distinguish this regex code from any that
      22              :  *      is existent on a platform, I've prepended the string "pg_" to
      23              :  *      the functions regcomp, regerror, regexec and regfree.
      24              :  *      Fixed a bug that was originally a typo by me, where `i' was used
      25              :  *      instead of `oldest' when compiling regular expressions - benign
      26              :  *      results mostly, although occasionally it bit you...
      27              :  *
      28              :  *-------------------------------------------------------------------------
      29              :  */
      30              : #include "postgres.h"
      31              : 
      32              : #include "catalog/pg_type.h"
      33              : #include "funcapi.h"
      34              : #include "regex/regex.h"
      35              : #include "utils/array.h"
      36              : #include "utils/builtins.h"
      37              : #include "utils/memutils.h"
      38              : #include "utils/varlena.h"
      39              : 
      40              : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      41              :     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
      42              : 
      43              : 
      44              : /* all the options of interest for regex functions */
      45              : typedef struct pg_re_flags
      46              : {
      47              :     int         cflags;         /* compile flags for Spencer's regex code */
      48              :     bool        glob;           /* do it globally (for each occurrence) */
      49              : } pg_re_flags;
      50              : 
      51              : /* cross-call state for regexp_match and regexp_split functions */
      52              : typedef struct regexp_matches_ctx
      53              : {
      54              :     text       *orig_str;       /* data string in original TEXT form */
      55              :     int         nmatches;       /* number of places where pattern matched */
      56              :     int         npatterns;      /* number of capturing subpatterns */
      57              :     /* We store start char index and end+1 char index for each match */
      58              :     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      59              :     int        *match_locs;     /* 0-based character indexes */
      60              :     int         next_match;     /* 0-based index of next match to process */
      61              :     /* workspace for build_regexp_match_result() */
      62              :     Datum      *elems;          /* has npatterns elements */
      63              :     bool       *nulls;          /* has npatterns elements */
      64              :     pg_wchar   *wide_str;       /* wide-char version of original string */
      65              :     char       *conv_buf;       /* conversion buffer, if needed */
      66              :     int         conv_bufsiz;    /* size thereof */
      67              : } regexp_matches_ctx;
      68              : 
      69              : /*
      70              :  * We cache precompiled regular expressions using a "self organizing list"
      71              :  * structure, in which recently-used items tend to be near the front.
      72              :  * Whenever we use an entry, it's moved up to the front of the list.
      73              :  * Over time, an item's average position corresponds to its frequency of use.
      74              :  *
      75              :  * When we first create an entry, it's inserted at the front of
      76              :  * the array, dropping the entry at the end of the array if necessary to
      77              :  * make room.  (This might seem to be weighting the new entry too heavily,
      78              :  * but if we insert new entries further back, we'll be unable to adjust to
      79              :  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
      80              :  * never-before-seen items used circularly.  We ought to be able to handle
      81              :  * that case, so we have to insert at the front.)
      82              :  *
      83              :  * Knuth mentions a variant strategy in which a used item is moved up just
      84              :  * one place in the list.  Although he says this uses fewer comparisons on
      85              :  * average, it seems not to adapt very well to the situation where you have
      86              :  * both some reusable patterns and a steady stream of non-reusable patterns.
      87              :  * A reusable pattern that isn't used at least as often as non-reusable
      88              :  * patterns are seen will "fail to keep up" and will drop off the end of the
      89              :  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
      90              :  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
      91              :  */
      92              : 
      93              : /* this is the maximum number of cached regular expressions */
      94              : #ifndef MAX_CACHED_RES
      95              : #define MAX_CACHED_RES  32
      96              : #endif
      97              : 
      98              : /* A parent memory context for regular expressions. */
      99              : static MemoryContext RegexpCacheMemoryContext;
     100              : 
     101              : /* this structure describes one cached regular expression */
     102              : typedef struct cached_re_str
     103              : {
     104              :     MemoryContext cre_context;  /* memory context for this regexp */
     105              :     char       *cre_pat;        /* original RE (not null terminated!) */
     106              :     int         cre_pat_len;    /* length of original RE, in bytes */
     107              :     int         cre_flags;      /* compile flags: extended,icase etc */
     108              :     Oid         cre_collation;  /* collation to use */
     109              :     regex_t     cre_re;         /* the compiled regular expression */
     110              : } cached_re_str;
     111              : 
     112              : static int  num_res = 0;        /* # of cached re's */
     113              : static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
     114              : 
     115              : 
     116              : /* Local functions */
     117              : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
     118              :                                                 pg_re_flags *re_flags,
     119              :                                                 int start_search,
     120              :                                                 Oid collation,
     121              :                                                 bool use_subpatterns,
     122              :                                                 bool ignore_degenerate,
     123              :                                                 bool fetching_unmatched);
     124              : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
     125              : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
     126              : 
     127              : 
     128              : /*
     129              :  * RE_compile_and_cache - compile a RE, caching if possible
     130              :  *
     131              :  * Returns regex_t *
     132              :  *
     133              :  *  text_re --- the pattern, expressed as a TEXT object
     134              :  *  cflags --- compile options for the pattern
     135              :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     136              :  *
     137              :  * Pattern is given in the database encoding.  We internally convert to
     138              :  * an array of pg_wchar, which is what Spencer's regex package wants.
     139              :  */
     140              : regex_t *
     141      3700933 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
     142              : {
     143      3700933 :     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
     144      3700933 :     char       *text_re_val = VARDATA_ANY(text_re);
     145              :     pg_wchar   *pattern;
     146              :     int         pattern_len;
     147              :     int         i;
     148              :     int         regcomp_result;
     149              :     cached_re_str re_temp;
     150              :     char        errMsg[100];
     151              :     MemoryContext oldcontext;
     152              : 
     153              :     /*
     154              :      * Look for a match among previously compiled REs.  Since the data
     155              :      * structure is self-organizing with most-used entries at the front, our
     156              :      * search strategy can just be to scan from the front.
     157              :      */
     158      3998696 :     for (i = 0; i < num_res; i++)
     159              :     {
     160      3995432 :         if (re_array[i].cre_pat_len == text_re_len &&
     161      3705195 :             re_array[i].cre_flags == cflags &&
     162      3704596 :             re_array[i].cre_collation == collation &&
     163      3704408 :             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
     164              :         {
     165              :             /*
     166              :              * Found a match; move it to front if not there already.
     167              :              */
     168      3697669 :             if (i > 0)
     169              :             {
     170       228141 :                 re_temp = re_array[i];
     171       228141 :                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
     172       228141 :                 re_array[0] = re_temp;
     173              :             }
     174              : 
     175      3697669 :             return &re_array[0].cre_re;
     176              :         }
     177              :     }
     178              : 
     179              :     /* Set up the cache memory on first go through. */
     180         3264 :     if (unlikely(RegexpCacheMemoryContext == NULL))
     181          853 :         RegexpCacheMemoryContext =
     182          853 :             AllocSetContextCreate(TopMemoryContext,
     183              :                                   "RegexpCacheMemoryContext",
     184              :                                   ALLOCSET_SMALL_SIZES);
     185              : 
     186              :     /*
     187              :      * Couldn't find it, so try to compile the new RE.  To avoid leaking
     188              :      * resources on failure, we build into the re_temp local.
     189              :      */
     190              : 
     191              :     /* Convert pattern string to wide characters */
     192         3264 :     pattern = palloc_array(pg_wchar, text_re_len + 1);
     193         3264 :     pattern_len = pg_mb2wchar_with_len(text_re_val,
     194              :                                        pattern,
     195              :                                        text_re_len);
     196              : 
     197              :     /*
     198              :      * Make a memory context for this compiled regexp.  This is initially a
     199              :      * child of the current memory context, so it will be cleaned up
     200              :      * automatically if compilation is interrupted and throws an ERROR. We'll
     201              :      * re-parent it under the longer lived cache context if we make it to the
     202              :      * bottom of this function.
     203              :      */
     204         3264 :     re_temp.cre_context = AllocSetContextCreate(CurrentMemoryContext,
     205              :                                                 "RegexpMemoryContext",
     206              :                                                 ALLOCSET_SMALL_SIZES);
     207         3264 :     oldcontext = MemoryContextSwitchTo(re_temp.cre_context);
     208              : 
     209         3264 :     regcomp_result = pg_regcomp(&re_temp.cre_re,
     210              :                                 pattern,
     211              :                                 pattern_len,
     212              :                                 cflags,
     213              :                                 collation);
     214              : 
     215         3252 :     pfree(pattern);
     216              : 
     217         3252 :     if (regcomp_result != REG_OKAY)
     218              :     {
     219              :         /* re didn't compile (no need for pg_regfree, if so) */
     220           18 :         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
     221           18 :         ereport(ERROR,
     222              :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     223              :                  errmsg("invalid regular expression: %s", errMsg)));
     224              :     }
     225              : 
     226              :     /* Copy the pattern into the per-regexp memory context. */
     227         3234 :     re_temp.cre_pat = palloc(text_re_len + 1);
     228         3234 :     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
     229              : 
     230              :     /*
     231              :      * NUL-terminate it only for the benefit of the identifier used for the
     232              :      * memory context, visible in the pg_backend_memory_contexts view.
     233              :      */
     234         3234 :     re_temp.cre_pat[text_re_len] = 0;
     235         3234 :     MemoryContextSetIdentifier(re_temp.cre_context, re_temp.cre_pat);
     236              : 
     237         3234 :     re_temp.cre_pat_len = text_re_len;
     238         3234 :     re_temp.cre_flags = cflags;
     239         3234 :     re_temp.cre_collation = collation;
     240              : 
     241              :     /*
     242              :      * Okay, we have a valid new item in re_temp; insert it into the storage
     243              :      * array.  Discard last entry if needed.
     244              :      */
     245         3234 :     if (num_res >= MAX_CACHED_RES)
     246              :     {
     247          444 :         --num_res;
     248              :         Assert(num_res < MAX_CACHED_RES);
     249              :         /* Delete the memory context holding the regexp and pattern. */
     250          444 :         MemoryContextDelete(re_array[num_res].cre_context);
     251              :     }
     252              : 
     253              :     /* Re-parent the memory context to our long-lived cache context. */
     254         3234 :     MemoryContextSetParent(re_temp.cre_context, RegexpCacheMemoryContext);
     255              : 
     256         3234 :     if (num_res > 0)
     257         2381 :         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
     258              : 
     259         3234 :     re_array[0] = re_temp;
     260         3234 :     num_res++;
     261              : 
     262         3234 :     MemoryContextSwitchTo(oldcontext);
     263              : 
     264         3234 :     return &re_array[0].cre_re;
     265              : }
     266              : 
     267              : /*
     268              :  * RE_wchar_execute - execute a RE on pg_wchar data
     269              :  *
     270              :  * Returns true on match, false on no match
     271              :  *
     272              :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     273              :  *  data --- the data to match against (need not be null-terminated)
     274              :  *  data_len --- the length of the data string
     275              :  *  start_search -- the offset in the data to start searching
     276              :  *  nmatch, pmatch  --- optional return area for match details
     277              :  *
     278              :  * Data is given as array of pg_wchar which is what Spencer's regex package
     279              :  * wants.
     280              :  */
     281              : static bool
     282      4128935 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
     283              :                  int start_search, int nmatch, regmatch_t *pmatch)
     284              : {
     285              :     int         regexec_result;
     286              :     char        errMsg[100];
     287              : 
     288              :     /* Perform RE match and return result */
     289      4128935 :     regexec_result = pg_regexec(re,
     290              :                                 data,
     291              :                                 data_len,
     292              :                                 start_search,
     293              :                                 NULL,   /* no details */
     294              :                                 nmatch,
     295              :                                 pmatch,
     296              :                                 0);
     297              : 
     298      4128935 :     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
     299              :     {
     300              :         /* re failed??? */
     301            0 :         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
     302            0 :         ereport(ERROR,
     303              :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     304              :                  errmsg("regular expression failed: %s", errMsg)));
     305              :     }
     306              : 
     307      4128935 :     return (regexec_result == REG_OKAY);
     308              : }
     309              : 
     310              : /*
     311              :  * RE_execute - execute a RE
     312              :  *
     313              :  * Returns true on match, false on no match
     314              :  *
     315              :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     316              :  *  dat --- the data to match against (need not be null-terminated)
     317              :  *  dat_len --- the length of the data string
     318              :  *  nmatch, pmatch  --- optional return area for match details
     319              :  *
     320              :  * Data is given in the database encoding.  We internally
     321              :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     322              :  */
     323              : static bool
     324      3580450 : RE_execute(regex_t *re, char *dat, int dat_len,
     325              :            int nmatch, regmatch_t *pmatch)
     326              : {
     327              :     pg_wchar   *data;
     328              :     int         data_len;
     329              :     bool        match;
     330              : 
     331              :     /* Convert data string to wide characters */
     332      3580450 :     data = palloc_array(pg_wchar, dat_len + 1);
     333      3580450 :     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
     334              : 
     335              :     /* Perform RE match and return result */
     336      3580450 :     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
     337              : 
     338      3580450 :     pfree(data);
     339      3580450 :     return match;
     340              : }
     341              : 
     342              : /*
     343              :  * RE_compile_and_execute - compile and execute a RE
     344              :  *
     345              :  * Returns true on match, false on no match
     346              :  *
     347              :  *  text_re --- the pattern, expressed as a TEXT object
     348              :  *  dat --- the data to match against (need not be null-terminated)
     349              :  *  dat_len --- the length of the data string
     350              :  *  cflags --- compile options for the pattern
     351              :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     352              :  *  nmatch, pmatch  --- optional return area for match details
     353              :  *
     354              :  * Both pattern and data are given in the database encoding.  We internally
     355              :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     356              :  */
     357              : bool
     358      3579639 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
     359              :                        int cflags, Oid collation,
     360              :                        int nmatch, regmatch_t *pmatch)
     361              : {
     362              :     regex_t    *re;
     363              : 
     364              :     /* Use REG_NOSUB if caller does not want sub-match details */
     365      3579639 :     if (nmatch < 2)
     366      3579639 :         cflags |= REG_NOSUB;
     367              : 
     368              :     /* Compile RE */
     369      3579639 :     re = RE_compile_and_cache(text_re, cflags, collation);
     370              : 
     371      3579627 :     return RE_execute(re, dat, dat_len, nmatch, pmatch);
     372              : }
     373              : 
     374              : 
     375              : /*
     376              :  * parse_re_flags - parse the options argument of regexp_match and friends
     377              :  *
     378              :  *  flags --- output argument, filled with desired options
     379              :  *  opts --- TEXT object, or NULL for defaults
     380              :  *
     381              :  * This accepts all the options allowed by any of the callers; callers that
     382              :  * don't want some have to reject them after the fact.
     383              :  */
     384              : static void
     385       104802 : parse_re_flags(pg_re_flags *flags, text *opts)
     386              : {
     387              :     /* regex flavor is always folded into the compile flags */
     388       104802 :     flags->cflags = REG_ADVANCED;
     389       104802 :     flags->glob = false;
     390              : 
     391       104802 :     if (opts)
     392              :     {
     393         2351 :         char       *opt_p = VARDATA_ANY(opts);
     394         2351 :         int         opt_len = VARSIZE_ANY_EXHDR(opts);
     395              :         int         i;
     396              : 
     397         5290 :         for (i = 0; i < opt_len; i++)
     398              :         {
     399         2951 :             switch (opt_p[i])
     400              :             {
     401         2190 :                 case 'g':
     402         2190 :                     flags->glob = true;
     403         2190 :                     break;
     404            0 :                 case 'b':       /* BREs (but why???) */
     405            0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
     406            0 :                     break;
     407            5 :                 case 'c':       /* case sensitive */
     408            5 :                     flags->cflags &= ~REG_ICASE;
     409            5 :                     break;
     410            0 :                 case 'e':       /* plain EREs */
     411            0 :                     flags->cflags |= REG_EXTENDED;
     412            0 :                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
     413            0 :                     break;
     414          146 :                 case 'i':       /* case insensitive */
     415          146 :                     flags->cflags |= REG_ICASE;
     416          146 :                     break;
     417          589 :                 case 'm':       /* Perloid synonym for n */
     418              :                 case 'n':       /* \n affects ^ $ . [^ */
     419          589 :                     flags->cflags |= REG_NEWLINE;
     420          589 :                     break;
     421            0 :                 case 'p':       /* ~Perl, \n affects . [^ */
     422            0 :                     flags->cflags |= REG_NLSTOP;
     423            0 :                     flags->cflags &= ~REG_NLANCH;
     424            0 :                     break;
     425            0 :                 case 'q':       /* literal string */
     426            0 :                     flags->cflags |= REG_QUOTE;
     427            0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
     428            0 :                     break;
     429            6 :                 case 's':       /* single line, \n ordinary */
     430            6 :                     flags->cflags &= ~REG_NEWLINE;
     431            6 :                     break;
     432            0 :                 case 't':       /* tight syntax */
     433            0 :                     flags->cflags &= ~REG_EXPANDED;
     434            0 :                     break;
     435            0 :                 case 'w':       /* weird, \n affects ^ $ only */
     436            0 :                     flags->cflags &= ~REG_NLSTOP;
     437            0 :                     flags->cflags |= REG_NLANCH;
     438            0 :                     break;
     439            3 :                 case 'x':       /* expanded syntax */
     440            3 :                     flags->cflags |= REG_EXPANDED;
     441            3 :                     break;
     442           12 :                 default:
     443           12 :                     ereport(ERROR,
     444              :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     445              :                              errmsg("invalid regular expression option: \"%.*s\"",
     446              :                                     pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
     447              :                     break;
     448              :             }
     449              :         }
     450              :     }
     451       104790 : }
     452              : 
     453              : 
     454              : /*
     455              :  *  interface routines called by the function manager
     456              :  */
     457              : 
     458              : Datum
     459      3346920 : nameregexeq(PG_FUNCTION_ARGS)
     460              : {
     461      3346920 :     Name        n = PG_GETARG_NAME(0);
     462      3346920 :     text       *p = PG_GETARG_TEXT_PP(1);
     463              : 
     464      3346920 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     465              :                                           NameStr(*n),
     466              :                                           strlen(NameStr(*n)),
     467              :                                           REG_ADVANCED,
     468              :                                           PG_GET_COLLATION(),
     469              :                                           0, NULL));
     470              : }
     471              : 
     472              : Datum
     473        14759 : nameregexne(PG_FUNCTION_ARGS)
     474              : {
     475        14759 :     Name        n = PG_GETARG_NAME(0);
     476        14759 :     text       *p = PG_GETARG_TEXT_PP(1);
     477              : 
     478        14759 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     479              :                                            NameStr(*n),
     480              :                                            strlen(NameStr(*n)),
     481              :                                            REG_ADVANCED,
     482              :                                            PG_GET_COLLATION(),
     483              :                                            0, NULL));
     484              : }
     485              : 
     486              : Datum
     487       196864 : textregexeq(PG_FUNCTION_ARGS)
     488              : {
     489       196864 :     text       *s = PG_GETARG_TEXT_PP(0);
     490       196864 :     text       *p = PG_GETARG_TEXT_PP(1);
     491              : 
     492       196864 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     493              :                                           VARDATA_ANY(s),
     494              :                                           VARSIZE_ANY_EXHDR(s),
     495              :                                           REG_ADVANCED,
     496              :                                           PG_GET_COLLATION(),
     497              :                                           0, NULL));
     498              : }
     499              : 
     500              : Datum
     501        17076 : textregexne(PG_FUNCTION_ARGS)
     502              : {
     503        17076 :     text       *s = PG_GETARG_TEXT_PP(0);
     504        17076 :     text       *p = PG_GETARG_TEXT_PP(1);
     505              : 
     506        17076 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     507              :                                            VARDATA_ANY(s),
     508              :                                            VARSIZE_ANY_EXHDR(s),
     509              :                                            REG_ADVANCED,
     510              :                                            PG_GET_COLLATION(),
     511              :                                            0, NULL));
     512              : }
     513              : 
     514              : 
     515              : /*
     516              :  *  routines that use the regexp stuff, but ignore the case.
     517              :  *  for this, we use the REG_ICASE flag to pg_regcomp
     518              :  */
     519              : 
     520              : 
     521              : Datum
     522         3743 : nameicregexeq(PG_FUNCTION_ARGS)
     523              : {
     524         3743 :     Name        n = PG_GETARG_NAME(0);
     525         3743 :     text       *p = PG_GETARG_TEXT_PP(1);
     526              : 
     527         3743 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     528              :                                           NameStr(*n),
     529              :                                           strlen(NameStr(*n)),
     530              :                                           REG_ADVANCED | REG_ICASE,
     531              :                                           PG_GET_COLLATION(),
     532              :                                           0, NULL));
     533              : }
     534              : 
     535              : Datum
     536            3 : nameicregexne(PG_FUNCTION_ARGS)
     537              : {
     538            3 :     Name        n = PG_GETARG_NAME(0);
     539            3 :     text       *p = PG_GETARG_TEXT_PP(1);
     540              : 
     541            3 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     542              :                                            NameStr(*n),
     543              :                                            strlen(NameStr(*n)),
     544              :                                            REG_ADVANCED | REG_ICASE,
     545              :                                            PG_GET_COLLATION(),
     546              :                                            0, NULL));
     547              : }
     548              : 
     549              : Datum
     550          110 : texticregexeq(PG_FUNCTION_ARGS)
     551              : {
     552          110 :     text       *s = PG_GETARG_TEXT_PP(0);
     553          110 :     text       *p = PG_GETARG_TEXT_PP(1);
     554              : 
     555          110 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     556              :                                           VARDATA_ANY(s),
     557              :                                           VARSIZE_ANY_EXHDR(s),
     558              :                                           REG_ADVANCED | REG_ICASE,
     559              :                                           PG_GET_COLLATION(),
     560              :                                           0, NULL));
     561              : }
     562              : 
     563              : Datum
     564           14 : texticregexne(PG_FUNCTION_ARGS)
     565              : {
     566           14 :     text       *s = PG_GETARG_TEXT_PP(0);
     567           14 :     text       *p = PG_GETARG_TEXT_PP(1);
     568              : 
     569           14 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     570              :                                            VARDATA_ANY(s),
     571              :                                            VARSIZE_ANY_EXHDR(s),
     572              :                                            REG_ADVANCED | REG_ICASE,
     573              :                                            PG_GET_COLLATION(),
     574              :                                            0, NULL));
     575              : }
     576              : 
     577              : 
     578              : /*
     579              :  * textregexsubstr()
     580              :  *      Return a substring matched by a regular expression.
     581              :  */
     582              : Datum
     583          823 : textregexsubstr(PG_FUNCTION_ARGS)
     584              : {
     585          823 :     text       *s = PG_GETARG_TEXT_PP(0);
     586          823 :     text       *p = PG_GETARG_TEXT_PP(1);
     587              :     regex_t    *re;
     588              :     regmatch_t  pmatch[2];
     589              :     int         so,
     590              :                 eo;
     591              : 
     592              :     /* Compile RE */
     593          823 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     594              : 
     595              :     /*
     596              :      * We pass two regmatch_t structs to get info about the overall match and
     597              :      * the match for the first parenthesized subexpression (if any). If there
     598              :      * is a parenthesized subexpression, we return what it matched; else
     599              :      * return what the whole regexp matched.
     600              :      */
     601          823 :     if (!RE_execute(re,
     602          823 :                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
     603              :                     2, pmatch))
     604            6 :         PG_RETURN_NULL();       /* definitely no match */
     605              : 
     606          817 :     if (re->re_nsub > 0)
     607              :     {
     608              :         /* has parenthesized subexpressions, use the first one */
     609          761 :         so = pmatch[1].rm_so;
     610          761 :         eo = pmatch[1].rm_eo;
     611              :     }
     612              :     else
     613              :     {
     614              :         /* no parenthesized subexpression, use whole match */
     615           56 :         so = pmatch[0].rm_so;
     616           56 :         eo = pmatch[0].rm_eo;
     617              :     }
     618              : 
     619              :     /*
     620              :      * It is possible to have a match to the whole pattern but no match for a
     621              :      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
     622              :      * there is no subexpression match.  So this extra test for match failure
     623              :      * is not redundant.
     624              :      */
     625          817 :     if (so < 0 || eo < 0)
     626            3 :         PG_RETURN_NULL();
     627              : 
     628          814 :     return DirectFunctionCall3(text_substr,
     629              :                                PointerGetDatum(s),
     630              :                                Int32GetDatum(so + 1),
     631              :                                Int32GetDatum(eo - so));
     632              : }
     633              : 
     634              : /*
     635              :  * textregexreplace_noopt()
     636              :  *      Return a string matched by a regular expression, with replacement.
     637              :  *
     638              :  * This version doesn't have an option argument: we default to case
     639              :  * sensitive match, replace the first instance only.
     640              :  */
     641              : Datum
     642         7227 : textregexreplace_noopt(PG_FUNCTION_ARGS)
     643              : {
     644         7227 :     text       *s = PG_GETARG_TEXT_PP(0);
     645         7227 :     text       *p = PG_GETARG_TEXT_PP(1);
     646         7227 :     text       *r = PG_GETARG_TEXT_PP(2);
     647              : 
     648         7227 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     649              :                                          REG_ADVANCED, PG_GET_COLLATION(),
     650              :                                          0, 1));
     651              : }
     652              : 
     653              : /*
     654              :  * textregexreplace()
     655              :  *      Return a string matched by a regular expression, with replacement.
     656              :  */
     657              : Datum
     658         2154 : textregexreplace(PG_FUNCTION_ARGS)
     659              : {
     660         2154 :     text       *s = PG_GETARG_TEXT_PP(0);
     661         2154 :     text       *p = PG_GETARG_TEXT_PP(1);
     662         2154 :     text       *r = PG_GETARG_TEXT_PP(2);
     663         2154 :     text       *opt = PG_GETARG_TEXT_PP(3);
     664              :     pg_re_flags flags;
     665              : 
     666              :     /*
     667              :      * regexp_replace() with four arguments will be preferentially resolved as
     668              :      * this form when the fourth argument is of type UNKNOWN.  However, the
     669              :      * user might have intended to call textregexreplace_extended_no_n.  If we
     670              :      * see flags that look like an integer, emit the same error that
     671              :      * parse_re_flags would, but add a HINT about how to fix it.
     672              :      */
     673         2154 :     if (VARSIZE_ANY_EXHDR(opt) > 0)
     674              :     {
     675         2154 :         char       *opt_p = VARDATA_ANY(opt);
     676         2154 :         const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt);
     677              : 
     678         2154 :         if (*opt_p >= '0' && *opt_p <= '9')
     679            3 :             ereport(ERROR,
     680              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     681              :                      errmsg("invalid regular expression option: \"%.*s\"",
     682              :                             pg_mblen_range(opt_p, end_p), opt_p),
     683              :                      errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
     684              :     }
     685              : 
     686         2151 :     parse_re_flags(&flags, opt);
     687              : 
     688         2148 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     689              :                                          flags.cflags, PG_GET_COLLATION(),
     690              :                                          0, flags.glob ? 0 : 1));
     691              : }
     692              : 
     693              : /*
     694              :  * textregexreplace_extended()
     695              :  *      Return a string matched by a regular expression, with replacement.
     696              :  *      Extends textregexreplace by allowing a start position and the
     697              :  *      choice of the occurrence to replace (0 means all occurrences).
     698              :  */
     699              : Datum
     700           33 : textregexreplace_extended(PG_FUNCTION_ARGS)
     701              : {
     702           33 :     text       *s = PG_GETARG_TEXT_PP(0);
     703           33 :     text       *p = PG_GETARG_TEXT_PP(1);
     704           33 :     text       *r = PG_GETARG_TEXT_PP(2);
     705           33 :     int         start = 1;
     706           33 :     int         n = 1;
     707           33 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
     708              :     pg_re_flags re_flags;
     709              : 
     710              :     /* Collect optional parameters */
     711           33 :     if (PG_NARGS() > 3)
     712              :     {
     713           33 :         start = PG_GETARG_INT32(3);
     714           33 :         if (start <= 0)
     715            3 :             ereport(ERROR,
     716              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     717              :                      errmsg("invalid value for parameter \"%s\": %d",
     718              :                             "start", start)));
     719              :     }
     720           30 :     if (PG_NARGS() > 4)
     721              :     {
     722           27 :         n = PG_GETARG_INT32(4);
     723           27 :         if (n < 0)
     724            3 :             ereport(ERROR,
     725              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     726              :                      errmsg("invalid value for parameter \"%s\": %d",
     727              :                             "n", n)));
     728              :     }
     729              : 
     730              :     /* Determine options */
     731           27 :     parse_re_flags(&re_flags, flags);
     732              : 
     733              :     /* If N was not specified, deduce it from the 'g' flag */
     734           27 :     if (PG_NARGS() <= 4)
     735            3 :         n = re_flags.glob ? 0 : 1;
     736              : 
     737              :     /* Do the replacement(s) */
     738           27 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     739              :                                          re_flags.cflags, PG_GET_COLLATION(),
     740              :                                          start - 1, n));
     741              : }
     742              : 
     743              : /* This is separate to keep the opr_sanity regression test from complaining */
     744              : Datum
     745            3 : textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
     746              : {
     747            3 :     return textregexreplace_extended(fcinfo);
     748              : }
     749              : 
     750              : /* This is separate to keep the opr_sanity regression test from complaining */
     751              : Datum
     752            3 : textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
     753              : {
     754            3 :     return textregexreplace_extended(fcinfo);
     755              : }
     756              : 
     757              : /*
     758              :  * similar_to_escape(), similar_escape()
     759              :  *
     760              :  * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
     761              :  * used by our regexp engine.
     762              :  *
     763              :  * similar_escape_internal() is the common workhorse for three SQL-exposed
     764              :  * functions.  esc_text can be passed as NULL to select the default escape
     765              :  * (which is '\'), or as an empty string to select no escape character.
     766              :  */
     767              : static text *
     768           96 : similar_escape_internal(text *pat_text, text *esc_text)
     769              : {
     770              :     text       *result;
     771              :     char       *p,
     772              :                *e,
     773              :                *r;
     774              :     int         plen,
     775              :                 elen;
     776              :     const char *pend;
     777           96 :     bool        afterescape = false;
     778           96 :     int         nquotes = 0;
     779           96 :     int         bracket_depth = 0;  /* square bracket nesting level */
     780           96 :     int         charclass_pos = 0;  /* position inside a character class */
     781              : 
     782           96 :     p = VARDATA_ANY(pat_text);
     783           96 :     plen = VARSIZE_ANY_EXHDR(pat_text);
     784           96 :     pend = p + plen;
     785           96 :     if (esc_text == NULL)
     786              :     {
     787              :         /* No ESCAPE clause provided; default to backslash as escape */
     788           44 :         e = "\\";
     789           44 :         elen = 1;
     790              :     }
     791              :     else
     792              :     {
     793           52 :         e = VARDATA_ANY(esc_text);
     794           52 :         elen = VARSIZE_ANY_EXHDR(esc_text);
     795           52 :         if (elen == 0)
     796            3 :             e = NULL;           /* no escape character */
     797           49 :         else if (elen > 1)
     798              :         {
     799            6 :             int         escape_mblen = pg_mbstrlen_with_len(e, elen);
     800              : 
     801            6 :             if (escape_mblen > 1)
     802            3 :                 ereport(ERROR,
     803              :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     804              :                          errmsg("invalid escape string"),
     805              :                          errhint("Escape string must be empty or one character.")));
     806              :         }
     807              :     }
     808              : 
     809              :     /*----------
     810              :      * We surround the transformed input string with
     811              :      *          ^(?: ... )$
     812              :      * which requires some explanation.  We need "^" and "$" to force
     813              :      * the pattern to match the entire input string as per the SQL spec.
     814              :      * The "(?:" and ")" are a non-capturing set of parens; we have to have
     815              :      * parens in case the string contains "|", else the "^" and "$" will
     816              :      * be bound into the first and last alternatives which is not what we
     817              :      * want, and the parens must be non capturing because we don't want them
     818              :      * to count when selecting output for SUBSTRING.
     819              :      *
     820              :      * When the pattern is divided into three parts by escape-double-quotes,
     821              :      * what we emit is
     822              :      *          ^(?:part1){1,1}?(part2){1,1}(?:part3)$
     823              :      * which requires even more explanation.  The "{1,1}?" on part1 makes it
     824              :      * non-greedy so that it will match the smallest possible amount of text
     825              :      * not the largest, as required by SQL.  The plain parens around part2
     826              :      * are capturing parens so that that part is what controls the result of
     827              :      * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
     828              :      * the largest possible amount of text; hence part3 must match the
     829              :      * smallest amount of text, as required by SQL.  We don't need an explicit
     830              :      * greediness marker on part3.  Note that this also confines the effects
     831              :      * of any "|" characters to the respective part, which is what we want.
     832              :      *
     833              :      * The SQL spec says that SUBSTRING's pattern must contain exactly two
     834              :      * escape-double-quotes, but we only complain if there's more than two.
     835              :      * With none, we act as though part1 and part3 are empty; with one, we
     836              :      * act as though part3 is empty.  Both behaviors fall out of omitting
     837              :      * the relevant part separators in the above expansion.  If the result
     838              :      * of this function is used in a plain regexp match (SIMILAR TO), the
     839              :      * escape-double-quotes have no effect on the match behavior.
     840              :      *
     841              :      * While we don't fully validate character classes (bracket expressions),
     842              :      * we do need to parse them well enough to know where they end.
     843              :      * "charclass_pos" tracks where we are in a character class.
     844              :      * Its value is uninteresting when bracket_depth is 0.
     845              :      * But when bracket_depth > 0, it will be
     846              :      *   1: right after the opening '[' (a following '^' will negate
     847              :      *      the class, while ']' is a literal character)
     848              :      *   2: right after a '^' after the opening '[' (']' is still a literal
     849              :      *      character)
     850              :      *   3 or more: further inside the character class (']' ends the class)
     851              :      *----------
     852              :      */
     853              : 
     854              :     /*
     855              :      * We need room for the prefix/postfix and part separators, plus as many
     856              :      * as 3 output bytes per input byte; since the input is at most 1GB this
     857              :      * can't overflow size_t.
     858              :      */
     859           93 :     result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
     860           93 :     r = VARDATA(result);
     861              : 
     862           93 :     *r++ = '^';
     863           93 :     *r++ = '(';
     864           93 :     *r++ = '?';
     865           93 :     *r++ = ':';
     866              : 
     867          926 :     while (plen > 0)
     868              :     {
     869          836 :         char        pchar = *p;
     870              : 
     871              :         /*
     872              :          * If both the escape character and the current character from the
     873              :          * pattern are multi-byte, we need to take the slow path.
     874              :          *
     875              :          * But if one of them is single-byte, we can process the pattern one
     876              :          * byte at a time, ignoring multi-byte characters.  (This works
     877              :          * because all server-encodings have the property that a valid
     878              :          * multi-byte character representation cannot contain the
     879              :          * representation of a valid single-byte character.)
     880              :          */
     881              : 
     882          836 :         if (elen > 1)
     883              :         {
     884            3 :             int         mblen = pg_mblen_range(p, pend);
     885              : 
     886            3 :             if (mblen > 1)
     887              :             {
     888              :                 /* slow, multi-byte path */
     889            3 :                 if (afterescape)
     890              :                 {
     891            0 :                     *r++ = '\\';
     892            0 :                     memcpy(r, p, mblen);
     893            0 :                     r += mblen;
     894            0 :                     afterescape = false;
     895              :                 }
     896            3 :                 else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
     897              :                 {
     898              :                     /* SQL escape character; do not send to output */
     899            0 :                     afterescape = true;
     900              :                 }
     901              :                 else
     902              :                 {
     903              :                     /*
     904              :                      * We know it's a multi-byte character, so we don't need
     905              :                      * to do all the comparisons to single-byte characters
     906              :                      * that we do below.
     907              :                      */
     908            3 :                     memcpy(r, p, mblen);
     909            3 :                     r += mblen;
     910              :                 }
     911              : 
     912            3 :                 p += mblen;
     913            3 :                 plen -= mblen;
     914              : 
     915            3 :                 continue;
     916              :             }
     917              :         }
     918              : 
     919              :         /* fast path */
     920          833 :         if (afterescape)
     921              :         {
     922           83 :             if (pchar == '"' && bracket_depth < 1)  /* escape-double-quote? */
     923              :             {
     924              :                 /* emit appropriate part separator, per notes above */
     925           62 :                 if (nquotes == 0)
     926              :                 {
     927           31 :                     *r++ = ')';
     928           31 :                     *r++ = '{';
     929           31 :                     *r++ = '1';
     930           31 :                     *r++ = ',';
     931           31 :                     *r++ = '1';
     932           31 :                     *r++ = '}';
     933           31 :                     *r++ = '?';
     934           31 :                     *r++ = '(';
     935              :                 }
     936           31 :                 else if (nquotes == 1)
     937              :                 {
     938           28 :                     *r++ = ')';
     939           28 :                     *r++ = '{';
     940           28 :                     *r++ = '1';
     941           28 :                     *r++ = ',';
     942           28 :                     *r++ = '1';
     943           28 :                     *r++ = '}';
     944           28 :                     *r++ = '(';
     945           28 :                     *r++ = '?';
     946           28 :                     *r++ = ':';
     947              :                 }
     948              :                 else
     949            3 :                     ereport(ERROR,
     950              :                             (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
     951              :                              errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
     952           59 :                 nquotes++;
     953              :             }
     954              :             else
     955              :             {
     956              :                 /*
     957              :                  * We allow any character at all to be escaped; notably, this
     958              :                  * allows access to POSIX character-class escapes such as
     959              :                  * "\d".  The SQL spec is considerably more restrictive.
     960              :                  */
     961           21 :                 *r++ = '\\';
     962           21 :                 *r++ = pchar;
     963              : 
     964              :                 /*
     965              :                  * If we encounter an escaped character in a character class,
     966              :                  * we are no longer at the beginning.
     967              :                  */
     968           21 :                 charclass_pos = 3;
     969              :             }
     970           80 :             afterescape = false;
     971              :         }
     972          750 :         else if (e && pchar == *e)
     973              :         {
     974              :             /* SQL escape character; do not send to output */
     975           83 :             afterescape = true;
     976              :         }
     977          667 :         else if (bracket_depth > 0)
     978              :         {
     979              :             /* inside a character class */
     980          306 :             if (pchar == '\\')
     981              :             {
     982              :                 /*
     983              :                  * If we're here, backslash is not the SQL escape character,
     984              :                  * so treat it as a literal class element, which requires
     985              :                  * doubling it.  (This matches our behavior for backslashes
     986              :                  * outside character classes.)
     987              :                  */
     988            0 :                 *r++ = '\\';
     989              :             }
     990          306 :             *r++ = pchar;
     991              : 
     992              :             /* parse the character class well enough to identify ending ']' */
     993          306 :             if (pchar == ']' && charclass_pos > 2)
     994              :             {
     995              :                 /* found the real end of a bracket pair */
     996           69 :                 bracket_depth--;
     997              :                 /* don't reset charclass_pos, this may be an inner bracket */
     998              :             }
     999          237 :             else if (pchar == '[')
    1000              :             {
    1001              :                 /* start of a nested bracket pair */
    1002           36 :                 bracket_depth++;
    1003              : 
    1004              :                 /*
    1005              :                  * We are no longer at the beginning of a character class.
    1006              :                  * (The nested bracket pair is a collating element, not a
    1007              :                  * character class in its own right.)
    1008              :                  */
    1009           36 :                 charclass_pos = 3;
    1010              :             }
    1011          201 :             else if (pchar == '^')
    1012              :             {
    1013              :                 /*
    1014              :                  * A caret right after the opening bracket negates the
    1015              :                  * character class.  In that case, the following will
    1016              :                  * increment charclass_pos from 1 to 2, so that a following
    1017              :                  * ']' is still a literal character and does not end the
    1018              :                  * character class.  If we are further inside a character
    1019              :                  * class, charclass_pos might get incremented past 3, which is
    1020              :                  * fine.
    1021              :                  */
    1022           30 :                 charclass_pos++;
    1023              :             }
    1024              :             else
    1025              :             {
    1026              :                 /*
    1027              :                  * Anything else (including a backslash or leading ']') is an
    1028              :                  * element of the character class, so we are no longer at the
    1029              :                  * beginning of the class.
    1030              :                  */
    1031          171 :                 charclass_pos = 3;
    1032              :             }
    1033              :         }
    1034          361 :         else if (pchar == '[')
    1035              :         {
    1036              :             /* start of a character class */
    1037           33 :             *r++ = pchar;
    1038           33 :             bracket_depth = 1;
    1039           33 :             charclass_pos = 1;
    1040              :         }
    1041          328 :         else if (pchar == '%')
    1042              :         {
    1043           66 :             *r++ = '.';
    1044           66 :             *r++ = '*';
    1045              :         }
    1046          262 :         else if (pchar == '_')
    1047           32 :             *r++ = '.';
    1048          230 :         else if (pchar == '(')
    1049              :         {
    1050              :             /* convert to non-capturing parenthesis */
    1051           15 :             *r++ = '(';
    1052           15 :             *r++ = '?';
    1053           15 :             *r++ = ':';
    1054              :         }
    1055          215 :         else if (pchar == '\\' || pchar == '.' ||
    1056          195 :                  pchar == '^' || pchar == '$')
    1057              :         {
    1058           26 :             *r++ = '\\';
    1059           26 :             *r++ = pchar;
    1060              :         }
    1061              :         else
    1062          189 :             *r++ = pchar;
    1063          830 :         p++, plen--;
    1064              :     }
    1065              : 
    1066           90 :     *r++ = ')';
    1067           90 :     *r++ = '$';
    1068              : 
    1069           90 :     SET_VARSIZE(result, r - ((char *) result));
    1070              : 
    1071           90 :     return result;
    1072              : }
    1073              : 
    1074              : /*
    1075              :  * similar_to_escape(pattern, escape)
    1076              :  */
    1077              : Datum
    1078           52 : similar_to_escape_2(PG_FUNCTION_ARGS)
    1079              : {
    1080           52 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
    1081           52 :     text       *esc_text = PG_GETARG_TEXT_PP(1);
    1082              :     text       *result;
    1083              : 
    1084           52 :     result = similar_escape_internal(pat_text, esc_text);
    1085              : 
    1086           46 :     PG_RETURN_TEXT_P(result);
    1087              : }
    1088              : 
    1089              : /*
    1090              :  * similar_to_escape(pattern)
    1091              :  * Inserts a default escape character.
    1092              :  */
    1093              : Datum
    1094           44 : similar_to_escape_1(PG_FUNCTION_ARGS)
    1095              : {
    1096           44 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
    1097              :     text       *result;
    1098              : 
    1099           44 :     result = similar_escape_internal(pat_text, NULL);
    1100              : 
    1101           44 :     PG_RETURN_TEXT_P(result);
    1102              : }
    1103              : 
    1104              : /*
    1105              :  * similar_escape(pattern, escape)
    1106              :  *
    1107              :  * Legacy function for compatibility with views stored using the
    1108              :  * pre-v13 expansion of SIMILAR TO.  Unlike the above functions, this
    1109              :  * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
    1110              :  */
    1111              : Datum
    1112            0 : similar_escape(PG_FUNCTION_ARGS)
    1113              : {
    1114              :     text       *pat_text;
    1115              :     text       *esc_text;
    1116              :     text       *result;
    1117              : 
    1118              :     /* This function is not strict, so must test explicitly */
    1119            0 :     if (PG_ARGISNULL(0))
    1120            0 :         PG_RETURN_NULL();
    1121            0 :     pat_text = PG_GETARG_TEXT_PP(0);
    1122              : 
    1123            0 :     if (PG_ARGISNULL(1))
    1124            0 :         esc_text = NULL;        /* use default escape character */
    1125              :     else
    1126            0 :         esc_text = PG_GETARG_TEXT_PP(1);
    1127              : 
    1128            0 :     result = similar_escape_internal(pat_text, esc_text);
    1129              : 
    1130            0 :     PG_RETURN_TEXT_P(result);
    1131              : }
    1132              : 
    1133              : /*
    1134              :  * regexp_count()
    1135              :  *      Return the number of matches of a pattern within a string.
    1136              :  */
    1137              : Datum
    1138           24 : regexp_count(PG_FUNCTION_ARGS)
    1139              : {
    1140           24 :     text       *str = PG_GETARG_TEXT_PP(0);
    1141           24 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1142           24 :     int         start = 1;
    1143           24 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3);
    1144              :     pg_re_flags re_flags;
    1145              :     regexp_matches_ctx *matchctx;
    1146              : 
    1147              :     /* Collect optional parameters */
    1148           24 :     if (PG_NARGS() > 2)
    1149              :     {
    1150           21 :         start = PG_GETARG_INT32(2);
    1151           21 :         if (start <= 0)
    1152            6 :             ereport(ERROR,
    1153              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1154              :                      errmsg("invalid value for parameter \"%s\": %d",
    1155              :                             "start", start)));
    1156              :     }
    1157              : 
    1158              :     /* Determine options */
    1159           18 :     parse_re_flags(&re_flags, flags);
    1160              :     /* User mustn't specify 'g' */
    1161           18 :     if (re_flags.glob)
    1162            0 :         ereport(ERROR,
    1163              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1164              :         /* translator: %s is a SQL function name */
    1165              :                  errmsg("%s does not support the \"global\" option",
    1166              :                         "regexp_count()")));
    1167              :     /* But we find all the matches anyway */
    1168           18 :     re_flags.glob = true;
    1169              : 
    1170              :     /* Do the matching */
    1171           18 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1172              :                                     PG_GET_COLLATION(),
    1173              :                                     false,  /* can ignore subexprs */
    1174              :                                     false, false);
    1175              : 
    1176           18 :     PG_RETURN_INT32(matchctx->nmatches);
    1177              : }
    1178              : 
    1179              : /* This is separate to keep the opr_sanity regression test from complaining */
    1180              : Datum
    1181            3 : regexp_count_no_start(PG_FUNCTION_ARGS)
    1182              : {
    1183            3 :     return regexp_count(fcinfo);
    1184              : }
    1185              : 
    1186              : /* This is separate to keep the opr_sanity regression test from complaining */
    1187              : Datum
    1188           15 : regexp_count_no_flags(PG_FUNCTION_ARGS)
    1189              : {
    1190           15 :     return regexp_count(fcinfo);
    1191              : }
    1192              : 
    1193              : /*
    1194              :  * regexp_instr()
    1195              :  *      Return the match's position within the string
    1196              :  */
    1197              : Datum
    1198           78 : regexp_instr(PG_FUNCTION_ARGS)
    1199              : {
    1200           78 :     text       *str = PG_GETARG_TEXT_PP(0);
    1201           78 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1202           78 :     int         start = 1;
    1203           78 :     int         n = 1;
    1204           78 :     int         endoption = 0;
    1205           78 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
    1206           78 :     int         subexpr = 0;
    1207              :     int         pos;
    1208              :     pg_re_flags re_flags;
    1209              :     regexp_matches_ctx *matchctx;
    1210              : 
    1211              :     /* Collect optional parameters */
    1212           78 :     if (PG_NARGS() > 2)
    1213              :     {
    1214           69 :         start = PG_GETARG_INT32(2);
    1215           69 :         if (start <= 0)
    1216            3 :             ereport(ERROR,
    1217              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1218              :                      errmsg("invalid value for parameter \"%s\": %d",
    1219              :                             "start", start)));
    1220              :     }
    1221           75 :     if (PG_NARGS() > 3)
    1222              :     {
    1223           63 :         n = PG_GETARG_INT32(3);
    1224           63 :         if (n <= 0)
    1225            3 :             ereport(ERROR,
    1226              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1227              :                      errmsg("invalid value for parameter \"%s\": %d",
    1228              :                             "n", n)));
    1229              :     }
    1230           72 :     if (PG_NARGS() > 4)
    1231              :     {
    1232           54 :         endoption = PG_GETARG_INT32(4);
    1233           54 :         if (endoption != 0 && endoption != 1)
    1234            6 :             ereport(ERROR,
    1235              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1236              :                      errmsg("invalid value for parameter \"%s\": %d",
    1237              :                             "endoption", endoption)));
    1238              :     }
    1239           66 :     if (PG_NARGS() > 6)
    1240              :     {
    1241           42 :         subexpr = PG_GETARG_INT32(6);
    1242           42 :         if (subexpr < 0)
    1243            3 :             ereport(ERROR,
    1244              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1245              :                      errmsg("invalid value for parameter \"%s\": %d",
    1246              :                             "subexpr", subexpr)));
    1247              :     }
    1248              : 
    1249              :     /* Determine options */
    1250           63 :     parse_re_flags(&re_flags, flags);
    1251              :     /* User mustn't specify 'g' */
    1252           63 :     if (re_flags.glob)
    1253            3 :         ereport(ERROR,
    1254              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1255              :         /* translator: %s is a SQL function name */
    1256              :                  errmsg("%s does not support the \"global\" option",
    1257              :                         "regexp_instr()")));
    1258              :     /* But we find all the matches anyway */
    1259           60 :     re_flags.glob = true;
    1260              : 
    1261              :     /* Do the matching */
    1262           60 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1263              :                                     PG_GET_COLLATION(),
    1264              :                                     (subexpr > 0),   /* need submatches? */
    1265              :                                     false, false);
    1266              : 
    1267              :     /* When n exceeds matches return 0 (includes case of no matches) */
    1268           60 :     if (n > matchctx->nmatches)
    1269            6 :         PG_RETURN_INT32(0);
    1270              : 
    1271              :     /* When subexpr exceeds number of subexpressions return 0 */
    1272           54 :     if (subexpr > matchctx->npatterns)
    1273            6 :         PG_RETURN_INT32(0);
    1274              : 
    1275              :     /* Select the appropriate match position to return */
    1276           48 :     pos = (n - 1) * matchctx->npatterns;
    1277           48 :     if (subexpr > 0)
    1278           27 :         pos += subexpr - 1;
    1279           48 :     pos *= 2;
    1280           48 :     if (endoption == 1)
    1281           15 :         pos += 1;
    1282              : 
    1283           48 :     if (matchctx->match_locs[pos] >= 0)
    1284           45 :         PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
    1285              :     else
    1286            3 :         PG_RETURN_INT32(0);     /* position not identifiable */
    1287              : }
    1288              : 
    1289              : /* This is separate to keep the opr_sanity regression test from complaining */
    1290              : Datum
    1291            9 : regexp_instr_no_start(PG_FUNCTION_ARGS)
    1292              : {
    1293            9 :     return regexp_instr(fcinfo);
    1294              : }
    1295              : 
    1296              : /* This is separate to keep the opr_sanity regression test from complaining */
    1297              : Datum
    1298            3 : regexp_instr_no_n(PG_FUNCTION_ARGS)
    1299              : {
    1300            3 :     return regexp_instr(fcinfo);
    1301              : }
    1302              : 
    1303              : /* This is separate to keep the opr_sanity regression test from complaining */
    1304              : Datum
    1305           12 : regexp_instr_no_endoption(PG_FUNCTION_ARGS)
    1306              : {
    1307           12 :     return regexp_instr(fcinfo);
    1308              : }
    1309              : 
    1310              : /* This is separate to keep the opr_sanity regression test from complaining */
    1311              : Datum
    1312            6 : regexp_instr_no_flags(PG_FUNCTION_ARGS)
    1313              : {
    1314            6 :     return regexp_instr(fcinfo);
    1315              : }
    1316              : 
    1317              : /* This is separate to keep the opr_sanity regression test from complaining */
    1318              : Datum
    1319            6 : regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
    1320              : {
    1321            6 :     return regexp_instr(fcinfo);
    1322              : }
    1323              : 
    1324              : /*
    1325              :  * regexp_like()
    1326              :  *      Test for a pattern match within a string.
    1327              :  */
    1328              : Datum
    1329           15 : regexp_like(PG_FUNCTION_ARGS)
    1330              : {
    1331           15 :     text       *str = PG_GETARG_TEXT_PP(0);
    1332           15 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1333           15 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1334              :     pg_re_flags re_flags;
    1335              : 
    1336              :     /* Determine options */
    1337           15 :     parse_re_flags(&re_flags, flags);
    1338              :     /* User mustn't specify 'g' */
    1339           15 :     if (re_flags.glob)
    1340            3 :         ereport(ERROR,
    1341              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1342              :         /* translator: %s is a SQL function name */
    1343              :                  errmsg("%s does not support the \"global\" option",
    1344              :                         "regexp_like()")));
    1345              : 
    1346              :     /* Otherwise it's like textregexeq/texticregexeq */
    1347           12 :     PG_RETURN_BOOL(RE_compile_and_execute(pattern,
    1348              :                                           VARDATA_ANY(str),
    1349              :                                           VARSIZE_ANY_EXHDR(str),
    1350              :                                           re_flags.cflags,
    1351              :                                           PG_GET_COLLATION(),
    1352              :                                           0, NULL));
    1353              : }
    1354              : 
    1355              : /* This is separate to keep the opr_sanity regression test from complaining */
    1356              : Datum
    1357            3 : regexp_like_no_flags(PG_FUNCTION_ARGS)
    1358              : {
    1359            3 :     return regexp_like(fcinfo);
    1360              : }
    1361              : 
    1362              : /*
    1363              :  * regexp_match()
    1364              :  *      Return the first substring(s) matching a pattern within a string.
    1365              :  */
    1366              : Datum
    1367         1270 : regexp_match(PG_FUNCTION_ARGS)
    1368              : {
    1369         1270 :     text       *orig_str = PG_GETARG_TEXT_PP(0);
    1370         1270 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1371         1270 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1372              :     pg_re_flags re_flags;
    1373              :     regexp_matches_ctx *matchctx;
    1374              : 
    1375              :     /* Determine options */
    1376         1270 :     parse_re_flags(&re_flags, flags);
    1377              :     /* User mustn't specify 'g' */
    1378         1270 :     if (re_flags.glob)
    1379            4 :         ereport(ERROR,
    1380              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1381              :         /* translator: %s is a SQL function name */
    1382              :                  errmsg("%s does not support the \"global\" option",
    1383              :                         "regexp_match()"),
    1384              :                  errhint("Use the regexp_matches function instead.")));
    1385              : 
    1386         1266 :     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, 0,
    1387              :                                     PG_GET_COLLATION(), true, false, false);
    1388              : 
    1389         1266 :     if (matchctx->nmatches == 0)
    1390           65 :         PG_RETURN_NULL();
    1391              : 
    1392              :     Assert(matchctx->nmatches == 1);
    1393              : 
    1394              :     /* Create workspace that build_regexp_match_result needs */
    1395         1201 :     matchctx->elems = palloc_array(Datum, matchctx->npatterns);
    1396         1201 :     matchctx->nulls = palloc_array(bool, matchctx->npatterns);
    1397              : 
    1398         1201 :     PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
    1399              : }
    1400              : 
    1401              : /* This is separate to keep the opr_sanity regression test from complaining */
    1402              : Datum
    1403         1255 : regexp_match_no_flags(PG_FUNCTION_ARGS)
    1404              : {
    1405         1255 :     return regexp_match(fcinfo);
    1406              : }
    1407              : 
    1408              : /*
    1409              :  * regexp_matches()
    1410              :  *      Return a table of all matches of a pattern within a string.
    1411              :  */
    1412              : Datum
    1413         1305 : regexp_matches(PG_FUNCTION_ARGS)
    1414              : {
    1415              :     FuncCallContext *funcctx;
    1416              :     regexp_matches_ctx *matchctx;
    1417              : 
    1418         1305 :     if (SRF_IS_FIRSTCALL())
    1419              :     {
    1420          975 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1421          975 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1422              :         pg_re_flags re_flags;
    1423              :         MemoryContext oldcontext;
    1424              : 
    1425          975 :         funcctx = SRF_FIRSTCALL_INIT();
    1426          975 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1427              : 
    1428              :         /* Determine options */
    1429          975 :         parse_re_flags(&re_flags, flags);
    1430              : 
    1431              :         /* be sure to copy the input string into the multi-call ctx */
    1432          972 :         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1433              :                                         &re_flags, 0,
    1434              :                                         PG_GET_COLLATION(),
    1435              :                                         true, false, false);
    1436              : 
    1437              :         /* Pre-create workspace that build_regexp_match_result needs */
    1438          966 :         matchctx->elems = palloc_array(Datum, matchctx->npatterns);
    1439          966 :         matchctx->nulls = palloc_array(bool, matchctx->npatterns);
    1440              : 
    1441          966 :         MemoryContextSwitchTo(oldcontext);
    1442          966 :         funcctx->user_fctx = matchctx;
    1443              :     }
    1444              : 
    1445         1296 :     funcctx = SRF_PERCALL_SETUP();
    1446         1296 :     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1447              : 
    1448         1296 :     if (matchctx->next_match < matchctx->nmatches)
    1449              :     {
    1450              :         ArrayType  *result_ary;
    1451              : 
    1452          330 :         result_ary = build_regexp_match_result(matchctx);
    1453          330 :         matchctx->next_match++;
    1454          330 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
    1455              :     }
    1456              : 
    1457          966 :     SRF_RETURN_DONE(funcctx);
    1458              : }
    1459              : 
    1460              : /* This is separate to keep the opr_sanity regression test from complaining */
    1461              : Datum
    1462         1143 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
    1463              : {
    1464         1143 :     return regexp_matches(fcinfo);
    1465              : }
    1466              : 
    1467              : /*
    1468              :  * setup_regexp_matches --- do the initial matching for regexp_match,
    1469              :  *      regexp_split, and related functions
    1470              :  *
    1471              :  * To avoid having to re-find the compiled pattern on each call, we do
    1472              :  * all the matching in one swoop.  The returned regexp_matches_ctx contains
    1473              :  * the locations of all the substrings matching the pattern.
    1474              :  *
    1475              :  * start_search: the character (not byte) offset in orig_str at which to
    1476              :  * begin the search.  Returned positions are relative to orig_str anyway.
    1477              :  * use_subpatterns: collect data about matches to parenthesized subexpressions.
    1478              :  * ignore_degenerate: ignore zero-length matches.
    1479              :  * fetching_unmatched: caller wants to fetch unmatched substrings.
    1480              :  *
    1481              :  * We don't currently assume that fetching_unmatched is exclusive of fetching
    1482              :  * the matched text too; if it's set, the conversion buffer is large enough to
    1483              :  * fetch any single matched or unmatched string, but not any larger
    1484              :  * substring.  (In practice, when splitting the matches are usually small
    1485              :  * anyway, and it didn't seem worth complicating the code further.)
    1486              :  */
    1487              : static regexp_matches_ctx *
    1488       102584 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
    1489              :                      int start_search,
    1490              :                      Oid collation,
    1491              :                      bool use_subpatterns,
    1492              :                      bool ignore_degenerate,
    1493              :                      bool fetching_unmatched)
    1494              : {
    1495       102584 :     regexp_matches_ctx *matchctx = palloc0_object(regexp_matches_ctx);
    1496       102584 :     int         eml = pg_database_encoding_max_length();
    1497              :     int         orig_len;
    1498              :     pg_wchar   *wide_str;
    1499              :     int         wide_len;
    1500              :     int         cflags;
    1501              :     regex_t    *cpattern;
    1502              :     regmatch_t *pmatch;
    1503              :     int         pmatch_len;
    1504              :     int         array_len;
    1505              :     int         array_idx;
    1506              :     int         prev_match_end;
    1507              :     int         prev_valid_match_end;
    1508       102584 :     int         maxlen = 0;     /* largest fetch length in characters */
    1509              : 
    1510              :     /* save original string --- we'll extract result substrings from it */
    1511       102584 :     matchctx->orig_str = orig_str;
    1512              : 
    1513              :     /* convert string to pg_wchar form for matching */
    1514       102584 :     orig_len = VARSIZE_ANY_EXHDR(orig_str);
    1515       102584 :     wide_str = palloc_array(pg_wchar, orig_len + 1);
    1516       102584 :     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
    1517              : 
    1518              :     /* set up the compiled pattern */
    1519       102584 :     cflags = re_flags->cflags;
    1520       102584 :     if (!use_subpatterns)
    1521       100295 :         cflags |= REG_NOSUB;
    1522       102584 :     cpattern = RE_compile_and_cache(pattern, cflags, collation);
    1523              : 
    1524              :     /* do we want to remember subpatterns? */
    1525       102578 :     if (use_subpatterns && cpattern->re_nsub > 0)
    1526              :     {
    1527         1347 :         matchctx->npatterns = cpattern->re_nsub;
    1528         1347 :         pmatch_len = cpattern->re_nsub + 1;
    1529              :     }
    1530              :     else
    1531              :     {
    1532       101231 :         use_subpatterns = false;
    1533       101231 :         matchctx->npatterns = 1;
    1534       101231 :         pmatch_len = 1;
    1535              :     }
    1536              : 
    1537              :     /* temporary output space for RE package */
    1538       102578 :     pmatch = palloc_array(regmatch_t, pmatch_len);
    1539              : 
    1540              :     /*
    1541              :      * the real output space (grown dynamically if needed)
    1542              :      *
    1543              :      * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
    1544              :      * than at 2^27
    1545              :      */
    1546       102578 :     array_len = re_flags->glob ? 255 : 31;
    1547       102578 :     matchctx->match_locs = palloc_array(int, array_len);
    1548       102578 :     array_idx = 0;
    1549              : 
    1550              :     /* search for the pattern, perhaps repeatedly */
    1551       102578 :     prev_match_end = 0;
    1552       102578 :     prev_valid_match_end = 0;
    1553       548485 :     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
    1554              :                             pmatch_len, pmatch))
    1555              :     {
    1556              :         /*
    1557              :          * If requested, ignore degenerate matches, which are zero-length
    1558              :          * matches occurring at the start or end of a string or just after a
    1559              :          * previous match.
    1560              :          */
    1561       447361 :         if (!ignore_degenerate ||
    1562       445632 :             (pmatch[0].rm_so < wide_len &&
    1563       445611 :              pmatch[0].rm_eo > prev_match_end))
    1564              :         {
    1565              :             /* enlarge output space if needed */
    1566       447451 :             while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
    1567              :             {
    1568          180 :                 array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
    1569          180 :                 if (array_len > MaxAllocSize / sizeof(int))
    1570            0 :                     ereport(ERROR,
    1571              :                             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    1572              :                              errmsg("too many regular expression matches")));
    1573          180 :                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
    1574              :                                                         sizeof(int) * array_len);
    1575              :             }
    1576              : 
    1577              :             /* save this match's locations */
    1578       447271 :             if (use_subpatterns)
    1579              :             {
    1580              :                 int         i;
    1581              : 
    1582         3972 :                 for (i = 1; i <= matchctx->npatterns; i++)
    1583              :                 {
    1584         2681 :                     int         so = pmatch[i].rm_so;
    1585         2681 :                     int         eo = pmatch[i].rm_eo;
    1586              : 
    1587         2681 :                     matchctx->match_locs[array_idx++] = so;
    1588         2681 :                     matchctx->match_locs[array_idx++] = eo;
    1589         2681 :                     if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1590         1722 :                         maxlen = (eo - so);
    1591              :                 }
    1592              :             }
    1593              :             else
    1594              :             {
    1595       445980 :                 int         so = pmatch[0].rm_so;
    1596       445980 :                 int         eo = pmatch[0].rm_eo;
    1597              : 
    1598       445980 :                 matchctx->match_locs[array_idx++] = so;
    1599       445980 :                 matchctx->match_locs[array_idx++] = eo;
    1600       445980 :                 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1601       100452 :                     maxlen = (eo - so);
    1602              :             }
    1603       447271 :             matchctx->nmatches++;
    1604              : 
    1605              :             /*
    1606              :              * check length of unmatched portion between end of previous valid
    1607              :              * (nondegenerate, or degenerate but not ignored) match and start
    1608              :              * of current one
    1609              :              */
    1610       447271 :             if (fetching_unmatched &&
    1611       445542 :                 pmatch[0].rm_so >= 0 &&
    1612       445542 :                 (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
    1613       190390 :                 maxlen = (pmatch[0].rm_so - prev_valid_match_end);
    1614       447271 :             prev_valid_match_end = pmatch[0].rm_eo;
    1615              :         }
    1616       447361 :         prev_match_end = pmatch[0].rm_eo;
    1617              : 
    1618              :         /* if not glob, stop after one match */
    1619       447361 :         if (!re_flags->glob)
    1620         1421 :             break;
    1621              : 
    1622              :         /*
    1623              :          * Advance search position.  Normally we start the next search at the
    1624              :          * end of the previous match; but if the match was of zero length, we
    1625              :          * have to advance by one character, or we'd just find the same match
    1626              :          * again.
    1627              :          */
    1628       445940 :         start_search = prev_match_end;
    1629       445940 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    1630          588 :             start_search++;
    1631       445940 :         if (start_search > wide_len)
    1632           33 :             break;
    1633              :     }
    1634              : 
    1635              :     /*
    1636              :      * check length of unmatched portion between end of last match and end of
    1637              :      * input string
    1638              :      */
    1639       102578 :     if (fetching_unmatched &&
    1640       100226 :         (wide_len - prev_valid_match_end) > maxlen)
    1641           35 :         maxlen = (wide_len - prev_valid_match_end);
    1642              : 
    1643              :     /*
    1644              :      * Keep a note of the end position of the string for the benefit of
    1645              :      * splitting code.
    1646              :      */
    1647       102578 :     matchctx->match_locs[array_idx] = wide_len;
    1648              : 
    1649       102578 :     if (eml > 1)
    1650              :     {
    1651       102578 :         int64       maxsiz = eml * (int64) maxlen;
    1652              :         int         conv_bufsiz;
    1653              : 
    1654              :         /*
    1655              :          * Make the conversion buffer large enough for any substring of
    1656              :          * interest.
    1657              :          *
    1658              :          * Worst case: assume we need the maximum size (maxlen*eml), but take
    1659              :          * advantage of the fact that the original string length in bytes is
    1660              :          * an upper bound on the byte length of any fetched substring (and we
    1661              :          * know that len+1 is safe to allocate because the varlena header is
    1662              :          * longer than 1 byte).
    1663              :          */
    1664       102578 :         if (maxsiz > orig_len)
    1665       100409 :             conv_bufsiz = orig_len + 1;
    1666              :         else
    1667         2169 :             conv_bufsiz = maxsiz + 1;   /* safe since maxsiz < 2^30 */
    1668              : 
    1669       102578 :         matchctx->conv_buf = palloc(conv_bufsiz);
    1670       102578 :         matchctx->conv_bufsiz = conv_bufsiz;
    1671       102578 :         matchctx->wide_str = wide_str;
    1672              :     }
    1673              :     else
    1674              :     {
    1675              :         /* No need to keep the wide string if we're in a single-byte charset. */
    1676            0 :         pfree(wide_str);
    1677            0 :         matchctx->wide_str = NULL;
    1678            0 :         matchctx->conv_buf = NULL;
    1679            0 :         matchctx->conv_bufsiz = 0;
    1680              :     }
    1681              : 
    1682              :     /* Clean up temp storage */
    1683       102578 :     pfree(pmatch);
    1684              : 
    1685       102578 :     return matchctx;
    1686              : }
    1687              : 
    1688              : /*
    1689              :  * build_regexp_match_result - build output array for current match
    1690              :  */
    1691              : static ArrayType *
    1692         1531 : build_regexp_match_result(regexp_matches_ctx *matchctx)
    1693              : {
    1694         1531 :     char       *buf = matchctx->conv_buf;
    1695         1531 :     Datum      *elems = matchctx->elems;
    1696         1531 :     bool       *nulls = matchctx->nulls;
    1697              :     int         dims[1];
    1698              :     int         lbs[1];
    1699              :     int         loc;
    1700              :     int         i;
    1701              : 
    1702              :     /* Extract matching substrings from the original string */
    1703         1531 :     loc = matchctx->next_match * matchctx->npatterns * 2;
    1704         4317 :     for (i = 0; i < matchctx->npatterns; i++)
    1705              :     {
    1706         2786 :         int         so = matchctx->match_locs[loc++];
    1707         2786 :         int         eo = matchctx->match_locs[loc++];
    1708              : 
    1709         2786 :         if (so < 0 || eo < 0)
    1710              :         {
    1711            3 :             elems[i] = (Datum) 0;
    1712            3 :             nulls[i] = true;
    1713              :         }
    1714         2783 :         else if (buf)
    1715              :         {
    1716         2783 :             int         len = pg_wchar2mb_with_len(matchctx->wide_str + so,
    1717              :                                                    buf,
    1718              :                                                    eo - so);
    1719              : 
    1720              :             Assert(len < matchctx->conv_bufsiz);
    1721         2783 :             elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
    1722         2783 :             nulls[i] = false;
    1723              :         }
    1724              :         else
    1725              :         {
    1726            0 :             elems[i] = DirectFunctionCall3(text_substr,
    1727              :                                            PointerGetDatum(matchctx->orig_str),
    1728              :                                            Int32GetDatum(so + 1),
    1729              :                                            Int32GetDatum(eo - so));
    1730            0 :             nulls[i] = false;
    1731              :         }
    1732              :     }
    1733              : 
    1734              :     /* And form an array */
    1735         1531 :     dims[0] = matchctx->npatterns;
    1736         1531 :     lbs[0] = 1;
    1737              :     /* XXX: this hardcodes assumptions about the text type */
    1738         1531 :     return construct_md_array(elems, nulls, 1, dims, lbs,
    1739              :                               TEXTOID, -1, false, TYPALIGN_INT);
    1740              : }
    1741              : 
    1742              : /*
    1743              :  * regexp_split_to_table()
    1744              :  *      Split the string at matches of the pattern, returning the
    1745              :  *      split-out substrings as a table.
    1746              :  */
    1747              : Datum
    1748          311 : regexp_split_to_table(PG_FUNCTION_ARGS)
    1749              : {
    1750              :     FuncCallContext *funcctx;
    1751              :     regexp_matches_ctx *splitctx;
    1752              : 
    1753          311 :     if (SRF_IS_FIRSTCALL())
    1754              :     {
    1755           26 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1756           26 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1757              :         pg_re_flags re_flags;
    1758              :         MemoryContext oldcontext;
    1759              : 
    1760           26 :         funcctx = SRF_FIRSTCALL_INIT();
    1761           26 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1762              : 
    1763              :         /* Determine options */
    1764           26 :         parse_re_flags(&re_flags, flags);
    1765              :         /* User mustn't specify 'g' */
    1766           23 :         if (re_flags.glob)
    1767            3 :             ereport(ERROR,
    1768              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1769              :             /* translator: %s is a SQL function name */
    1770              :                      errmsg("%s does not support the \"global\" option",
    1771              :                             "regexp_split_to_table()")));
    1772              :         /* But we find all the matches anyway */
    1773           20 :         re_flags.glob = true;
    1774              : 
    1775              :         /* be sure to copy the input string into the multi-call ctx */
    1776           20 :         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1777              :                                         &re_flags, 0,
    1778              :                                         PG_GET_COLLATION(),
    1779              :                                         false, true, true);
    1780              : 
    1781           20 :         MemoryContextSwitchTo(oldcontext);
    1782           20 :         funcctx->user_fctx = splitctx;
    1783              :     }
    1784              : 
    1785          305 :     funcctx = SRF_PERCALL_SETUP();
    1786          305 :     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1787              : 
    1788          305 :     if (splitctx->next_match <= splitctx->nmatches)
    1789              :     {
    1790          285 :         Datum       result = build_regexp_split_result(splitctx);
    1791              : 
    1792          285 :         splitctx->next_match++;
    1793          285 :         SRF_RETURN_NEXT(funcctx, result);
    1794              :     }
    1795              : 
    1796           20 :     SRF_RETURN_DONE(funcctx);
    1797              : }
    1798              : 
    1799              : /* This is separate to keep the opr_sanity regression test from complaining */
    1800              : Datum
    1801          276 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
    1802              : {
    1803          276 :     return regexp_split_to_table(fcinfo);
    1804              : }
    1805              : 
    1806              : /*
    1807              :  * regexp_split_to_array()
    1808              :  *      Split the string at matches of the pattern, returning the
    1809              :  *      split-out substrings as an array.
    1810              :  */
    1811              : Datum
    1812       100212 : regexp_split_to_array(PG_FUNCTION_ARGS)
    1813              : {
    1814       100212 :     ArrayBuildState *astate = NULL;
    1815              :     pg_re_flags re_flags;
    1816              :     regexp_matches_ctx *splitctx;
    1817              : 
    1818              :     /* Determine options */
    1819       100212 :     parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
    1820              :     /* User mustn't specify 'g' */
    1821       100209 :     if (re_flags.glob)
    1822            3 :         ereport(ERROR,
    1823              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1824              :         /* translator: %s is a SQL function name */
    1825              :                  errmsg("%s does not support the \"global\" option",
    1826              :                         "regexp_split_to_array()")));
    1827              :     /* But we find all the matches anyway */
    1828       100206 :     re_flags.glob = true;
    1829              : 
    1830       100206 :     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
    1831       100206 :                                     PG_GETARG_TEXT_PP(1),
    1832              :                                     &re_flags, 0,
    1833              :                                     PG_GET_COLLATION(),
    1834              :                                     false, true, true);
    1835              : 
    1836       645689 :     while (splitctx->next_match <= splitctx->nmatches)
    1837              :     {
    1838       545483 :         astate = accumArrayResult(astate,
    1839              :                                   build_regexp_split_result(splitctx),
    1840              :                                   false,
    1841              :                                   TEXTOID,
    1842              :                                   CurrentMemoryContext);
    1843       545483 :         splitctx->next_match++;
    1844              :     }
    1845              : 
    1846       100206 :     PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
    1847              : }
    1848              : 
    1849              : /* This is separate to keep the opr_sanity regression test from complaining */
    1850              : Datum
    1851       100191 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
    1852              : {
    1853       100191 :     return regexp_split_to_array(fcinfo);
    1854              : }
    1855              : 
    1856              : /*
    1857              :  * build_regexp_split_result - build output string for current match
    1858              :  *
    1859              :  * We return the string between the current match and the previous one,
    1860              :  * or the string after the last match when next_match == nmatches.
    1861              :  */
    1862              : static Datum
    1863       545768 : build_regexp_split_result(regexp_matches_ctx *splitctx)
    1864              : {
    1865       545768 :     char       *buf = splitctx->conv_buf;
    1866              :     int         startpos;
    1867              :     int         endpos;
    1868              : 
    1869       545768 :     if (splitctx->next_match > 0)
    1870       445542 :         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
    1871              :     else
    1872       100226 :         startpos = 0;
    1873       545768 :     if (startpos < 0)
    1874            0 :         elog(ERROR, "invalid match ending position");
    1875              : 
    1876       545768 :     endpos = splitctx->match_locs[splitctx->next_match * 2];
    1877       545768 :     if (endpos < startpos)
    1878            0 :         elog(ERROR, "invalid match starting position");
    1879              : 
    1880       545768 :     if (buf)
    1881              :     {
    1882              :         int         len;
    1883              : 
    1884       545768 :         len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
    1885              :                                    buf,
    1886              :                                    endpos - startpos);
    1887              :         Assert(len < splitctx->conv_bufsiz);
    1888       545768 :         return PointerGetDatum(cstring_to_text_with_len(buf, len));
    1889              :     }
    1890              :     else
    1891              :     {
    1892            0 :         return DirectFunctionCall3(text_substr,
    1893              :                                    PointerGetDatum(splitctx->orig_str),
    1894              :                                    Int32GetDatum(startpos + 1),
    1895              :                                    Int32GetDatum(endpos - startpos));
    1896              :     }
    1897              : }
    1898              : 
    1899              : /*
    1900              :  * regexp_substr()
    1901              :  *      Return the substring that matches a regular expression pattern
    1902              :  */
    1903              : Datum
    1904           54 : regexp_substr(PG_FUNCTION_ARGS)
    1905              : {
    1906           54 :     text       *str = PG_GETARG_TEXT_PP(0);
    1907           54 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1908           54 :     int         start = 1;
    1909           54 :     int         n = 1;
    1910           54 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4);
    1911           54 :     int         subexpr = 0;
    1912              :     int         so,
    1913              :                 eo,
    1914              :                 pos;
    1915              :     pg_re_flags re_flags;
    1916              :     regexp_matches_ctx *matchctx;
    1917              : 
    1918              :     /* Collect optional parameters */
    1919           54 :     if (PG_NARGS() > 2)
    1920              :     {
    1921           45 :         start = PG_GETARG_INT32(2);
    1922           45 :         if (start <= 0)
    1923            3 :             ereport(ERROR,
    1924              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1925              :                      errmsg("invalid value for parameter \"%s\": %d",
    1926              :                             "start", start)));
    1927              :     }
    1928           51 :     if (PG_NARGS() > 3)
    1929              :     {
    1930           39 :         n = PG_GETARG_INT32(3);
    1931           39 :         if (n <= 0)
    1932            3 :             ereport(ERROR,
    1933              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1934              :                      errmsg("invalid value for parameter \"%s\": %d",
    1935              :                             "n", n)));
    1936              :     }
    1937           48 :     if (PG_NARGS() > 5)
    1938              :     {
    1939           24 :         subexpr = PG_GETARG_INT32(5);
    1940           24 :         if (subexpr < 0)
    1941            3 :             ereport(ERROR,
    1942              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1943              :                      errmsg("invalid value for parameter \"%s\": %d",
    1944              :                             "subexpr", subexpr)));
    1945              :     }
    1946              : 
    1947              :     /* Determine options */
    1948           45 :     parse_re_flags(&re_flags, flags);
    1949              :     /* User mustn't specify 'g' */
    1950           45 :     if (re_flags.glob)
    1951            3 :         ereport(ERROR,
    1952              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1953              :         /* translator: %s is a SQL function name */
    1954              :                  errmsg("%s does not support the \"global\" option",
    1955              :                         "regexp_substr()")));
    1956              :     /* But we find all the matches anyway */
    1957           42 :     re_flags.glob = true;
    1958              : 
    1959              :     /* Do the matching */
    1960           42 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1961              :                                     PG_GET_COLLATION(),
    1962              :                                     (subexpr > 0),   /* need submatches? */
    1963              :                                     false, false);
    1964              : 
    1965              :     /* When n exceeds matches return NULL (includes case of no matches) */
    1966           42 :     if (n > matchctx->nmatches)
    1967            6 :         PG_RETURN_NULL();
    1968              : 
    1969              :     /* When subexpr exceeds number of subexpressions return NULL */
    1970           36 :     if (subexpr > matchctx->npatterns)
    1971            3 :         PG_RETURN_NULL();
    1972              : 
    1973              :     /* Select the appropriate match position to return */
    1974           33 :     pos = (n - 1) * matchctx->npatterns;
    1975           33 :     if (subexpr > 0)
    1976           15 :         pos += subexpr - 1;
    1977           33 :     pos *= 2;
    1978           33 :     so = matchctx->match_locs[pos];
    1979           33 :     eo = matchctx->match_locs[pos + 1];
    1980              : 
    1981           33 :     if (so < 0 || eo < 0)
    1982            3 :         PG_RETURN_NULL();       /* unidentifiable location */
    1983              : 
    1984           30 :     PG_RETURN_DATUM(DirectFunctionCall3(text_substr,
    1985              :                                         PointerGetDatum(matchctx->orig_str),
    1986              :                                         Int32GetDatum(so + 1),
    1987              :                                         Int32GetDatum(eo - so)));
    1988              : }
    1989              : 
    1990              : /* This is separate to keep the opr_sanity regression test from complaining */
    1991              : Datum
    1992            9 : regexp_substr_no_start(PG_FUNCTION_ARGS)
    1993              : {
    1994            9 :     return regexp_substr(fcinfo);
    1995              : }
    1996              : 
    1997              : /* This is separate to keep the opr_sanity regression test from complaining */
    1998              : Datum
    1999            3 : regexp_substr_no_n(PG_FUNCTION_ARGS)
    2000              : {
    2001            3 :     return regexp_substr(fcinfo);
    2002              : }
    2003              : 
    2004              : /* This is separate to keep the opr_sanity regression test from complaining */
    2005              : Datum
    2006           12 : regexp_substr_no_flags(PG_FUNCTION_ARGS)
    2007              : {
    2008           12 :     return regexp_substr(fcinfo);
    2009              : }
    2010              : 
    2011              : /* This is separate to keep the opr_sanity regression test from complaining */
    2012              : Datum
    2013            6 : regexp_substr_no_subexpr(PG_FUNCTION_ARGS)
    2014              : {
    2015            6 :     return regexp_substr(fcinfo);
    2016              : }
    2017              : 
    2018              : /*
    2019              :  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
    2020              :  *
    2021              :  * The result is NULL if there is no fixed prefix, else a palloc'd string.
    2022              :  * If it is an exact match, not just a prefix, *exact is returned as true.
    2023              :  */
    2024              : char *
    2025         8485 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
    2026              :                     bool *exact)
    2027              : {
    2028              :     char       *result;
    2029              :     regex_t    *re;
    2030              :     int         cflags;
    2031              :     int         re_result;
    2032              :     pg_wchar   *str;
    2033              :     size_t      slen;
    2034              :     size_t      maxlen;
    2035              :     char        errMsg[100];
    2036              : 
    2037         8485 :     *exact = false;             /* default result */
    2038              : 
    2039              :     /* Compile RE */
    2040         8485 :     cflags = REG_ADVANCED;
    2041         8485 :     if (case_insensitive)
    2042           31 :         cflags |= REG_ICASE;
    2043              : 
    2044         8485 :     re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
    2045              : 
    2046              :     /* Examine it to see if there's a fixed prefix */
    2047         8473 :     re_result = pg_regprefix(re, &str, &slen);
    2048              : 
    2049         8473 :     switch (re_result)
    2050              :     {
    2051          384 :         case REG_NOMATCH:
    2052          384 :             return NULL;
    2053              : 
    2054         1578 :         case REG_PREFIX:
    2055              :             /* continue with wchar conversion */
    2056         1578 :             break;
    2057              : 
    2058         6511 :         case REG_EXACT:
    2059         6511 :             *exact = true;
    2060              :             /* continue with wchar conversion */
    2061         6511 :             break;
    2062              : 
    2063            0 :         default:
    2064              :             /* re failed??? */
    2065            0 :             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
    2066            0 :             ereport(ERROR,
    2067              :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    2068              :                      errmsg("regular expression failed: %s", errMsg)));
    2069              :             break;
    2070              :     }
    2071              : 
    2072              :     /* Convert pg_wchar result back to database encoding */
    2073         8089 :     maxlen = pg_database_encoding_max_length() * slen + 1;
    2074         8089 :     result = (char *) palloc(maxlen);
    2075         8089 :     slen = pg_wchar2mb_with_len(str, result, slen);
    2076              :     Assert(slen < maxlen);
    2077              : 
    2078         8089 :     pfree(str);
    2079              : 
    2080         8089 :     return result;
    2081              : }

Generated by: LCOV version 2.0-1