LCOV - code coverage report
Current view: top level - src/backend/utils/adt - regexp.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15beta1 Lines: 563 642 87.7 %
Date: 2022-05-18 02:09:37 Functions: 49 51 96.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regexp.c
       4             :  *    Postgres' interface to the regular expression package.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/regexp.c
      12             :  *
      13             :  *      Alistair Crooks added the code for the regex caching
      14             :  *      agc - cached the regular expressions used - there's a good chance
      15             :  *      that we'll get a hit, so this saves a compile step for every
      16             :  *      attempted match. I haven't actually measured the speed improvement,
      17             :  *      but it `looks' a lot quicker visually when watching regression
      18             :  *      test output.
      19             :  *
      20             :  *      agc - incorporated Keith Bostic's Berkeley regex code into
      21             :  *      the tree for all ports. To distinguish this regex code from any that
      22             :  *      is existent on a platform, I've prepended the string "pg_" to
      23             :  *      the functions regcomp, regerror, regexec and regfree.
      24             :  *      Fixed a bug that was originally a typo by me, where `i' was used
      25             :  *      instead of `oldest' when compiling regular expressions - benign
      26             :  *      results mostly, although occasionally it bit you...
      27             :  *
      28             :  *-------------------------------------------------------------------------
      29             :  */
      30             : #include "postgres.h"
      31             : 
      32             : #include "catalog/pg_type.h"
      33             : #include "funcapi.h"
      34             : #include "miscadmin.h"
      35             : #include "regex/regex.h"
      36             : #include "utils/array.h"
      37             : #include "utils/builtins.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/varlena.h"
      40             : 
      41             : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      42             :     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
      43             : 
      44             : 
      45             : /* all the options of interest for regex functions */
      46             : typedef struct pg_re_flags
      47             : {
      48             :     int         cflags;         /* compile flags for Spencer's regex code */
      49             :     bool        glob;           /* do it globally (for each occurrence) */
      50             : } pg_re_flags;
      51             : 
      52             : /* cross-call state for regexp_match and regexp_split functions */
      53             : typedef struct regexp_matches_ctx
      54             : {
      55             :     text       *orig_str;       /* data string in original TEXT form */
      56             :     int         nmatches;       /* number of places where pattern matched */
      57             :     int         npatterns;      /* number of capturing subpatterns */
      58             :     /* We store start char index and end+1 char index for each match */
      59             :     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      60             :     int        *match_locs;     /* 0-based character indexes */
      61             :     int         next_match;     /* 0-based index of next match to process */
      62             :     /* workspace for build_regexp_match_result() */
      63             :     Datum      *elems;          /* has npatterns elements */
      64             :     bool       *nulls;          /* has npatterns elements */
      65             :     pg_wchar   *wide_str;       /* wide-char version of original string */
      66             :     char       *conv_buf;       /* conversion buffer, if needed */
      67             :     int         conv_bufsiz;    /* size thereof */
      68             : } regexp_matches_ctx;
      69             : 
      70             : /*
      71             :  * We cache precompiled regular expressions using a "self organizing list"
      72             :  * structure, in which recently-used items tend to be near the front.
      73             :  * Whenever we use an entry, it's moved up to the front of the list.
      74             :  * Over time, an item's average position corresponds to its frequency of use.
      75             :  *
      76             :  * When we first create an entry, it's inserted at the front of
      77             :  * the array, dropping the entry at the end of the array if necessary to
      78             :  * make room.  (This might seem to be weighting the new entry too heavily,
      79             :  * but if we insert new entries further back, we'll be unable to adjust to
      80             :  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
      81             :  * never-before-seen items used circularly.  We ought to be able to handle
      82             :  * that case, so we have to insert at the front.)
      83             :  *
      84             :  * Knuth mentions a variant strategy in which a used item is moved up just
      85             :  * one place in the list.  Although he says this uses fewer comparisons on
      86             :  * average, it seems not to adapt very well to the situation where you have
      87             :  * both some reusable patterns and a steady stream of non-reusable patterns.
      88             :  * A reusable pattern that isn't used at least as often as non-reusable
      89             :  * patterns are seen will "fail to keep up" and will drop off the end of the
      90             :  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
      91             :  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
      92             :  */
      93             : 
      94             : /* this is the maximum number of cached regular expressions */
      95             : #ifndef MAX_CACHED_RES
      96             : #define MAX_CACHED_RES  32
      97             : #endif
      98             : 
      99             : /* this structure describes one cached regular expression */
     100             : typedef struct cached_re_str
     101             : {
     102             :     char       *cre_pat;        /* original RE (not null terminated!) */
     103             :     int         cre_pat_len;    /* length of original RE, in bytes */
     104             :     int         cre_flags;      /* compile flags: extended,icase etc */
     105             :     Oid         cre_collation;  /* collation to use */
     106             :     regex_t     cre_re;         /* the compiled regular expression */
     107             : } cached_re_str;
     108             : 
     109             : static int  num_res = 0;        /* # of cached re's */
     110             : static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
     111             : 
     112             : 
     113             : /* Local functions */
     114             : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
     115             :                                                 pg_re_flags *flags,
     116             :                                                 int start_search,
     117             :                                                 Oid collation,
     118             :                                                 bool use_subpatterns,
     119             :                                                 bool ignore_degenerate,
     120             :                                                 bool fetching_unmatched);
     121             : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
     122             : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
     123             : 
     124             : 
     125             : /*
     126             :  * RE_compile_and_cache - compile a RE, caching if possible
     127             :  *
     128             :  * Returns regex_t *
     129             :  *
     130             :  *  text_re --- the pattern, expressed as a TEXT object
     131             :  *  cflags --- compile options for the pattern
     132             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     133             :  *
     134             :  * Pattern is given in the database encoding.  We internally convert to
     135             :  * an array of pg_wchar, which is what Spencer's regex package wants.
     136             :  */
     137             : regex_t *
     138      852278 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
     139             : {
     140      852278 :     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
     141      852278 :     char       *text_re_val = VARDATA_ANY(text_re);
     142             :     pg_wchar   *pattern;
     143             :     int         pattern_len;
     144             :     int         i;
     145             :     int         regcomp_result;
     146             :     cached_re_str re_temp;
     147             :     char        errMsg[100];
     148             : 
     149             :     /*
     150             :      * Look for a match among previously compiled REs.  Since the data
     151             :      * structure is self-organizing with most-used entries at the front, our
     152             :      * search strategy can just be to scan from the front.
     153             :      */
     154     1345626 :     for (i = 0; i < num_res; i++)
     155             :     {
     156     1340290 :         if (re_array[i].cre_pat_len == text_re_len &&
     157      854674 :             re_array[i].cre_flags == cflags &&
     158      853416 :             re_array[i].cre_collation == collation &&
     159      853312 :             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
     160             :         {
     161             :             /*
     162             :              * Found a match; move it to front if not there already.
     163             :              */
     164      846942 :             if (i > 0)
     165             :             {
     166      420376 :                 re_temp = re_array[i];
     167      420376 :                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
     168      420376 :                 re_array[0] = re_temp;
     169             :             }
     170             : 
     171      846942 :             return &re_array[0].cre_re;
     172             :         }
     173             :     }
     174             : 
     175             :     /*
     176             :      * Couldn't find it, so try to compile the new RE.  To avoid leaking
     177             :      * resources on failure, we build into the re_temp local.
     178             :      */
     179             : 
     180             :     /* Convert pattern string to wide characters */
     181        5336 :     pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
     182        5336 :     pattern_len = pg_mb2wchar_with_len(text_re_val,
     183             :                                        pattern,
     184             :                                        text_re_len);
     185             : 
     186        5336 :     regcomp_result = pg_regcomp(&re_temp.cre_re,
     187             :                                 pattern,
     188             :                                 pattern_len,
     189             :                                 cflags,
     190             :                                 collation);
     191             : 
     192        5336 :     pfree(pattern);
     193             : 
     194        5336 :     if (regcomp_result != REG_OKAY)
     195             :     {
     196             :         /* re didn't compile (no need for pg_regfree, if so) */
     197             : 
     198             :         /*
     199             :          * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
     200             :          * before reporting a regex error.  This is so that if the regex
     201             :          * library aborts and returns REG_CANCEL, we don't print an error
     202             :          * message that implies the regex was invalid.
     203             :          */
     204          42 :         CHECK_FOR_INTERRUPTS();
     205             : 
     206          42 :         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
     207          42 :         ereport(ERROR,
     208             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     209             :                  errmsg("invalid regular expression: %s", errMsg)));
     210             :     }
     211             : 
     212             :     /*
     213             :      * We use malloc/free for the cre_pat field because the storage has to
     214             :      * persist across transactions, and because we want to get control back on
     215             :      * out-of-memory.  The Max() is because some malloc implementations return
     216             :      * NULL for malloc(0).
     217             :      */
     218        5294 :     re_temp.cre_pat = malloc(Max(text_re_len, 1));
     219        5294 :     if (re_temp.cre_pat == NULL)
     220             :     {
     221           0 :         pg_regfree(&re_temp.cre_re);
     222           0 :         ereport(ERROR,
     223             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     224             :                  errmsg("out of memory")));
     225             :     }
     226        5294 :     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
     227        5294 :     re_temp.cre_pat_len = text_re_len;
     228        5294 :     re_temp.cre_flags = cflags;
     229        5294 :     re_temp.cre_collation = collation;
     230             : 
     231             :     /*
     232             :      * Okay, we have a valid new item in re_temp; insert it into the storage
     233             :      * array.  Discard last entry if needed.
     234             :      */
     235        5294 :     if (num_res >= MAX_CACHED_RES)
     236             :     {
     237         714 :         --num_res;
     238             :         Assert(num_res < MAX_CACHED_RES);
     239         714 :         pg_regfree(&re_array[num_res].cre_re);
     240         714 :         free(re_array[num_res].cre_pat);
     241             :     }
     242             : 
     243        5294 :     if (num_res > 0)
     244        3892 :         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
     245             : 
     246        5294 :     re_array[0] = re_temp;
     247        5294 :     num_res++;
     248             : 
     249        5294 :     return &re_array[0].cre_re;
     250             : }
     251             : 
     252             : /*
     253             :  * RE_wchar_execute - execute a RE on pg_wchar data
     254             :  *
     255             :  * Returns true on match, false on no match
     256             :  *
     257             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     258             :  *  data --- the data to match against (need not be null-terminated)
     259             :  *  data_len --- the length of the data string
     260             :  *  start_search -- the offset in the data to start searching
     261             :  *  nmatch, pmatch  --- optional return area for match details
     262             :  *
     263             :  * Data is given as array of pg_wchar which is what Spencer's regex package
     264             :  * wants.
     265             :  */
     266             : static bool
     267     1632956 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
     268             :                  int start_search, int nmatch, regmatch_t *pmatch)
     269             : {
     270             :     int         regexec_result;
     271             :     char        errMsg[100];
     272             : 
     273             :     /* Perform RE match and return result */
     274     1632956 :     regexec_result = pg_regexec(re,
     275             :                                 data,
     276             :                                 data_len,
     277             :                                 start_search,
     278             :                                 NULL,   /* no details */
     279             :                                 nmatch,
     280             :                                 pmatch,
     281             :                                 0);
     282             : 
     283     1632956 :     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
     284             :     {
     285             :         /* re failed??? */
     286           0 :         CHECK_FOR_INTERRUPTS();
     287           0 :         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
     288           0 :         ereport(ERROR,
     289             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     290             :                  errmsg("regular expression failed: %s", errMsg)));
     291             :     }
     292             : 
     293     1632956 :     return (regexec_result == REG_OKAY);
     294             : }
     295             : 
     296             : /*
     297             :  * RE_execute - execute a RE
     298             :  *
     299             :  * Returns true on match, false on no match
     300             :  *
     301             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     302             :  *  dat --- the data to match against (need not be null-terminated)
     303             :  *  dat_len --- the length of the data string
     304             :  *  nmatch, pmatch  --- optional return area for match details
     305             :  *
     306             :  * Data is given in the database encoding.  We internally
     307             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     308             :  */
     309             : static bool
     310      627806 : RE_execute(regex_t *re, char *dat, int dat_len,
     311             :            int nmatch, regmatch_t *pmatch)
     312             : {
     313             :     pg_wchar   *data;
     314             :     int         data_len;
     315             :     bool        match;
     316             : 
     317             :     /* Convert data string to wide characters */
     318      627806 :     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
     319      627806 :     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
     320             : 
     321             :     /* Perform RE match and return result */
     322      627806 :     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
     323             : 
     324      627806 :     pfree(data);
     325      627806 :     return match;
     326             : }
     327             : 
     328             : /*
     329             :  * RE_compile_and_execute - compile and execute a RE
     330             :  *
     331             :  * Returns true on match, false on no match
     332             :  *
     333             :  *  text_re --- the pattern, expressed as a TEXT object
     334             :  *  dat --- the data to match against (need not be null-terminated)
     335             :  *  dat_len --- the length of the data string
     336             :  *  cflags --- compile options for the pattern
     337             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     338             :  *  nmatch, pmatch  --- optional return area for match details
     339             :  *
     340             :  * Both pattern and data are given in the database encoding.  We internally
     341             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     342             :  */
     343             : bool
     344      625756 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
     345             :                        int cflags, Oid collation,
     346             :                        int nmatch, regmatch_t *pmatch)
     347             : {
     348             :     regex_t    *re;
     349             : 
     350             :     /* Use REG_NOSUB if caller does not want sub-match details */
     351      625756 :     if (nmatch < 2)
     352      625756 :         cflags |= REG_NOSUB;
     353             : 
     354             :     /* Compile RE */
     355      625756 :     re = RE_compile_and_cache(text_re, cflags, collation);
     356             : 
     357      625732 :     return RE_execute(re, dat, dat_len, nmatch, pmatch);
     358             : }
     359             : 
     360             : 
     361             : /*
     362             :  * parse_re_flags - parse the options argument of regexp_match and friends
     363             :  *
     364             :  *  flags --- output argument, filled with desired options
     365             :  *  opts --- TEXT object, or NULL for defaults
     366             :  *
     367             :  * This accepts all the options allowed by any of the callers; callers that
     368             :  * don't want some have to reject them after the fact.
     369             :  */
     370             : static void
     371      204922 : parse_re_flags(pg_re_flags *flags, text *opts)
     372             : {
     373             :     /* regex flavor is always folded into the compile flags */
     374      204922 :     flags->cflags = REG_ADVANCED;
     375      204922 :     flags->glob = false;
     376             : 
     377      204922 :     if (opts)
     378             :     {
     379        1998 :         char       *opt_p = VARDATA_ANY(opts);
     380        1998 :         int         opt_len = VARSIZE_ANY_EXHDR(opts);
     381             :         int         i;
     382             : 
     383        5092 :         for (i = 0; i < opt_len; i++)
     384             :         {
     385        3118 :             switch (opt_p[i])
     386             :             {
     387        1676 :                 case 'g':
     388        1676 :                     flags->glob = true;
     389        1676 :                     break;
     390           0 :                 case 'b':       /* BREs (but why???) */
     391           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
     392           0 :                     break;
     393          10 :                 case 'c':       /* case sensitive */
     394          10 :                     flags->cflags &= ~REG_ICASE;
     395          10 :                     break;
     396           0 :                 case 'e':       /* plain EREs */
     397           0 :                     flags->cflags |= REG_EXTENDED;
     398           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
     399           0 :                     break;
     400         292 :                 case 'i':       /* case insensitive */
     401         292 :                     flags->cflags |= REG_ICASE;
     402         292 :                     break;
     403        1098 :                 case 'm':       /* Perloid synonym for n */
     404             :                 case 'n':       /* \n affects ^ $ . [^ */
     405        1098 :                     flags->cflags |= REG_NEWLINE;
     406        1098 :                     break;
     407           0 :                 case 'p':       /* ~Perl, \n affects . [^ */
     408           0 :                     flags->cflags |= REG_NLSTOP;
     409           0 :                     flags->cflags &= ~REG_NLANCH;
     410           0 :                     break;
     411           0 :                 case 'q':       /* literal string */
     412           0 :                     flags->cflags |= REG_QUOTE;
     413           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
     414           0 :                     break;
     415          12 :                 case 's':       /* single line, \n ordinary */
     416          12 :                     flags->cflags &= ~REG_NEWLINE;
     417          12 :                     break;
     418           0 :                 case 't':       /* tight syntax */
     419           0 :                     flags->cflags &= ~REG_EXPANDED;
     420           0 :                     break;
     421           0 :                 case 'w':       /* weird, \n affects ^ $ only */
     422           0 :                     flags->cflags &= ~REG_NLSTOP;
     423           0 :                     flags->cflags |= REG_NLANCH;
     424           0 :                     break;
     425           6 :                 case 'x':       /* expanded syntax */
     426           6 :                     flags->cflags |= REG_EXPANDED;
     427           6 :                     break;
     428          24 :                 default:
     429          24 :                     ereport(ERROR,
     430             :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     431             :                              errmsg("invalid regular expression option: \"%.*s\"",
     432             :                                     pg_mblen(opt_p + i), opt_p + i)));
     433             :                     break;
     434             :             }
     435             :         }
     436             :     }
     437      204898 : }
     438             : 
     439             : 
     440             : /*
     441             :  *  interface routines called by the function manager
     442             :  */
     443             : 
     444             : Datum
     445      207388 : nameregexeq(PG_FUNCTION_ARGS)
     446             : {
     447      207388 :     Name        n = PG_GETARG_NAME(0);
     448      207388 :     text       *p = PG_GETARG_TEXT_PP(1);
     449             : 
     450      207388 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     451             :                                           NameStr(*n),
     452             :                                           strlen(NameStr(*n)),
     453             :                                           REG_ADVANCED,
     454             :                                           PG_GET_COLLATION(),
     455             :                                           0, NULL));
     456             : }
     457             : 
     458             : Datum
     459        9122 : nameregexne(PG_FUNCTION_ARGS)
     460             : {
     461        9122 :     Name        n = PG_GETARG_NAME(0);
     462        9122 :     text       *p = PG_GETARG_TEXT_PP(1);
     463             : 
     464        9122 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     465             :                                            NameStr(*n),
     466             :                                            strlen(NameStr(*n)),
     467             :                                            REG_ADVANCED,
     468             :                                            PG_GET_COLLATION(),
     469             :                                            0, NULL));
     470             : }
     471             : 
     472             : Datum
     473      374738 : textregexeq(PG_FUNCTION_ARGS)
     474             : {
     475      374738 :     text       *s = PG_GETARG_TEXT_PP(0);
     476      374738 :     text       *p = PG_GETARG_TEXT_PP(1);
     477             : 
     478      374738 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     479             :                                           VARDATA_ANY(s),
     480             :                                           VARSIZE_ANY_EXHDR(s),
     481             :                                           REG_ADVANCED,
     482             :                                           PG_GET_COLLATION(),
     483             :                                           0, NULL));
     484             : }
     485             : 
     486             : Datum
     487       34110 : textregexne(PG_FUNCTION_ARGS)
     488             : {
     489       34110 :     text       *s = PG_GETARG_TEXT_PP(0);
     490       34110 :     text       *p = PG_GETARG_TEXT_PP(1);
     491             : 
     492       34110 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     493             :                                            VARDATA_ANY(s),
     494             :                                            VARSIZE_ANY_EXHDR(s),
     495             :                                            REG_ADVANCED,
     496             :                                            PG_GET_COLLATION(),
     497             :                                            0, NULL));
     498             : }
     499             : 
     500             : 
     501             : /*
     502             :  *  routines that use the regexp stuff, but ignore the case.
     503             :  *  for this, we use the REG_ICASE flag to pg_regcomp
     504             :  */
     505             : 
     506             : 
     507             : Datum
     508           0 : nameicregexeq(PG_FUNCTION_ARGS)
     509             : {
     510           0 :     Name        n = PG_GETARG_NAME(0);
     511           0 :     text       *p = PG_GETARG_TEXT_PP(1);
     512             : 
     513           0 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     514             :                                           NameStr(*n),
     515             :                                           strlen(NameStr(*n)),
     516             :                                           REG_ADVANCED | REG_ICASE,
     517             :                                           PG_GET_COLLATION(),
     518             :                                           0, NULL));
     519             : }
     520             : 
     521             : Datum
     522           6 : nameicregexne(PG_FUNCTION_ARGS)
     523             : {
     524           6 :     Name        n = PG_GETARG_NAME(0);
     525           6 :     text       *p = PG_GETARG_TEXT_PP(1);
     526             : 
     527           6 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     528             :                                            NameStr(*n),
     529             :                                            strlen(NameStr(*n)),
     530             :                                            REG_ADVANCED | REG_ICASE,
     531             :                                            PG_GET_COLLATION(),
     532             :                                            0, NULL));
     533             : }
     534             : 
     535             : Datum
     536          76 : texticregexeq(PG_FUNCTION_ARGS)
     537             : {
     538          76 :     text       *s = PG_GETARG_TEXT_PP(0);
     539          76 :     text       *p = PG_GETARG_TEXT_PP(1);
     540             : 
     541          76 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     542             :                                           VARDATA_ANY(s),
     543             :                                           VARSIZE_ANY_EXHDR(s),
     544             :                                           REG_ADVANCED | REG_ICASE,
     545             :                                           PG_GET_COLLATION(),
     546             :                                           0, NULL));
     547             : }
     548             : 
     549             : Datum
     550          16 : texticregexne(PG_FUNCTION_ARGS)
     551             : {
     552          16 :     text       *s = PG_GETARG_TEXT_PP(0);
     553          16 :     text       *p = PG_GETARG_TEXT_PP(1);
     554             : 
     555          16 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     556             :                                            VARDATA_ANY(s),
     557             :                                            VARSIZE_ANY_EXHDR(s),
     558             :                                            REG_ADVANCED | REG_ICASE,
     559             :                                            PG_GET_COLLATION(),
     560             :                                            0, NULL));
     561             : }
     562             : 
     563             : 
     564             : /*
     565             :  * textregexsubstr()
     566             :  *      Return a substring matched by a regular expression.
     567             :  */
     568             : Datum
     569        2074 : textregexsubstr(PG_FUNCTION_ARGS)
     570             : {
     571        2074 :     text       *s = PG_GETARG_TEXT_PP(0);
     572        2074 :     text       *p = PG_GETARG_TEXT_PP(1);
     573             :     regex_t    *re;
     574             :     regmatch_t  pmatch[2];
     575             :     int         so,
     576             :                 eo;
     577             : 
     578             :     /* Compile RE */
     579        2074 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     580             : 
     581             :     /*
     582             :      * We pass two regmatch_t structs to get info about the overall match and
     583             :      * the match for the first parenthesized subexpression (if any). If there
     584             :      * is a parenthesized subexpression, we return what it matched; else
     585             :      * return what the whole regexp matched.
     586             :      */
     587        4148 :     if (!RE_execute(re,
     588        4148 :                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
     589             :                     2, pmatch))
     590           6 :         PG_RETURN_NULL();       /* definitely no match */
     591             : 
     592        2068 :     if (re->re_nsub > 0)
     593             :     {
     594             :         /* has parenthesized subexpressions, use the first one */
     595        1522 :         so = pmatch[1].rm_so;
     596        1522 :         eo = pmatch[1].rm_eo;
     597             :     }
     598             :     else
     599             :     {
     600             :         /* no parenthesized subexpression, use whole match */
     601         546 :         so = pmatch[0].rm_so;
     602         546 :         eo = pmatch[0].rm_eo;
     603             :     }
     604             : 
     605             :     /*
     606             :      * It is possible to have a match to the whole pattern but no match for a
     607             :      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
     608             :      * there is no subexpression match.  So this extra test for match failure
     609             :      * is not redundant.
     610             :      */
     611        2068 :     if (so < 0 || eo < 0)
     612           6 :         PG_RETURN_NULL();
     613             : 
     614        2062 :     return DirectFunctionCall3(text_substr,
     615             :                                PointerGetDatum(s),
     616             :                                Int32GetDatum(so + 1),
     617             :                                Int32GetDatum(eo - so));
     618             : }
     619             : 
     620             : /*
     621             :  * textregexreplace_noopt()
     622             :  *      Return a string matched by a regular expression, with replacement.
     623             :  *
     624             :  * This version doesn't have an option argument: we default to case
     625             :  * sensitive match, replace the first instance only.
     626             :  */
     627             : Datum
     628        7472 : textregexreplace_noopt(PG_FUNCTION_ARGS)
     629             : {
     630        7472 :     text       *s = PG_GETARG_TEXT_PP(0);
     631        7472 :     text       *p = PG_GETARG_TEXT_PP(1);
     632        7472 :     text       *r = PG_GETARG_TEXT_PP(2);
     633             : 
     634        7472 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     635             :                                          REG_ADVANCED, PG_GET_COLLATION(),
     636             :                                          0, 1));
     637             : }
     638             : 
     639             : /*
     640             :  * textregexreplace()
     641             :  *      Return a string matched by a regular expression, with replacement.
     642             :  */
     643             : Datum
     644        1604 : textregexreplace(PG_FUNCTION_ARGS)
     645             : {
     646        1604 :     text       *s = PG_GETARG_TEXT_PP(0);
     647        1604 :     text       *p = PG_GETARG_TEXT_PP(1);
     648        1604 :     text       *r = PG_GETARG_TEXT_PP(2);
     649        1604 :     text       *opt = PG_GETARG_TEXT_PP(3);
     650             :     pg_re_flags flags;
     651             : 
     652             :     /*
     653             :      * regexp_replace() with four arguments will be preferentially resolved as
     654             :      * this form when the fourth argument is of type UNKNOWN.  However, the
     655             :      * user might have intended to call textregexreplace_extended_no_n.  If we
     656             :      * see flags that look like an integer, emit the same error that
     657             :      * parse_re_flags would, but add a HINT about how to fix it.
     658             :      */
     659        1604 :     if (VARSIZE_ANY_EXHDR(opt) > 0)
     660             :     {
     661        1604 :         char       *opt_p = VARDATA_ANY(opt);
     662             : 
     663        1604 :         if (*opt_p >= '0' && *opt_p <= '9')
     664           6 :             ereport(ERROR,
     665             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     666             :                      errmsg("invalid regular expression option: \"%.*s\"",
     667             :                             pg_mblen(opt_p), opt_p),
     668             :                      errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
     669             :     }
     670             : 
     671        1598 :     parse_re_flags(&flags, opt);
     672             : 
     673        1592 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     674             :                                          flags.cflags, PG_GET_COLLATION(),
     675             :                                          0, flags.glob ? 0 : 1));
     676             : }
     677             : 
     678             : /*
     679             :  * textregexreplace_extended()
     680             :  *      Return a string matched by a regular expression, with replacement.
     681             :  *      Extends textregexreplace by allowing a start position and the
     682             :  *      choice of the occurrence to replace (0 means all occurrences).
     683             :  */
     684             : Datum
     685          66 : textregexreplace_extended(PG_FUNCTION_ARGS)
     686             : {
     687          66 :     text       *s = PG_GETARG_TEXT_PP(0);
     688          66 :     text       *p = PG_GETARG_TEXT_PP(1);
     689          66 :     text       *r = PG_GETARG_TEXT_PP(2);
     690          66 :     int         start = 1;
     691          66 :     int         n = 1;
     692          66 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
     693             :     pg_re_flags re_flags;
     694             : 
     695             :     /* Collect optional parameters */
     696          66 :     if (PG_NARGS() > 3)
     697             :     {
     698          66 :         start = PG_GETARG_INT32(3);
     699          66 :         if (start <= 0)
     700           6 :             ereport(ERROR,
     701             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     702             :                      errmsg("invalid value for parameter \"%s\": %d",
     703             :                             "start", start)));
     704             :     }
     705          60 :     if (PG_NARGS() > 4)
     706             :     {
     707          54 :         n = PG_GETARG_INT32(4);
     708          54 :         if (n < 0)
     709           6 :             ereport(ERROR,
     710             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     711             :                      errmsg("invalid value for parameter \"%s\": %d",
     712             :                             "n", n)));
     713             :     }
     714             : 
     715             :     /* Determine options */
     716          54 :     parse_re_flags(&re_flags, flags);
     717             : 
     718             :     /* If N was not specified, deduce it from the 'g' flag */
     719          54 :     if (PG_NARGS() <= 4)
     720           6 :         n = re_flags.glob ? 0 : 1;
     721             : 
     722             :     /* Do the replacement(s) */
     723          54 :     PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
     724             :                                          re_flags.cflags, PG_GET_COLLATION(),
     725             :                                          start - 1, n));
     726             : }
     727             : 
     728             : /* This is separate to keep the opr_sanity regression test from complaining */
     729             : Datum
     730           6 : textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
     731             : {
     732           6 :     return textregexreplace_extended(fcinfo);
     733             : }
     734             : 
     735             : /* This is separate to keep the opr_sanity regression test from complaining */
     736             : Datum
     737           6 : textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
     738             : {
     739           6 :     return textregexreplace_extended(fcinfo);
     740             : }
     741             : 
     742             : /*
     743             :  * similar_to_escape(), similar_escape()
     744             :  *
     745             :  * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
     746             :  * used by our regexp engine.
     747             :  *
     748             :  * similar_escape_internal() is the common workhorse for three SQL-exposed
     749             :  * functions.  esc_text can be passed as NULL to select the default escape
     750             :  * (which is '\'), or as an empty string to select no escape character.
     751             :  */
     752             : static text *
     753         114 : similar_escape_internal(text *pat_text, text *esc_text)
     754             : {
     755             :     text       *result;
     756             :     char       *p,
     757             :                *e,
     758             :                *r;
     759             :     int         plen,
     760             :                 elen;
     761         114 :     bool        afterescape = false;
     762         114 :     bool        incharclass = false;
     763         114 :     int         nquotes = 0;
     764             : 
     765         114 :     p = VARDATA_ANY(pat_text);
     766         114 :     plen = VARSIZE_ANY_EXHDR(pat_text);
     767         114 :     if (esc_text == NULL)
     768             :     {
     769             :         /* No ESCAPE clause provided; default to backslash as escape */
     770          22 :         e = "\\";
     771          22 :         elen = 1;
     772             :     }
     773             :     else
     774             :     {
     775          92 :         e = VARDATA_ANY(esc_text);
     776          92 :         elen = VARSIZE_ANY_EXHDR(esc_text);
     777          92 :         if (elen == 0)
     778           6 :             e = NULL;           /* no escape character */
     779          86 :         else if (elen > 1)
     780             :         {
     781           6 :             int         escape_mblen = pg_mbstrlen_with_len(e, elen);
     782             : 
     783           6 :             if (escape_mblen > 1)
     784           6 :                 ereport(ERROR,
     785             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     786             :                          errmsg("invalid escape string"),
     787             :                          errhint("Escape string must be empty or one character.")));
     788             :         }
     789             :     }
     790             : 
     791             :     /*----------
     792             :      * We surround the transformed input string with
     793             :      *          ^(?: ... )$
     794             :      * which requires some explanation.  We need "^" and "$" to force
     795             :      * the pattern to match the entire input string as per the SQL spec.
     796             :      * The "(?:" and ")" are a non-capturing set of parens; we have to have
     797             :      * parens in case the string contains "|", else the "^" and "$" will
     798             :      * be bound into the first and last alternatives which is not what we
     799             :      * want, and the parens must be non capturing because we don't want them
     800             :      * to count when selecting output for SUBSTRING.
     801             :      *
     802             :      * When the pattern is divided into three parts by escape-double-quotes,
     803             :      * what we emit is
     804             :      *          ^(?:part1){1,1}?(part2){1,1}(?:part3)$
     805             :      * which requires even more explanation.  The "{1,1}?" on part1 makes it
     806             :      * non-greedy so that it will match the smallest possible amount of text
     807             :      * not the largest, as required by SQL.  The plain parens around part2
     808             :      * are capturing parens so that that part is what controls the result of
     809             :      * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
     810             :      * the largest possible amount of text; hence part3 must match the
     811             :      * smallest amount of text, as required by SQL.  We don't need an explicit
     812             :      * greediness marker on part3.  Note that this also confines the effects
     813             :      * of any "|" characters to the respective part, which is what we want.
     814             :      *
     815             :      * The SQL spec says that SUBSTRING's pattern must contain exactly two
     816             :      * escape-double-quotes, but we only complain if there's more than two.
     817             :      * With none, we act as though part1 and part3 are empty; with one, we
     818             :      * act as though part3 is empty.  Both behaviors fall out of omitting
     819             :      * the relevant part separators in the above expansion.  If the result
     820             :      * of this function is used in a plain regexp match (SIMILAR TO), the
     821             :      * escape-double-quotes have no effect on the match behavior.
     822             :      *----------
     823             :      */
     824             : 
     825             :     /*
     826             :      * We need room for the prefix/postfix and part separators, plus as many
     827             :      * as 3 output bytes per input byte; since the input is at most 1GB this
     828             :      * can't overflow size_t.
     829             :      */
     830         108 :     result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
     831         108 :     r = VARDATA(result);
     832             : 
     833         108 :     *r++ = '^';
     834         108 :     *r++ = '(';
     835         108 :     *r++ = '?';
     836         108 :     *r++ = ':';
     837             : 
     838         904 :     while (plen > 0)
     839             :     {
     840         802 :         char        pchar = *p;
     841             : 
     842             :         /*
     843             :          * If both the escape character and the current character from the
     844             :          * pattern are multi-byte, we need to take the slow path.
     845             :          *
     846             :          * But if one of them is single-byte, we can process the pattern one
     847             :          * byte at a time, ignoring multi-byte characters.  (This works
     848             :          * because all server-encodings have the property that a valid
     849             :          * multi-byte character representation cannot contain the
     850             :          * representation of a valid single-byte character.)
     851             :          */
     852             : 
     853         802 :         if (elen > 1)
     854             :         {
     855           0 :             int         mblen = pg_mblen(p);
     856             : 
     857           0 :             if (mblen > 1)
     858             :             {
     859             :                 /* slow, multi-byte path */
     860           0 :                 if (afterescape)
     861             :                 {
     862           0 :                     *r++ = '\\';
     863           0 :                     memcpy(r, p, mblen);
     864           0 :                     r += mblen;
     865           0 :                     afterescape = false;
     866             :                 }
     867           0 :                 else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
     868             :                 {
     869             :                     /* SQL escape character; do not send to output */
     870           0 :                     afterescape = true;
     871             :                 }
     872             :                 else
     873             :                 {
     874             :                     /*
     875             :                      * We know it's a multi-byte character, so we don't need
     876             :                      * to do all the comparisons to single-byte characters
     877             :                      * that we do below.
     878             :                      */
     879           0 :                     memcpy(r, p, mblen);
     880           0 :                     r += mblen;
     881             :                 }
     882             : 
     883           0 :                 p += mblen;
     884           0 :                 plen -= mblen;
     885             : 
     886           0 :                 continue;
     887             :             }
     888             :         }
     889             : 
     890             :         /* fast path */
     891         802 :         if (afterescape)
     892             :         {
     893         142 :             if (pchar == '"' && !incharclass)  /* escape-double-quote? */
     894             :             {
     895             :                 /* emit appropriate part separator, per notes above */
     896         124 :                 if (nquotes == 0)
     897             :                 {
     898          62 :                     *r++ = ')';
     899          62 :                     *r++ = '{';
     900          62 :                     *r++ = '1';
     901          62 :                     *r++ = ',';
     902          62 :                     *r++ = '1';
     903          62 :                     *r++ = '}';
     904          62 :                     *r++ = '?';
     905          62 :                     *r++ = '(';
     906             :                 }
     907          62 :                 else if (nquotes == 1)
     908             :                 {
     909          56 :                     *r++ = ')';
     910          56 :                     *r++ = '{';
     911          56 :                     *r++ = '1';
     912          56 :                     *r++ = ',';
     913          56 :                     *r++ = '1';
     914          56 :                     *r++ = '}';
     915          56 :                     *r++ = '(';
     916          56 :                     *r++ = '?';
     917          56 :                     *r++ = ':';
     918             :                 }
     919             :                 else
     920           6 :                     ereport(ERROR,
     921             :                             (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
     922             :                              errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
     923         118 :                 nquotes++;
     924             :             }
     925             :             else
     926             :             {
     927             :                 /*
     928             :                  * We allow any character at all to be escaped; notably, this
     929             :                  * allows access to POSIX character-class escapes such as
     930             :                  * "\d".  The SQL spec is considerably more restrictive.
     931             :                  */
     932          18 :                 *r++ = '\\';
     933          18 :                 *r++ = pchar;
     934             :             }
     935         136 :             afterescape = false;
     936             :         }
     937         660 :         else if (e && pchar == *e)
     938             :         {
     939             :             /* SQL escape character; do not send to output */
     940         142 :             afterescape = true;
     941             :         }
     942         518 :         else if (incharclass)
     943             :         {
     944           0 :             if (pchar == '\\')
     945           0 :                 *r++ = '\\';
     946           0 :             *r++ = pchar;
     947           0 :             if (pchar == ']')
     948           0 :                 incharclass = false;
     949             :         }
     950         518 :         else if (pchar == '[')
     951             :         {
     952           0 :             *r++ = pchar;
     953           0 :             incharclass = true;
     954             :         }
     955         518 :         else if (pchar == '%')
     956             :         {
     957          90 :             *r++ = '.';
     958          90 :             *r++ = '*';
     959             :         }
     960         428 :         else if (pchar == '_')
     961          52 :             *r++ = '.';
     962         376 :         else if (pchar == '(')
     963             :         {
     964             :             /* convert to non-capturing parenthesis */
     965          18 :             *r++ = '(';
     966          18 :             *r++ = '?';
     967          18 :             *r++ = ':';
     968             :         }
     969         358 :         else if (pchar == '\\' || pchar == '.' ||
     970         348 :                  pchar == '^' || pchar == '$')
     971             :         {
     972          10 :             *r++ = '\\';
     973          10 :             *r++ = pchar;
     974             :         }
     975             :         else
     976         348 :             *r++ = pchar;
     977         796 :         p++, plen--;
     978             :     }
     979             : 
     980         102 :     *r++ = ')';
     981         102 :     *r++ = '$';
     982             : 
     983         102 :     SET_VARSIZE(result, r - ((char *) result));
     984             : 
     985         102 :     return result;
     986             : }
     987             : 
     988             : /*
     989             :  * similar_to_escape(pattern, escape)
     990             :  */
     991             : Datum
     992          92 : similar_to_escape_2(PG_FUNCTION_ARGS)
     993             : {
     994          92 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
     995          92 :     text       *esc_text = PG_GETARG_TEXT_PP(1);
     996             :     text       *result;
     997             : 
     998          92 :     result = similar_escape_internal(pat_text, esc_text);
     999             : 
    1000          80 :     PG_RETURN_TEXT_P(result);
    1001             : }
    1002             : 
    1003             : /*
    1004             :  * similar_to_escape(pattern)
    1005             :  * Inserts a default escape character.
    1006             :  */
    1007             : Datum
    1008          22 : similar_to_escape_1(PG_FUNCTION_ARGS)
    1009             : {
    1010          22 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
    1011             :     text       *result;
    1012             : 
    1013          22 :     result = similar_escape_internal(pat_text, NULL);
    1014             : 
    1015          22 :     PG_RETURN_TEXT_P(result);
    1016             : }
    1017             : 
    1018             : /*
    1019             :  * similar_escape(pattern, escape)
    1020             :  *
    1021             :  * Legacy function for compatibility with views stored using the
    1022             :  * pre-v13 expansion of SIMILAR TO.  Unlike the above functions, this
    1023             :  * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
    1024             :  */
    1025             : Datum
    1026           0 : similar_escape(PG_FUNCTION_ARGS)
    1027             : {
    1028             :     text       *pat_text;
    1029             :     text       *esc_text;
    1030             :     text       *result;
    1031             : 
    1032             :     /* This function is not strict, so must test explicitly */
    1033           0 :     if (PG_ARGISNULL(0))
    1034           0 :         PG_RETURN_NULL();
    1035           0 :     pat_text = PG_GETARG_TEXT_PP(0);
    1036             : 
    1037           0 :     if (PG_ARGISNULL(1))
    1038           0 :         esc_text = NULL;        /* use default escape character */
    1039             :     else
    1040           0 :         esc_text = PG_GETARG_TEXT_PP(1);
    1041             : 
    1042           0 :     result = similar_escape_internal(pat_text, esc_text);
    1043             : 
    1044           0 :     PG_RETURN_TEXT_P(result);
    1045             : }
    1046             : 
    1047             : /*
    1048             :  * regexp_count()
    1049             :  *      Return the number of matches of a pattern within a string.
    1050             :  */
    1051             : Datum
    1052          48 : regexp_count(PG_FUNCTION_ARGS)
    1053             : {
    1054          48 :     text       *str = PG_GETARG_TEXT_PP(0);
    1055          48 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1056          48 :     int         start = 1;
    1057          48 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3);
    1058             :     pg_re_flags re_flags;
    1059             :     regexp_matches_ctx *matchctx;
    1060             : 
    1061             :     /* Collect optional parameters */
    1062          48 :     if (PG_NARGS() > 2)
    1063             :     {
    1064          42 :         start = PG_GETARG_INT32(2);
    1065          42 :         if (start <= 0)
    1066          12 :             ereport(ERROR,
    1067             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1068             :                      errmsg("invalid value for parameter \"%s\": %d",
    1069             :                             "start", start)));
    1070             :     }
    1071             : 
    1072             :     /* Determine options */
    1073          36 :     parse_re_flags(&re_flags, flags);
    1074             :     /* User mustn't specify 'g' */
    1075          36 :     if (re_flags.glob)
    1076           0 :         ereport(ERROR,
    1077             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1078             :         /* translator: %s is a SQL function name */
    1079             :                  errmsg("%s does not support the \"global\" option",
    1080             :                         "regexp_count()")));
    1081             :     /* But we find all the matches anyway */
    1082          36 :     re_flags.glob = true;
    1083             : 
    1084             :     /* Do the matching */
    1085          36 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1086             :                                     PG_GET_COLLATION(),
    1087             :                                     false,  /* can ignore subexprs */
    1088             :                                     false, false);
    1089             : 
    1090          36 :     PG_RETURN_INT32(matchctx->nmatches);
    1091             : }
    1092             : 
    1093             : /* This is separate to keep the opr_sanity regression test from complaining */
    1094             : Datum
    1095           6 : regexp_count_no_start(PG_FUNCTION_ARGS)
    1096             : {
    1097           6 :     return regexp_count(fcinfo);
    1098             : }
    1099             : 
    1100             : /* This is separate to keep the opr_sanity regression test from complaining */
    1101             : Datum
    1102          30 : regexp_count_no_flags(PG_FUNCTION_ARGS)
    1103             : {
    1104          30 :     return regexp_count(fcinfo);
    1105             : }
    1106             : 
    1107             : /*
    1108             :  * regexp_instr()
    1109             :  *      Return the match's position within the string
    1110             :  */
    1111             : Datum
    1112         156 : regexp_instr(PG_FUNCTION_ARGS)
    1113             : {
    1114         156 :     text       *str = PG_GETARG_TEXT_PP(0);
    1115         156 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1116         156 :     int         start = 1;
    1117         156 :     int         n = 1;
    1118         156 :     int         endoption = 0;
    1119         156 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
    1120         156 :     int         subexpr = 0;
    1121             :     int         pos;
    1122             :     pg_re_flags re_flags;
    1123             :     regexp_matches_ctx *matchctx;
    1124             : 
    1125             :     /* Collect optional parameters */
    1126         156 :     if (PG_NARGS() > 2)
    1127             :     {
    1128         138 :         start = PG_GETARG_INT32(2);
    1129         138 :         if (start <= 0)
    1130           6 :             ereport(ERROR,
    1131             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1132             :                      errmsg("invalid value for parameter \"%s\": %d",
    1133             :                             "start", start)));
    1134             :     }
    1135         150 :     if (PG_NARGS() > 3)
    1136             :     {
    1137         126 :         n = PG_GETARG_INT32(3);
    1138         126 :         if (n <= 0)
    1139           6 :             ereport(ERROR,
    1140             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1141             :                      errmsg("invalid value for parameter \"%s\": %d",
    1142             :                             "n", n)));
    1143             :     }
    1144         144 :     if (PG_NARGS() > 4)
    1145             :     {
    1146         108 :         endoption = PG_GETARG_INT32(4);
    1147         108 :         if (endoption != 0 && endoption != 1)
    1148          12 :             ereport(ERROR,
    1149             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1150             :                      errmsg("invalid value for parameter \"%s\": %d",
    1151             :                             "endoption", endoption)));
    1152             :     }
    1153         132 :     if (PG_NARGS() > 6)
    1154             :     {
    1155          84 :         subexpr = PG_GETARG_INT32(6);
    1156          84 :         if (subexpr < 0)
    1157           6 :             ereport(ERROR,
    1158             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1159             :                      errmsg("invalid value for parameter \"%s\": %d",
    1160             :                             "subexpr", subexpr)));
    1161             :     }
    1162             : 
    1163             :     /* Determine options */
    1164         126 :     parse_re_flags(&re_flags, flags);
    1165             :     /* User mustn't specify 'g' */
    1166         126 :     if (re_flags.glob)
    1167           6 :         ereport(ERROR,
    1168             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1169             :         /* translator: %s is a SQL function name */
    1170             :                  errmsg("%s does not support the \"global\" option",
    1171             :                         "regexp_instr()")));
    1172             :     /* But we find all the matches anyway */
    1173         120 :     re_flags.glob = true;
    1174             : 
    1175             :     /* Do the matching */
    1176         120 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1177             :                                     PG_GET_COLLATION(),
    1178             :                                     (subexpr > 0),   /* need submatches? */
    1179             :                                     false, false);
    1180             : 
    1181             :     /* When n exceeds matches return 0 (includes case of no matches) */
    1182         120 :     if (n > matchctx->nmatches)
    1183          12 :         PG_RETURN_INT32(0);
    1184             : 
    1185             :     /* When subexpr exceeds number of subexpressions return 0 */
    1186         108 :     if (subexpr > matchctx->npatterns)
    1187          12 :         PG_RETURN_INT32(0);
    1188             : 
    1189             :     /* Select the appropriate match position to return */
    1190          96 :     pos = (n - 1) * matchctx->npatterns;
    1191          96 :     if (subexpr > 0)
    1192          54 :         pos += subexpr - 1;
    1193          96 :     pos *= 2;
    1194          96 :     if (endoption == 1)
    1195          30 :         pos += 1;
    1196             : 
    1197          96 :     if (matchctx->match_locs[pos] >= 0)
    1198          90 :         PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
    1199             :     else
    1200           6 :         PG_RETURN_INT32(0);     /* position not identifiable */
    1201             : }
    1202             : 
    1203             : /* This is separate to keep the opr_sanity regression test from complaining */
    1204             : Datum
    1205          18 : regexp_instr_no_start(PG_FUNCTION_ARGS)
    1206             : {
    1207          18 :     return regexp_instr(fcinfo);
    1208             : }
    1209             : 
    1210             : /* This is separate to keep the opr_sanity regression test from complaining */
    1211             : Datum
    1212           6 : regexp_instr_no_n(PG_FUNCTION_ARGS)
    1213             : {
    1214           6 :     return regexp_instr(fcinfo);
    1215             : }
    1216             : 
    1217             : /* This is separate to keep the opr_sanity regression test from complaining */
    1218             : Datum
    1219          24 : regexp_instr_no_endoption(PG_FUNCTION_ARGS)
    1220             : {
    1221          24 :     return regexp_instr(fcinfo);
    1222             : }
    1223             : 
    1224             : /* This is separate to keep the opr_sanity regression test from complaining */
    1225             : Datum
    1226          12 : regexp_instr_no_flags(PG_FUNCTION_ARGS)
    1227             : {
    1228          12 :     return regexp_instr(fcinfo);
    1229             : }
    1230             : 
    1231             : /* This is separate to keep the opr_sanity regression test from complaining */
    1232             : Datum
    1233          12 : regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
    1234             : {
    1235          12 :     return regexp_instr(fcinfo);
    1236             : }
    1237             : 
    1238             : /*
    1239             :  * regexp_like()
    1240             :  *      Test for a pattern match within a string.
    1241             :  */
    1242             : Datum
    1243          30 : regexp_like(PG_FUNCTION_ARGS)
    1244             : {
    1245          30 :     text       *str = PG_GETARG_TEXT_PP(0);
    1246          30 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1247          30 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1248             :     pg_re_flags re_flags;
    1249             : 
    1250             :     /* Determine options */
    1251          30 :     parse_re_flags(&re_flags, flags);
    1252             :     /* User mustn't specify 'g' */
    1253          30 :     if (re_flags.glob)
    1254           6 :         ereport(ERROR,
    1255             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1256             :         /* translator: %s is a SQL function name */
    1257             :                  errmsg("%s does not support the \"global\" option",
    1258             :                         "regexp_like()")));
    1259             : 
    1260             :     /* Otherwise it's like textregexeq/texticregexeq */
    1261          24 :     PG_RETURN_BOOL(RE_compile_and_execute(pattern,
    1262             :                                           VARDATA_ANY(str),
    1263             :                                           VARSIZE_ANY_EXHDR(str),
    1264             :                                           re_flags.cflags,
    1265             :                                           PG_GET_COLLATION(),
    1266             :                                           0, NULL));
    1267             : }
    1268             : 
    1269             : /* This is separate to keep the opr_sanity regression test from complaining */
    1270             : Datum
    1271           6 : regexp_like_no_flags(PG_FUNCTION_ARGS)
    1272             : {
    1273           6 :     return regexp_like(fcinfo);
    1274             : }
    1275             : 
    1276             : /*
    1277             :  * regexp_match()
    1278             :  *      Return the first substring(s) matching a pattern within a string.
    1279             :  */
    1280             : Datum
    1281        2532 : regexp_match(PG_FUNCTION_ARGS)
    1282             : {
    1283        2532 :     text       *orig_str = PG_GETARG_TEXT_PP(0);
    1284        2532 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1285        2532 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1286             :     pg_re_flags re_flags;
    1287             :     regexp_matches_ctx *matchctx;
    1288             : 
    1289             :     /* Determine options */
    1290        2532 :     parse_re_flags(&re_flags, flags);
    1291             :     /* User mustn't specify 'g' */
    1292        2532 :     if (re_flags.glob)
    1293           8 :         ereport(ERROR,
    1294             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1295             :         /* translator: %s is a SQL function name */
    1296             :                  errmsg("%s does not support the \"global\" option",
    1297             :                         "regexp_match()"),
    1298             :                  errhint("Use the regexp_matches function instead.")));
    1299             : 
    1300        2524 :     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, 0,
    1301             :                                     PG_GET_COLLATION(), true, false, false);
    1302             : 
    1303        2524 :     if (matchctx->nmatches == 0)
    1304         170 :         PG_RETURN_NULL();
    1305             : 
    1306             :     Assert(matchctx->nmatches == 1);
    1307             : 
    1308             :     /* Create workspace that build_regexp_match_result needs */
    1309        2354 :     matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
    1310        2354 :     matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
    1311             : 
    1312        2354 :     PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
    1313             : }
    1314             : 
    1315             : /* This is separate to keep the opr_sanity regression test from complaining */
    1316             : Datum
    1317        2502 : regexp_match_no_flags(PG_FUNCTION_ARGS)
    1318             : {
    1319        2502 :     return regexp_match(fcinfo);
    1320             : }
    1321             : 
    1322             : /*
    1323             :  * regexp_matches()
    1324             :  *      Return a table of all matches of a pattern within a string.
    1325             :  */
    1326             : Datum
    1327         678 : regexp_matches(PG_FUNCTION_ARGS)
    1328             : {
    1329             :     FuncCallContext *funcctx;
    1330             :     regexp_matches_ctx *matchctx;
    1331             : 
    1332         678 :     if (SRF_IS_FIRSTCALL())
    1333             :     {
    1334         288 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1335         288 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1336             :         pg_re_flags re_flags;
    1337             :         MemoryContext oldcontext;
    1338             : 
    1339         288 :         funcctx = SRF_FIRSTCALL_INIT();
    1340         288 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1341             : 
    1342             :         /* Determine options */
    1343         288 :         parse_re_flags(&re_flags, flags);
    1344             : 
    1345             :         /* be sure to copy the input string into the multi-call ctx */
    1346         282 :         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1347             :                                         &re_flags, 0,
    1348             :                                         PG_GET_COLLATION(),
    1349             :                                         true, false, false);
    1350             : 
    1351             :         /* Pre-create workspace that build_regexp_match_result needs */
    1352         270 :         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
    1353         270 :         matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
    1354             : 
    1355         270 :         MemoryContextSwitchTo(oldcontext);
    1356         270 :         funcctx->user_fctx = (void *) matchctx;
    1357             :     }
    1358             : 
    1359         660 :     funcctx = SRF_PERCALL_SETUP();
    1360         660 :     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1361             : 
    1362         660 :     if (matchctx->next_match < matchctx->nmatches)
    1363             :     {
    1364             :         ArrayType  *result_ary;
    1365             : 
    1366         390 :         result_ary = build_regexp_match_result(matchctx);
    1367         390 :         matchctx->next_match++;
    1368         390 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
    1369             :     }
    1370             : 
    1371         270 :     SRF_RETURN_DONE(funcctx);
    1372             : }
    1373             : 
    1374             : /* This is separate to keep the opr_sanity regression test from complaining */
    1375             : Datum
    1376         354 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
    1377             : {
    1378         354 :     return regexp_matches(fcinfo);
    1379             : }
    1380             : 
    1381             : /*
    1382             :  * setup_regexp_matches --- do the initial matching for regexp_match,
    1383             :  *      regexp_split, and related functions
    1384             :  *
    1385             :  * To avoid having to re-find the compiled pattern on each call, we do
    1386             :  * all the matching in one swoop.  The returned regexp_matches_ctx contains
    1387             :  * the locations of all the substrings matching the pattern.
    1388             :  *
    1389             :  * start_search: the character (not byte) offset in orig_str at which to
    1390             :  * begin the search.  Returned positions are relative to orig_str anyway.
    1391             :  * use_subpatterns: collect data about matches to parenthesized subexpressions.
    1392             :  * ignore_degenerate: ignore zero-length matches.
    1393             :  * fetching_unmatched: caller wants to fetch unmatched substrings.
    1394             :  *
    1395             :  * We don't currently assume that fetching_unmatched is exclusive of fetching
    1396             :  * the matched text too; if it's set, the conversion buffer is large enough to
    1397             :  * fetch any single matched or unmatched string, but not any larger
    1398             :  * substring.  (In practice, when splitting the matches are usually small
    1399             :  * anyway, and it didn't seem worth complicating the code further.)
    1400             :  */
    1401             : static regexp_matches_ctx *
    1402      203190 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
    1403             :                      int start_search,
    1404             :                      Oid collation,
    1405             :                      bool use_subpatterns,
    1406             :                      bool ignore_degenerate,
    1407             :                      bool fetching_unmatched)
    1408             : {
    1409      203190 :     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
    1410      203190 :     int         eml = pg_database_encoding_max_length();
    1411             :     int         orig_len;
    1412             :     pg_wchar   *wide_str;
    1413             :     int         wide_len;
    1414             :     int         cflags;
    1415             :     regex_t    *cpattern;
    1416             :     regmatch_t *pmatch;
    1417             :     int         pmatch_len;
    1418             :     int         array_len;
    1419             :     int         array_idx;
    1420             :     int         prev_match_end;
    1421             :     int         prev_valid_match_end;
    1422      203190 :     int         maxlen = 0;     /* largest fetch length in characters */
    1423             : 
    1424             :     /* save original string --- we'll extract result substrings from it */
    1425      203190 :     matchctx->orig_str = orig_str;
    1426             : 
    1427             :     /* convert string to pg_wchar form for matching */
    1428      203190 :     orig_len = VARSIZE_ANY_EXHDR(orig_str);
    1429      203190 :     wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
    1430      203190 :     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
    1431             : 
    1432             :     /* set up the compiled pattern */
    1433      203190 :     cflags = re_flags->cflags;
    1434      203190 :     if (!use_subpatterns)
    1435      200282 :         cflags |= REG_NOSUB;
    1436      203190 :     cpattern = RE_compile_and_cache(pattern, cflags, collation);
    1437             : 
    1438             :     /* do we want to remember subpatterns? */
    1439      203178 :     if (use_subpatterns && cpattern->re_nsub > 0)
    1440             :     {
    1441        2686 :         matchctx->npatterns = cpattern->re_nsub;
    1442        2686 :         pmatch_len = cpattern->re_nsub + 1;
    1443             :     }
    1444             :     else
    1445             :     {
    1446      200492 :         use_subpatterns = false;
    1447      200492 :         matchctx->npatterns = 1;
    1448      200492 :         pmatch_len = 1;
    1449             :     }
    1450             : 
    1451             :     /* temporary output space for RE package */
    1452      203178 :     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
    1453             : 
    1454             :     /*
    1455             :      * the real output space (grown dynamically if needed)
    1456             :      *
    1457             :      * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
    1458             :      * than at 2^27
    1459             :      */
    1460      203178 :     array_len = re_flags->glob ? 255 : 31;
    1461      203178 :     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
    1462      203178 :     array_idx = 0;
    1463             : 
    1464             :     /* search for the pattern, perhaps repeatedly */
    1465      203178 :     prev_match_end = 0;
    1466      203178 :     prev_valid_match_end = 0;
    1467     1005150 :     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
    1468             :                             pmatch_len, pmatch))
    1469             :     {
    1470             :         /*
    1471             :          * If requested, ignore degenerate matches, which are zero-length
    1472             :          * matches occurring at the start or end of a string or just after a
    1473             :          * previous match.
    1474             :          */
    1475      804562 :         if (!ignore_degenerate ||
    1476      801422 :             (pmatch[0].rm_so < wide_len &&
    1477      801380 :              pmatch[0].rm_eo > prev_match_end))
    1478             :         {
    1479             :             /* enlarge output space if needed */
    1480      804382 :             while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
    1481             :             {
    1482           0 :                 array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
    1483           0 :                 if (array_len > MaxAllocSize / sizeof(int))
    1484           0 :                     ereport(ERROR,
    1485             :                             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    1486             :                              errmsg("too many regular expression matches")));
    1487           0 :                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
    1488             :                                                         sizeof(int) * array_len);
    1489             :             }
    1490             : 
    1491             :             /* save this match's locations */
    1492      804382 :             if (use_subpatterns)
    1493             :             {
    1494             :                 int         i;
    1495             : 
    1496        7800 :                 for (i = 1; i <= matchctx->npatterns; i++)
    1497             :                 {
    1498        5266 :                     int         so = pmatch[i].rm_so;
    1499        5266 :                     int         eo = pmatch[i].rm_eo;
    1500             : 
    1501        5266 :                     matchctx->match_locs[array_idx++] = so;
    1502        5266 :                     matchctx->match_locs[array_idx++] = eo;
    1503        5266 :                     if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1504        3384 :                         maxlen = (eo - so);
    1505             :                 }
    1506             :             }
    1507             :             else
    1508             :             {
    1509      801848 :                 int         so = pmatch[0].rm_so;
    1510      801848 :                 int         eo = pmatch[0].rm_eo;
    1511             : 
    1512      801848 :                 matchctx->match_locs[array_idx++] = so;
    1513      801848 :                 matchctx->match_locs[array_idx++] = eo;
    1514      801848 :                 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1515      200326 :                     maxlen = (eo - so);
    1516             :             }
    1517      804382 :             matchctx->nmatches++;
    1518             : 
    1519             :             /*
    1520             :              * check length of unmatched portion between end of previous valid
    1521             :              * (nondegenerate, or degenerate but not ignored) match and start
    1522             :              * of current one
    1523             :              */
    1524      804382 :             if (fetching_unmatched &&
    1525      801242 :                 pmatch[0].rm_so >= 0 &&
    1526      801242 :                 (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
    1527      380142 :                 maxlen = (pmatch[0].rm_so - prev_valid_match_end);
    1528      804382 :             prev_valid_match_end = pmatch[0].rm_eo;
    1529             :         }
    1530      804562 :         prev_match_end = pmatch[0].rm_eo;
    1531             : 
    1532             :         /* if not glob, stop after one match */
    1533      804562 :         if (!re_flags->glob)
    1534        2524 :             break;
    1535             : 
    1536             :         /*
    1537             :          * Advance search position.  Normally we start the next search at the
    1538             :          * end of the previous match; but if the match was of zero length, we
    1539             :          * have to advance by one character, or we'd just find the same match
    1540             :          * again.
    1541             :          */
    1542      802038 :         start_search = prev_match_end;
    1543      802038 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    1544        1176 :             start_search++;
    1545      802038 :         if (start_search > wide_len)
    1546          66 :             break;
    1547             :     }
    1548             : 
    1549             :     /*
    1550             :      * check length of unmatched portion between end of last match and end of
    1551             :      * input string
    1552             :      */
    1553      203178 :     if (fetching_unmatched &&
    1554      200144 :         (wide_len - prev_valid_match_end) > maxlen)
    1555          28 :         maxlen = (wide_len - prev_valid_match_end);
    1556             : 
    1557             :     /*
    1558             :      * Keep a note of the end position of the string for the benefit of
    1559             :      * splitting code.
    1560             :      */
    1561      203178 :     matchctx->match_locs[array_idx] = wide_len;
    1562             : 
    1563      203178 :     if (eml > 1)
    1564             :     {
    1565      203178 :         int64       maxsiz = eml * (int64) maxlen;
    1566             :         int         conv_bufsiz;
    1567             : 
    1568             :         /*
    1569             :          * Make the conversion buffer large enough for any substring of
    1570             :          * interest.
    1571             :          *
    1572             :          * Worst case: assume we need the maximum size (maxlen*eml), but take
    1573             :          * advantage of the fact that the original string length in bytes is
    1574             :          * an upper bound on the byte length of any fetched substring (and we
    1575             :          * know that len+1 is safe to allocate because the varlena header is
    1576             :          * longer than 1 byte).
    1577             :          */
    1578      203178 :         if (maxsiz > orig_len)
    1579      200480 :             conv_bufsiz = orig_len + 1;
    1580             :         else
    1581        2698 :             conv_bufsiz = maxsiz + 1;   /* safe since maxsiz < 2^30 */
    1582             : 
    1583      203178 :         matchctx->conv_buf = palloc(conv_bufsiz);
    1584      203178 :         matchctx->conv_bufsiz = conv_bufsiz;
    1585      203178 :         matchctx->wide_str = wide_str;
    1586             :     }
    1587             :     else
    1588             :     {
    1589             :         /* No need to keep the wide string if we're in a single-byte charset. */
    1590           0 :         pfree(wide_str);
    1591           0 :         matchctx->wide_str = NULL;
    1592           0 :         matchctx->conv_buf = NULL;
    1593           0 :         matchctx->conv_bufsiz = 0;
    1594             :     }
    1595             : 
    1596             :     /* Clean up temp storage */
    1597      203178 :     pfree(pmatch);
    1598             : 
    1599      203178 :     return matchctx;
    1600             : }
    1601             : 
    1602             : /*
    1603             :  * build_regexp_match_result - build output array for current match
    1604             :  */
    1605             : static ArrayType *
    1606        2744 : build_regexp_match_result(regexp_matches_ctx *matchctx)
    1607             : {
    1608        2744 :     char       *buf = matchctx->conv_buf;
    1609        2744 :     Datum      *elems = matchctx->elems;
    1610        2744 :     bool       *nulls = matchctx->nulls;
    1611             :     int         dims[1];
    1612             :     int         lbs[1];
    1613             :     int         loc;
    1614             :     int         i;
    1615             : 
    1616             :     /* Extract matching substrings from the original string */
    1617        2744 :     loc = matchctx->next_match * matchctx->npatterns * 2;
    1618        7950 :     for (i = 0; i < matchctx->npatterns; i++)
    1619             :     {
    1620        5206 :         int         so = matchctx->match_locs[loc++];
    1621        5206 :         int         eo = matchctx->match_locs[loc++];
    1622             : 
    1623        5206 :         if (so < 0 || eo < 0)
    1624             :         {
    1625           6 :             elems[i] = (Datum) 0;
    1626           6 :             nulls[i] = true;
    1627             :         }
    1628        5200 :         else if (buf)
    1629             :         {
    1630        5200 :             int         len = pg_wchar2mb_with_len(matchctx->wide_str + so,
    1631             :                                                    buf,
    1632             :                                                    eo - so);
    1633             : 
    1634             :             Assert(len < matchctx->conv_bufsiz);
    1635        5200 :             elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
    1636        5200 :             nulls[i] = false;
    1637             :         }
    1638             :         else
    1639             :         {
    1640           0 :             elems[i] = DirectFunctionCall3(text_substr,
    1641             :                                            PointerGetDatum(matchctx->orig_str),
    1642             :                                            Int32GetDatum(so + 1),
    1643             :                                            Int32GetDatum(eo - so));
    1644           0 :             nulls[i] = false;
    1645             :         }
    1646             :     }
    1647             : 
    1648             :     /* And form an array */
    1649        2744 :     dims[0] = matchctx->npatterns;
    1650        2744 :     lbs[0] = 1;
    1651             :     /* XXX: this hardcodes assumptions about the text type */
    1652        2744 :     return construct_md_array(elems, nulls, 1, dims, lbs,
    1653             :                               TEXTOID, -1, false, TYPALIGN_INT);
    1654             : }
    1655             : 
    1656             : /*
    1657             :  * regexp_split_to_table()
    1658             :  *      Split the string at matches of the pattern, returning the
    1659             :  *      split-out substrings as a table.
    1660             :  */
    1661             : Datum
    1662         622 : regexp_split_to_table(PG_FUNCTION_ARGS)
    1663             : {
    1664             :     FuncCallContext *funcctx;
    1665             :     regexp_matches_ctx *splitctx;
    1666             : 
    1667         622 :     if (SRF_IS_FIRSTCALL())
    1668             :     {
    1669          52 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1670          52 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1671             :         pg_re_flags re_flags;
    1672             :         MemoryContext oldcontext;
    1673             : 
    1674          52 :         funcctx = SRF_FIRSTCALL_INIT();
    1675          52 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1676             : 
    1677             :         /* Determine options */
    1678          52 :         parse_re_flags(&re_flags, flags);
    1679             :         /* User mustn't specify 'g' */
    1680          46 :         if (re_flags.glob)
    1681           6 :             ereport(ERROR,
    1682             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1683             :             /* translator: %s is a SQL function name */
    1684             :                      errmsg("%s does not support the \"global\" option",
    1685             :                             "regexp_split_to_table()")));
    1686             :         /* But we find all the matches anyway */
    1687          40 :         re_flags.glob = true;
    1688             : 
    1689             :         /* be sure to copy the input string into the multi-call ctx */
    1690          40 :         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1691             :                                         &re_flags, 0,
    1692             :                                         PG_GET_COLLATION(),
    1693             :                                         false, true, true);
    1694             : 
    1695          40 :         MemoryContextSwitchTo(oldcontext);
    1696          40 :         funcctx->user_fctx = (void *) splitctx;
    1697             :     }
    1698             : 
    1699         610 :     funcctx = SRF_PERCALL_SETUP();
    1700         610 :     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1701             : 
    1702         610 :     if (splitctx->next_match <= splitctx->nmatches)
    1703             :     {
    1704         570 :         Datum       result = build_regexp_split_result(splitctx);
    1705             : 
    1706         570 :         splitctx->next_match++;
    1707         570 :         SRF_RETURN_NEXT(funcctx, result);
    1708             :     }
    1709             : 
    1710          40 :     SRF_RETURN_DONE(funcctx);
    1711             : }
    1712             : 
    1713             : /* This is separate to keep the opr_sanity regression test from complaining */
    1714             : Datum
    1715         552 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
    1716             : {
    1717         552 :     return regexp_split_to_table(fcinfo);
    1718             : }
    1719             : 
    1720             : /*
    1721             :  * regexp_split_to_array()
    1722             :  *      Split the string at matches of the pattern, returning the
    1723             :  *      split-out substrings as an array.
    1724             :  */
    1725             : Datum
    1726      200116 : regexp_split_to_array(PG_FUNCTION_ARGS)
    1727             : {
    1728      200116 :     ArrayBuildState *astate = NULL;
    1729             :     pg_re_flags re_flags;
    1730             :     regexp_matches_ctx *splitctx;
    1731             : 
    1732             :     /* Determine options */
    1733      200116 :     parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
    1734             :     /* User mustn't specify 'g' */
    1735      200110 :     if (re_flags.glob)
    1736           6 :         ereport(ERROR,
    1737             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1738             :         /* translator: %s is a SQL function name */
    1739             :                  errmsg("%s does not support the \"global\" option",
    1740             :                         "regexp_split_to_array()")));
    1741             :     /* But we find all the matches anyway */
    1742      200104 :     re_flags.glob = true;
    1743             : 
    1744      200104 :     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
    1745      200104 :                                     PG_GETARG_TEXT_PP(1),
    1746             :                                     &re_flags, 0,
    1747             :                                     PG_GET_COLLATION(),
    1748             :                                     false, true, true);
    1749             : 
    1750     1200920 :     while (splitctx->next_match <= splitctx->nmatches)
    1751             :     {
    1752     1000816 :         astate = accumArrayResult(astate,
    1753             :                                   build_regexp_split_result(splitctx),
    1754             :                                   false,
    1755             :                                   TEXTOID,
    1756             :                                   CurrentMemoryContext);
    1757     1000816 :         splitctx->next_match++;
    1758             :     }
    1759             : 
    1760      200104 :     PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
    1761             : }
    1762             : 
    1763             : /* This is separate to keep the opr_sanity regression test from complaining */
    1764             : Datum
    1765      200074 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
    1766             : {
    1767      200074 :     return regexp_split_to_array(fcinfo);
    1768             : }
    1769             : 
    1770             : /*
    1771             :  * build_regexp_split_result - build output string for current match
    1772             :  *
    1773             :  * We return the string between the current match and the previous one,
    1774             :  * or the string after the last match when next_match == nmatches.
    1775             :  */
    1776             : static Datum
    1777     1001386 : build_regexp_split_result(regexp_matches_ctx *splitctx)
    1778             : {
    1779     1001386 :     char       *buf = splitctx->conv_buf;
    1780             :     int         startpos;
    1781             :     int         endpos;
    1782             : 
    1783     1001386 :     if (splitctx->next_match > 0)
    1784      801242 :         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
    1785             :     else
    1786      200144 :         startpos = 0;
    1787     1001386 :     if (startpos < 0)
    1788           0 :         elog(ERROR, "invalid match ending position");
    1789             : 
    1790     1001386 :     endpos = splitctx->match_locs[splitctx->next_match * 2];
    1791     1001386 :     if (endpos < startpos)
    1792           0 :         elog(ERROR, "invalid match starting position");
    1793             : 
    1794     1001386 :     if (buf)
    1795             :     {
    1796             :         int         len;
    1797             : 
    1798     1001386 :         len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
    1799             :                                    buf,
    1800             :                                    endpos - startpos);
    1801             :         Assert(len < splitctx->conv_bufsiz);
    1802     1001386 :         return PointerGetDatum(cstring_to_text_with_len(buf, len));
    1803             :     }
    1804             :     else
    1805             :     {
    1806           0 :         return DirectFunctionCall3(text_substr,
    1807             :                                    PointerGetDatum(splitctx->orig_str),
    1808             :                                    Int32GetDatum(startpos + 1),
    1809             :                                    Int32GetDatum(endpos - startpos));
    1810             :     }
    1811             : }
    1812             : 
    1813             : /*
    1814             :  * regexp_substr()
    1815             :  *      Return the substring that matches a regular expression pattern
    1816             :  */
    1817             : Datum
    1818         108 : regexp_substr(PG_FUNCTION_ARGS)
    1819             : {
    1820         108 :     text       *str = PG_GETARG_TEXT_PP(0);
    1821         108 :     text       *pattern = PG_GETARG_TEXT_PP(1);
    1822         108 :     int         start = 1;
    1823         108 :     int         n = 1;
    1824         108 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4);
    1825         108 :     int         subexpr = 0;
    1826             :     int         so,
    1827             :                 eo,
    1828             :                 pos;
    1829             :     pg_re_flags re_flags;
    1830             :     regexp_matches_ctx *matchctx;
    1831             : 
    1832             :     /* Collect optional parameters */
    1833         108 :     if (PG_NARGS() > 2)
    1834             :     {
    1835          90 :         start = PG_GETARG_INT32(2);
    1836          90 :         if (start <= 0)
    1837           6 :             ereport(ERROR,
    1838             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1839             :                      errmsg("invalid value for parameter \"%s\": %d",
    1840             :                             "start", start)));
    1841             :     }
    1842         102 :     if (PG_NARGS() > 3)
    1843             :     {
    1844          78 :         n = PG_GETARG_INT32(3);
    1845          78 :         if (n <= 0)
    1846           6 :             ereport(ERROR,
    1847             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1848             :                      errmsg("invalid value for parameter \"%s\": %d",
    1849             :                             "n", n)));
    1850             :     }
    1851          96 :     if (PG_NARGS() > 5)
    1852             :     {
    1853          48 :         subexpr = PG_GETARG_INT32(5);
    1854          48 :         if (subexpr < 0)
    1855           6 :             ereport(ERROR,
    1856             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1857             :                      errmsg("invalid value for parameter \"%s\": %d",
    1858             :                             "subexpr", subexpr)));
    1859             :     }
    1860             : 
    1861             :     /* Determine options */
    1862          90 :     parse_re_flags(&re_flags, flags);
    1863             :     /* User mustn't specify 'g' */
    1864          90 :     if (re_flags.glob)
    1865           6 :         ereport(ERROR,
    1866             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1867             :         /* translator: %s is a SQL function name */
    1868             :                  errmsg("%s does not support the \"global\" option",
    1869             :                         "regexp_substr()")));
    1870             :     /* But we find all the matches anyway */
    1871          84 :     re_flags.glob = true;
    1872             : 
    1873             :     /* Do the matching */
    1874          84 :     matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
    1875             :                                     PG_GET_COLLATION(),
    1876             :                                     (subexpr > 0),   /* need submatches? */
    1877             :                                     false, false);
    1878             : 
    1879             :     /* When n exceeds matches return NULL (includes case of no matches) */
    1880          84 :     if (n > matchctx->nmatches)
    1881          12 :         PG_RETURN_NULL();
    1882             : 
    1883             :     /* When subexpr exceeds number of subexpressions return NULL */
    1884          72 :     if (subexpr > matchctx->npatterns)
    1885           6 :         PG_RETURN_NULL();
    1886             : 
    1887             :     /* Select the appropriate match position to return */
    1888          66 :     pos = (n - 1) * matchctx->npatterns;
    1889          66 :     if (subexpr > 0)
    1890          30 :         pos += subexpr - 1;
    1891          66 :     pos *= 2;
    1892          66 :     so = matchctx->match_locs[pos];
    1893          66 :     eo = matchctx->match_locs[pos + 1];
    1894             : 
    1895          66 :     if (so < 0 || eo < 0)
    1896           6 :         PG_RETURN_NULL();       /* unidentifiable location */
    1897             : 
    1898          60 :     PG_RETURN_DATUM(DirectFunctionCall3(text_substr,
    1899             :                                         PointerGetDatum(matchctx->orig_str),
    1900             :                                         Int32GetDatum(so + 1),
    1901             :                                         Int32GetDatum(eo - so)));
    1902             : }
    1903             : 
    1904             : /* This is separate to keep the opr_sanity regression test from complaining */
    1905             : Datum
    1906          18 : regexp_substr_no_start(PG_FUNCTION_ARGS)
    1907             : {
    1908          18 :     return regexp_substr(fcinfo);
    1909             : }
    1910             : 
    1911             : /* This is separate to keep the opr_sanity regression test from complaining */
    1912             : Datum
    1913           6 : regexp_substr_no_n(PG_FUNCTION_ARGS)
    1914             : {
    1915           6 :     return regexp_substr(fcinfo);
    1916             : }
    1917             : 
    1918             : /* This is separate to keep the opr_sanity regression test from complaining */
    1919             : Datum
    1920          24 : regexp_substr_no_flags(PG_FUNCTION_ARGS)
    1921             : {
    1922          24 :     return regexp_substr(fcinfo);
    1923             : }
    1924             : 
    1925             : /* This is separate to keep the opr_sanity regression test from complaining */
    1926             : Datum
    1927          12 : regexp_substr_no_subexpr(PG_FUNCTION_ARGS)
    1928             : {
    1929          12 :     return regexp_substr(fcinfo);
    1930             : }
    1931             : 
    1932             : /*
    1933             :  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
    1934             :  *
    1935             :  * The result is NULL if there is no fixed prefix, else a palloc'd string.
    1936             :  * If it is an exact match, not just a prefix, *exact is returned as true.
    1937             :  */
    1938             : char *
    1939       12026 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
    1940             :                     bool *exact)
    1941             : {
    1942             :     char       *result;
    1943             :     regex_t    *re;
    1944             :     int         cflags;
    1945             :     int         re_result;
    1946             :     pg_wchar   *str;
    1947             :     size_t      slen;
    1948             :     size_t      maxlen;
    1949             :     char        errMsg[100];
    1950             : 
    1951       12026 :     *exact = false;             /* default result */
    1952             : 
    1953             :     /* Compile RE */
    1954       12026 :     cflags = REG_ADVANCED;
    1955       12026 :     if (case_insensitive)
    1956          32 :         cflags |= REG_ICASE;
    1957             : 
    1958       12026 :     re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
    1959             : 
    1960             :     /* Examine it to see if there's a fixed prefix */
    1961       12026 :     re_result = pg_regprefix(re, &str, &slen);
    1962             : 
    1963       12026 :     switch (re_result)
    1964             :     {
    1965         648 :         case REG_NOMATCH:
    1966         648 :             return NULL;
    1967             : 
    1968        1100 :         case REG_PREFIX:
    1969             :             /* continue with wchar conversion */
    1970        1100 :             break;
    1971             : 
    1972       10278 :         case REG_EXACT:
    1973       10278 :             *exact = true;
    1974             :             /* continue with wchar conversion */
    1975       10278 :             break;
    1976             : 
    1977           0 :         default:
    1978             :             /* re failed??? */
    1979           0 :             CHECK_FOR_INTERRUPTS();
    1980           0 :             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
    1981           0 :             ereport(ERROR,
    1982             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    1983             :                      errmsg("regular expression failed: %s", errMsg)));
    1984             :             break;
    1985             :     }
    1986             : 
    1987             :     /* Convert pg_wchar result back to database encoding */
    1988       11378 :     maxlen = pg_database_encoding_max_length() * slen + 1;
    1989       11378 :     result = (char *) palloc(maxlen);
    1990       11378 :     slen = pg_wchar2mb_with_len(str, result, slen);
    1991             :     Assert(slen < maxlen);
    1992             : 
    1993       11378 :     free(str);
    1994             : 
    1995       11378 :     return result;
    1996             : }

Generated by: LCOV version 1.14