LCOV - code coverage report
Current view: top level - src/backend/utils/adt - regexp.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 398 475 83.8 %
Date: 2019-09-22 07:07:17 Functions: 30 32 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regexp.c
       4             :  *    Postgres' interface to the regular expression package.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/regexp.c
      12             :  *
      13             :  *      Alistair Crooks added the code for the regex caching
      14             :  *      agc - cached the regular expressions used - there's a good chance
      15             :  *      that we'll get a hit, so this saves a compile step for every
      16             :  *      attempted match. I haven't actually measured the speed improvement,
      17             :  *      but it `looks' a lot quicker visually when watching regression
      18             :  *      test output.
      19             :  *
      20             :  *      agc - incorporated Keith Bostic's Berkeley regex code into
      21             :  *      the tree for all ports. To distinguish this regex code from any that
      22             :  *      is existent on a platform, I've prepended the string "pg_" to
      23             :  *      the functions regcomp, regerror, regexec and regfree.
      24             :  *      Fixed a bug that was originally a typo by me, where `i' was used
      25             :  *      instead of `oldest' when compiling regular expressions - benign
      26             :  *      results mostly, although occasionally it bit you...
      27             :  *
      28             :  *-------------------------------------------------------------------------
      29             :  */
      30             : #include "postgres.h"
      31             : 
      32             : #include "catalog/pg_type.h"
      33             : #include "funcapi.h"
      34             : #include "miscadmin.h"
      35             : #include "regex/regex.h"
      36             : #include "utils/array.h"
      37             : #include "utils/builtins.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/varlena.h"
      40             : 
      41             : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      42             :     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
      43             : 
      44             : 
      45             : /* all the options of interest for regex functions */
      46             : typedef struct pg_re_flags
      47             : {
      48             :     int         cflags;         /* compile flags for Spencer's regex code */
      49             :     bool        glob;           /* do it globally (for each occurrence) */
      50             : } pg_re_flags;
      51             : 
      52             : /* cross-call state for regexp_match and regexp_split functions */
      53             : typedef struct regexp_matches_ctx
      54             : {
      55             :     text       *orig_str;       /* data string in original TEXT form */
      56             :     int         nmatches;       /* number of places where pattern matched */
      57             :     int         npatterns;      /* number of capturing subpatterns */
      58             :     /* We store start char index and end+1 char index for each match */
      59             :     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      60             :     int        *match_locs;     /* 0-based character indexes */
      61             :     int         next_match;     /* 0-based index of next match to process */
      62             :     /* workspace for build_regexp_match_result() */
      63             :     Datum      *elems;          /* has npatterns elements */
      64             :     bool       *nulls;          /* has npatterns elements */
      65             :     pg_wchar   *wide_str;       /* wide-char version of original string */
      66             :     char       *conv_buf;       /* conversion buffer */
      67             :     int         conv_bufsiz;    /* size thereof */
      68             : } regexp_matches_ctx;
      69             : 
      70             : /*
      71             :  * We cache precompiled regular expressions using a "self organizing list"
      72             :  * structure, in which recently-used items tend to be near the front.
      73             :  * Whenever we use an entry, it's moved up to the front of the list.
      74             :  * Over time, an item's average position corresponds to its frequency of use.
      75             :  *
      76             :  * When we first create an entry, it's inserted at the front of
      77             :  * the array, dropping the entry at the end of the array if necessary to
      78             :  * make room.  (This might seem to be weighting the new entry too heavily,
      79             :  * but if we insert new entries further back, we'll be unable to adjust to
      80             :  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
      81             :  * never-before-seen items used circularly.  We ought to be able to handle
      82             :  * that case, so we have to insert at the front.)
      83             :  *
      84             :  * Knuth mentions a variant strategy in which a used item is moved up just
      85             :  * one place in the list.  Although he says this uses fewer comparisons on
      86             :  * average, it seems not to adapt very well to the situation where you have
      87             :  * both some reusable patterns and a steady stream of non-reusable patterns.
      88             :  * A reusable pattern that isn't used at least as often as non-reusable
      89             :  * patterns are seen will "fail to keep up" and will drop off the end of the
      90             :  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
      91             :  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
      92             :  */
      93             : 
      94             : /* this is the maximum number of cached regular expressions */
      95             : #ifndef MAX_CACHED_RES
      96             : #define MAX_CACHED_RES  32
      97             : #endif
      98             : 
      99             : /* this structure describes one cached regular expression */
     100             : typedef struct cached_re_str
     101             : {
     102             :     char       *cre_pat;        /* original RE (not null terminated!) */
     103             :     int         cre_pat_len;    /* length of original RE, in bytes */
     104             :     int         cre_flags;      /* compile flags: extended,icase etc */
     105             :     Oid         cre_collation;  /* collation to use */
     106             :     regex_t     cre_re;         /* the compiled regular expression */
     107             : } cached_re_str;
     108             : 
     109             : static int  num_res = 0;        /* # of cached re's */
     110             : static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
     111             : 
     112             : 
     113             : /* Local functions */
     114             : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
     115             :                                                 pg_re_flags *flags,
     116             :                                                 Oid collation,
     117             :                                                 bool use_subpatterns,
     118             :                                                 bool ignore_degenerate,
     119             :                                                 bool fetching_unmatched);
     120             : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
     121             : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
     122             : 
     123             : 
     124             : /*
     125             :  * RE_compile_and_cache - compile a RE, caching if possible
     126             :  *
     127             :  * Returns regex_t *
     128             :  *
     129             :  *  text_re --- the pattern, expressed as a TEXT object
     130             :  *  cflags --- compile options for the pattern
     131             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     132             :  *
     133             :  * Pattern is given in the database encoding.  We internally convert to
     134             :  * an array of pg_wchar, which is what Spencer's regex package wants.
     135             :  */
     136             : regex_t *
     137      612344 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
     138             : {
     139      612344 :     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
     140      612344 :     char       *text_re_val = VARDATA_ANY(text_re);
     141             :     pg_wchar   *pattern;
     142             :     int         pattern_len;
     143             :     int         i;
     144             :     int         regcomp_result;
     145             :     cached_re_str re_temp;
     146             :     char        errMsg[100];
     147             : 
     148             :     /*
     149             :      * Look for a match among previously compiled REs.  Since the data
     150             :      * structure is self-organizing with most-used entries at the front, our
     151             :      * search strategy can just be to scan from the front.
     152             :      */
     153     1056864 :     for (i = 0; i < num_res; i++)
     154             :     {
     155     1666372 :         if (re_array[i].cre_pat_len == text_re_len &&
     156     1223826 :             re_array[i].cre_flags == cflags &&
     157     1223372 :             re_array[i].cre_collation == collation &&
     158      611610 :             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
     159             :         {
     160             :             /*
     161             :              * Found a match; move it to front if not there already.
     162             :              */
     163      609788 :             if (i > 0)
     164             :             {
     165      426882 :                 re_temp = re_array[i];
     166      426882 :                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
     167      426882 :                 re_array[0] = re_temp;
     168             :             }
     169             : 
     170      609788 :             return &re_array[0].cre_re;
     171             :         }
     172             :     }
     173             : 
     174             :     /*
     175             :      * Couldn't find it, so try to compile the new RE.  To avoid leaking
     176             :      * resources on failure, we build into the re_temp local.
     177             :      */
     178             : 
     179             :     /* Convert pattern string to wide characters */
     180        2556 :     pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
     181        2556 :     pattern_len = pg_mb2wchar_with_len(text_re_val,
     182             :                                        pattern,
     183             :                                        text_re_len);
     184             : 
     185        2556 :     regcomp_result = pg_regcomp(&re_temp.cre_re,
     186             :                                 pattern,
     187             :                                 pattern_len,
     188             :                                 cflags,
     189             :                                 collation);
     190             : 
     191        2556 :     pfree(pattern);
     192             : 
     193        2556 :     if (regcomp_result != REG_OKAY)
     194             :     {
     195             :         /* re didn't compile (no need for pg_regfree, if so) */
     196             : 
     197             :         /*
     198             :          * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
     199             :          * before reporting a regex error.  This is so that if the regex
     200             :          * library aborts and returns REG_CANCEL, we don't print an error
     201             :          * message that implies the regex was invalid.
     202             :          */
     203          28 :         CHECK_FOR_INTERRUPTS();
     204             : 
     205          28 :         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
     206          28 :         ereport(ERROR,
     207             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     208             :                  errmsg("invalid regular expression: %s", errMsg)));
     209             :     }
     210             : 
     211             :     /*
     212             :      * We use malloc/free for the cre_pat field because the storage has to
     213             :      * persist across transactions, and because we want to get control back on
     214             :      * out-of-memory.  The Max() is because some malloc implementations return
     215             :      * NULL for malloc(0).
     216             :      */
     217        2528 :     re_temp.cre_pat = malloc(Max(text_re_len, 1));
     218        2528 :     if (re_temp.cre_pat == NULL)
     219             :     {
     220           0 :         pg_regfree(&re_temp.cre_re);
     221           0 :         ereport(ERROR,
     222             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     223             :                  errmsg("out of memory")));
     224             :     }
     225        2528 :     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
     226        2528 :     re_temp.cre_pat_len = text_re_len;
     227        2528 :     re_temp.cre_flags = cflags;
     228        2528 :     re_temp.cre_collation = collation;
     229             : 
     230             :     /*
     231             :      * Okay, we have a valid new item in re_temp; insert it into the storage
     232             :      * array.  Discard last entry if needed.
     233             :      */
     234        2528 :     if (num_res >= MAX_CACHED_RES)
     235             :     {
     236         160 :         --num_res;
     237             :         Assert(num_res < MAX_CACHED_RES);
     238         160 :         pg_regfree(&re_array[num_res].cre_re);
     239         160 :         free(re_array[num_res].cre_pat);
     240             :     }
     241             : 
     242        2528 :     if (num_res > 0)
     243        1722 :         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
     244             : 
     245        2528 :     re_array[0] = re_temp;
     246        2528 :     num_res++;
     247             : 
     248        2528 :     return &re_array[0].cre_re;
     249             : }
     250             : 
     251             : /*
     252             :  * RE_wchar_execute - execute a RE on pg_wchar data
     253             :  *
     254             :  * Returns true on match, false on no match
     255             :  *
     256             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     257             :  *  data --- the data to match against (need not be null-terminated)
     258             :  *  data_len --- the length of the data string
     259             :  *  start_search -- the offset in the data to start searching
     260             :  *  nmatch, pmatch  --- optional return area for match details
     261             :  *
     262             :  * Data is given as array of pg_wchar which is what Spencer's regex package
     263             :  * wants.
     264             :  */
     265             : static bool
     266     1407136 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
     267             :                  int start_search, int nmatch, regmatch_t *pmatch)
     268             : {
     269             :     int         regexec_result;
     270             :     char        errMsg[100];
     271             : 
     272             :     /* Perform RE match and return result */
     273     1407136 :     regexec_result = pg_regexec(re,
     274             :                                 data,
     275             :                                 data_len,
     276             :                                 start_search,
     277             :                                 NULL,   /* no details */
     278             :                                 nmatch,
     279             :                                 pmatch,
     280             :                                 0);
     281             : 
     282     1407136 :     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
     283             :     {
     284             :         /* re failed??? */
     285           0 :         CHECK_FOR_INTERRUPTS();
     286           0 :         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
     287           0 :         ereport(ERROR,
     288             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     289             :                  errmsg("regular expression failed: %s", errMsg)));
     290             :     }
     291             : 
     292     1407136 :     return (regexec_result == REG_OKAY);
     293             : }
     294             : 
     295             : /*
     296             :  * RE_execute - execute a RE
     297             :  *
     298             :  * Returns true on match, false on no match
     299             :  *
     300             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     301             :  *  dat --- the data to match against (need not be null-terminated)
     302             :  *  dat_len --- the length of the data string
     303             :  *  nmatch, pmatch  --- optional return area for match details
     304             :  *
     305             :  * Data is given in the database encoding.  We internally
     306             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     307             :  */
     308             : static bool
     309      405326 : RE_execute(regex_t *re, char *dat, int dat_len,
     310             :            int nmatch, regmatch_t *pmatch)
     311             : {
     312             :     pg_wchar   *data;
     313             :     int         data_len;
     314             :     bool        match;
     315             : 
     316             :     /* Convert data string to wide characters */
     317      405326 :     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
     318      405326 :     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
     319             : 
     320             :     /* Perform RE match and return result */
     321      405326 :     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
     322             : 
     323      405326 :     pfree(data);
     324      405326 :     return match;
     325             : }
     326             : 
     327             : /*
     328             :  * RE_compile_and_execute - compile and execute a RE
     329             :  *
     330             :  * Returns true on match, false on no match
     331             :  *
     332             :  *  text_re --- the pattern, expressed as a TEXT object
     333             :  *  dat --- the data to match against (need not be null-terminated)
     334             :  *  dat_len --- the length of the data string
     335             :  *  cflags --- compile options for the pattern
     336             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     337             :  *  nmatch, pmatch  --- optional return area for match details
     338             :  *
     339             :  * Both pattern and data are given in the database encoding.  We internally
     340             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     341             :  */
     342             : bool
     343      404962 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
     344             :                        int cflags, Oid collation,
     345             :                        int nmatch, regmatch_t *pmatch)
     346             : {
     347             :     regex_t    *re;
     348             : 
     349             :     /* Compile RE */
     350      404962 :     re = RE_compile_and_cache(text_re, cflags, collation);
     351             : 
     352      404946 :     return RE_execute(re, dat, dat_len, nmatch, pmatch);
     353             : }
     354             : 
     355             : 
     356             : /*
     357             :  * parse_re_flags - parse the options argument of regexp_match and friends
     358             :  *
     359             :  *  flags --- output argument, filled with desired options
     360             :  *  opts --- TEXT object, or NULL for defaults
     361             :  *
     362             :  * This accepts all the options allowed by any of the callers; callers that
     363             :  * don't want some have to reject them after the fact.
     364             :  */
     365             : static void
     366      201478 : parse_re_flags(pg_re_flags *flags, text *opts)
     367             : {
     368             :     /* regex flavor is always folded into the compile flags */
     369      201478 :     flags->cflags = REG_ADVANCED;
     370      201478 :     flags->glob = false;
     371             : 
     372      201478 :     if (opts)
     373             :     {
     374         854 :         char       *opt_p = VARDATA_ANY(opts);
     375         854 :         int         opt_len = VARSIZE_ANY_EXHDR(opts);
     376             :         int         i;
     377             : 
     378        2374 :         for (i = 0; i < opt_len; i++)
     379             :         {
     380        1536 :             switch (opt_p[i])
     381             :             {
     382             :                 case 'g':
     383         746 :                     flags->glob = true;
     384         746 :                     break;
     385             :                 case 'b':       /* BREs (but why???) */
     386           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
     387           0 :                     break;
     388             :                 case 'c':       /* case sensitive */
     389          10 :                     flags->cflags &= ~REG_ICASE;
     390          10 :                     break;
     391             :                 case 'e':       /* plain EREs */
     392           0 :                     flags->cflags |= REG_EXTENDED;
     393           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
     394           0 :                     break;
     395             :                 case 'i':       /* case insensitive */
     396         116 :                     flags->cflags |= REG_ICASE;
     397         116 :                     break;
     398             :                 case 'm':       /* Perloid synonym for n */
     399             :                 case 'n':       /* \n affects ^ $ . [^ */
     400         642 :                     flags->cflags |= REG_NEWLINE;
     401         642 :                     break;
     402             :                 case 'p':       /* ~Perl, \n affects . [^ */
     403           0 :                     flags->cflags |= REG_NLSTOP;
     404           0 :                     flags->cflags &= ~REG_NLANCH;
     405           0 :                     break;
     406             :                 case 'q':       /* literal string */
     407           0 :                     flags->cflags |= REG_QUOTE;
     408           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
     409           0 :                     break;
     410             :                 case 's':       /* single line, \n ordinary */
     411           6 :                     flags->cflags &= ~REG_NEWLINE;
     412           6 :                     break;
     413             :                 case 't':       /* tight syntax */
     414           0 :                     flags->cflags &= ~REG_EXPANDED;
     415           0 :                     break;
     416             :                 case 'w':       /* weird, \n affects ^ $ only */
     417           0 :                     flags->cflags &= ~REG_NLSTOP;
     418           0 :                     flags->cflags |= REG_NLANCH;
     419           0 :                     break;
     420             :                 case 'x':       /* expanded syntax */
     421           0 :                     flags->cflags |= REG_EXPANDED;
     422           0 :                     break;
     423             :                 default:
     424          16 :                     ereport(ERROR,
     425             :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     426             :                              errmsg("invalid regular expression option: \"%c\"",
     427             :                                     opt_p[i])));
     428             :                     break;
     429             :             }
     430             :         }
     431             :     }
     432      201462 : }
     433             : 
     434             : 
     435             : /*
     436             :  *  interface routines called by the function manager
     437             :  */
     438             : 
     439             : Datum
     440       71618 : nameregexeq(PG_FUNCTION_ARGS)
     441             : {
     442       71618 :     Name        n = PG_GETARG_NAME(0);
     443       71618 :     text       *p = PG_GETARG_TEXT_PP(1);
     444             : 
     445       71618 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     446             :                                           NameStr(*n),
     447             :                                           strlen(NameStr(*n)),
     448             :                                           REG_ADVANCED,
     449             :                                           PG_GET_COLLATION(),
     450             :                                           0, NULL));
     451             : }
     452             : 
     453             : Datum
     454        7848 : nameregexne(PG_FUNCTION_ARGS)
     455             : {
     456        7848 :     Name        n = PG_GETARG_NAME(0);
     457        7848 :     text       *p = PG_GETARG_TEXT_PP(1);
     458             : 
     459        7848 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     460             :                                            NameStr(*n),
     461             :                                            strlen(NameStr(*n)),
     462             :                                            REG_ADVANCED,
     463             :                                            PG_GET_COLLATION(),
     464             :                                            0, NULL));
     465             : }
     466             : 
     467             : Datum
     468      302476 : textregexeq(PG_FUNCTION_ARGS)
     469             : {
     470      302476 :     text       *s = PG_GETARG_TEXT_PP(0);
     471      302476 :     text       *p = PG_GETARG_TEXT_PP(1);
     472             : 
     473      302476 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     474             :                                           VARDATA_ANY(s),
     475             :                                           VARSIZE_ANY_EXHDR(s),
     476             :                                           REG_ADVANCED,
     477             :                                           PG_GET_COLLATION(),
     478             :                                           0, NULL));
     479             : }
     480             : 
     481             : Datum
     482       22740 : textregexne(PG_FUNCTION_ARGS)
     483             : {
     484       22740 :     text       *s = PG_GETARG_TEXT_PP(0);
     485       22740 :     text       *p = PG_GETARG_TEXT_PP(1);
     486             : 
     487       22740 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     488             :                                            VARDATA_ANY(s),
     489             :                                            VARSIZE_ANY_EXHDR(s),
     490             :                                            REG_ADVANCED,
     491             :                                            PG_GET_COLLATION(),
     492             :                                            0, NULL));
     493             : }
     494             : 
     495             : 
     496             : /*
     497             :  *  routines that use the regexp stuff, but ignore the case.
     498             :  *  for this, we use the REG_ICASE flag to pg_regcomp
     499             :  */
     500             : 
     501             : 
     502             : Datum
     503           0 : nameicregexeq(PG_FUNCTION_ARGS)
     504             : {
     505           0 :     Name        n = PG_GETARG_NAME(0);
     506           0 :     text       *p = PG_GETARG_TEXT_PP(1);
     507             : 
     508           0 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     509             :                                           NameStr(*n),
     510             :                                           strlen(NameStr(*n)),
     511             :                                           REG_ADVANCED | REG_ICASE,
     512             :                                           PG_GET_COLLATION(),
     513             :                                           0, NULL));
     514             : }
     515             : 
     516             : Datum
     517           4 : nameicregexne(PG_FUNCTION_ARGS)
     518             : {
     519           4 :     Name        n = PG_GETARG_NAME(0);
     520           4 :     text       *p = PG_GETARG_TEXT_PP(1);
     521             : 
     522           4 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     523             :                                            NameStr(*n),
     524             :                                            strlen(NameStr(*n)),
     525             :                                            REG_ADVANCED | REG_ICASE,
     526             :                                            PG_GET_COLLATION(),
     527             :                                            0, NULL));
     528             : }
     529             : 
     530             : Datum
     531          76 : texticregexeq(PG_FUNCTION_ARGS)
     532             : {
     533          76 :     text       *s = PG_GETARG_TEXT_PP(0);
     534          76 :     text       *p = PG_GETARG_TEXT_PP(1);
     535             : 
     536          76 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     537             :                                           VARDATA_ANY(s),
     538             :                                           VARSIZE_ANY_EXHDR(s),
     539             :                                           REG_ADVANCED | REG_ICASE,
     540             :                                           PG_GET_COLLATION(),
     541             :                                           0, NULL));
     542             : }
     543             : 
     544             : Datum
     545          16 : texticregexne(PG_FUNCTION_ARGS)
     546             : {
     547          16 :     text       *s = PG_GETARG_TEXT_PP(0);
     548          16 :     text       *p = PG_GETARG_TEXT_PP(1);
     549             : 
     550          16 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     551             :                                            VARDATA_ANY(s),
     552             :                                            VARSIZE_ANY_EXHDR(s),
     553             :                                            REG_ADVANCED | REG_ICASE,
     554             :                                            PG_GET_COLLATION(),
     555             :                                            0, NULL));
     556             : }
     557             : 
     558             : 
     559             : /*
     560             :  * textregexsubstr()
     561             :  *      Return a substring matched by a regular expression.
     562             :  */
     563             : Datum
     564         380 : textregexsubstr(PG_FUNCTION_ARGS)
     565             : {
     566         380 :     text       *s = PG_GETARG_TEXT_PP(0);
     567         380 :     text       *p = PG_GETARG_TEXT_PP(1);
     568             :     regex_t    *re;
     569             :     regmatch_t  pmatch[2];
     570             :     int         so,
     571             :                 eo;
     572             : 
     573             :     /* Compile RE */
     574         380 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     575             : 
     576             :     /*
     577             :      * We pass two regmatch_t structs to get info about the overall match and
     578             :      * the match for the first parenthesized subexpression (if any). If there
     579             :      * is a parenthesized subexpression, we return what it matched; else
     580             :      * return what the whole regexp matched.
     581             :      */
     582        1520 :     if (!RE_execute(re,
     583        1520 :                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
     584             :                     2, pmatch))
     585           4 :         PG_RETURN_NULL();       /* definitely no match */
     586             : 
     587         376 :     if (re->re_nsub > 0)
     588             :     {
     589             :         /* has parenthesized subexpressions, use the first one */
     590          46 :         so = pmatch[1].rm_so;
     591          46 :         eo = pmatch[1].rm_eo;
     592             :     }
     593             :     else
     594             :     {
     595             :         /* no parenthesized subexpression, use whole match */
     596         330 :         so = pmatch[0].rm_so;
     597         330 :         eo = pmatch[0].rm_eo;
     598             :     }
     599             : 
     600             :     /*
     601             :      * It is possible to have a match to the whole pattern but no match for a
     602             :      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
     603             :      * there is no subexpression match.  So this extra test for match failure
     604             :      * is not redundant.
     605             :      */
     606         376 :     if (so < 0 || eo < 0)
     607           0 :         PG_RETURN_NULL();
     608             : 
     609         376 :     return DirectFunctionCall3(text_substr,
     610             :                                PointerGetDatum(s),
     611             :                                Int32GetDatum(so + 1),
     612             :                                Int32GetDatum(eo - so));
     613             : }
     614             : 
     615             : /*
     616             :  * textregexreplace_noopt()
     617             :  *      Return a string matched by a regular expression, with replacement.
     618             :  *
     619             :  * This version doesn't have an option argument: we default to case
     620             :  * sensitive match, replace the first instance only.
     621             :  */
     622             : Datum
     623         290 : textregexreplace_noopt(PG_FUNCTION_ARGS)
     624             : {
     625         290 :     text       *s = PG_GETARG_TEXT_PP(0);
     626         290 :     text       *p = PG_GETARG_TEXT_PP(1);
     627         290 :     text       *r = PG_GETARG_TEXT_PP(2);
     628             :     regex_t    *re;
     629             : 
     630         290 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     631             : 
     632         290 :     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
     633             : }
     634             : 
     635             : /*
     636             :  * textregexreplace()
     637             :  *      Return a string matched by a regular expression, with replacement.
     638             :  */
     639             : Datum
     640         712 : textregexreplace(PG_FUNCTION_ARGS)
     641             : {
     642         712 :     text       *s = PG_GETARG_TEXT_PP(0);
     643         712 :     text       *p = PG_GETARG_TEXT_PP(1);
     644         712 :     text       *r = PG_GETARG_TEXT_PP(2);
     645         712 :     text       *opt = PG_GETARG_TEXT_PP(3);
     646             :     regex_t    *re;
     647             :     pg_re_flags flags;
     648             : 
     649         712 :     parse_re_flags(&flags, opt);
     650             : 
     651         708 :     re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
     652             : 
     653         708 :     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
     654             : }
     655             : 
     656             : /*
     657             :  * similar_to_escape(), similar_escape()
     658             :  *
     659             :  * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be
     660             :  * used by our regexp engine.
     661             :  *
     662             :  * similar_escape_internal() is the common workhorse for three SQL-exposed
     663             :  * functions.  esc_text can be passed as NULL to select the default escape
     664             :  * (which is '\'), or as an empty string to select no escape character.
     665             :  */
     666             : static text *
     667          74 : similar_escape_internal(text *pat_text, text *esc_text)
     668             : {
     669             :     text       *result;
     670             :     char       *p,
     671             :                *e,
     672             :                *r;
     673             :     int         plen,
     674             :                 elen;
     675          74 :     bool        afterescape = false;
     676          74 :     bool        incharclass = false;
     677          74 :     int         nquotes = 0;
     678             : 
     679          74 :     p = VARDATA_ANY(pat_text);
     680          74 :     plen = VARSIZE_ANY_EXHDR(pat_text);
     681          74 :     if (esc_text == NULL)
     682             :     {
     683             :         /* No ESCAPE clause provided; default to backslash as escape */
     684          16 :         e = "\\";
     685          16 :         elen = 1;
     686             :     }
     687             :     else
     688             :     {
     689          58 :         e = VARDATA_ANY(esc_text);
     690          58 :         elen = VARSIZE_ANY_EXHDR(esc_text);
     691          58 :         if (elen == 0)
     692           4 :             e = NULL;           /* no escape character */
     693          54 :         else if (elen > 1)
     694             :         {
     695           4 :             int         escape_mblen = pg_mbstrlen_with_len(e, elen);
     696             : 
     697           4 :             if (escape_mblen > 1)
     698           4 :                 ereport(ERROR,
     699             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     700             :                          errmsg("invalid escape string"),
     701             :                          errhint("Escape string must be empty or one character.")));
     702             :         }
     703             :     }
     704             : 
     705             :     /*----------
     706             :      * We surround the transformed input string with
     707             :      *          ^(?: ... )$
     708             :      * which requires some explanation.  We need "^" and "$" to force
     709             :      * the pattern to match the entire input string as per the SQL spec.
     710             :      * The "(?:" and ")" are a non-capturing set of parens; we have to have
     711             :      * parens in case the string contains "|", else the "^" and "$" will
     712             :      * be bound into the first and last alternatives which is not what we
     713             :      * want, and the parens must be non capturing because we don't want them
     714             :      * to count when selecting output for SUBSTRING.
     715             :      *
     716             :      * When the pattern is divided into three parts by escape-double-quotes,
     717             :      * what we emit is
     718             :      *          ^(?:part1){1,1}?(part2){1,1}(?:part3)$
     719             :      * which requires even more explanation.  The "{1,1}?" on part1 makes it
     720             :      * non-greedy so that it will match the smallest possible amount of text
     721             :      * not the largest, as required by SQL.  The plain parens around part2
     722             :      * are capturing parens so that that part is what controls the result of
     723             :      * SUBSTRING.  The "{1,1}" forces part2 to be greedy, so that it matches
     724             :      * the largest possible amount of text; hence part3 must match the
     725             :      * smallest amount of text, as required by SQL.  We don't need an explicit
     726             :      * greediness marker on part3.  Note that this also confines the effects
     727             :      * of any "|" characters to the respective part, which is what we want.
     728             :      *
     729             :      * The SQL spec says that SUBSTRING's pattern must contain exactly two
     730             :      * escape-double-quotes, but we only complain if there's more than two.
     731             :      * With none, we act as though part1 and part3 are empty; with one, we
     732             :      * act as though part3 is empty.  Both behaviors fall out of omitting
     733             :      * the relevant part separators in the above expansion.  If the result
     734             :      * of this function is used in a plain regexp match (SIMILAR TO), the
     735             :      * escape-double-quotes have no effect on the match behavior.
     736             :      *----------
     737             :      */
     738             : 
     739             :     /*
     740             :      * We need room for the prefix/postfix and part separators, plus as many
     741             :      * as 3 output bytes per input byte; since the input is at most 1GB this
     742             :      * can't overflow size_t.
     743             :      */
     744          70 :     result = (text *) palloc(VARHDRSZ + 23 + 3 * (size_t) plen);
     745          70 :     r = VARDATA(result);
     746             : 
     747          70 :     *r++ = '^';
     748          70 :     *r++ = '(';
     749          70 :     *r++ = '?';
     750          70 :     *r++ = ':';
     751             : 
     752         638 :     while (plen > 0)
     753             :     {
     754         502 :         char        pchar = *p;
     755             : 
     756             :         /*
     757             :          * If both the escape character and the current character from the
     758             :          * pattern are multi-byte, we need to take the slow path.
     759             :          *
     760             :          * But if one of them is single-byte, we can process the pattern one
     761             :          * byte at a time, ignoring multi-byte characters.  (This works
     762             :          * because all server-encodings have the property that a valid
     763             :          * multi-byte character representation cannot contain the
     764             :          * representation of a valid single-byte character.)
     765             :          */
     766             : 
     767         502 :         if (elen > 1)
     768             :         {
     769           0 :             int         mblen = pg_mblen(p);
     770             : 
     771           0 :             if (mblen > 1)
     772             :             {
     773             :                 /* slow, multi-byte path */
     774           0 :                 if (afterescape)
     775             :                 {
     776           0 :                     *r++ = '\\';
     777           0 :                     memcpy(r, p, mblen);
     778           0 :                     r += mblen;
     779           0 :                     afterescape = false;
     780             :                 }
     781           0 :                 else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
     782             :                 {
     783             :                     /* SQL escape character; do not send to output */
     784           0 :                     afterescape = true;
     785             :                 }
     786             :                 else
     787             :                 {
     788             :                     /*
     789             :                      * We know it's a multi-byte character, so we don't need
     790             :                      * to do all the comparisons to single-byte characters
     791             :                      * that we do below.
     792             :                      */
     793           0 :                     memcpy(r, p, mblen);
     794           0 :                     r += mblen;
     795             :                 }
     796             : 
     797           0 :                 p += mblen;
     798           0 :                 plen -= mblen;
     799             : 
     800           0 :                 continue;
     801             :             }
     802             :         }
     803             : 
     804             :         /* fast path */
     805         502 :         if (afterescape)
     806             :         {
     807          88 :             if (pchar == '"' && !incharclass)  /* escape-double-quote? */
     808             :             {
     809             :                 /* emit appropriate part separator, per notes above */
     810          76 :                 if (nquotes == 0)
     811             :                 {
     812          38 :                     *r++ = ')';
     813          38 :                     *r++ = '{';
     814          38 :                     *r++ = '1';
     815          38 :                     *r++ = ',';
     816          38 :                     *r++ = '1';
     817          38 :                     *r++ = '}';
     818          38 :                     *r++ = '?';
     819          38 :                     *r++ = '(';
     820             :                 }
     821          38 :                 else if (nquotes == 1)
     822             :                 {
     823          34 :                     *r++ = ')';
     824          34 :                     *r++ = '{';
     825          34 :                     *r++ = '1';
     826          34 :                     *r++ = ',';
     827          34 :                     *r++ = '1';
     828          34 :                     *r++ = '}';
     829          34 :                     *r++ = '(';
     830          34 :                     *r++ = '?';
     831          34 :                     *r++ = ':';
     832             :                 }
     833             :                 else
     834           4 :                     ereport(ERROR,
     835             :                             (errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
     836             :                              errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
     837          72 :                 nquotes++;
     838             :             }
     839             :             else
     840             :             {
     841             :                 /*
     842             :                  * We allow any character at all to be escaped; notably, this
     843             :                  * allows access to POSIX character-class escapes such as
     844             :                  * "\d".  The SQL spec is considerably more restrictive.
     845             :                  */
     846          12 :                 *r++ = '\\';
     847          12 :                 *r++ = pchar;
     848             :             }
     849          84 :             afterescape = false;
     850             :         }
     851         414 :         else if (e && pchar == *e)
     852             :         {
     853             :             /* SQL escape character; do not send to output */
     854          88 :             afterescape = true;
     855             :         }
     856         326 :         else if (incharclass)
     857             :         {
     858           0 :             if (pchar == '\\')
     859           0 :                 *r++ = '\\';
     860           0 :             *r++ = pchar;
     861           0 :             if (pchar == ']')
     862           0 :                 incharclass = false;
     863             :         }
     864         326 :         else if (pchar == '[')
     865             :         {
     866           0 :             *r++ = pchar;
     867           0 :             incharclass = true;
     868             :         }
     869         326 :         else if (pchar == '%')
     870             :         {
     871          58 :             *r++ = '.';
     872          58 :             *r++ = '*';
     873             :         }
     874         268 :         else if (pchar == '_')
     875          32 :             *r++ = '.';
     876         236 :         else if (pchar == '(')
     877             :         {
     878             :             /* convert to non-capturing parenthesis */
     879           8 :             *r++ = '(';
     880           8 :             *r++ = '?';
     881           8 :             *r++ = ':';
     882             :         }
     883         228 :         else if (pchar == '\\' || pchar == '.' ||
     884         220 :                  pchar == '^' || pchar == '$')
     885             :         {
     886           8 :             *r++ = '\\';
     887           8 :             *r++ = pchar;
     888             :         }
     889             :         else
     890         220 :             *r++ = pchar;
     891         498 :         p++, plen--;
     892             :     }
     893             : 
     894          66 :     *r++ = ')';
     895          66 :     *r++ = '$';
     896             : 
     897          66 :     SET_VARSIZE(result, r - ((char *) result));
     898             : 
     899          66 :     return result;
     900             : }
     901             : 
     902             : /*
     903             :  * similar_to_escape(pattern, escape)
     904             :  */
     905             : Datum
     906          58 : similar_to_escape_2(PG_FUNCTION_ARGS)
     907             : {
     908          58 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
     909          58 :     text       *esc_text = PG_GETARG_TEXT_PP(1);
     910             :     text       *result;
     911             : 
     912          58 :     result = similar_escape_internal(pat_text, esc_text);
     913             : 
     914          50 :     PG_RETURN_TEXT_P(result);
     915             : }
     916             : 
     917             : /*
     918             :  * similar_to_escape(pattern)
     919             :  * Inserts a default escape character.
     920             :  */
     921             : Datum
     922          16 : similar_to_escape_1(PG_FUNCTION_ARGS)
     923             : {
     924          16 :     text       *pat_text = PG_GETARG_TEXT_PP(0);
     925             :     text       *result;
     926             : 
     927          16 :     result = similar_escape_internal(pat_text, NULL);
     928             : 
     929          16 :     PG_RETURN_TEXT_P(result);
     930             : }
     931             : 
     932             : /*
     933             :  * similar_escape(pattern, escape)
     934             :  *
     935             :  * Legacy function for compatibility with views stored using the
     936             :  * pre-v13 expansion of SIMILAR TO.  Unlike the above functions, this
     937             :  * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL".
     938             :  */
     939             : Datum
     940           0 : similar_escape(PG_FUNCTION_ARGS)
     941             : {
     942             :     text       *pat_text;
     943             :     text       *esc_text;
     944             :     text       *result;
     945             : 
     946             :     /* This function is not strict, so must test explicitly */
     947           0 :     if (PG_ARGISNULL(0))
     948           0 :         PG_RETURN_NULL();
     949           0 :     pat_text = PG_GETARG_TEXT_PP(0);
     950             : 
     951           0 :     if (PG_ARGISNULL(1))
     952           0 :         esc_text = NULL;        /* use default escape character */
     953             :     else
     954           0 :         esc_text = PG_GETARG_TEXT_PP(1);
     955             : 
     956           0 :     result = similar_escape_internal(pat_text, esc_text);
     957             : 
     958           0 :     PG_RETURN_TEXT_P(result);
     959             : }
     960             : 
     961             : /*
     962             :  * regexp_match()
     963             :  *      Return the first substring(s) matching a pattern within a string.
     964             :  */
     965             : Datum
     966         442 : regexp_match(PG_FUNCTION_ARGS)
     967             : {
     968         442 :     text       *orig_str = PG_GETARG_TEXT_PP(0);
     969         442 :     text       *pattern = PG_GETARG_TEXT_PP(1);
     970         442 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
     971             :     pg_re_flags re_flags;
     972             :     regexp_matches_ctx *matchctx;
     973             : 
     974             :     /* Determine options */
     975         442 :     parse_re_flags(&re_flags, flags);
     976             :     /* User mustn't specify 'g' */
     977         442 :     if (re_flags.glob)
     978           6 :         ereport(ERROR,
     979             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     980             :         /* translator: %s is a SQL function name */
     981             :                  errmsg("%s does not support the \"global\" option",
     982             :                         "regexp_match()"),
     983             :                  errhint("Use the regexp_matches function instead.")));
     984             : 
     985         436 :     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
     986             :                                     PG_GET_COLLATION(), true, false, false);
     987             : 
     988         436 :     if (matchctx->nmatches == 0)
     989         138 :         PG_RETURN_NULL();
     990             : 
     991             :     Assert(matchctx->nmatches == 1);
     992             : 
     993             :     /* Create workspace that build_regexp_match_result needs */
     994         298 :     matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
     995         298 :     matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
     996             : 
     997         298 :     PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
     998             : }
     999             : 
    1000             : /* This is separate to keep the opr_sanity regression test from complaining */
    1001             : Datum
    1002         416 : regexp_match_no_flags(PG_FUNCTION_ARGS)
    1003             : {
    1004         416 :     return regexp_match(fcinfo);
    1005             : }
    1006             : 
    1007             : /*
    1008             :  * regexp_matches()
    1009             :  *      Return a table of all matches of a pattern within a string.
    1010             :  */
    1011             : Datum
    1012         464 : regexp_matches(PG_FUNCTION_ARGS)
    1013             : {
    1014             :     FuncCallContext *funcctx;
    1015             :     regexp_matches_ctx *matchctx;
    1016             : 
    1017         464 :     if (SRF_IS_FIRSTCALL())
    1018             :     {
    1019         198 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1020         198 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1021             :         pg_re_flags re_flags;
    1022             :         MemoryContext oldcontext;
    1023             : 
    1024         198 :         funcctx = SRF_FIRSTCALL_INIT();
    1025         198 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1026             : 
    1027             :         /* Determine options */
    1028         198 :         parse_re_flags(&re_flags, flags);
    1029             : 
    1030             :         /* be sure to copy the input string into the multi-call ctx */
    1031         194 :         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1032             :                                         &re_flags,
    1033             :                                         PG_GET_COLLATION(),
    1034             :                                         true, false, false);
    1035             : 
    1036             :         /* Pre-create workspace that build_regexp_match_result needs */
    1037         186 :         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
    1038         186 :         matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
    1039             : 
    1040         186 :         MemoryContextSwitchTo(oldcontext);
    1041         186 :         funcctx->user_fctx = (void *) matchctx;
    1042             :     }
    1043             : 
    1044         452 :     funcctx = SRF_PERCALL_SETUP();
    1045         452 :     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1046             : 
    1047         452 :     if (matchctx->next_match < matchctx->nmatches)
    1048             :     {
    1049             :         ArrayType  *result_ary;
    1050             : 
    1051         266 :         result_ary = build_regexp_match_result(matchctx);
    1052         266 :         matchctx->next_match++;
    1053         266 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
    1054             :     }
    1055             : 
    1056         186 :     SRF_RETURN_DONE(funcctx);
    1057             : }
    1058             : 
    1059             : /* This is separate to keep the opr_sanity regression test from complaining */
    1060             : Datum
    1061         236 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
    1062             : {
    1063         236 :     return regexp_matches(fcinfo);
    1064             : }
    1065             : 
    1066             : /*
    1067             :  * setup_regexp_matches --- do the initial matching for regexp_match
    1068             :  *      and regexp_split functions
    1069             :  *
    1070             :  * To avoid having to re-find the compiled pattern on each call, we do
    1071             :  * all the matching in one swoop.  The returned regexp_matches_ctx contains
    1072             :  * the locations of all the substrings matching the pattern.
    1073             :  *
    1074             :  * The three bool parameters have only two patterns (one for matching, one for
    1075             :  * splitting) but it seems clearer to distinguish the functionality this way
    1076             :  * than to key it all off one "is_split" flag. We don't currently assume that
    1077             :  * fetching_unmatched is exclusive of fetching the matched text too; if it's
    1078             :  * set, the conversion buffer is large enough to fetch any single matched or
    1079             :  * unmatched string, but not any larger substring. (In practice, when splitting
    1080             :  * the matches are usually small anyway, and it didn't seem worth complicating
    1081             :  * the code further.)
    1082             :  */
    1083             : static regexp_matches_ctx *
    1084      200740 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
    1085             :                      Oid collation,
    1086             :                      bool use_subpatterns,
    1087             :                      bool ignore_degenerate,
    1088             :                      bool fetching_unmatched)
    1089             : {
    1090      200740 :     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
    1091      200740 :     int         eml = pg_database_encoding_max_length();
    1092             :     int         orig_len;
    1093             :     pg_wchar   *wide_str;
    1094             :     int         wide_len;
    1095             :     regex_t    *cpattern;
    1096             :     regmatch_t *pmatch;
    1097             :     int         pmatch_len;
    1098             :     int         array_len;
    1099             :     int         array_idx;
    1100             :     int         prev_match_end;
    1101             :     int         prev_valid_match_end;
    1102             :     int         start_search;
    1103      200740 :     int         maxlen = 0;     /* largest fetch length in characters */
    1104             : 
    1105             :     /* save original string --- we'll extract result substrings from it */
    1106      200740 :     matchctx->orig_str = orig_str;
    1107             : 
    1108             :     /* convert string to pg_wchar form for matching */
    1109      200740 :     orig_len = VARSIZE_ANY_EXHDR(orig_str);
    1110      200740 :     wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
    1111      200740 :     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
    1112             : 
    1113             :     /* set up the compiled pattern */
    1114      200740 :     cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
    1115             : 
    1116             :     /* do we want to remember subpatterns? */
    1117      200732 :     if (use_subpatterns && cpattern->re_nsub > 0)
    1118             :     {
    1119         506 :         matchctx->npatterns = cpattern->re_nsub;
    1120         506 :         pmatch_len = cpattern->re_nsub + 1;
    1121             :     }
    1122             :     else
    1123             :     {
    1124      200226 :         use_subpatterns = false;
    1125      200226 :         matchctx->npatterns = 1;
    1126      200226 :         pmatch_len = 1;
    1127             :     }
    1128             : 
    1129             :     /* temporary output space for RE package */
    1130      200732 :     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
    1131             : 
    1132             :     /*
    1133             :      * the real output space (grown dynamically if needed)
    1134             :      *
    1135             :      * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
    1136             :      * than at 2^27
    1137             :      */
    1138      200732 :     array_len = re_flags->glob ? 255 : 31;
    1139      200732 :     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
    1140      200732 :     array_idx = 0;
    1141             : 
    1142             :     /* search for the pattern, perhaps repeatedly */
    1143      200732 :     prev_match_end = 0;
    1144      200732 :     prev_valid_match_end = 0;
    1145      200732 :     start_search = 0;
    1146     1202542 :     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
    1147             :                             pmatch_len, pmatch))
    1148             :     {
    1149             :         /*
    1150             :          * If requested, ignore degenerate matches, which are zero-length
    1151             :          * matches occurring at the start or end of a string or just after a
    1152             :          * previous match.
    1153             :          */
    1154     1602512 :         if (!ignore_degenerate ||
    1155     1601920 :             (pmatch[0].rm_so < wide_len &&
    1156      800946 :              pmatch[0].rm_eo > prev_match_end))
    1157             :         {
    1158             :             /* enlarge output space if needed */
    1159     1602836 :             while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
    1160             :             {
    1161           0 :                 array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
    1162           0 :                 if (array_len > MaxAllocSize / sizeof(int))
    1163           0 :                     ereport(ERROR,
    1164             :                             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    1165             :                              errmsg("too many regular expression matches")));
    1166           0 :                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
    1167             :                                                         sizeof(int) * array_len);
    1168             :             }
    1169             : 
    1170             :             /* save this match's locations */
    1171      801418 :             if (use_subpatterns)
    1172             :             {
    1173             :                 int         i;
    1174             : 
    1175        1148 :                 for (i = 1; i <= matchctx->npatterns; i++)
    1176             :                 {
    1177         772 :                     int         so = pmatch[i].rm_so;
    1178         772 :                     int         eo = pmatch[i].rm_eo;
    1179             : 
    1180         772 :                     matchctx->match_locs[array_idx++] = so;
    1181         772 :                     matchctx->match_locs[array_idx++] = eo;
    1182         772 :                     if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1183         496 :                         maxlen = (eo - so);
    1184             :                 }
    1185             :             }
    1186             :             else
    1187             :             {
    1188      801042 :                 int         so = pmatch[0].rm_so;
    1189      801042 :                 int         eo = pmatch[0].rm_eo;
    1190             : 
    1191      801042 :                 matchctx->match_locs[array_idx++] = so;
    1192      801042 :                 matchctx->match_locs[array_idx++] = eo;
    1193      801042 :                 if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
    1194      200134 :                     maxlen = (eo - so);
    1195             :             }
    1196      801418 :             matchctx->nmatches++;
    1197             : 
    1198             :             /*
    1199             :              * check length of unmatched portion between end of previous valid
    1200             :              * (nondegenerate, or degenerate but not ignored) match and start
    1201             :              * of current one
    1202             :              */
    1203     1602272 :             if (fetching_unmatched &&
    1204     1601708 :                 pmatch[0].rm_so >= 0 &&
    1205      800854 :                 (pmatch[0].rm_so - prev_valid_match_end) > maxlen)
    1206      380112 :                 maxlen = (pmatch[0].rm_so - prev_valid_match_end);
    1207      801418 :             prev_valid_match_end = pmatch[0].rm_eo;
    1208             :         }
    1209      801538 :         prev_match_end = pmatch[0].rm_eo;
    1210             : 
    1211             :         /* if not glob, stop after one match */
    1212      801538 :         if (!re_flags->glob)
    1213         416 :             break;
    1214             : 
    1215             :         /*
    1216             :          * Advance search position.  Normally we start the next search at the
    1217             :          * end of the previous match; but if the match was of zero length, we
    1218             :          * have to advance by one character, or we'd just find the same match
    1219             :          * again.
    1220             :          */
    1221      801122 :         start_search = prev_match_end;
    1222      801122 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    1223         784 :             start_search++;
    1224      801122 :         if (start_search > wide_len)
    1225          44 :             break;
    1226             :     }
    1227             : 
    1228             :     /*
    1229             :      * check length of unmatched portion between end of last match and end of
    1230             :      * input string
    1231             :      */
    1232      400842 :     if (fetching_unmatched &&
    1233      200110 :         (wide_len - prev_valid_match_end) > maxlen)
    1234          20 :         maxlen = (wide_len - prev_valid_match_end);
    1235             : 
    1236             :     /*
    1237             :      * Keep a note of the end position of the string for the benefit of
    1238             :      * splitting code.
    1239             :      */
    1240      200732 :     matchctx->match_locs[array_idx] = wide_len;
    1241             : 
    1242      200732 :     if (eml > 1)
    1243             :     {
    1244      200732 :         int64       maxsiz = eml * (int64) maxlen;
    1245             :         int         conv_bufsiz;
    1246             : 
    1247             :         /*
    1248             :          * Make the conversion buffer large enough for any substring of
    1249             :          * interest.
    1250             :          *
    1251             :          * Worst case: assume we need the maximum size (maxlen*eml), but take
    1252             :          * advantage of the fact that the original string length in bytes is
    1253             :          * an upper bound on the byte length of any fetched substring (and we
    1254             :          * know that len+1 is safe to allocate because the varlena header is
    1255             :          * longer than 1 byte).
    1256             :          */
    1257      200732 :         if (maxsiz > orig_len)
    1258      200204 :             conv_bufsiz = orig_len + 1;
    1259             :         else
    1260         528 :             conv_bufsiz = maxsiz + 1;   /* safe since maxsiz < 2^30 */
    1261             : 
    1262      200732 :         matchctx->conv_buf = palloc(conv_bufsiz);
    1263      200732 :         matchctx->conv_bufsiz = conv_bufsiz;
    1264      200732 :         matchctx->wide_str = wide_str;
    1265             :     }
    1266             :     else
    1267             :     {
    1268             :         /* No need to keep the wide string if we're in a single-byte charset. */
    1269           0 :         pfree(wide_str);
    1270           0 :         matchctx->wide_str = NULL;
    1271           0 :         matchctx->conv_buf = NULL;
    1272           0 :         matchctx->conv_bufsiz = 0;
    1273             :     }
    1274             : 
    1275             :     /* Clean up temp storage */
    1276      200732 :     pfree(pmatch);
    1277             : 
    1278      200732 :     return matchctx;
    1279             : }
    1280             : 
    1281             : /*
    1282             :  * build_regexp_match_result - build output array for current match
    1283             :  */
    1284             : static ArrayType *
    1285         564 : build_regexp_match_result(regexp_matches_ctx *matchctx)
    1286             : {
    1287         564 :     char       *buf = matchctx->conv_buf;
    1288         564 :     int         bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
    1289         564 :     Datum      *elems = matchctx->elems;
    1290         564 :     bool       *nulls = matchctx->nulls;
    1291             :     int         dims[1];
    1292             :     int         lbs[1];
    1293             :     int         loc;
    1294             :     int         i;
    1295             : 
    1296             :     /* Extract matching substrings from the original string */
    1297         564 :     loc = matchctx->next_match * matchctx->npatterns * 2;
    1298        1524 :     for (i = 0; i < matchctx->npatterns; i++)
    1299             :     {
    1300         960 :         int         so = matchctx->match_locs[loc++];
    1301         960 :         int         eo = matchctx->match_locs[loc++];
    1302             : 
    1303         960 :         if (so < 0 || eo < 0)
    1304             :         {
    1305           4 :             elems[i] = (Datum) 0;
    1306           4 :             nulls[i] = true;
    1307             :         }
    1308         956 :         else if (buf)
    1309             :         {
    1310         956 :             int         len = pg_wchar2mb_with_len(matchctx->wide_str + so,
    1311             :                                                    buf,
    1312             :                                                    eo - so);
    1313             : 
    1314             :             Assert(len < bufsiz);
    1315         956 :             elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
    1316         956 :             nulls[i] = false;
    1317             :         }
    1318             :         else
    1319             :         {
    1320           0 :             elems[i] = DirectFunctionCall3(text_substr,
    1321             :                                            PointerGetDatum(matchctx->orig_str),
    1322             :                                            Int32GetDatum(so + 1),
    1323             :                                            Int32GetDatum(eo - so));
    1324           0 :             nulls[i] = false;
    1325             :         }
    1326             :     }
    1327             : 
    1328             :     /* And form an array */
    1329         564 :     dims[0] = matchctx->npatterns;
    1330         564 :     lbs[0] = 1;
    1331             :     /* XXX: this hardcodes assumptions about the text type */
    1332         564 :     return construct_md_array(elems, nulls, 1, dims, lbs,
    1333             :                               TEXTOID, -1, false, 'i');
    1334             : }
    1335             : 
    1336             : /*
    1337             :  * regexp_split_to_table()
    1338             :  *      Split the string at matches of the pattern, returning the
    1339             :  *      split-out substrings as a table.
    1340             :  */
    1341             : Datum
    1342         424 : regexp_split_to_table(PG_FUNCTION_ARGS)
    1343             : {
    1344             :     FuncCallContext *funcctx;
    1345             :     regexp_matches_ctx *splitctx;
    1346             : 
    1347         424 :     if (SRF_IS_FIRSTCALL())
    1348             :     {
    1349          38 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1350          38 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1351             :         pg_re_flags re_flags;
    1352             :         MemoryContext oldcontext;
    1353             : 
    1354          38 :         funcctx = SRF_FIRSTCALL_INIT();
    1355          38 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1356             : 
    1357             :         /* Determine options */
    1358          38 :         parse_re_flags(&re_flags, flags);
    1359             :         /* User mustn't specify 'g' */
    1360          34 :         if (re_flags.glob)
    1361           4 :             ereport(ERROR,
    1362             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1363             :             /* translator: %s is a SQL function name */
    1364             :                      errmsg("%s does not support the \"global\" option",
    1365             :                             "regexp_split_to_table()")));
    1366             :         /* But we find all the matches anyway */
    1367          30 :         re_flags.glob = true;
    1368             : 
    1369             :         /* be sure to copy the input string into the multi-call ctx */
    1370          30 :         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1371             :                                         &re_flags,
    1372             :                                         PG_GET_COLLATION(),
    1373             :                                         false, true, true);
    1374             : 
    1375          30 :         MemoryContextSwitchTo(oldcontext);
    1376          30 :         funcctx->user_fctx = (void *) splitctx;
    1377             :     }
    1378             : 
    1379         416 :     funcctx = SRF_PERCALL_SETUP();
    1380         416 :     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1381             : 
    1382         416 :     if (splitctx->next_match <= splitctx->nmatches)
    1383             :     {
    1384         386 :         Datum       result = build_regexp_split_result(splitctx);
    1385             : 
    1386         386 :         splitctx->next_match++;
    1387         386 :         SRF_RETURN_NEXT(funcctx, result);
    1388             :     }
    1389             : 
    1390          30 :     SRF_RETURN_DONE(funcctx);
    1391             : }
    1392             : 
    1393             : /* This is separate to keep the opr_sanity regression test from complaining */
    1394             : Datum
    1395         368 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
    1396             : {
    1397         368 :     return regexp_split_to_table(fcinfo);
    1398             : }
    1399             : 
    1400             : /*
    1401             :  * regexp_split_to_array()
    1402             :  *      Split the string at matches of the pattern, returning the
    1403             :  *      split-out substrings as an array.
    1404             :  */
    1405             : Datum
    1406      200088 : regexp_split_to_array(PG_FUNCTION_ARGS)
    1407             : {
    1408      200088 :     ArrayBuildState *astate = NULL;
    1409             :     pg_re_flags re_flags;
    1410             :     regexp_matches_ctx *splitctx;
    1411             : 
    1412             :     /* Determine options */
    1413      200088 :     parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
    1414             :     /* User mustn't specify 'g' */
    1415      200084 :     if (re_flags.glob)
    1416           4 :         ereport(ERROR,
    1417             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1418             :         /* translator: %s is a SQL function name */
    1419             :                  errmsg("%s does not support the \"global\" option",
    1420             :                         "regexp_split_to_array()")));
    1421             :     /* But we find all the matches anyway */
    1422      200080 :     re_flags.glob = true;
    1423             : 
    1424      400160 :     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
    1425      200080 :                                     PG_GETARG_TEXT_PP(1),
    1426             :                                     &re_flags,
    1427             :                                     PG_GET_COLLATION(),
    1428             :                                     false, true, true);
    1429             : 
    1430     1400738 :     while (splitctx->next_match <= splitctx->nmatches)
    1431             :     {
    1432     1000578 :         astate = accumArrayResult(astate,
    1433             :                                   build_regexp_split_result(splitctx),
    1434             :                                   false,
    1435             :                                   TEXTOID,
    1436             :                                   CurrentMemoryContext);
    1437     1000578 :         splitctx->next_match++;
    1438             :     }
    1439             : 
    1440      200080 :     PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
    1441             : }
    1442             : 
    1443             : /* This is separate to keep the opr_sanity regression test from complaining */
    1444             : Datum
    1445      200052 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
    1446             : {
    1447      200052 :     return regexp_split_to_array(fcinfo);
    1448             : }
    1449             : 
    1450             : /*
    1451             :  * build_regexp_split_result - build output string for current match
    1452             :  *
    1453             :  * We return the string between the current match and the previous one,
    1454             :  * or the string after the last match when next_match == nmatches.
    1455             :  */
    1456             : static Datum
    1457     1000964 : build_regexp_split_result(regexp_matches_ctx *splitctx)
    1458             : {
    1459     1000964 :     char       *buf = splitctx->conv_buf;
    1460             :     int         startpos;
    1461             :     int         endpos;
    1462             : 
    1463     1000964 :     if (splitctx->next_match > 0)
    1464      800854 :         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
    1465             :     else
    1466      200110 :         startpos = 0;
    1467     1000964 :     if (startpos < 0)
    1468           0 :         elog(ERROR, "invalid match ending position");
    1469             : 
    1470     1000964 :     if (buf)
    1471             :     {
    1472     1000964 :         int         bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
    1473             :         int         len;
    1474             : 
    1475     1000964 :         endpos = splitctx->match_locs[splitctx->next_match * 2];
    1476     1000964 :         if (endpos < startpos)
    1477           0 :             elog(ERROR, "invalid match starting position");
    1478     1000964 :         len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
    1479             :                                    buf,
    1480             :                                    endpos - startpos);
    1481             :         Assert(len < bufsiz);
    1482     1000964 :         return PointerGetDatum(cstring_to_text_with_len(buf, len));
    1483             :     }
    1484             :     else
    1485             :     {
    1486           0 :         endpos = splitctx->match_locs[splitctx->next_match * 2];
    1487           0 :         if (endpos < startpos)
    1488           0 :             elog(ERROR, "invalid match starting position");
    1489           0 :         return DirectFunctionCall3(text_substr,
    1490             :                                    PointerGetDatum(splitctx->orig_str),
    1491             :                                    Int32GetDatum(startpos + 1),
    1492             :                                    Int32GetDatum(endpos - startpos));
    1493             :     }
    1494             : }
    1495             : 
    1496             : /*
    1497             :  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
    1498             :  *
    1499             :  * The result is NULL if there is no fixed prefix, else a palloc'd string.
    1500             :  * If it is an exact match, not just a prefix, *exact is returned as true.
    1501             :  */
    1502             : char *
    1503        5188 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
    1504             :                     bool *exact)
    1505             : {
    1506             :     char       *result;
    1507             :     regex_t    *re;
    1508             :     int         cflags;
    1509             :     int         re_result;
    1510             :     pg_wchar   *str;
    1511             :     size_t      slen;
    1512             :     size_t      maxlen;
    1513             :     char        errMsg[100];
    1514             : 
    1515        5188 :     *exact = false;             /* default result */
    1516             : 
    1517             :     /* Compile RE */
    1518        5188 :     cflags = REG_ADVANCED;
    1519        5188 :     if (case_insensitive)
    1520          32 :         cflags |= REG_ICASE;
    1521             : 
    1522        5188 :     re = RE_compile_and_cache(text_re, cflags, collation);
    1523             : 
    1524             :     /* Examine it to see if there's a fixed prefix */
    1525        5188 :     re_result = pg_regprefix(re, &str, &slen);
    1526             : 
    1527        5188 :     switch (re_result)
    1528             :     {
    1529             :         case REG_NOMATCH:
    1530         402 :             return NULL;
    1531             : 
    1532             :         case REG_PREFIX:
    1533             :             /* continue with wchar conversion */
    1534         642 :             break;
    1535             : 
    1536             :         case REG_EXACT:
    1537        4144 :             *exact = true;
    1538             :             /* continue with wchar conversion */
    1539        4144 :             break;
    1540             : 
    1541             :         default:
    1542             :             /* re failed??? */
    1543           0 :             CHECK_FOR_INTERRUPTS();
    1544           0 :             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
    1545           0 :             ereport(ERROR,
    1546             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    1547             :                      errmsg("regular expression failed: %s", errMsg)));
    1548             :             break;
    1549             :     }
    1550             : 
    1551             :     /* Convert pg_wchar result back to database encoding */
    1552        4786 :     maxlen = pg_database_encoding_max_length() * slen + 1;
    1553        4786 :     result = (char *) palloc(maxlen);
    1554        4786 :     slen = pg_wchar2mb_with_len(str, result, slen);
    1555             :     Assert(slen < maxlen);
    1556             : 
    1557        4786 :     free(str);
    1558             : 
    1559        4786 :     return result;
    1560             : }

Generated by: LCOV version 1.13