LCOV - code coverage report
Current view: top level - src/backend/utils/adt - varlena.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 90.9 % 1898 1726
Test Date: 2026-05-31 10:17:02 Functions: 92.5 % 146 135
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * varlena.c
       4              :  *    Functions for the variable-length built-in types.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/utils/adt/varlena.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #include "postgres.h"
      16              : 
      17              : #include <ctype.h>
      18              : #include <limits.h>
      19              : 
      20              : #include "access/detoast.h"
      21              : #include "access/toast_compression.h"
      22              : #include "access/tupmacs.h"
      23              : #include "catalog/pg_collation.h"
      24              : #include "catalog/pg_type.h"
      25              : #include "common/hashfn.h"
      26              : #include "common/int.h"
      27              : #include "common/unicode_category.h"
      28              : #include "common/unicode_norm.h"
      29              : #include "common/unicode_version.h"
      30              : #include "funcapi.h"
      31              : #include "lib/hyperloglog.h"
      32              : #include "libpq/pqformat.h"
      33              : #include "miscadmin.h"
      34              : #include "nodes/execnodes.h"
      35              : #include "parser/scansup.h"
      36              : #include "port/pg_bswap.h"
      37              : #include "regex/regex.h"
      38              : #include "utils/builtins.h"
      39              : #include "utils/guc.h"
      40              : #include "utils/lsyscache.h"
      41              : #include "utils/memutils.h"
      42              : #include "utils/pg_locale.h"
      43              : #include "utils/sortsupport.h"
      44              : #include "utils/tuplestore.h"
      45              : #include "utils/varlena.h"
      46              : 
      47              : typedef varlena VarString;
      48              : 
      49              : /*
      50              :  * State for text_position_* functions.
      51              :  */
      52              : typedef struct
      53              : {
      54              :     pg_locale_t locale;         /* collation used for substring matching */
      55              :     bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
      56              :     bool        greedy;         /* find longest possible substring? */
      57              : 
      58              :     char       *str1;           /* haystack string */
      59              :     char       *str2;           /* needle string */
      60              :     int         len1;           /* string lengths in bytes */
      61              :     int         len2;
      62              : 
      63              :     /* Skip table for Boyer-Moore-Horspool search algorithm: */
      64              :     int         skiptablemask;  /* mask for ANDing with skiptable subscripts */
      65              :     int         skiptable[256]; /* skip distance for given mismatched char */
      66              : 
      67              :     /*
      68              :      * Note that with nondeterministic collations, the length of the last
      69              :      * match is not necessarily equal to the length of the "needle" passed in.
      70              :      */
      71              :     char       *last_match;     /* pointer to last match in 'str1' */
      72              :     int         last_match_len; /* length of last match */
      73              :     int         last_match_len_tmp; /* same but for internal use */
      74              : 
      75              :     /*
      76              :      * Sometimes we need to convert the byte position of a match to a
      77              :      * character position.  These store the last position that was converted,
      78              :      * so that on the next call, we can continue from that point, rather than
      79              :      * count characters from the very beginning.
      80              :      */
      81              :     char       *refpoint;       /* pointer within original haystack string */
      82              :     int         refpos;         /* 0-based character offset of the same point */
      83              : } TextPositionState;
      84              : 
      85              : typedef struct
      86              : {
      87              :     char       *buf1;           /* 1st string, or abbreviation original string
      88              :                                  * buf */
      89              :     char       *buf2;           /* 2nd string, or abbreviation strxfrm() buf */
      90              :     int         buflen1;        /* Allocated length of buf1 */
      91              :     int         buflen2;        /* Allocated length of buf2 */
      92              :     int         last_len1;      /* Length of last buf1 string/strxfrm() input */
      93              :     int         last_len2;      /* Length of last buf2 string/strxfrm() blob */
      94              :     int         last_returned;  /* Last comparison result (cache) */
      95              :     bool        cache_blob;     /* Does buf2 contain strxfrm() blob, etc? */
      96              :     bool        collate_c;
      97              :     Oid         typid;          /* Actual datatype (text/bpchar/name) */
      98              :     hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
      99              :     hyperLogLogState full_card; /* Full key cardinality state */
     100              :     double      prop_card;      /* Required cardinality proportion */
     101              :     pg_locale_t locale;
     102              : } VarStringSortSupport;
     103              : 
     104              : /*
     105              :  * Output data for split_text(): we output either to an array or a table.
     106              :  * tupstore and tupdesc must be set up in advance to output to a table.
     107              :  */
     108              : typedef struct
     109              : {
     110              :     ArrayBuildState *astate;
     111              :     Tuplestorestate *tupstore;
     112              :     TupleDesc   tupdesc;
     113              : } SplitTextOutputData;
     114              : 
     115              : /*
     116              :  * This should be large enough that most strings will fit, but small enough
     117              :  * that we feel comfortable putting it on the stack
     118              :  */
     119              : #define TEXTBUFLEN      1024
     120              : 
     121              : #define DatumGetVarStringP(X)       ((VarString *) PG_DETOAST_DATUM(X))
     122              : #define DatumGetVarStringPP(X)      ((VarString *) PG_DETOAST_DATUM_PACKED(X))
     123              : 
     124              : static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
     125              : static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
     126              : static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
     127              : static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
     128              : static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
     129              : static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
     130              : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
     131              : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
     132              : static int32 text_length(Datum str);
     133              : static text *text_catenate(text *t1, text *t2);
     134              : static text *text_substring(Datum str,
     135              :                             int32 start,
     136              :                             int32 length,
     137              :                             bool length_not_specified);
     138              : static int  pg_mbcharcliplen_chars(const char *mbstr, int len, int limit);
     139              : static text *text_overlay(text *t1, text *t2, int sp, int sl);
     140              : static int  text_position(text *t1, text *t2, Oid collid);
     141              : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
     142              : static bool text_position_next(TextPositionState *state);
     143              : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
     144              : static char *text_position_get_match_ptr(TextPositionState *state);
     145              : static int  text_position_get_match_pos(TextPositionState *state);
     146              : static void text_position_cleanup(TextPositionState *state);
     147              : static void check_collation_set(Oid collid);
     148              : static int  text_cmp(text *arg1, text *arg2, Oid collid);
     149              : static void appendStringInfoText(StringInfo str, const text *t);
     150              : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
     151              : static void split_text_accum_result(SplitTextOutputData *tstate,
     152              :                                     text *field_value,
     153              :                                     text *null_string,
     154              :                                     Oid collation);
     155              : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
     156              :                                     const char *fldsep, const char *null_string);
     157              : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
     158              : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
     159              :                                      int *value);
     160              : static const char *text_format_parse_format(const char *start_ptr,
     161              :                                             const char *end_ptr,
     162              :                                             int *argpos, int *widthpos,
     163              :                                             int *flags, int *width);
     164              : static void text_format_string_conversion(StringInfo buf, char conversion,
     165              :                                           FmgrInfo *typOutputInfo,
     166              :                                           Datum value, bool isNull,
     167              :                                           int flags, int width);
     168              : static void text_format_append_string(StringInfo buf, const char *str,
     169              :                                       int flags, int width);
     170              : 
     171              : 
     172              : /*****************************************************************************
     173              :  *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                          *
     174              :  *****************************************************************************/
     175              : 
     176              : /*
     177              :  * cstring_to_text
     178              :  *
     179              :  * Create a text value from a null-terminated C string.
     180              :  *
     181              :  * The new text value is freshly palloc'd with a full-size VARHDR.
     182              :  */
     183              : text *
     184     15493059 : cstring_to_text(const char *s)
     185              : {
     186     15493059 :     return cstring_to_text_with_len(s, strlen(s));
     187              : }
     188              : 
     189              : /*
     190              :  * cstring_to_text_with_len
     191              :  *
     192              :  * Same as cstring_to_text except the caller specifies the string length;
     193              :  * the string need not be null_terminated.
     194              :  */
     195              : text *
     196     17058355 : cstring_to_text_with_len(const char *s, int len)
     197              : {
     198     17058355 :     text       *result = (text *) palloc(len + VARHDRSZ);
     199              : 
     200     17058355 :     SET_VARSIZE(result, len + VARHDRSZ);
     201     17058355 :     memcpy(VARDATA(result), s, len);
     202              : 
     203     17058355 :     return result;
     204              : }
     205              : 
     206              : /*
     207              :  * text_to_cstring
     208              :  *
     209              :  * Create a palloc'd, null-terminated C string from a text value.
     210              :  *
     211              :  * We support being passed a compressed or toasted text value.
     212              :  * This is a bit bogus since such values shouldn't really be referred to as
     213              :  * "text *", but it seems useful for robustness.  If we didn't handle that
     214              :  * case here, we'd need another routine that did, anyway.
     215              :  */
     216              : char *
     217     10401281 : text_to_cstring(const text *t)
     218              : {
     219              :     /* must cast away the const, unfortunately */
     220     10401281 :     text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
     221     10401281 :     int         len = VARSIZE_ANY_EXHDR(tunpacked);
     222              :     char       *result;
     223              : 
     224     10401281 :     result = (char *) palloc(len + 1);
     225     10401281 :     memcpy(result, VARDATA_ANY(tunpacked), len);
     226     10401281 :     result[len] = '\0';
     227              : 
     228     10401281 :     if (tunpacked != t)
     229        29936 :         pfree(tunpacked);
     230              : 
     231     10401281 :     return result;
     232              : }
     233              : 
     234              : /*
     235              :  * text_to_cstring_buffer
     236              :  *
     237              :  * Copy a text value into a caller-supplied buffer of size dst_len.
     238              :  *
     239              :  * The text string is truncated if necessary to fit.  The result is
     240              :  * guaranteed null-terminated (unless dst_len == 0).
     241              :  *
     242              :  * We support being passed a compressed or toasted text value.
     243              :  * This is a bit bogus since such values shouldn't really be referred to as
     244              :  * "text *", but it seems useful for robustness.  If we didn't handle that
     245              :  * case here, we'd need another routine that did, anyway.
     246              :  */
     247              : void
     248          727 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
     249              : {
     250              :     /* must cast away the const, unfortunately */
     251          727 :     text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
     252          727 :     size_t      src_len = VARSIZE_ANY_EXHDR(srcunpacked);
     253              : 
     254          727 :     if (dst_len > 0)
     255              :     {
     256          727 :         dst_len--;
     257          727 :         if (dst_len >= src_len)
     258          727 :             dst_len = src_len;
     259              :         else                    /* ensure truncation is encoding-safe */
     260            0 :             dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
     261          727 :         memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
     262          727 :         dst[dst_len] = '\0';
     263              :     }
     264              : 
     265          727 :     if (srcunpacked != src)
     266            0 :         pfree(srcunpacked);
     267          727 : }
     268              : 
     269              : 
     270              : /*****************************************************************************
     271              :  *   USER I/O ROUTINES                                                       *
     272              :  *****************************************************************************/
     273              : 
     274              : /*
     275              :  *      textin          - converts cstring to internal representation
     276              :  */
     277              : Datum
     278     13197522 : textin(PG_FUNCTION_ARGS)
     279              : {
     280     13197522 :     char       *inputText = PG_GETARG_CSTRING(0);
     281              : 
     282     13197522 :     PG_RETURN_TEXT_P(cstring_to_text(inputText));
     283              : }
     284              : 
     285              : /*
     286              :  *      textout         - converts internal representation to cstring
     287              :  */
     288              : Datum
     289      4799817 : textout(PG_FUNCTION_ARGS)
     290              : {
     291      4799817 :     Datum       txt = PG_GETARG_DATUM(0);
     292              : 
     293      4799817 :     PG_RETURN_CSTRING(TextDatumGetCString(txt));
     294              : }
     295              : 
     296              : /*
     297              :  *      textrecv            - converts external binary format to text
     298              :  */
     299              : Datum
     300           27 : textrecv(PG_FUNCTION_ARGS)
     301              : {
     302           27 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     303              :     text       *result;
     304              :     char       *str;
     305              :     int         nbytes;
     306              : 
     307           27 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     308              : 
     309           27 :     result = cstring_to_text_with_len(str, nbytes);
     310           27 :     pfree(str);
     311           27 :     PG_RETURN_TEXT_P(result);
     312              : }
     313              : 
     314              : /*
     315              :  *      textsend            - converts text to binary format
     316              :  */
     317              : Datum
     318         2394 : textsend(PG_FUNCTION_ARGS)
     319              : {
     320         2394 :     text       *t = PG_GETARG_TEXT_PP(0);
     321              :     StringInfoData buf;
     322              : 
     323         2394 :     pq_begintypsend(&buf);
     324         2394 :     pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
     325         2394 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     326              : }
     327              : 
     328              : 
     329              : /*
     330              :  *      unknownin           - converts cstring to internal representation
     331              :  */
     332              : Datum
     333            0 : unknownin(PG_FUNCTION_ARGS)
     334              : {
     335            0 :     char       *str = PG_GETARG_CSTRING(0);
     336              : 
     337              :     /* representation is same as cstring */
     338            0 :     PG_RETURN_CSTRING(pstrdup(str));
     339              : }
     340              : 
     341              : /*
     342              :  *      unknownout          - converts internal representation to cstring
     343              :  */
     344              : Datum
     345          656 : unknownout(PG_FUNCTION_ARGS)
     346              : {
     347              :     /* representation is same as cstring */
     348          656 :     char       *str = PG_GETARG_CSTRING(0);
     349              : 
     350          656 :     PG_RETURN_CSTRING(pstrdup(str));
     351              : }
     352              : 
     353              : /*
     354              :  *      unknownrecv         - converts external binary format to unknown
     355              :  */
     356              : Datum
     357            0 : unknownrecv(PG_FUNCTION_ARGS)
     358              : {
     359            0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     360              :     char       *str;
     361              :     int         nbytes;
     362              : 
     363            0 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     364              :     /* representation is same as cstring */
     365            0 :     PG_RETURN_CSTRING(str);
     366              : }
     367              : 
     368              : /*
     369              :  *      unknownsend         - converts unknown to binary format
     370              :  */
     371              : Datum
     372            0 : unknownsend(PG_FUNCTION_ARGS)
     373              : {
     374              :     /* representation is same as cstring */
     375            0 :     char       *str = PG_GETARG_CSTRING(0);
     376              :     StringInfoData buf;
     377              : 
     378            0 :     pq_begintypsend(&buf);
     379            0 :     pq_sendtext(&buf, str, strlen(str));
     380            0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     381              : }
     382              : 
     383              : 
     384              : /* ========== PUBLIC ROUTINES ========== */
     385              : 
     386              : /*
     387              :  * textlen -
     388              :  *    returns the logical length of a text*
     389              :  *     (which is less than the VARSIZE of the text*)
     390              :  */
     391              : Datum
     392       286151 : textlen(PG_FUNCTION_ARGS)
     393              : {
     394       286151 :     Datum       str = PG_GETARG_DATUM(0);
     395              : 
     396              :     /* try to avoid decompressing argument */
     397       286151 :     PG_RETURN_INT32(text_length(str));
     398              : }
     399              : 
     400              : /*
     401              :  * text_length -
     402              :  *  Does the real work for textlen()
     403              :  *
     404              :  *  This is broken out so it can be called directly by other string processing
     405              :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     406              :  *  it may still be in compressed form.  We can avoid decompressing it at all
     407              :  *  in some cases.
     408              :  */
     409              : static int32
     410       286161 : text_length(Datum str)
     411              : {
     412              :     /* fastpath when max encoding length is one */
     413       286161 :     if (pg_database_encoding_max_length() == 1)
     414           10 :         return (toast_raw_datum_size(str) - VARHDRSZ);
     415              :     else
     416              :     {
     417       286151 :         text       *t = DatumGetTextPP(str);
     418              : 
     419       286151 :         return (pg_mbstrlen_with_len(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)));
     420              :     }
     421              : }
     422              : 
     423              : /*
     424              :  * textoctetlen -
     425              :  *    returns the physical length of a text*
     426              :  *     (which is less than the VARSIZE of the text*)
     427              :  */
     428              : Datum
     429           45 : textoctetlen(PG_FUNCTION_ARGS)
     430              : {
     431           45 :     Datum       str = PG_GETARG_DATUM(0);
     432              : 
     433              :     /* We need not detoast the input at all */
     434           45 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     435              : }
     436              : 
     437              : /*
     438              :  * textcat -
     439              :  *    takes two text* and returns a text* that is the concatenation of
     440              :  *    the two.
     441              :  *
     442              :  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
     443              :  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
     444              :  * Allocate space for output in all cases.
     445              :  * XXX - thomas 1997-07-10
     446              :  */
     447              : Datum
     448      1362072 : textcat(PG_FUNCTION_ARGS)
     449              : {
     450      1362072 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     451      1362072 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     452              : 
     453      1362072 :     PG_RETURN_TEXT_P(text_catenate(t1, t2));
     454              : }
     455              : 
     456              : /*
     457              :  * text_catenate
     458              :  *  Guts of textcat(), broken out so it can be used by other functions
     459              :  *
     460              :  * Arguments can be in short-header form, but not compressed or out-of-line
     461              :  */
     462              : static text *
     463      1362128 : text_catenate(text *t1, text *t2)
     464              : {
     465              :     text       *result;
     466              :     int         len1,
     467              :                 len2,
     468              :                 len;
     469              :     char       *ptr;
     470              : 
     471      1362128 :     len1 = VARSIZE_ANY_EXHDR(t1);
     472      1362128 :     len2 = VARSIZE_ANY_EXHDR(t2);
     473              : 
     474              :     /* paranoia ... probably should throw error instead? */
     475      1362128 :     if (len1 < 0)
     476            0 :         len1 = 0;
     477      1362128 :     if (len2 < 0)
     478            0 :         len2 = 0;
     479              : 
     480      1362128 :     len = len1 + len2 + VARHDRSZ;
     481      1362128 :     result = (text *) palloc(len);
     482              : 
     483              :     /* Set size of result string... */
     484      1362128 :     SET_VARSIZE(result, len);
     485              : 
     486              :     /* Fill data field of result string... */
     487      1362128 :     ptr = VARDATA(result);
     488      1362128 :     if (len1 > 0)
     489      1360405 :         memcpy(ptr, VARDATA_ANY(t1), len1);
     490      1362128 :     if (len2 > 0)
     491      1361987 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
     492              : 
     493      1362128 :     return result;
     494              : }
     495              : 
     496              : /*
     497              :  * charlen_to_bytelen()
     498              :  *  Compute the number of bytes occupied by n characters starting at *p
     499              :  *
     500              :  * The caller shall ensure there are n complete characters.  Callers achieve
     501              :  * this by deriving "n" from regmatch_t findings from searching a wchar array.
     502              :  * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
     503              :  * matches will end no later than the last complete character.  (The string
     504              :  * need not be null-terminated.)
     505              :  */
     506              : static int
     507        11572 : charlen_to_bytelen(const char *p, int n)
     508              : {
     509        11572 :     if (pg_database_encoding_max_length() == 1)
     510              :     {
     511              :         /* Optimization for single-byte encodings */
     512           96 :         return n;
     513              :     }
     514              :     else
     515              :     {
     516              :         const char *s;
     517              : 
     518      3087142 :         for (s = p; n > 0; n--)
     519      3075666 :             s += pg_mblen_unbounded(s); /* caller verified encoding */
     520              : 
     521        11476 :         return s - p;
     522              :     }
     523              : }
     524              : 
     525              : /*
     526              :  * text_substr()
     527              :  * Return a substring starting at the specified position.
     528              :  * - thomas 1997-12-31
     529              :  *
     530              :  * Input:
     531              :  *  - string
     532              :  *  - starting position (is one-based)
     533              :  *  - string length
     534              :  *
     535              :  * If the starting position is zero or less, then return from the start of the string
     536              :  *  adjusting the length to be consistent with the "negative start" per SQL.
     537              :  * If the length is less than zero, return the remaining string.
     538              :  *
     539              :  * Added multibyte support.
     540              :  * - Tatsuo Ishii 1998-4-21
     541              :  * Changed behavior if starting position is less than one to conform to SQL behavior.
     542              :  * Formerly returned the entire string; now returns a portion.
     543              :  * - Thomas Lockhart 1998-12-10
     544              :  * Now uses faster TOAST-slicing interface
     545              :  * - John Gray 2002-02-22
     546              :  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
     547              :  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
     548              :  * error; if E < 1, return '', not entire string). Fixed MB related bug when
     549              :  * S > LC and < LC + 4 sometimes garbage characters are returned.
     550              :  * - Joe Conway 2002-08-10
     551              :  */
     552              : Datum
     553       369962 : text_substr(PG_FUNCTION_ARGS)
     554              : {
     555       369962 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     556              :                                     PG_GETARG_INT32(1),
     557              :                                     PG_GETARG_INT32(2),
     558              :                                     false));
     559              : }
     560              : 
     561              : /*
     562              :  * text_substr_no_len -
     563              :  *    Wrapper to avoid opr_sanity failure due to
     564              :  *    one function accepting a different number of args.
     565              :  */
     566              : Datum
     567           24 : text_substr_no_len(PG_FUNCTION_ARGS)
     568              : {
     569           24 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     570              :                                     PG_GETARG_INT32(1),
     571              :                                     -1, true));
     572              : }
     573              : 
     574              : /*
     575              :  * text_substring -
     576              :  *  Does the real work for text_substr() and text_substr_no_len()
     577              :  *
     578              :  *  This is broken out so it can be called directly by other string processing
     579              :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     580              :  *  it may still be in compressed/toasted form.  We can avoid detoasting all
     581              :  *  of it in some cases.
     582              :  *
     583              :  *  The result is always a freshly palloc'd datum.
     584              :  */
     585              : static text *
     586       396730 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
     587              : {
     588       396730 :     int32       eml = pg_database_encoding_max_length();
     589       396730 :     int32       S = start;      /* start position */
     590              :     int32       S1;             /* adjusted start position */
     591              :     int32       L1;             /* adjusted substring length */
     592              :     int32       E;              /* end position, exclusive */
     593              : 
     594              :     /*
     595              :      * SQL99 says S can be zero or negative (which we don't document), but we
     596              :      * still must fetch from the start of the string.
     597              :      * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
     598              :      */
     599       396730 :     S1 = Max(S, 1);
     600              : 
     601              :     /* life is easy if the encoding max length is 1 */
     602       396730 :     if (eml == 1)
     603              :     {
     604           11 :         if (length_not_specified)   /* special case - get length to end of
     605              :                                      * string */
     606            0 :             L1 = -1;
     607           11 :         else if (length < 0)
     608              :         {
     609              :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     610            0 :             ereport(ERROR,
     611              :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     612              :                      errmsg("negative substring length not allowed")));
     613              :             L1 = -1;            /* silence stupider compilers */
     614              :         }
     615           11 :         else if (pg_add_s32_overflow(S, length, &E))
     616              :         {
     617              :             /*
     618              :              * L could be large enough for S + L to overflow, in which case
     619              :              * the substring must run to end of string.
     620              :              */
     621            0 :             L1 = -1;
     622              :         }
     623              :         else
     624              :         {
     625              :             /*
     626              :              * A zero or negative value for the end position can happen if the
     627              :              * start was negative or one. SQL99 says to return a zero-length
     628              :              * string.
     629              :              */
     630           11 :             if (E < 1)
     631            0 :                 return cstring_to_text("");
     632              : 
     633           11 :             L1 = E - S1;
     634              :         }
     635              : 
     636              :         /*
     637              :          * If the start position is past the end of the string, SQL99 says to
     638              :          * return a zero-length string -- DatumGetTextPSlice() will do that
     639              :          * for us.  We need only convert S1 to zero-based starting position.
     640              :          */
     641           11 :         return DatumGetTextPSlice(str, S1 - 1, L1);
     642              :     }
     643       396719 :     else if (eml > 1)
     644              :     {
     645              :         /*
     646              :          * When encoding max length is > 1, we can't get LC without
     647              :          * detoasting, so we'll grab a conservatively large slice now and go
     648              :          * back later to do the right thing
     649              :          */
     650              :         int32       slice_start;
     651              :         int32       slice_size;
     652              :         int32       slice_strlen;
     653              :         int32       slice_len;
     654              :         text       *slice;
     655              :         int32       E1;
     656              :         int32       i;
     657              :         char       *p;
     658              :         char       *s;
     659              :         text       *ret;
     660              : 
     661              :         /*
     662              :          * We need to start at position zero because there is no way to know
     663              :          * in advance which byte offset corresponds to the supplied start
     664              :          * position.
     665              :          */
     666       396719 :         slice_start = 0;
     667              : 
     668       396719 :         if (length_not_specified)   /* special case - get length to end of
     669              :                                      * string */
     670           52 :             E = slice_size = L1 = -1;
     671       396667 :         else if (length < 0)
     672              :         {
     673              :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     674            8 :             ereport(ERROR,
     675              :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     676              :                      errmsg("negative substring length not allowed")));
     677              :             E = slice_size = L1 = -1;   /* silence stupider compilers */
     678              :         }
     679       396659 :         else if (pg_add_s32_overflow(S, length, &E))
     680              :         {
     681              :             /*
     682              :              * L could be large enough for S + L to overflow, in which case
     683              :              * the substring must run to end of string.
     684              :              */
     685            5 :             slice_size = L1 = -1;
     686              :         }
     687              :         else
     688              :         {
     689              :             /*
     690              :              * Ending at position 1, exclusive, obviously yields an empty
     691              :              * string.  A zero or negative value can happen if the start was
     692              :              * negative or one. SQL99 says to return a zero-length string.
     693              :              */
     694       396654 :             if (E <= 1)
     695            8 :                 return cstring_to_text("");
     696              : 
     697              :             /*
     698              :              * if E is past the end of the string, the tuple toaster will
     699              :              * truncate the length for us
     700              :              */
     701       396646 :             L1 = E - S1;
     702              : 
     703              :             /*
     704              :              * Total slice size in bytes can't be any longer than the
     705              :              * inclusive end position times the encoding max length.  If that
     706              :              * overflows, we can just use -1.
     707              :              */
     708       396646 :             if (pg_mul_s32_overflow(E - 1, eml, &slice_size))
     709            5 :                 slice_size = -1;
     710              :         }
     711              : 
     712              :         /*
     713              :          * If we're working with an untoasted source, no need to do an extra
     714              :          * copying step.
     715              :          */
     716       793318 :         if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
     717       396615 :             VARATT_IS_EXTERNAL(DatumGetPointer(str)))
     718          240 :             slice = DatumGetTextPSlice(str, slice_start, slice_size);
     719              :         else
     720       396463 :             slice = (text *) DatumGetPointer(str);
     721              : 
     722              :         /* see if we got back an empty string */
     723       396703 :         slice_len = VARSIZE_ANY_EXHDR(slice);
     724       396703 :         if (slice_len == 0)
     725              :         {
     726            0 :             if (slice != (text *) DatumGetPointer(str))
     727            0 :                 pfree(slice);
     728            0 :             return cstring_to_text("");
     729              :         }
     730              : 
     731              :         /*
     732              :          * Now we can get the actual length of the slice in MB characters,
     733              :          * stopping at the end of the substring.  Continuing beyond the
     734              :          * substring end could find an incomplete character attributable
     735              :          * solely to DatumGetTextPSlice() chopping in the middle of a
     736              :          * character, and it would be superfluous work at best.
     737              :          */
     738       396695 :         slice_strlen =
     739       396703 :             (slice_size == -1 ?
     740       396703 :              pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) :
     741       396641 :              pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1));
     742              : 
     743              :         /*
     744              :          * Check that the start position wasn't > slice_strlen. If so, SQL99
     745              :          * says to return a zero-length string.
     746              :          */
     747       396695 :         if (S1 > slice_strlen)
     748              :         {
     749           25 :             if (slice != (text *) DatumGetPointer(str))
     750            4 :                 pfree(slice);
     751           25 :             return cstring_to_text("");
     752              :         }
     753              : 
     754              :         /*
     755              :          * Adjust L1 and E1 now that we know the slice string length. Again
     756              :          * remember that S1 is one based, and slice_start is zero based.
     757              :          */
     758       396670 :         if (L1 > -1)
     759       396626 :             E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
     760              :         else
     761           44 :             E1 = slice_start + 1 + slice_strlen;
     762              : 
     763              :         /*
     764              :          * Find the start position in the slice; remember S1 is not zero based
     765              :          */
     766       396670 :         p = VARDATA_ANY(slice);
     767      4234847 :         for (i = 0; i < S1 - 1; i++)
     768      3838177 :             p += pg_mblen_unbounded(p);
     769              : 
     770              :         /* hang onto a pointer to our start position */
     771       396670 :         s = p;
     772              : 
     773              :         /*
     774              :          * Count the actual bytes used by the substring of the requested
     775              :          * length.
     776              :          */
     777      6237601 :         for (i = S1; i < E1; i++)
     778      5840931 :             p += pg_mblen_unbounded(p);
     779              : 
     780       396670 :         ret = (text *) palloc(VARHDRSZ + (p - s));
     781       396670 :         SET_VARSIZE(ret, VARHDRSZ + (p - s));
     782       396670 :         memcpy(VARDATA(ret), s, (p - s));
     783              : 
     784       396670 :         if (slice != (text *) DatumGetPointer(str))
     785          232 :             pfree(slice);
     786              : 
     787       396670 :         return ret;
     788              :     }
     789              :     else
     790            0 :         elog(ERROR, "invalid backend encoding: encoding max length < 1");
     791              : 
     792              :     /* not reached: suppress compiler warning */
     793              :     return NULL;
     794              : }
     795              : 
     796              : /*
     797              :  * pg_mbcharcliplen_chars -
     798              :  *  Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes.
     799              :  *
     800              :  *  This mirrors all the dubious historical behavior, so it's static to
     801              :  *  discourage proliferation.  The assertions are specific to the one caller.
     802              :  */
     803              : static int
     804       396641 : pg_mbcharcliplen_chars(const char *mbstr, int len, int limit)
     805              : {
     806       396641 :     int         nch = 0;
     807              :     int         l;
     808              : 
     809              :     Assert(len > 0);
     810              :     Assert(limit > 0);
     811              :     Assert(pg_database_encoding_max_length() > 1);
     812              : 
     813      8111268 :     while (len > 0 && *mbstr)
     814              :     {
     815      8110933 :         l = pg_mblen_with_len(mbstr, len);
     816      8110925 :         nch++;
     817      8110925 :         if (nch == limit)
     818       396298 :             break;
     819      7714627 :         len -= l;
     820      7714627 :         mbstr += l;
     821              :     }
     822       396633 :     return nch;
     823              : }
     824              : 
     825              : /*
     826              :  * textoverlay
     827              :  *  Replace specified substring of first string with second
     828              :  *
     829              :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
     830              :  * This code is a direct implementation of what the standard says.
     831              :  */
     832              : Datum
     833           18 : textoverlay(PG_FUNCTION_ARGS)
     834              : {
     835           18 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     836           18 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     837           18 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     838           18 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
     839              : 
     840           18 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     841              : }
     842              : 
     843              : Datum
     844           10 : textoverlay_no_len(PG_FUNCTION_ARGS)
     845              : {
     846           10 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     847           10 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     848           10 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     849              :     int         sl;
     850              : 
     851           10 :     sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
     852           10 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     853              : }
     854              : 
     855              : static text *
     856           28 : text_overlay(text *t1, text *t2, int sp, int sl)
     857              : {
     858              :     text       *result;
     859              :     text       *s1;
     860              :     text       *s2;
     861              :     int         sp_pl_sl;
     862              : 
     863              :     /*
     864              :      * Check for possible integer-overflow cases.  For negative sp, throw a
     865              :      * "substring length" error because that's what should be expected
     866              :      * according to the spec's definition of OVERLAY().
     867              :      */
     868           28 :     if (sp <= 0)
     869            0 :         ereport(ERROR,
     870              :                 (errcode(ERRCODE_SUBSTRING_ERROR),
     871              :                  errmsg("negative substring length not allowed")));
     872           28 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
     873            0 :         ereport(ERROR,
     874              :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
     875              :                  errmsg("integer out of range")));
     876              : 
     877           28 :     s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
     878           28 :     s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
     879           28 :     result = text_catenate(s1, t2);
     880           28 :     result = text_catenate(result, s2);
     881              : 
     882           28 :     return result;
     883              : }
     884              : 
     885              : /*
     886              :  * textpos -
     887              :  *    Return the position of the specified substring.
     888              :  *    Implements the SQL POSITION() function.
     889              :  *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
     890              :  * - thomas 1997-07-27
     891              :  */
     892              : Datum
     893           87 : textpos(PG_FUNCTION_ARGS)
     894              : {
     895           87 :     text       *str = PG_GETARG_TEXT_PP(0);
     896           87 :     text       *search_str = PG_GETARG_TEXT_PP(1);
     897              : 
     898           87 :     PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
     899              : }
     900              : 
     901              : /*
     902              :  * text_position -
     903              :  *  Does the real work for textpos()
     904              :  *
     905              :  * Inputs:
     906              :  *      t1 - string to be searched
     907              :  *      t2 - pattern to match within t1
     908              :  * Result:
     909              :  *      Character index of the first matched char, starting from 1,
     910              :  *      or 0 if no match.
     911              :  *
     912              :  *  This is broken out so it can be called directly by other string processing
     913              :  *  functions.
     914              :  */
     915              : static int
     916           87 : text_position(text *t1, text *t2, Oid collid)
     917              : {
     918              :     TextPositionState state;
     919              :     int         result;
     920              : 
     921           87 :     check_collation_set(collid);
     922              : 
     923              :     /* Empty needle always matches at position 1 */
     924           87 :     if (VARSIZE_ANY_EXHDR(t2) < 1)
     925           10 :         return 1;
     926              : 
     927              :     /* Otherwise, can't match if haystack is shorter than needle */
     928           77 :     if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
     929           13 :         pg_newlocale_from_collation(collid)->deterministic)
     930           13 :         return 0;
     931              : 
     932           64 :     text_position_setup(t1, t2, collid, &state);
     933              :     /* don't need greedy mode here */
     934           64 :     state.greedy = false;
     935              : 
     936           64 :     if (!text_position_next(&state))
     937           14 :         result = 0;
     938              :     else
     939           50 :         result = text_position_get_match_pos(&state);
     940           64 :     text_position_cleanup(&state);
     941           64 :     return result;
     942              : }
     943              : 
     944              : 
     945              : /*
     946              :  * text_position_setup, text_position_next, text_position_cleanup -
     947              :  *  Component steps of text_position()
     948              :  *
     949              :  * These are broken out so that a string can be efficiently searched for
     950              :  * multiple occurrences of the same pattern.  text_position_next may be
     951              :  * called multiple times, and it advances to the next match on each call.
     952              :  * text_position_get_match_ptr() and text_position_get_match_pos() return
     953              :  * a pointer or 1-based character position of the last match, respectively.
     954              :  *
     955              :  * The "state" variable is normally just a local variable in the caller.
     956              :  *
     957              :  * NOTE: text_position_next skips over the matched portion.  For example,
     958              :  * searching for "xx" in "xxx" returns only one match, not two.
     959              :  */
     960              : 
     961              : static void
     962         1180 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
     963              : {
     964         1180 :     int         len1 = VARSIZE_ANY_EXHDR(t1);
     965         1180 :     int         len2 = VARSIZE_ANY_EXHDR(t2);
     966              : 
     967         1180 :     check_collation_set(collid);
     968              : 
     969         1180 :     state->locale = pg_newlocale_from_collation(collid);
     970              : 
     971              :     /*
     972              :      * Most callers need greedy mode, but some might want to unset this to
     973              :      * optimize.
     974              :      */
     975         1180 :     state->greedy = true;
     976              : 
     977              :     Assert(len2 > 0);
     978              : 
     979              :     /*
     980              :      * Even with a multi-byte encoding, we perform the search using the raw
     981              :      * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
     982              :      * because in UTF-8 the byte sequence of one character cannot contain
     983              :      * another character.  For other multi-byte encodings, we do the search
     984              :      * initially as a simple byte search, ignoring multibyte issues, but
     985              :      * verify afterwards that the match we found is at a character boundary,
     986              :      * and continue the search if it was a false match.
     987              :      */
     988         1180 :     if (pg_database_encoding_max_length() == 1)
     989           54 :         state->is_multibyte_char_in_char = false;
     990         1126 :     else if (GetDatabaseEncoding() == PG_UTF8)
     991         1126 :         state->is_multibyte_char_in_char = false;
     992              :     else
     993            0 :         state->is_multibyte_char_in_char = true;
     994              : 
     995         1180 :     state->str1 = VARDATA_ANY(t1);
     996         1180 :     state->str2 = VARDATA_ANY(t2);
     997         1180 :     state->len1 = len1;
     998         1180 :     state->len2 = len2;
     999         1180 :     state->last_match = NULL;
    1000         1180 :     state->refpoint = state->str1;
    1001         1180 :     state->refpos = 0;
    1002              : 
    1003              :     /*
    1004              :      * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
    1005              :      * notes we use the terminology that the "haystack" is the string to be
    1006              :      * searched (t1) and the "needle" is the pattern being sought (t2).
    1007              :      *
    1008              :      * If the needle is empty or bigger than the haystack then there is no
    1009              :      * point in wasting cycles initializing the table.  We also choose not to
    1010              :      * use B-M-H for needles of length 1, since the skip table can't possibly
    1011              :      * save anything in that case.
    1012              :      *
    1013              :      * (With nondeterministic collations, the search is already
    1014              :      * multibyte-aware, so we don't need this.)
    1015              :      */
    1016         1180 :     if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
    1017              :     {
    1018          926 :         int         searchlength = len1 - len2;
    1019              :         int         skiptablemask;
    1020              :         int         last;
    1021              :         int         i;
    1022          926 :         const char *str2 = state->str2;
    1023              : 
    1024              :         /*
    1025              :          * First we must determine how much of the skip table to use.  The
    1026              :          * declaration of TextPositionState allows up to 256 elements, but for
    1027              :          * short search problems we don't really want to have to initialize so
    1028              :          * many elements --- it would take too long in comparison to the
    1029              :          * actual search time.  So we choose a useful skip table size based on
    1030              :          * the haystack length minus the needle length.  The closer the needle
    1031              :          * length is to the haystack length the less useful skipping becomes.
    1032              :          *
    1033              :          * Note: since we use bit-masking to select table elements, the skip
    1034              :          * table size MUST be a power of 2, and so the mask must be 2^N-1.
    1035              :          */
    1036          926 :         if (searchlength < 16)
    1037          112 :             skiptablemask = 3;
    1038          814 :         else if (searchlength < 64)
    1039           24 :             skiptablemask = 7;
    1040          790 :         else if (searchlength < 128)
    1041           15 :             skiptablemask = 15;
    1042          775 :         else if (searchlength < 512)
    1043          187 :             skiptablemask = 31;
    1044          588 :         else if (searchlength < 2048)
    1045          434 :             skiptablemask = 63;
    1046          154 :         else if (searchlength < 4096)
    1047          112 :             skiptablemask = 127;
    1048              :         else
    1049           42 :             skiptablemask = 255;
    1050          926 :         state->skiptablemask = skiptablemask;
    1051              : 
    1052              :         /*
    1053              :          * Initialize the skip table.  We set all elements to the needle
    1054              :          * length, since this is the correct skip distance for any character
    1055              :          * not found in the needle.
    1056              :          */
    1057        60654 :         for (i = 0; i <= skiptablemask; i++)
    1058        59728 :             state->skiptable[i] = len2;
    1059              : 
    1060              :         /*
    1061              :          * Now examine the needle.  For each character except the last one,
    1062              :          * set the corresponding table element to the appropriate skip
    1063              :          * distance.  Note that when two characters share the same skip table
    1064              :          * entry, the one later in the needle must determine the skip
    1065              :          * distance.
    1066              :          */
    1067          926 :         last = len2 - 1;
    1068              : 
    1069        11191 :         for (i = 0; i < last; i++)
    1070        10265 :             state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
    1071              :     }
    1072         1180 : }
    1073              : 
    1074              : /*
    1075              :  * Advance to the next match, starting from the end of the previous match
    1076              :  * (or the beginning of the string, on first call).  Returns true if a match
    1077              :  * is found.
    1078              :  *
    1079              :  * Note that this refuses to match an empty-string needle.  Most callers
    1080              :  * will have handled that case specially and we'll never see it here.
    1081              :  */
    1082              : static bool
    1083         5562 : text_position_next(TextPositionState *state)
    1084              : {
    1085         5562 :     int         needle_len = state->len2;
    1086              :     char       *start_ptr;
    1087              :     char       *matchptr;
    1088              : 
    1089         5562 :     if (needle_len <= 0)
    1090            0 :         return false;           /* result for empty pattern */
    1091              : 
    1092              :     /* Start from the point right after the previous match. */
    1093         5562 :     if (state->last_match)
    1094         4368 :         start_ptr = state->last_match + state->last_match_len;
    1095              :     else
    1096         1194 :         start_ptr = state->str1;
    1097              : 
    1098         5562 : retry:
    1099         5562 :     matchptr = text_position_next_internal(start_ptr, state);
    1100              : 
    1101         5562 :     if (!matchptr)
    1102         1116 :         return false;
    1103              : 
    1104              :     /*
    1105              :      * Found a match for the byte sequence.  If this is a multibyte encoding,
    1106              :      * where one character's byte sequence can appear inside a longer
    1107              :      * multi-byte character, we need to verify that the match was at a
    1108              :      * character boundary, not in the middle of a multi-byte character.
    1109              :      */
    1110         4446 :     if (state->is_multibyte_char_in_char && state->locale->deterministic)
    1111              :     {
    1112            0 :         const char *haystack_end = state->str1 + state->len1;
    1113              : 
    1114              :         /* Walk one character at a time, until we reach the match. */
    1115              : 
    1116              :         /* the search should never move backwards. */
    1117              :         Assert(state->refpoint <= matchptr);
    1118              : 
    1119            0 :         while (state->refpoint < matchptr)
    1120              :         {
    1121              :             /* step to next character. */
    1122            0 :             state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
    1123            0 :             state->refpos++;
    1124              : 
    1125              :             /*
    1126              :              * If we stepped over the match's start position, then it was a
    1127              :              * false positive, where the byte sequence appeared in the middle
    1128              :              * of a multi-byte character.  Skip it, and continue the search at
    1129              :              * the next character boundary.
    1130              :              */
    1131            0 :             if (state->refpoint > matchptr)
    1132              :             {
    1133            0 :                 start_ptr = state->refpoint;
    1134            0 :                 goto retry;
    1135              :             }
    1136              :         }
    1137              :     }
    1138              : 
    1139         4446 :     state->last_match = matchptr;
    1140         4446 :     state->last_match_len = state->last_match_len_tmp;
    1141         4446 :     return true;
    1142              : }
    1143              : 
    1144              : /*
    1145              :  * Subroutine of text_position_next().  This searches for the raw byte
    1146              :  * sequence, ignoring any multi-byte encoding issues.  Returns the first
    1147              :  * match starting at 'start_ptr', or NULL if no match is found.
    1148              :  */
    1149              : static char *
    1150         5562 : text_position_next_internal(char *start_ptr, TextPositionState *state)
    1151              : {
    1152         5562 :     int         haystack_len = state->len1;
    1153         5562 :     int         needle_len = state->len2;
    1154         5562 :     int         skiptablemask = state->skiptablemask;
    1155         5562 :     const char *haystack = state->str1;
    1156         5562 :     const char *needle = state->str2;
    1157         5562 :     const char *haystack_end = &haystack[haystack_len];
    1158              :     const char *hptr;
    1159              : 
    1160              :     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
    1161              :     Assert(needle_len > 0);
    1162              : 
    1163         5562 :     state->last_match_len_tmp = needle_len;
    1164              : 
    1165         5562 :     if (!state->locale->deterministic)
    1166              :     {
    1167              :         /*
    1168              :          * With a nondeterministic collation, we have to use an unoptimized
    1169              :          * route.  We walk through the haystack and see if at each position
    1170              :          * there is a substring of the remaining string that is equal to the
    1171              :          * needle under the given collation.
    1172              :          *
    1173              :          * Note, the found substring could have a different length than the
    1174              :          * needle.  Callers that want to skip over the found string need to
    1175              :          * read the length of the found substring from last_match_len rather
    1176              :          * than just using the length of their needle.
    1177              :          *
    1178              :          * Most callers will require "greedy" semantics, meaning that we need
    1179              :          * to find the longest such substring, not the shortest.  For callers
    1180              :          * that don't need greedy semantics, we can finish on the first match.
    1181              :          *
    1182              :          * This loop depends on the assumption that the needle is nonempty and
    1183              :          * any matching substring must also be nonempty.  (Even if the
    1184              :          * collation would accept an empty match, returning one would send
    1185              :          * callers that search for successive matches into an infinite loop.)
    1186              :          */
    1187          176 :         const char *result_hptr = NULL;
    1188              : 
    1189          176 :         hptr = start_ptr;
    1190          482 :         while (hptr < haystack_end)
    1191              :         {
    1192              :             const char *test_end;
    1193              : 
    1194              :             /*
    1195              :              * First check the common case that there is a match in the
    1196              :              * haystack of exactly the length of the needle.
    1197              :              */
    1198          403 :             if (!state->greedy &&
    1199           72 :                 haystack_end - hptr >= needle_len &&
    1200           36 :                 pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
    1201            8 :                 return (char *) hptr;
    1202              : 
    1203              :             /*
    1204              :              * Else check if any of the non-empty substrings starting at hptr
    1205              :              * compare equal to the needle.
    1206              :              */
    1207          395 :             test_end = hptr;
    1208              :             do
    1209              :             {
    1210         1583 :                 test_end += pg_mblen_range(test_end, haystack_end);
    1211         1583 :                 if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
    1212              :                 {
    1213           97 :                     state->last_match_len_tmp = (test_end - hptr);
    1214           97 :                     result_hptr = hptr;
    1215           97 :                     if (!state->greedy)
    1216            0 :                         break;
    1217              :                 }
    1218         1583 :             } while (test_end < haystack_end);
    1219              : 
    1220          395 :             if (result_hptr)
    1221           89 :                 break;
    1222              : 
    1223          306 :             hptr += pg_mblen_range(hptr, haystack_end);
    1224              :         }
    1225              : 
    1226          168 :         return (char *) result_hptr;
    1227              :     }
    1228         5386 :     else if (needle_len == 1)
    1229              :     {
    1230              :         /* No point in using B-M-H for a one-character needle */
    1231          597 :         char        nchar = *needle;
    1232              : 
    1233          597 :         hptr = start_ptr;
    1234         4246 :         while (hptr < haystack_end)
    1235              :         {
    1236         4108 :             if (*hptr == nchar)
    1237          459 :                 return (char *) hptr;
    1238         3649 :             hptr++;
    1239              :         }
    1240              :     }
    1241              :     else
    1242              :     {
    1243         4789 :         const char *needle_last = &needle[needle_len - 1];
    1244              : 
    1245              :         /* Start at startpos plus the length of the needle */
    1246         4789 :         hptr = start_ptr + needle_len - 1;
    1247       115895 :         while (hptr < haystack_end)
    1248              :         {
    1249              :             /* Match the needle scanning *backward* */
    1250              :             const char *nptr;
    1251              :             const char *p;
    1252              : 
    1253       114996 :             nptr = needle_last;
    1254       114996 :             p = hptr;
    1255       172354 :             while (*nptr == *p)
    1256              :             {
    1257              :                 /* Matched it all?  If so, return 1-based position */
    1258        61248 :                 if (nptr == needle)
    1259         3890 :                     return (char *) p;
    1260        57358 :                 nptr--, p--;
    1261              :             }
    1262              : 
    1263              :             /*
    1264              :              * No match, so use the haystack char at hptr to decide how far to
    1265              :              * advance.  If the needle had any occurrence of that character
    1266              :              * (or more precisely, one sharing the same skiptable entry)
    1267              :              * before its last character, then we advance far enough to align
    1268              :              * the last such needle character with that haystack position.
    1269              :              * Otherwise we can advance by the whole needle length.
    1270              :              */
    1271       111106 :             hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
    1272              :         }
    1273              :     }
    1274              : 
    1275         1037 :     return 0;                   /* not found */
    1276              : }
    1277              : 
    1278              : /*
    1279              :  * Return a pointer to the current match.
    1280              :  *
    1281              :  * The returned pointer points into the original haystack string.
    1282              :  */
    1283              : static char *
    1284         4359 : text_position_get_match_ptr(TextPositionState *state)
    1285              : {
    1286         4359 :     return state->last_match;
    1287              : }
    1288              : 
    1289              : /*
    1290              :  * Return the offset of the current match.
    1291              :  *
    1292              :  * The offset is in characters, 1-based.
    1293              :  */
    1294              : static int
    1295           50 : text_position_get_match_pos(TextPositionState *state)
    1296              : {
    1297              :     /* Convert the byte position to char position. */
    1298          100 :     state->refpos += pg_mbstrlen_with_len(state->refpoint,
    1299           50 :                                           state->last_match - state->refpoint);
    1300           50 :     state->refpoint = state->last_match;
    1301           50 :     return state->refpos + 1;
    1302              : }
    1303              : 
    1304              : /*
    1305              :  * Reset search state to the initial state installed by text_position_setup.
    1306              :  *
    1307              :  * The next call to text_position_next will search from the beginning
    1308              :  * of the string.
    1309              :  */
    1310              : static void
    1311           14 : text_position_reset(TextPositionState *state)
    1312              : {
    1313           14 :     state->last_match = NULL;
    1314           14 :     state->refpoint = state->str1;
    1315           14 :     state->refpos = 0;
    1316           14 : }
    1317              : 
    1318              : static void
    1319         1180 : text_position_cleanup(TextPositionState *state)
    1320              : {
    1321              :     /* no cleanup needed */
    1322         1180 : }
    1323              : 
    1324              : 
    1325              : static void
    1326     11803340 : check_collation_set(Oid collid)
    1327              : {
    1328     11803340 :     if (!OidIsValid(collid))
    1329              :     {
    1330              :         /*
    1331              :          * This typically means that the parser could not resolve a conflict
    1332              :          * of implicit collations, so report it that way.
    1333              :          */
    1334           20 :         ereport(ERROR,
    1335              :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
    1336              :                  errmsg("could not determine which collation to use for string comparison"),
    1337              :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
    1338              :     }
    1339     11803320 : }
    1340              : 
    1341              : /*
    1342              :  * varstr_cmp()
    1343              :  *
    1344              :  * Comparison function for text strings with given lengths, using the
    1345              :  * appropriate locale. Returns an integer less than, equal to, or greater than
    1346              :  * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
    1347              :  *
    1348              :  * Note: many functions that depend on this are marked leakproof; therefore,
    1349              :  * avoid reporting the actual contents of the input when throwing errors.
    1350              :  * All errors herein should be things that can't happen except on corrupt
    1351              :  * data, anyway; otherwise we will have trouble with indexing strings that
    1352              :  * would cause them.
    1353              :  */
    1354              : int
    1355      6004988 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
    1356              : {
    1357              :     int         result;
    1358              :     pg_locale_t mylocale;
    1359              : 
    1360      6004988 :     check_collation_set(collid);
    1361              : 
    1362      6004976 :     mylocale = pg_newlocale_from_collation(collid);
    1363              : 
    1364      6004976 :     if (mylocale->collate_is_c)
    1365              :     {
    1366      2194839 :         result = memcmp(arg1, arg2, Min(len1, len2));
    1367      2194839 :         if ((result == 0) && (len1 != len2))
    1368        87019 :             result = (len1 < len2) ? -1 : 1;
    1369              :     }
    1370              :     else
    1371              :     {
    1372              :         /*
    1373              :          * memcmp() can't tell us which of two unequal strings sorts first,
    1374              :          * but it's a cheap way to tell if they're equal.  Testing shows that
    1375              :          * memcmp() followed by strcoll() is only trivially slower than
    1376              :          * strcoll() by itself, so we don't lose much if this doesn't work out
    1377              :          * very often, and if it does - for example, because there are many
    1378              :          * equal strings in the input - then we win big by avoiding expensive
    1379              :          * collation-aware comparisons.
    1380              :          */
    1381      3810137 :         if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
    1382      1059673 :             return 0;
    1383              : 
    1384      2750464 :         result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
    1385              : 
    1386              :         /* Break tie if necessary. */
    1387      2750464 :         if (result == 0 && mylocale->deterministic)
    1388              :         {
    1389            0 :             result = memcmp(arg1, arg2, Min(len1, len2));
    1390            0 :             if ((result == 0) && (len1 != len2))
    1391            0 :                 result = (len1 < len2) ? -1 : 1;
    1392              :         }
    1393              :     }
    1394              : 
    1395      4945303 :     return result;
    1396              : }
    1397              : 
    1398              : /*
    1399              :  * text_cmp()
    1400              :  * Internal comparison function for text strings.
    1401              :  * Returns -1, 0 or 1
    1402              :  */
    1403              : static int
    1404      4742825 : text_cmp(text *arg1, text *arg2, Oid collid)
    1405              : {
    1406              :     char       *a1p,
    1407              :                *a2p;
    1408              :     int         len1,
    1409              :                 len2;
    1410              : 
    1411      4742825 :     a1p = VARDATA_ANY(arg1);
    1412      4742825 :     a2p = VARDATA_ANY(arg2);
    1413              : 
    1414      4742825 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1415      4742825 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1416              : 
    1417      4742825 :     return varstr_cmp(a1p, len1, a2p, len2, collid);
    1418              : }
    1419              : 
    1420              : /*
    1421              :  * Comparison functions for text strings.
    1422              :  *
    1423              :  * Note: btree indexes need these routines not to leak memory; therefore,
    1424              :  * be careful to free working copies of toasted datums.  Most places don't
    1425              :  * need to be so careful.
    1426              :  */
    1427              : 
    1428              : Datum
    1429      5320211 : texteq(PG_FUNCTION_ARGS)
    1430              : {
    1431      5320211 :     Oid         collid = PG_GET_COLLATION();
    1432      5320211 :     pg_locale_t mylocale = 0;
    1433              :     bool        result;
    1434              : 
    1435      5320211 :     check_collation_set(collid);
    1436              : 
    1437      5320211 :     mylocale = pg_newlocale_from_collation(collid);
    1438              : 
    1439      5320211 :     if (mylocale->deterministic)
    1440              :     {
    1441      5313961 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1442      5313961 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1443              :         Size        len1,
    1444              :                     len2;
    1445              : 
    1446              :         /*
    1447              :          * Since we only care about equality or not-equality, we can avoid all
    1448              :          * the expense of strcoll() here, and just do bitwise comparison.  In
    1449              :          * fact, we don't even have to do a bitwise comparison if we can show
    1450              :          * the lengths of the strings are unequal; which might save us from
    1451              :          * having to detoast one or both values.
    1452              :          */
    1453      5313961 :         len1 = toast_raw_datum_size(arg1);
    1454      5313961 :         len2 = toast_raw_datum_size(arg2);
    1455      5313961 :         if (len1 != len2)
    1456      2699488 :             result = false;
    1457              :         else
    1458              :         {
    1459      2614473 :             text       *targ1 = DatumGetTextPP(arg1);
    1460      2614473 :             text       *targ2 = DatumGetTextPP(arg2);
    1461              : 
    1462      2614473 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1463              :                              len1 - VARHDRSZ) == 0);
    1464              : 
    1465      2614473 :             PG_FREE_IF_COPY(targ1, 0);
    1466      2614473 :             PG_FREE_IF_COPY(targ2, 1);
    1467              :         }
    1468              :     }
    1469              :     else
    1470              :     {
    1471         6250 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1472         6250 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1473              : 
    1474         6250 :         result = (text_cmp(arg1, arg2, collid) == 0);
    1475              : 
    1476         6250 :         PG_FREE_IF_COPY(arg1, 0);
    1477         6250 :         PG_FREE_IF_COPY(arg2, 1);
    1478              :     }
    1479              : 
    1480      5320211 :     PG_RETURN_BOOL(result);
    1481              : }
    1482              : 
    1483              : Datum
    1484       207846 : textne(PG_FUNCTION_ARGS)
    1485              : {
    1486       207846 :     Oid         collid = PG_GET_COLLATION();
    1487              :     pg_locale_t mylocale;
    1488              :     bool        result;
    1489              : 
    1490       207846 :     check_collation_set(collid);
    1491              : 
    1492       207846 :     mylocale = pg_newlocale_from_collation(collid);
    1493              : 
    1494       207846 :     if (mylocale->deterministic)
    1495              :     {
    1496       207830 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1497       207830 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1498              :         Size        len1,
    1499              :                     len2;
    1500              : 
    1501              :         /* See comment in texteq() */
    1502       207830 :         len1 = toast_raw_datum_size(arg1);
    1503       207830 :         len2 = toast_raw_datum_size(arg2);
    1504       207830 :         if (len1 != len2)
    1505        11743 :             result = true;
    1506              :         else
    1507              :         {
    1508       196087 :             text       *targ1 = DatumGetTextPP(arg1);
    1509       196087 :             text       *targ2 = DatumGetTextPP(arg2);
    1510              : 
    1511       196087 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1512              :                              len1 - VARHDRSZ) != 0);
    1513              : 
    1514       196087 :             PG_FREE_IF_COPY(targ1, 0);
    1515       196087 :             PG_FREE_IF_COPY(targ2, 1);
    1516              :         }
    1517              :     }
    1518              :     else
    1519              :     {
    1520           16 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1521           16 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1522              : 
    1523           16 :         result = (text_cmp(arg1, arg2, collid) != 0);
    1524              : 
    1525           16 :         PG_FREE_IF_COPY(arg1, 0);
    1526           16 :         PG_FREE_IF_COPY(arg2, 1);
    1527              :     }
    1528              : 
    1529       207846 :     PG_RETURN_BOOL(result);
    1530              : }
    1531              : 
    1532              : Datum
    1533       231689 : text_lt(PG_FUNCTION_ARGS)
    1534              : {
    1535       231689 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1536       231689 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1537              :     bool        result;
    1538              : 
    1539       231689 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
    1540              : 
    1541       231677 :     PG_FREE_IF_COPY(arg1, 0);
    1542       231677 :     PG_FREE_IF_COPY(arg2, 1);
    1543              : 
    1544       231677 :     PG_RETURN_BOOL(result);
    1545              : }
    1546              : 
    1547              : Datum
    1548       213625 : text_le(PG_FUNCTION_ARGS)
    1549              : {
    1550       213625 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1551       213625 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1552              :     bool        result;
    1553              : 
    1554       213625 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
    1555              : 
    1556       213625 :     PG_FREE_IF_COPY(arg1, 0);
    1557       213625 :     PG_FREE_IF_COPY(arg2, 1);
    1558              : 
    1559       213625 :     PG_RETURN_BOOL(result);
    1560              : }
    1561              : 
    1562              : Datum
    1563       216151 : text_gt(PG_FUNCTION_ARGS)
    1564              : {
    1565       216151 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1566       216151 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1567              :     bool        result;
    1568              : 
    1569       216151 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
    1570              : 
    1571       216151 :     PG_FREE_IF_COPY(arg1, 0);
    1572       216151 :     PG_FREE_IF_COPY(arg2, 1);
    1573              : 
    1574       216151 :     PG_RETURN_BOOL(result);
    1575              : }
    1576              : 
    1577              : Datum
    1578       116417 : text_ge(PG_FUNCTION_ARGS)
    1579              : {
    1580       116417 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1581       116417 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1582              :     bool        result;
    1583              : 
    1584       116417 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
    1585              : 
    1586       116417 :     PG_FREE_IF_COPY(arg1, 0);
    1587       116417 :     PG_FREE_IF_COPY(arg2, 1);
    1588              : 
    1589       116417 :     PG_RETURN_BOOL(result);
    1590              : }
    1591              : 
    1592              : Datum
    1593        25276 : text_starts_with(PG_FUNCTION_ARGS)
    1594              : {
    1595        25276 :     Datum       arg1 = PG_GETARG_DATUM(0);
    1596        25276 :     Datum       arg2 = PG_GETARG_DATUM(1);
    1597        25276 :     Oid         collid = PG_GET_COLLATION();
    1598              :     pg_locale_t mylocale;
    1599              :     bool        result;
    1600              :     Size        len1,
    1601              :                 len2;
    1602              : 
    1603        25276 :     check_collation_set(collid);
    1604              : 
    1605        25276 :     mylocale = pg_newlocale_from_collation(collid);
    1606              : 
    1607        25276 :     if (!mylocale->deterministic)
    1608            0 :         ereport(ERROR,
    1609              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1610              :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1611              : 
    1612        25276 :     len1 = toast_raw_datum_size(arg1);
    1613        25276 :     len2 = toast_raw_datum_size(arg2);
    1614        25276 :     if (len2 > len1)
    1615            0 :         result = false;
    1616              :     else
    1617              :     {
    1618        25276 :         text       *targ1 = text_substring(arg1, 1, len2, false);
    1619        25276 :         text       *targ2 = DatumGetTextPP(arg2);
    1620              : 
    1621        25276 :         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1622              :                          VARSIZE_ANY_EXHDR(targ2)) == 0);
    1623              : 
    1624        25276 :         PG_FREE_IF_COPY(targ1, 0);
    1625        25276 :         PG_FREE_IF_COPY(targ2, 1);
    1626              :     }
    1627              : 
    1628        25276 :     PG_RETURN_BOOL(result);
    1629              : }
    1630              : 
    1631              : Datum
    1632      3762230 : bttextcmp(PG_FUNCTION_ARGS)
    1633              : {
    1634      3762230 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1635      3762230 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1636              :     int32       result;
    1637              : 
    1638      3762230 :     result = text_cmp(arg1, arg2, PG_GET_COLLATION());
    1639              : 
    1640      3762230 :     PG_FREE_IF_COPY(arg1, 0);
    1641      3762230 :     PG_FREE_IF_COPY(arg2, 1);
    1642              : 
    1643      3762230 :     PG_RETURN_INT32(result);
    1644              : }
    1645              : 
    1646              : Datum
    1647        51031 : bttextsortsupport(PG_FUNCTION_ARGS)
    1648              : {
    1649        51031 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    1650        51031 :     Oid         collid = ssup->ssup_collation;
    1651              :     MemoryContext oldcontext;
    1652              : 
    1653        51031 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    1654              : 
    1655              :     /* Use generic string SortSupport */
    1656        51031 :     varstr_sortsupport(ssup, TEXTOID, collid);
    1657              : 
    1658        51023 :     MemoryContextSwitchTo(oldcontext);
    1659              : 
    1660        51023 :     PG_RETURN_VOID();
    1661              : }
    1662              : 
    1663              : /*
    1664              :  * Generic sortsupport interface for character type's operator classes.
    1665              :  * Includes locale support, and support for BpChar semantics (i.e. removing
    1666              :  * trailing spaces before comparison).
    1667              :  *
    1668              :  * Relies on the assumption that text, VarChar, and BpChar all have the
    1669              :  * same representation.
    1670              :  */
    1671              : void
    1672        87236 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
    1673              : {
    1674        87236 :     bool        abbreviate = ssup->abbreviate;
    1675        87236 :     bool        collate_c = false;
    1676              :     VarStringSortSupport *sss;
    1677              :     pg_locale_t locale;
    1678              : 
    1679        87236 :     check_collation_set(collid);
    1680              : 
    1681        87228 :     locale = pg_newlocale_from_collation(collid);
    1682              : 
    1683              :     /*
    1684              :      * If possible, set ssup->comparator to a function which can be used to
    1685              :      * directly compare two datums.  If we can do this, we'll avoid the
    1686              :      * overhead of a trip through the fmgr layer for every comparison, which
    1687              :      * can be substantial.
    1688              :      *
    1689              :      * Most typically, we'll set the comparator to varlenafastcmp_locale,
    1690              :      * which uses strcoll() to perform comparisons.  We use that for the
    1691              :      * BpChar case too, but type NAME uses namefastcmp_locale. However, if
    1692              :      * LC_COLLATE = C, we can make things quite a bit faster with
    1693              :      * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
    1694              :      * memcmp() rather than strcoll().
    1695              :      */
    1696        87228 :     if (locale->collate_is_c)
    1697              :     {
    1698        59718 :         if (typid == BPCHAROID)
    1699          157 :             ssup->comparator = bpcharfastcmp_c;
    1700        59561 :         else if (typid == NAMEOID)
    1701              :         {
    1702        35527 :             ssup->comparator = namefastcmp_c;
    1703              :             /* Not supporting abbreviation with type NAME, for now */
    1704        35527 :             abbreviate = false;
    1705              :         }
    1706              :         else
    1707        24034 :             ssup->comparator = varstrfastcmp_c;
    1708              : 
    1709        59718 :         collate_c = true;
    1710              :     }
    1711              :     else
    1712              :     {
    1713              :         /*
    1714              :          * We use varlenafastcmp_locale except for type NAME.
    1715              :          */
    1716        27510 :         if (typid == NAMEOID)
    1717              :         {
    1718            0 :             ssup->comparator = namefastcmp_locale;
    1719              :             /* Not supporting abbreviation with type NAME, for now */
    1720            0 :             abbreviate = false;
    1721              :         }
    1722              :         else
    1723        27510 :             ssup->comparator = varlenafastcmp_locale;
    1724              : 
    1725              :         /*
    1726              :          * Unfortunately, it seems that abbreviation for non-C collations is
    1727              :          * broken on many common platforms; see pg_strxfrm_enabled().
    1728              :          *
    1729              :          * Even apart from the risk of broken locales, it's possible that
    1730              :          * there are platforms where the use of abbreviated keys should be
    1731              :          * disabled at compile time.  For example, macOS's strxfrm()
    1732              :          * implementation is known to not effectively concentrate a
    1733              :          * significant amount of entropy from the original string in earlier
    1734              :          * transformed blobs.  It's possible that other supported platforms
    1735              :          * are similarly encumbered.  So, if we ever get past disabling this
    1736              :          * categorically, we may still want or need to disable it for
    1737              :          * particular platforms.
    1738              :          */
    1739        27510 :         if (!pg_strxfrm_enabled(locale))
    1740        26896 :             abbreviate = false;
    1741              :     }
    1742              : 
    1743              :     /*
    1744              :      * If we're using abbreviated keys, or if we're using a locale-aware
    1745              :      * comparison, we need to initialize a VarStringSortSupport object. Both
    1746              :      * cases will make use of the temporary buffers we initialize here for
    1747              :      * scratch space (and to detect requirement for BpChar semantics from
    1748              :      * caller), and the abbreviation case requires additional state.
    1749              :      */
    1750        87228 :     if (abbreviate || !collate_c)
    1751              :     {
    1752        40494 :         sss = palloc_object(VarStringSortSupport);
    1753        40494 :         sss->buf1 = palloc(TEXTBUFLEN);
    1754        40494 :         sss->buflen1 = TEXTBUFLEN;
    1755        40494 :         sss->buf2 = palloc(TEXTBUFLEN);
    1756        40494 :         sss->buflen2 = TEXTBUFLEN;
    1757              :         /* Start with invalid values */
    1758        40494 :         sss->last_len1 = -1;
    1759        40494 :         sss->last_len2 = -1;
    1760              :         /* Initialize */
    1761        40494 :         sss->last_returned = 0;
    1762        40494 :         if (collate_c)
    1763        12984 :             sss->locale = NULL;
    1764              :         else
    1765        27510 :             sss->locale = locale;
    1766              : 
    1767              :         /*
    1768              :          * To avoid somehow confusing a strxfrm() blob and an original string,
    1769              :          * constantly keep track of the variety of data that buf1 and buf2
    1770              :          * currently contain.
    1771              :          *
    1772              :          * Comparisons may be interleaved with conversion calls.  Frequently,
    1773              :          * conversions and comparisons are batched into two distinct phases,
    1774              :          * but the correctness of caching cannot hinge upon this.  For
    1775              :          * comparison caching, buffer state is only trusted if cache_blob is
    1776              :          * found set to false, whereas strxfrm() caching only trusts the state
    1777              :          * when cache_blob is found set to true.
    1778              :          *
    1779              :          * Arbitrarily initialize cache_blob to true.
    1780              :          */
    1781        40494 :         sss->cache_blob = true;
    1782        40494 :         sss->collate_c = collate_c;
    1783        40494 :         sss->typid = typid;
    1784        40494 :         ssup->ssup_extra = sss;
    1785              : 
    1786              :         /*
    1787              :          * If possible, plan to use the abbreviated keys optimization.  The
    1788              :          * core code may switch back to authoritative comparator should
    1789              :          * abbreviation be aborted.
    1790              :          */
    1791        40494 :         if (abbreviate)
    1792              :         {
    1793        13438 :             sss->prop_card = 0.20;
    1794        13438 :             initHyperLogLog(&sss->abbr_card, 10);
    1795        13438 :             initHyperLogLog(&sss->full_card, 10);
    1796        13438 :             ssup->abbrev_full_comparator = ssup->comparator;
    1797        13438 :             ssup->comparator = ssup_datum_unsigned_cmp;
    1798        13438 :             ssup->abbrev_converter = varstr_abbrev_convert;
    1799        13438 :             ssup->abbrev_abort = varstr_abbrev_abort;
    1800              :         }
    1801              :     }
    1802        87228 : }
    1803              : 
    1804              : /*
    1805              :  * sortsupport comparison func (for C locale case)
    1806              :  */
    1807              : static int
    1808     25068182 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1809              : {
    1810     25068182 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1811     25068182 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1812              :     char       *a1p,
    1813              :                *a2p;
    1814              :     int         len1,
    1815              :                 len2,
    1816              :                 result;
    1817              : 
    1818     25068182 :     a1p = VARDATA_ANY(arg1);
    1819     25068182 :     a2p = VARDATA_ANY(arg2);
    1820              : 
    1821     25068182 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1822     25068182 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1823              : 
    1824     25068182 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1825     25068182 :     if ((result == 0) && (len1 != len2))
    1826       756438 :         result = (len1 < len2) ? -1 : 1;
    1827              : 
    1828              :     /* We can't afford to leak memory here. */
    1829     25068182 :     if (PointerGetDatum(arg1) != x)
    1830            0 :         pfree(arg1);
    1831     25068182 :     if (PointerGetDatum(arg2) != y)
    1832            0 :         pfree(arg2);
    1833              : 
    1834     25068182 :     return result;
    1835              : }
    1836              : 
    1837              : /*
    1838              :  * sortsupport comparison func (for BpChar C locale case)
    1839              :  *
    1840              :  * BpChar outsources its sortsupport to this module.  Specialization for the
    1841              :  * varstr_sortsupport BpChar case, modeled on
    1842              :  * internal_bpchar_pattern_compare().
    1843              :  */
    1844              : static int
    1845        31160 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1846              : {
    1847        31160 :     BpChar     *arg1 = DatumGetBpCharPP(x);
    1848        31160 :     BpChar     *arg2 = DatumGetBpCharPP(y);
    1849              :     char       *a1p,
    1850              :                *a2p;
    1851              :     int         len1,
    1852              :                 len2,
    1853              :                 result;
    1854              : 
    1855        31160 :     a1p = VARDATA_ANY(arg1);
    1856        31160 :     a2p = VARDATA_ANY(arg2);
    1857              : 
    1858        31160 :     len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
    1859        31160 :     len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
    1860              : 
    1861        31160 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1862        31160 :     if ((result == 0) && (len1 != len2))
    1863            2 :         result = (len1 < len2) ? -1 : 1;
    1864              : 
    1865              :     /* We can't afford to leak memory here. */
    1866        31160 :     if (PointerGetDatum(arg1) != x)
    1867            0 :         pfree(arg1);
    1868        31160 :     if (PointerGetDatum(arg2) != y)
    1869            0 :         pfree(arg2);
    1870              : 
    1871        31160 :     return result;
    1872              : }
    1873              : 
    1874              : /*
    1875              :  * sortsupport comparison func (for NAME C locale case)
    1876              :  */
    1877              : static int
    1878     23597805 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
    1879              : {
    1880     23597805 :     Name        arg1 = DatumGetName(x);
    1881     23597805 :     Name        arg2 = DatumGetName(y);
    1882              : 
    1883     23597805 :     return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
    1884              : }
    1885              : 
    1886              : /*
    1887              :  * sortsupport comparison func (for locale case with all varlena types)
    1888              :  */
    1889              : static int
    1890     21825729 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1891              : {
    1892     21825729 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1893     21825729 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1894              :     char       *a1p,
    1895              :                *a2p;
    1896              :     int         len1,
    1897              :                 len2,
    1898              :                 result;
    1899              : 
    1900     21825729 :     a1p = VARDATA_ANY(arg1);
    1901     21825729 :     a2p = VARDATA_ANY(arg2);
    1902              : 
    1903     21825729 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1904     21825729 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1905              : 
    1906     21825729 :     result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
    1907              : 
    1908              :     /* We can't afford to leak memory here. */
    1909     21825729 :     if (PointerGetDatum(arg1) != x)
    1910           36 :         pfree(arg1);
    1911     21825729 :     if (PointerGetDatum(arg2) != y)
    1912           29 :         pfree(arg2);
    1913              : 
    1914     21825729 :     return result;
    1915              : }
    1916              : 
    1917              : /*
    1918              :  * sortsupport comparison func (for locale case with NAME type)
    1919              :  */
    1920              : static int
    1921            0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1922              : {
    1923            0 :     Name        arg1 = DatumGetName(x);
    1924            0 :     Name        arg2 = DatumGetName(y);
    1925              : 
    1926            0 :     return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
    1927            0 :                                 NameStr(*arg2), strlen(NameStr(*arg2)),
    1928              :                                 ssup);
    1929              : }
    1930              : 
    1931              : /*
    1932              :  * sortsupport comparison func for locale cases
    1933              :  */
    1934              : static int
    1935     21825729 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
    1936              : {
    1937     21825729 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    1938              :     int         result;
    1939              :     bool        arg1_match;
    1940              : 
    1941              :     /* Fast pre-check for equality, as discussed in varstr_cmp() */
    1942     21825729 :     if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
    1943              :     {
    1944              :         /*
    1945              :          * No change in buf1 or buf2 contents, so avoid changing last_len1 or
    1946              :          * last_len2.  Existing contents of buffers might still be used by
    1947              :          * next call.
    1948              :          *
    1949              :          * It's fine to allow the comparison of BpChar padding bytes here,
    1950              :          * even though that implies that the memcmp() will usually be
    1951              :          * performed for BpChar callers (though multibyte characters could
    1952              :          * still prevent that from occurring).  The memcmp() is still very
    1953              :          * cheap, and BpChar's funny semantics have us remove trailing spaces
    1954              :          * (not limited to padding), so we need make no distinction between
    1955              :          * padding space characters and "real" space characters.
    1956              :          */
    1957      6867601 :         return 0;
    1958              :     }
    1959              : 
    1960     14958128 :     if (sss->typid == BPCHAROID)
    1961              :     {
    1962              :         /* Get true number of bytes, ignoring trailing spaces */
    1963        18717 :         len1 = bpchartruelen(a1p, len1);
    1964        18717 :         len2 = bpchartruelen(a2p, len2);
    1965              :     }
    1966              : 
    1967     14958128 :     if (len1 >= sss->buflen1)
    1968              :     {
    1969            7 :         sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    1970            7 :         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    1971              :     }
    1972     14958128 :     if (len2 >= sss->buflen2)
    1973              :     {
    1974            4 :         sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
    1975            4 :         sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    1976              :     }
    1977              : 
    1978              :     /*
    1979              :      * We're likely to be asked to compare the same strings repeatedly, and
    1980              :      * memcmp() is so much cheaper than strcoll() that it pays to try to cache
    1981              :      * comparisons, even though in general there is no reason to think that
    1982              :      * that will work out (every string datum may be unique).  Caching does
    1983              :      * not slow things down measurably when it doesn't work out, and can speed
    1984              :      * things up by rather a lot when it does.  In part, this is because the
    1985              :      * memcmp() compares data from cachelines that are needed in L1 cache even
    1986              :      * when the last comparison's result cannot be reused.
    1987              :      */
    1988     14958128 :     arg1_match = true;
    1989     14958128 :     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
    1990              :     {
    1991     13602509 :         arg1_match = false;
    1992     13602509 :         memcpy(sss->buf1, a1p, len1);
    1993     13602509 :         sss->buf1[len1] = '\0';
    1994     13602509 :         sss->last_len1 = len1;
    1995              :     }
    1996              : 
    1997              :     /*
    1998              :      * If we're comparing the same two strings as last time, we can return the
    1999              :      * same answer without calling strcoll() again.  This is more likely than
    2000              :      * it seems (at least with moderate to low cardinality sets), because
    2001              :      * quicksort compares the same pivot against many values.
    2002              :      */
    2003     14958128 :     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
    2004              :     {
    2005      2284151 :         memcpy(sss->buf2, a2p, len2);
    2006      2284151 :         sss->buf2[len2] = '\0';
    2007      2284151 :         sss->last_len2 = len2;
    2008              :     }
    2009     12673977 :     else if (arg1_match && !sss->cache_blob)
    2010              :     {
    2011              :         /* Use result cached following last actual strcoll() call */
    2012      1120552 :         return sss->last_returned;
    2013              :     }
    2014              : 
    2015     13837576 :     result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
    2016              : 
    2017              :     /* Break tie if necessary. */
    2018     13837576 :     if (result == 0 && sss->locale->deterministic)
    2019            0 :         result = strcmp(sss->buf1, sss->buf2);
    2020              : 
    2021              :     /* Cache result, perhaps saving an expensive strcoll() call next time */
    2022     13837576 :     sss->cache_blob = false;
    2023     13837576 :     sss->last_returned = result;
    2024     13837576 :     return result;
    2025              : }
    2026              : 
    2027              : /*
    2028              :  * Conversion routine for sortsupport.  Converts original to abbreviated key
    2029              :  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
    2030              :  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
    2031              :  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
    2032              :  * locale is used just memcpy() from original instead.
    2033              :  */
    2034              : static Datum
    2035       500042 : varstr_abbrev_convert(Datum original, SortSupport ssup)
    2036              : {
    2037       500042 :     const size_t max_prefix_bytes = sizeof(Datum);
    2038       500042 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2039       500042 :     VarString  *authoritative = DatumGetVarStringPP(original);
    2040       500042 :     char       *authoritative_data = VARDATA_ANY(authoritative);
    2041              : 
    2042              :     /* working state */
    2043              :     Datum       res;
    2044              :     char       *pres;
    2045              :     int         len;
    2046              :     uint32      hash;
    2047              : 
    2048       500042 :     pres = (char *) &res;
    2049              :     /* memset(), so any non-overwritten bytes are NUL */
    2050       500042 :     memset(pres, 0, max_prefix_bytes);
    2051       500042 :     len = VARSIZE_ANY_EXHDR(authoritative);
    2052              : 
    2053              :     /* Get number of bytes, ignoring trailing spaces */
    2054       500042 :     if (sss->typid == BPCHAROID)
    2055          540 :         len = bpchartruelen(authoritative_data, len);
    2056              : 
    2057              :     /*
    2058              :      * If we're using the C collation, use memcpy(), rather than strxfrm(), to
    2059              :      * abbreviate keys.  The full comparator for the C locale is also
    2060              :      * memcmp().  This should be faster than strxfrm().
    2061              :      */
    2062       500042 :     if (sss->collate_c)
    2063       498584 :         memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
    2064              :     else
    2065              :     {
    2066              :         Size        bsize;
    2067              : 
    2068              :         /*
    2069              :          * We're not using the C collation, so fall back on strxfrm or ICU
    2070              :          * analogs.
    2071              :          */
    2072              : 
    2073              :         /* By convention, we use buffer 1 to store and NUL-terminate */
    2074         1458 :         if (len >= sss->buflen1)
    2075              :         {
    2076            0 :             sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2077            0 :             sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    2078              :         }
    2079              : 
    2080              :         /* Might be able to reuse strxfrm() blob from last call */
    2081         1458 :         if (sss->last_len1 == len && sss->cache_blob &&
    2082          790 :             memcmp(sss->buf1, authoritative_data, len) == 0)
    2083              :         {
    2084          152 :             memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
    2085              :             /* No change affecting cardinality, so no hashing required */
    2086          152 :             goto done;
    2087              :         }
    2088              : 
    2089         1306 :         memcpy(sss->buf1, authoritative_data, len);
    2090              : 
    2091              :         /*
    2092              :          * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
    2093              :          */
    2094         1306 :         sss->buf1[len] = '\0';
    2095         1306 :         sss->last_len1 = len;
    2096              : 
    2097         1306 :         if (pg_strxfrm_prefix_enabled(sss->locale))
    2098              :         {
    2099         1306 :             if (sss->buflen2 < max_prefix_bytes)
    2100              :             {
    2101            0 :                 sss->buflen2 = Max(max_prefix_bytes,
    2102              :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2103            0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2104              :             }
    2105              : 
    2106         1306 :             bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
    2107              :                                       max_prefix_bytes, sss->locale);
    2108         1306 :             sss->last_len2 = bsize;
    2109              :         }
    2110              :         else
    2111              :         {
    2112              :             /*
    2113              :              * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
    2114              :              * again.  The pg_strxfrm() function leaves the result buffer
    2115              :              * content undefined if the result did not fit, so we need to
    2116              :              * retry until everything fits, even though we only need the first
    2117              :              * few bytes in the end.
    2118              :              */
    2119              :             for (;;)
    2120              :             {
    2121            0 :                 bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
    2122              :                                    sss->locale);
    2123              : 
    2124            0 :                 sss->last_len2 = bsize;
    2125            0 :                 if (bsize < sss->buflen2)
    2126            0 :                     break;
    2127              : 
    2128              :                 /*
    2129              :                  * Grow buffer and retry.
    2130              :                  */
    2131            0 :                 sss->buflen2 = Max(bsize + 1,
    2132              :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2133            0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2134              :             }
    2135              :         }
    2136              : 
    2137              :         /*
    2138              :          * Every Datum byte is always compared.  This is safe because the
    2139              :          * strxfrm() blob is itself NUL terminated, leaving no danger of
    2140              :          * misinterpreting any NUL bytes not intended to be interpreted as
    2141              :          * logically representing termination.
    2142              :          */
    2143         1306 :         memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
    2144              :     }
    2145              : 
    2146              :     /*
    2147              :      * Maintain approximate cardinality of both abbreviated keys and original,
    2148              :      * authoritative keys using HyperLogLog.  Used as cheap insurance against
    2149              :      * the worst case, where we do many string transformations for no saving
    2150              :      * in full strcoll()-based comparisons.  These statistics are used by
    2151              :      * varstr_abbrev_abort().
    2152              :      *
    2153              :      * First, Hash key proper, or a significant fraction of it.  Mix in length
    2154              :      * in order to compensate for cases where differences are past
    2155              :      * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
    2156              :      */
    2157       499890 :     hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
    2158              :                                    Min(len, PG_CACHE_LINE_SIZE)));
    2159              : 
    2160       499890 :     if (len > PG_CACHE_LINE_SIZE)
    2161           97 :         hash ^= DatumGetUInt32(hash_uint32((uint32) len));
    2162              : 
    2163       499890 :     addHyperLogLog(&sss->full_card, hash);
    2164              : 
    2165              :     /* Hash abbreviated key */
    2166              :     {
    2167              :         uint32      tmp;
    2168              : 
    2169       499890 :         tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
    2170       499890 :         hash = DatumGetUInt32(hash_uint32(tmp));
    2171              :     }
    2172              : 
    2173       499890 :     addHyperLogLog(&sss->abbr_card, hash);
    2174              : 
    2175              :     /* Cache result, perhaps saving an expensive strxfrm() call next time */
    2176       499890 :     sss->cache_blob = true;
    2177       500042 : done:
    2178              : 
    2179              :     /*
    2180              :      * Byteswap on little-endian machines.
    2181              :      *
    2182              :      * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
    2183              :      * 3-way comparator) works correctly on all platforms.  If we didn't do
    2184              :      * this, the comparator would have to call memcmp() with a pair of
    2185              :      * pointers to the first byte of each abbreviated key, which is slower.
    2186              :      */
    2187       500042 :     res = DatumBigEndianToNative(res);
    2188              : 
    2189              :     /* Don't leak memory here */
    2190       500042 :     if (PointerGetDatum(authoritative) != original)
    2191            1 :         pfree(authoritative);
    2192              : 
    2193       500042 :     return res;
    2194              : }
    2195              : 
    2196              : /*
    2197              :  * Callback for estimating effectiveness of abbreviated key optimization, using
    2198              :  * heuristic rules.  Returns value indicating if the abbreviation optimization
    2199              :  * should be aborted, based on its projected effectiveness.
    2200              :  */
    2201              : static bool
    2202         1441 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
    2203              : {
    2204         1441 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2205              :     double      abbrev_distinct,
    2206              :                 key_distinct;
    2207              : 
    2208              :     Assert(ssup->abbreviate);
    2209              : 
    2210              :     /* Have a little patience */
    2211         1441 :     if (memtupcount < 100)
    2212          833 :         return false;
    2213              : 
    2214          608 :     abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
    2215          608 :     key_distinct = estimateHyperLogLog(&sss->full_card);
    2216              : 
    2217              :     /*
    2218              :      * Clamp cardinality estimates to at least one distinct value.  While
    2219              :      * NULLs are generally disregarded, if only NULL values were seen so far,
    2220              :      * that might misrepresent costs if we failed to clamp.
    2221              :      */
    2222          608 :     if (abbrev_distinct < 1.0)
    2223            0 :         abbrev_distinct = 1.0;
    2224              : 
    2225          608 :     if (key_distinct < 1.0)
    2226            0 :         key_distinct = 1.0;
    2227              : 
    2228              :     /*
    2229              :      * In the worst case all abbreviated keys are identical, while at the same
    2230              :      * time there are differences within full key strings not captured in
    2231              :      * abbreviations.
    2232              :      */
    2233          608 :     if (trace_sort)
    2234              :     {
    2235            0 :         double      norm_abbrev_card = abbrev_distinct / (double) memtupcount;
    2236              : 
    2237            0 :         elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
    2238              :              "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
    2239              :              memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
    2240              :              sss->prop_card);
    2241              :     }
    2242              : 
    2243              :     /*
    2244              :      * If the number of distinct abbreviated keys approximately matches the
    2245              :      * number of distinct authoritative original keys, that's reason enough to
    2246              :      * proceed.  We can win even with a very low cardinality set if most
    2247              :      * tie-breakers only memcmp().  This is by far the most important
    2248              :      * consideration.
    2249              :      *
    2250              :      * While comparisons that are resolved at the abbreviated key level are
    2251              :      * considerably cheaper than tie-breakers resolved with memcmp(), both of
    2252              :      * those two outcomes are so much cheaper than a full strcoll() once
    2253              :      * sorting is underway that it doesn't seem worth it to weigh abbreviated
    2254              :      * cardinality against the overall size of the set in order to more
    2255              :      * accurately model costs.  Assume that an abbreviated comparison, and an
    2256              :      * abbreviated comparison with a cheap memcmp()-based authoritative
    2257              :      * resolution are equivalent.
    2258              :      */
    2259          608 :     if (abbrev_distinct > key_distinct * sss->prop_card)
    2260              :     {
    2261              :         /*
    2262              :          * When we have exceeded 10,000 tuples, decay required cardinality
    2263              :          * aggressively for next call.
    2264              :          *
    2265              :          * This is useful because the number of comparisons required on
    2266              :          * average increases at a linearithmic rate, and at roughly 10,000
    2267              :          * tuples that factor will start to dominate over the linear costs of
    2268              :          * string transformation (this is a conservative estimate).  The decay
    2269              :          * rate is chosen to be a little less aggressive than halving -- which
    2270              :          * (since we're called at points at which memtupcount has doubled)
    2271              :          * would never see the cost model actually abort past the first call
    2272              :          * following a decay.  This decay rate is mostly a precaution against
    2273              :          * a sudden, violent swing in how well abbreviated cardinality tracks
    2274              :          * full key cardinality.  The decay also serves to prevent a marginal
    2275              :          * case from being aborted too late, when too much has already been
    2276              :          * invested in string transformation.
    2277              :          *
    2278              :          * It's possible for sets of several million distinct strings with
    2279              :          * mere tens of thousands of distinct abbreviated keys to still
    2280              :          * benefit very significantly.  This will generally occur provided
    2281              :          * each abbreviated key is a proxy for a roughly uniform number of the
    2282              :          * set's full keys. If it isn't so, we hope to catch that early and
    2283              :          * abort.  If it isn't caught early, by the time the problem is
    2284              :          * apparent it's probably not worth aborting.
    2285              :          */
    2286          608 :         if (memtupcount > 10000)
    2287            2 :             sss->prop_card *= 0.65;
    2288              : 
    2289          608 :         return false;
    2290              :     }
    2291              : 
    2292              :     /*
    2293              :      * Abort abbreviation strategy.
    2294              :      *
    2295              :      * The worst case, where all abbreviated keys are identical while all
    2296              :      * original strings differ will typically only see a regression of about
    2297              :      * 10% in execution time for small to medium sized lists of strings.
    2298              :      * Whereas on modern CPUs where cache stalls are the dominant cost, we can
    2299              :      * often expect very large improvements, particularly with sets of strings
    2300              :      * of moderately high to high abbreviated cardinality.  There is little to
    2301              :      * lose but much to gain, which our strategy reflects.
    2302              :      */
    2303            0 :     if (trace_sort)
    2304            0 :         elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
    2305              :              "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
    2306              :              memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
    2307              : 
    2308            0 :     return true;
    2309              : }
    2310              : 
    2311              : /*
    2312              :  * Generic equalimage support function for character type's operator classes.
    2313              :  * Disables the use of deduplication with nondeterministic collations.
    2314              :  */
    2315              : Datum
    2316         5618 : btvarstrequalimage(PG_FUNCTION_ARGS)
    2317              : {
    2318              : #ifdef NOT_USED
    2319              :     Oid         opcintype = PG_GETARG_OID(0);
    2320              : #endif
    2321         5618 :     Oid         collid = PG_GET_COLLATION();
    2322              :     pg_locale_t locale;
    2323              : 
    2324         5618 :     check_collation_set(collid);
    2325              : 
    2326         5618 :     locale = pg_newlocale_from_collation(collid);
    2327              : 
    2328         5618 :     PG_RETURN_BOOL(locale->deterministic);
    2329              : }
    2330              : 
    2331              : Datum
    2332       145885 : text_larger(PG_FUNCTION_ARGS)
    2333              : {
    2334       145885 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2335       145885 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2336              :     text       *result;
    2337              : 
    2338       145885 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
    2339              : 
    2340       145885 :     PG_RETURN_TEXT_P(result);
    2341              : }
    2342              : 
    2343              : Datum
    2344        50562 : text_smaller(PG_FUNCTION_ARGS)
    2345              : {
    2346        50562 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2347        50562 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2348              :     text       *result;
    2349              : 
    2350        50562 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
    2351              : 
    2352        50562 :     PG_RETURN_TEXT_P(result);
    2353              : }
    2354              : 
    2355              : 
    2356              : /*
    2357              :  * Cross-type comparison functions for types text and name.
    2358              :  */
    2359              : 
    2360              : Datum
    2361       145143 : nameeqtext(PG_FUNCTION_ARGS)
    2362              : {
    2363       145143 :     Name        arg1 = PG_GETARG_NAME(0);
    2364       145143 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2365       145143 :     size_t      len1 = strlen(NameStr(*arg1));
    2366       145143 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2367       145143 :     Oid         collid = PG_GET_COLLATION();
    2368              :     bool        result;
    2369              : 
    2370       145143 :     check_collation_set(collid);
    2371              : 
    2372       145143 :     if (collid == C_COLLATION_OID)
    2373       173792 :         result = (len1 == len2 &&
    2374        83317 :                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2375              :     else
    2376        54668 :         result = (varstr_cmp(NameStr(*arg1), len1,
    2377        54668 :                              VARDATA_ANY(arg2), len2,
    2378              :                              collid) == 0);
    2379              : 
    2380       145143 :     PG_FREE_IF_COPY(arg2, 1);
    2381              : 
    2382       145143 :     PG_RETURN_BOOL(result);
    2383              : }
    2384              : 
    2385              : Datum
    2386         5731 : texteqname(PG_FUNCTION_ARGS)
    2387              : {
    2388         5731 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2389         5731 :     Name        arg2 = PG_GETARG_NAME(1);
    2390         5731 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2391         5731 :     size_t      len2 = strlen(NameStr(*arg2));
    2392         5731 :     Oid         collid = PG_GET_COLLATION();
    2393              :     bool        result;
    2394              : 
    2395         5731 :     check_collation_set(collid);
    2396              : 
    2397         5731 :     if (collid == C_COLLATION_OID)
    2398          378 :         result = (len1 == len2 &&
    2399          121 :                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2400              :     else
    2401         5474 :         result = (varstr_cmp(VARDATA_ANY(arg1), len1,
    2402         5474 :                              NameStr(*arg2), len2,
    2403              :                              collid) == 0);
    2404              : 
    2405         5731 :     PG_FREE_IF_COPY(arg1, 0);
    2406              : 
    2407         5731 :     PG_RETURN_BOOL(result);
    2408              : }
    2409              : 
    2410              : Datum
    2411           12 : namenetext(PG_FUNCTION_ARGS)
    2412              : {
    2413           12 :     Name        arg1 = PG_GETARG_NAME(0);
    2414           12 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2415           12 :     size_t      len1 = strlen(NameStr(*arg1));
    2416           12 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2417           12 :     Oid         collid = PG_GET_COLLATION();
    2418              :     bool        result;
    2419              : 
    2420           12 :     check_collation_set(collid);
    2421              : 
    2422           12 :     if (collid == C_COLLATION_OID)
    2423            0 :         result = !(len1 == len2 &&
    2424            0 :                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2425              :     else
    2426           12 :         result = !(varstr_cmp(NameStr(*arg1), len1,
    2427           12 :                               VARDATA_ANY(arg2), len2,
    2428              :                               collid) == 0);
    2429              : 
    2430           12 :     PG_FREE_IF_COPY(arg2, 1);
    2431              : 
    2432           12 :     PG_RETURN_BOOL(result);
    2433              : }
    2434              : 
    2435              : Datum
    2436           12 : textnename(PG_FUNCTION_ARGS)
    2437              : {
    2438           12 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2439           12 :     Name        arg2 = PG_GETARG_NAME(1);
    2440           12 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2441           12 :     size_t      len2 = strlen(NameStr(*arg2));
    2442           12 :     Oid         collid = PG_GET_COLLATION();
    2443              :     bool        result;
    2444              : 
    2445           12 :     check_collation_set(collid);
    2446              : 
    2447           12 :     if (collid == C_COLLATION_OID)
    2448            0 :         result = !(len1 == len2 &&
    2449            0 :                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2450              :     else
    2451           12 :         result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
    2452           12 :                               NameStr(*arg2), len2,
    2453              :                               collid) == 0);
    2454              : 
    2455           12 :     PG_FREE_IF_COPY(arg1, 0);
    2456              : 
    2457           12 :     PG_RETURN_BOOL(result);
    2458              : }
    2459              : 
    2460              : Datum
    2461        94006 : btnametextcmp(PG_FUNCTION_ARGS)
    2462              : {
    2463        94006 :     Name        arg1 = PG_GETARG_NAME(0);
    2464        94006 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2465              :     int32       result;
    2466              : 
    2467        94006 :     result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
    2468        94006 :                         VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
    2469              :                         PG_GET_COLLATION());
    2470              : 
    2471        94006 :     PG_FREE_IF_COPY(arg2, 1);
    2472              : 
    2473        94006 :     PG_RETURN_INT32(result);
    2474              : }
    2475              : 
    2476              : Datum
    2477           22 : bttextnamecmp(PG_FUNCTION_ARGS)
    2478              : {
    2479           22 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2480           22 :     Name        arg2 = PG_GETARG_NAME(1);
    2481              :     int32       result;
    2482              : 
    2483           22 :     result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
    2484           22 :                         NameStr(*arg2), strlen(NameStr(*arg2)),
    2485              :                         PG_GET_COLLATION());
    2486              : 
    2487           22 :     PG_FREE_IF_COPY(arg1, 0);
    2488              : 
    2489           22 :     PG_RETURN_INT32(result);
    2490              : }
    2491              : 
    2492              : #define CmpCall(cmpfunc) \
    2493              :     DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
    2494              :                                           PG_GET_COLLATION(), \
    2495              :                                           PG_GETARG_DATUM(0), \
    2496              :                                           PG_GETARG_DATUM(1)))
    2497              : 
    2498              : Datum
    2499        45115 : namelttext(PG_FUNCTION_ARGS)
    2500              : {
    2501        45115 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
    2502              : }
    2503              : 
    2504              : Datum
    2505            0 : nameletext(PG_FUNCTION_ARGS)
    2506              : {
    2507            0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
    2508              : }
    2509              : 
    2510              : Datum
    2511            0 : namegttext(PG_FUNCTION_ARGS)
    2512              : {
    2513            0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
    2514              : }
    2515              : 
    2516              : Datum
    2517        39960 : namegetext(PG_FUNCTION_ARGS)
    2518              : {
    2519        39960 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
    2520              : }
    2521              : 
    2522              : Datum
    2523            0 : textltname(PG_FUNCTION_ARGS)
    2524              : {
    2525            0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
    2526              : }
    2527              : 
    2528              : Datum
    2529            0 : textlename(PG_FUNCTION_ARGS)
    2530              : {
    2531            0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
    2532              : }
    2533              : 
    2534              : Datum
    2535            0 : textgtname(PG_FUNCTION_ARGS)
    2536              : {
    2537            0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
    2538              : }
    2539              : 
    2540              : Datum
    2541            0 : textgename(PG_FUNCTION_ARGS)
    2542              : {
    2543            0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
    2544              : }
    2545              : 
    2546              : #undef CmpCall
    2547              : 
    2548              : 
    2549              : /*
    2550              :  * The following operators support character-by-character comparison
    2551              :  * of text datums, to allow building indexes suitable for LIKE clauses.
    2552              :  * Note that the regular texteq/textne comparison operators, and regular
    2553              :  * support functions 1 and 2 with "C" collation are assumed to be
    2554              :  * compatible with these!
    2555              :  */
    2556              : 
    2557              : static int
    2558       107164 : internal_text_pattern_compare(text *arg1, text *arg2)
    2559              : {
    2560              :     int         result;
    2561              :     int         len1,
    2562              :                 len2;
    2563              : 
    2564       107164 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2565       107164 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2566              : 
    2567       107164 :     result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    2568       107164 :     if (result != 0)
    2569       107076 :         return result;
    2570           88 :     else if (len1 < len2)
    2571            0 :         return -1;
    2572           88 :     else if (len1 > len2)
    2573           56 :         return 1;
    2574              :     else
    2575           32 :         return 0;
    2576              : }
    2577              : 
    2578              : 
    2579              : Datum
    2580        32112 : text_pattern_lt(PG_FUNCTION_ARGS)
    2581              : {
    2582        32112 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2583        32112 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2584              :     int         result;
    2585              : 
    2586        32112 :     result = internal_text_pattern_compare(arg1, arg2);
    2587              : 
    2588        32112 :     PG_FREE_IF_COPY(arg1, 0);
    2589        32112 :     PG_FREE_IF_COPY(arg2, 1);
    2590              : 
    2591        32112 :     PG_RETURN_BOOL(result < 0);
    2592              : }
    2593              : 
    2594              : 
    2595              : Datum
    2596        25006 : text_pattern_le(PG_FUNCTION_ARGS)
    2597              : {
    2598        25006 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2599        25006 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2600              :     int         result;
    2601              : 
    2602        25006 :     result = internal_text_pattern_compare(arg1, arg2);
    2603              : 
    2604        25006 :     PG_FREE_IF_COPY(arg1, 0);
    2605        25006 :     PG_FREE_IF_COPY(arg2, 1);
    2606              : 
    2607        25006 :     PG_RETURN_BOOL(result <= 0);
    2608              : }
    2609              : 
    2610              : 
    2611              : Datum
    2612        25022 : text_pattern_ge(PG_FUNCTION_ARGS)
    2613              : {
    2614        25022 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2615        25022 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2616              :     int         result;
    2617              : 
    2618        25022 :     result = internal_text_pattern_compare(arg1, arg2);
    2619              : 
    2620        25022 :     PG_FREE_IF_COPY(arg1, 0);
    2621        25022 :     PG_FREE_IF_COPY(arg2, 1);
    2622              : 
    2623        25022 :     PG_RETURN_BOOL(result >= 0);
    2624              : }
    2625              : 
    2626              : 
    2627              : Datum
    2628        25006 : text_pattern_gt(PG_FUNCTION_ARGS)
    2629              : {
    2630        25006 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2631        25006 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2632              :     int         result;
    2633              : 
    2634        25006 :     result = internal_text_pattern_compare(arg1, arg2);
    2635              : 
    2636        25006 :     PG_FREE_IF_COPY(arg1, 0);
    2637        25006 :     PG_FREE_IF_COPY(arg2, 1);
    2638              : 
    2639        25006 :     PG_RETURN_BOOL(result > 0);
    2640              : }
    2641              : 
    2642              : 
    2643              : Datum
    2644           18 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
    2645              : {
    2646           18 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2647           18 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2648              :     int         result;
    2649              : 
    2650           18 :     result = internal_text_pattern_compare(arg1, arg2);
    2651              : 
    2652           18 :     PG_FREE_IF_COPY(arg1, 0);
    2653           18 :     PG_FREE_IF_COPY(arg2, 1);
    2654              : 
    2655           18 :     PG_RETURN_INT32(result);
    2656              : }
    2657              : 
    2658              : 
    2659              : Datum
    2660           77 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
    2661              : {
    2662           77 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    2663              :     MemoryContext oldcontext;
    2664              : 
    2665           77 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    2666              : 
    2667              :     /* Use generic string SortSupport, forcing "C" collation */
    2668           77 :     varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
    2669              : 
    2670           77 :     MemoryContextSwitchTo(oldcontext);
    2671              : 
    2672           77 :     PG_RETURN_VOID();
    2673              : }
    2674              : 
    2675              : 
    2676              : /*
    2677              :  * text_name()
    2678              :  * Converts a text type to a Name type.
    2679              :  */
    2680              : Datum
    2681        17867 : text_name(PG_FUNCTION_ARGS)
    2682              : {
    2683        17867 :     text       *s = PG_GETARG_TEXT_PP(0);
    2684              :     Name        result;
    2685              :     int         len;
    2686              : 
    2687        17867 :     len = VARSIZE_ANY_EXHDR(s);
    2688              : 
    2689              :     /* Truncate oversize input */
    2690        17867 :     if (len >= NAMEDATALEN)
    2691            5 :         len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
    2692              : 
    2693              :     /* We use palloc0 here to ensure result is zero-padded */
    2694        17867 :     result = (Name) palloc0(NAMEDATALEN);
    2695        17867 :     memcpy(NameStr(*result), VARDATA_ANY(s), len);
    2696              : 
    2697        17867 :     PG_RETURN_NAME(result);
    2698              : }
    2699              : 
    2700              : /*
    2701              :  * name_text()
    2702              :  * Converts a Name type to a text type.
    2703              :  */
    2704              : Datum
    2705       426045 : name_text(PG_FUNCTION_ARGS)
    2706              : {
    2707       426045 :     Name        s = PG_GETARG_NAME(0);
    2708              : 
    2709       426045 :     PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
    2710              : }
    2711              : 
    2712              : 
    2713              : /*
    2714              :  * textToQualifiedNameList - convert a text object to list of names
    2715              :  *
    2716              :  * This implements the input parsing needed by nextval() and other
    2717              :  * functions that take a text parameter representing a qualified name.
    2718              :  * We split the name at dots, downcase if not double-quoted, and
    2719              :  * truncate names if they're too long.
    2720              :  */
    2721              : List *
    2722         3005 : textToQualifiedNameList(text *textval)
    2723              : {
    2724              :     char       *rawname;
    2725         3005 :     List       *result = NIL;
    2726              :     List       *namelist;
    2727              :     ListCell   *l;
    2728              : 
    2729              :     /* Convert to C string (handles possible detoasting). */
    2730              :     /* Note we rely on being able to modify rawname below. */
    2731         3005 :     rawname = text_to_cstring(textval);
    2732              : 
    2733         3005 :     if (!SplitIdentifierString(rawname, '.', &namelist))
    2734            0 :         ereport(ERROR,
    2735              :                 (errcode(ERRCODE_INVALID_NAME),
    2736              :                  errmsg("invalid name syntax")));
    2737              : 
    2738         3005 :     if (namelist == NIL)
    2739            0 :         ereport(ERROR,
    2740              :                 (errcode(ERRCODE_INVALID_NAME),
    2741              :                  errmsg("invalid name syntax")));
    2742              : 
    2743         6103 :     foreach(l, namelist)
    2744              :     {
    2745         3098 :         char       *curname = (char *) lfirst(l);
    2746              : 
    2747         3098 :         result = lappend(result, makeString(pstrdup(curname)));
    2748              :     }
    2749              : 
    2750         3005 :     pfree(rawname);
    2751         3005 :     list_free(namelist);
    2752              : 
    2753         3005 :     return result;
    2754              : }
    2755              : 
    2756              : /*
    2757              :  * scan_quoted_identifier - In-place scanner for quoted identifiers.
    2758              :  *
    2759              :  * *nextp should point to the opening double-quote character, and will be
    2760              :  * updated to point just past the end.  *endp is set to the position of
    2761              :  * the closing quote. The return value is the identifier, or NULL if the
    2762              :  * matching close-quote cannot be found.
    2763              :  *
    2764              :  * If we find two consecutive double quote characters, that doesn't end the
    2765              :  * identifier: instead, we collapse them into a double quote and include them
    2766              :  * in the resulting token. Note that this requires overwriting the rest of the
    2767              :  * string in place, including the portion beyond the final value of *nextp.
    2768              :  */
    2769              : char *
    2770        25199 : scan_quoted_identifier(char **endp, char **nextp)
    2771              : {
    2772        25199 :     char       *token = *nextp + 1;
    2773              : 
    2774              :     for (;;)
    2775              :     {
    2776        25209 :         *endp = strchr(*nextp + 1, '"');
    2777        25204 :         if (*endp == NULL)
    2778            2 :             return NULL;        /* mismatched quotes */
    2779        25202 :         if ((*endp)[1] != '"')
    2780        25197 :             break;              /* found end of quoted identifier */
    2781              :         /* Collapse adjacent quotes into one quote, and look again */
    2782            5 :         memmove(*endp, *endp + 1, strlen(*endp));
    2783            5 :         *nextp = *endp;
    2784              :     }
    2785              :     /* *endp now points at the terminating quote */
    2786        25197 :     *nextp = *endp + 1;
    2787              : 
    2788        25197 :     return token;
    2789              : }
    2790              : 
    2791              : /*
    2792              :  * scan_identifier - In-place scanner for quoted or unquoted identifiers.
    2793              :  *
    2794              :  * On success, *endp is set to the position where the caller should write '\0'
    2795              :  * to null-terminate the token, and *nextp is advanced past the token (and past
    2796              :  * the closing quote, if any).  The return value is the token content, or NULL
    2797              :  * if there is a syntax error (mismatched quotes or empty unquoted token).
    2798              :  *
    2799              :  * Unquoted identifiers are terminated by whitespace or the first occurrence
    2800              :  * of the separator character. Additionally, if downcase_unquoted = true,
    2801              :  * unquoted identifiers are downcased in place. See scan_quoted_identifier for
    2802              :  * an additional way in which we modify the string in place.
    2803              :  */
    2804              : char *
    2805       370360 : scan_identifier(char **endp, char **nextp, char separator, bool downcase_unquoted)
    2806              : {
    2807              :     char       *token;
    2808              : 
    2809       370360 :     if (**nextp == '"')
    2810        25199 :         return scan_quoted_identifier(endp, nextp);
    2811              : 
    2812              :     /* Unquoted identifier --- extends to separator or whitespace */
    2813       345161 :     token = *nextp;
    2814              : 
    2815      3165329 :     while (**nextp && **nextp != separator && !scanner_isspace(**nextp))
    2816      2820168 :         (*nextp)++;
    2817              : 
    2818       345161 :     if (*nextp == token)
    2819            2 :         return NULL;            /* empty token */
    2820              : 
    2821       345159 :     *endp = *nextp;
    2822              : 
    2823       345159 :     if (downcase_unquoted)
    2824              :     {
    2825              :         /*
    2826              :          * Downcase the identifier, using same code as main lexer does.
    2827              :          *
    2828              :          * XXX because we want to overwrite the input in-place, we cannot
    2829              :          * support a downcasing transformation that increases the string
    2830              :          * length.  This is not a problem given the current implementation of
    2831              :          * downcase_truncate_identifier, but we'll probably have to do
    2832              :          * something about this someday.
    2833              :          */
    2834       343310 :         int         len = *endp - token;
    2835       343310 :         char       *downname = downcase_truncate_identifier(token, len, false);
    2836              : 
    2837              :         Assert(strlen(downname) <= len);
    2838       343310 :         strncpy(token, downname, len);  /* strncpy is required here */
    2839       343310 :         pfree(downname);
    2840              :     }
    2841              : 
    2842       345159 :     return token;
    2843              : }
    2844              : 
    2845              : 
    2846              : /*
    2847              :  * SplitIdentifierString --- parse a string containing identifiers
    2848              :  *
    2849              :  * This is the guts of textToQualifiedNameList, and is exported for use in
    2850              :  * other situations such as parsing GUC variables.  In the GUC case, it's
    2851              :  * important to avoid memory leaks, so the API is designed to minimize the
    2852              :  * amount of stuff that needs to be allocated and freed.
    2853              :  *
    2854              :  * Inputs:
    2855              :  *  rawstring: the input string; must be overwritable!  On return, it's
    2856              :  *             been modified to contain the separated identifiers.
    2857              :  *  separator: the separator punctuation expected between identifiers
    2858              :  *             (typically '.' or ',').  Whitespace may also appear around
    2859              :  *             identifiers.
    2860              :  * Outputs:
    2861              :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    2862              :  *            rawstring.  Caller should list_free() this even on error return.
    2863              :  *
    2864              :  * Returns true if okay, false if there is a syntax error in the string.
    2865              :  *
    2866              :  * Note that an empty string is considered okay here, though not in
    2867              :  * textToQualifiedNameList.
    2868              :  */
    2869              : bool
    2870       215893 : SplitIdentifierString(char *rawstring, char separator,
    2871              :                       List **namelist)
    2872              : {
    2873       215893 :     char       *nextp = rawstring;
    2874       215893 :     bool        done = false;
    2875              : 
    2876       215893 :     *namelist = NIL;
    2877              : 
    2878       215897 :     while (scanner_isspace(*nextp))
    2879            4 :         nextp++;                /* skip leading whitespace */
    2880              : 
    2881       215893 :     if (*nextp == '\0')
    2882        17772 :         return true;            /* empty string represents empty list */
    2883              : 
    2884              :     /* At the top of the loop, we are at start of a new identifier. */
    2885              :     do
    2886              :     {
    2887              :         char       *curname;
    2888              :         char       *endp;
    2889              : 
    2890       368445 :         curname = scan_identifier(&endp, &nextp, separator, true);
    2891       368445 :         if (curname == NULL)
    2892            1 :             return false;       /* mismatched quotes or empty name */
    2893              : 
    2894       368446 :         while (scanner_isspace(*nextp))
    2895            1 :             nextp++;            /* skip trailing whitespace */
    2896              : 
    2897       368445 :         if (*nextp == separator)
    2898              :         {
    2899       170324 :             nextp++;
    2900       325943 :             while (scanner_isspace(*nextp))
    2901       155619 :                 nextp++;        /* skip leading whitespace for next */
    2902              :             /* we expect another name, so done remains false */
    2903              :         }
    2904       198121 :         else if (*nextp == '\0')
    2905       198120 :             done = true;
    2906              :         else
    2907            1 :             return false;       /* invalid syntax */
    2908              : 
    2909              :         /* Now safe to overwrite separator with a null */
    2910       368444 :         *endp = '\0';
    2911              : 
    2912              :         /* Truncate name if it's overlength */
    2913       368444 :         truncate_identifier(curname, strlen(curname), false);
    2914              : 
    2915              :         /*
    2916              :          * Finished isolating current name --- add it to list
    2917              :          */
    2918       368444 :         *namelist = lappend(*namelist, curname);
    2919              : 
    2920              :         /* Loop back if we didn't reach end of string */
    2921       368444 :     } while (!done);
    2922              : 
    2923       198120 :     return true;
    2924              : }
    2925              : 
    2926              : 
    2927              : /*
    2928              :  * SplitDirectoriesString --- parse a string containing file/directory names
    2929              :  *
    2930              :  * This works fine on file names too; the function name is historical.
    2931              :  *
    2932              :  * This is similar to SplitIdentifierString, except that the parsing
    2933              :  * rules are meant to handle pathnames instead of identifiers: there is
    2934              :  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
    2935              :  * and we apply canonicalize_path() to each extracted string.  Because of the
    2936              :  * last, the returned strings are separately palloc'd rather than being
    2937              :  * pointers into rawstring --- but we still scribble on rawstring.
    2938              :  *
    2939              :  * Inputs:
    2940              :  *  rawstring: the input string; must be modifiable!
    2941              :  *  separator: the separator punctuation expected between directories
    2942              :  *             (typically ',' or ';').  Whitespace may also appear around
    2943              :  *             directories.
    2944              :  * Outputs:
    2945              :  *  namelist: filled with a palloc'd list of directory names.
    2946              :  *            Caller should list_free_deep() this even on error return.
    2947              :  *
    2948              :  * Returns true if okay, false if there is a syntax error in the string.
    2949              :  *
    2950              :  * Note that an empty string is considered okay here.
    2951              :  */
    2952              : bool
    2953         1058 : SplitDirectoriesString(char *rawstring, char separator,
    2954              :                        List **namelist)
    2955              : {
    2956         1058 :     char       *nextp = rawstring;
    2957         1058 :     bool        done = false;
    2958              : 
    2959         1058 :     *namelist = NIL;
    2960              : 
    2961         1058 :     while (scanner_isspace(*nextp))
    2962            0 :         nextp++;                /* skip leading whitespace */
    2963              : 
    2964         1058 :     if (*nextp == '\0')
    2965            1 :         return true;            /* empty string represents empty list */
    2966              : 
    2967              :     /* At the top of the loop, we are at start of a new directory. */
    2968              :     do
    2969              :     {
    2970              :         char       *curname;
    2971              :         char       *endp;
    2972              : 
    2973         1080 :         if (*nextp == '"')
    2974              :         {
    2975              :             /* Quoted name --- collapse quote-quote pairs */
    2976            0 :             curname = scan_quoted_identifier(&endp, &nextp);
    2977            0 :             if (curname == NULL)
    2978            0 :                 return false;   /* mismatched quotes */
    2979              :         }
    2980              :         else
    2981              :         {
    2982              :             /* Unquoted name --- extends to separator or end of string */
    2983         1080 :             curname = endp = nextp;
    2984        17965 :             while (*nextp && *nextp != separator)
    2985              :             {
    2986              :                 /* trailing whitespace should not be included in name */
    2987        16885 :                 if (!scanner_isspace(*nextp))
    2988        16885 :                     endp = nextp + 1;
    2989        16885 :                 nextp++;
    2990              :             }
    2991         1080 :             if (curname == endp)
    2992            0 :                 return false;   /* empty unquoted name not allowed */
    2993              :         }
    2994              : 
    2995         1080 :         while (scanner_isspace(*nextp))
    2996            0 :             nextp++;            /* skip trailing whitespace */
    2997              : 
    2998         1080 :         if (*nextp == separator)
    2999              :         {
    3000           23 :             nextp++;
    3001           30 :             while (scanner_isspace(*nextp))
    3002            7 :                 nextp++;        /* skip leading whitespace for next */
    3003              :             /* we expect another name, so done remains false */
    3004              :         }
    3005         1057 :         else if (*nextp == '\0')
    3006         1057 :             done = true;
    3007              :         else
    3008            0 :             return false;       /* invalid syntax */
    3009              : 
    3010              :         /* Now safe to overwrite separator with a null */
    3011         1080 :         *endp = '\0';
    3012              : 
    3013              :         /* Truncate path if it's overlength */
    3014         1080 :         if (strlen(curname) >= MAXPGPATH)
    3015            0 :             curname[MAXPGPATH - 1] = '\0';
    3016              : 
    3017              :         /*
    3018              :          * Finished isolating current name --- add it to list
    3019              :          */
    3020         1080 :         curname = pstrdup(curname);
    3021         1080 :         canonicalize_path(curname);
    3022         1080 :         *namelist = lappend(*namelist, curname);
    3023              : 
    3024              :         /* Loop back if we didn't reach end of string */
    3025         1080 :     } while (!done);
    3026              : 
    3027         1057 :     return true;
    3028              : }
    3029              : 
    3030              : 
    3031              : /*
    3032              :  * SplitGUCList --- parse a string containing identifiers or file names
    3033              :  *
    3034              :  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
    3035              :  * presuming whether the elements will be taken as identifiers or file names.
    3036              :  * We assume the input has already been through flatten_set_variable_args(),
    3037              :  * so that we need never downcase (if appropriate, that was done already).
    3038              :  * Nor do we ever truncate, since we don't know the correct max length.
    3039              :  * We disallow embedded whitespace for simplicity (it shouldn't matter,
    3040              :  * because any embedded whitespace should have led to double-quoting).
    3041              :  * Otherwise the API is identical to SplitIdentifierString.
    3042              :  *
    3043              :  * XXX it's annoying to have so many copies of this string-splitting logic.
    3044              :  * However, it's not clear that having one function with a bunch of option
    3045              :  * flags would be much better.
    3046              :  *
    3047              :  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
    3048              :  * Be sure to update that if you have to change this.
    3049              :  *
    3050              :  * Inputs:
    3051              :  *  rawstring: the input string; must be overwritable!  On return, it's
    3052              :  *             been modified to contain the separated identifiers.
    3053              :  *  separator: the separator punctuation expected between identifiers
    3054              :  *             (typically '.' or ',').  Whitespace may also appear around
    3055              :  *             identifiers.
    3056              :  * Outputs:
    3057              :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    3058              :  *            rawstring.  Caller should list_free() this even on error return.
    3059              :  *
    3060              :  * Returns true if okay, false if there is a syntax error in the string.
    3061              :  */
    3062              : bool
    3063         4043 : SplitGUCList(char *rawstring, char separator,
    3064              :              List **namelist)
    3065              : {
    3066         4043 :     char       *nextp = rawstring;
    3067         4043 :     bool        done = false;
    3068              : 
    3069         4043 :     *namelist = NIL;
    3070              : 
    3071         4043 :     while (scanner_isspace(*nextp))
    3072            0 :         nextp++;                /* skip leading whitespace */
    3073              : 
    3074         4043 :     if (*nextp == '\0')
    3075         2259 :         return true;            /* empty string represents empty list */
    3076              : 
    3077              :     /* At the top of the loop, we are at start of a new identifier. */
    3078              :     do
    3079              :     {
    3080              :         char       *curname;
    3081              :         char       *endp;
    3082              : 
    3083         1865 :         curname = scan_identifier(&endp, &nextp, separator, false);
    3084         1865 :         if (curname == NULL)
    3085            0 :             return false;       /* mismatched quotes or empty name */
    3086              : 
    3087         1865 :         while (scanner_isspace(*nextp))
    3088            0 :             nextp++;            /* skip trailing whitespace */
    3089              : 
    3090         1865 :         if (*nextp == separator)
    3091              :         {
    3092           81 :             nextp++;
    3093          158 :             while (scanner_isspace(*nextp))
    3094           77 :                 nextp++;        /* skip leading whitespace for next */
    3095              :             /* we expect another name, so done remains false */
    3096              :         }
    3097         1784 :         else if (*nextp == '\0')
    3098         1784 :             done = true;
    3099              :         else
    3100            0 :             return false;       /* invalid syntax */
    3101              : 
    3102              :         /* Now safe to overwrite separator with a null */
    3103         1865 :         *endp = '\0';
    3104              : 
    3105              :         /*
    3106              :          * Finished isolating current name --- add it to list
    3107              :          */
    3108         1865 :         *namelist = lappend(*namelist, curname);
    3109              : 
    3110              :         /* Loop back if we didn't reach end of string */
    3111         1865 :     } while (!done);
    3112              : 
    3113         1784 :     return true;
    3114              : }
    3115              : 
    3116              : /*
    3117              :  * appendStringInfoText
    3118              :  *
    3119              :  * Append a text to str.
    3120              :  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
    3121              :  */
    3122              : static void
    3123      1233165 : appendStringInfoText(StringInfo str, const text *t)
    3124              : {
    3125      1233165 :     appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
    3126      1233165 : }
    3127              : 
    3128              : /*
    3129              :  * replace_text
    3130              :  * replace all occurrences of 'old_sub_str' in 'orig_str'
    3131              :  * with 'new_sub_str' to form 'new_str'
    3132              :  *
    3133              :  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
    3134              :  * otherwise returns 'new_str'
    3135              :  */
    3136              : Datum
    3137          913 : replace_text(PG_FUNCTION_ARGS)
    3138              : {
    3139          913 :     text       *src_text = PG_GETARG_TEXT_PP(0);
    3140          913 :     text       *from_sub_text = PG_GETARG_TEXT_PP(1);
    3141          913 :     text       *to_sub_text = PG_GETARG_TEXT_PP(2);
    3142              :     int         src_text_len;
    3143              :     int         from_sub_text_len;
    3144              :     TextPositionState state;
    3145              :     text       *ret_text;
    3146              :     int         chunk_len;
    3147              :     char       *curr_ptr;
    3148              :     char       *start_ptr;
    3149              :     StringInfoData str;
    3150              :     bool        found;
    3151              : 
    3152          913 :     src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3153          913 :     from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
    3154              : 
    3155              :     /* Return unmodified source string if empty source or pattern */
    3156          913 :     if (src_text_len < 1 || from_sub_text_len < 1)
    3157              :     {
    3158            0 :         PG_RETURN_TEXT_P(src_text);
    3159              :     }
    3160              : 
    3161          913 :     text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
    3162              : 
    3163          913 :     found = text_position_next(&state);
    3164              : 
    3165              :     /* When the from_sub_text is not found, there is nothing to do. */
    3166          913 :     if (!found)
    3167              :     {
    3168          188 :         text_position_cleanup(&state);
    3169          188 :         PG_RETURN_TEXT_P(src_text);
    3170              :     }
    3171          725 :     curr_ptr = text_position_get_match_ptr(&state);
    3172          725 :     start_ptr = VARDATA_ANY(src_text);
    3173              : 
    3174          725 :     initStringInfo(&str);
    3175              : 
    3176              :     do
    3177              :     {
    3178         3894 :         CHECK_FOR_INTERRUPTS();
    3179              : 
    3180              :         /* copy the data skipped over by last text_position_next() */
    3181         3894 :         chunk_len = curr_ptr - start_ptr;
    3182         3894 :         appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3183              : 
    3184         3894 :         appendStringInfoText(&str, to_sub_text);
    3185              : 
    3186         3894 :         start_ptr = curr_ptr + state.last_match_len;
    3187              : 
    3188         3894 :         found = text_position_next(&state);
    3189         3894 :         if (found)
    3190         3169 :             curr_ptr = text_position_get_match_ptr(&state);
    3191              :     }
    3192         3894 :     while (found);
    3193              : 
    3194              :     /* copy trailing data */
    3195          725 :     chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3196          725 :     appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3197              : 
    3198          725 :     text_position_cleanup(&state);
    3199              : 
    3200          725 :     ret_text = cstring_to_text_with_len(str.data, str.len);
    3201          725 :     pfree(str.data);
    3202              : 
    3203          725 :     PG_RETURN_TEXT_P(ret_text);
    3204              : }
    3205              : 
    3206              : /*
    3207              :  * check_replace_text_has_escape
    3208              :  *
    3209              :  * Returns 0 if text contains no backslashes that need processing.
    3210              :  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
    3211              :  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
    3212              :  */
    3213              : static int
    3214        12298 : check_replace_text_has_escape(const text *replace_text)
    3215              : {
    3216        12298 :     int         result = 0;
    3217        12298 :     const char *p = VARDATA_ANY(replace_text);
    3218        12298 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3219              : 
    3220        24624 :     while (p < p_end)
    3221              :     {
    3222              :         /* Find next escape char, if any. */
    3223        11693 :         p = memchr(p, '\\', p_end - p);
    3224        11693 :         if (p == NULL)
    3225        11145 :             break;
    3226          548 :         p++;
    3227              :         /* Note: a backslash at the end doesn't require extra processing. */
    3228          548 :         if (p < p_end)
    3229              :         {
    3230          548 :             if (*p >= '1' && *p <= '9')
    3231          520 :                 return 2;       /* Found a submatch specifier, so done */
    3232           28 :             result = 1;         /* Found some other sequence, keep looking */
    3233           28 :             p++;
    3234              :         }
    3235              :     }
    3236        11778 :     return result;
    3237              : }
    3238              : 
    3239              : /*
    3240              :  * appendStringInfoRegexpSubstr
    3241              :  *
    3242              :  * Append replace_text to str, substituting regexp back references for
    3243              :  * \n escapes.  start_ptr is the start of the match in the source string,
    3244              :  * at logical character position data_pos.
    3245              :  */
    3246              : static void
    3247          170 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
    3248              :                              regmatch_t *pmatch,
    3249              :                              char *start_ptr, int data_pos)
    3250              : {
    3251          170 :     const char *p = VARDATA_ANY(replace_text);
    3252          170 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3253              : 
    3254          404 :     while (p < p_end)
    3255              :     {
    3256          361 :         const char *chunk_start = p;
    3257              :         int         so;
    3258              :         int         eo;
    3259              : 
    3260              :         /* Find next escape char, if any. */
    3261          361 :         p = memchr(p, '\\', p_end - p);
    3262          361 :         if (p == NULL)
    3263          122 :             p = p_end;
    3264              : 
    3265              :         /* Copy the text we just scanned over, if any. */
    3266          361 :         if (p > chunk_start)
    3267          225 :             appendBinaryStringInfo(str, chunk_start, p - chunk_start);
    3268              : 
    3269              :         /* Done if at end of string, else advance over escape char. */
    3270          361 :         if (p >= p_end)
    3271          122 :             break;
    3272          239 :         p++;
    3273              : 
    3274          239 :         if (p >= p_end)
    3275              :         {
    3276              :             /* Escape at very end of input.  Treat same as unexpected char */
    3277            5 :             appendStringInfoChar(str, '\\');
    3278            5 :             break;
    3279              :         }
    3280              : 
    3281          234 :         if (*p >= '1' && *p <= '9')
    3282          190 :         {
    3283              :             /* Use the back reference of regexp. */
    3284          190 :             int         idx = *p - '0';
    3285              : 
    3286          190 :             so = pmatch[idx].rm_so;
    3287          190 :             eo = pmatch[idx].rm_eo;
    3288          190 :             p++;
    3289              :         }
    3290           44 :         else if (*p == '&')
    3291              :         {
    3292              :             /* Use the entire matched string. */
    3293           15 :             so = pmatch[0].rm_so;
    3294           15 :             eo = pmatch[0].rm_eo;
    3295           15 :             p++;
    3296              :         }
    3297           29 :         else if (*p == '\\')
    3298              :         {
    3299              :             /* \\ means transfer one \ to output. */
    3300           24 :             appendStringInfoChar(str, '\\');
    3301           24 :             p++;
    3302           24 :             continue;
    3303              :         }
    3304              :         else
    3305              :         {
    3306              :             /*
    3307              :              * If escape char is not followed by any expected char, just treat
    3308              :              * it as ordinary data to copy.  (XXX would it be better to throw
    3309              :              * an error?)
    3310              :              */
    3311            5 :             appendStringInfoChar(str, '\\');
    3312            5 :             continue;
    3313              :         }
    3314              : 
    3315          205 :         if (so >= 0 && eo >= 0)
    3316              :         {
    3317              :             /*
    3318              :              * Copy the text that is back reference of regexp.  Note so and eo
    3319              :              * are counted in characters not bytes.
    3320              :              */
    3321              :             char       *chunk_start;
    3322              :             int         chunk_len;
    3323              : 
    3324              :             Assert(so >= data_pos);
    3325          205 :             chunk_start = start_ptr;
    3326          205 :             chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
    3327          205 :             chunk_len = charlen_to_bytelen(chunk_start, eo - so);
    3328          205 :             appendBinaryStringInfo(str, chunk_start, chunk_len);
    3329              :         }
    3330              :     }
    3331          170 : }
    3332              : 
    3333              : /*
    3334              :  * replace_text_regexp
    3335              :  *
    3336              :  * replace substring(s) in src_text that match pattern with replace_text.
    3337              :  * The replace_text can contain backslash markers to substitute
    3338              :  * (parts of) the matched text.
    3339              :  *
    3340              :  * cflags: regexp compile flags.
    3341              :  * collation: collation to use.
    3342              :  * search_start: the character (not byte) offset in src_text at which to
    3343              :  * begin searching.
    3344              :  * n: if 0, replace all matches; if > 0, replace only the N'th match.
    3345              :  */
    3346              : text *
    3347        12298 : replace_text_regexp(text *src_text, text *pattern_text,
    3348              :                     text *replace_text,
    3349              :                     int cflags, Oid collation,
    3350              :                     int search_start, int n)
    3351              : {
    3352              :     text       *ret_text;
    3353              :     regex_t    *re;
    3354        12298 :     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3355        12298 :     int         nmatches = 0;
    3356              :     StringInfoData buf;
    3357              :     regmatch_t  pmatch[10];     /* main match, plus \1 to \9 */
    3358        12298 :     int         nmatch = lengthof(pmatch);
    3359              :     pg_wchar   *data;
    3360              :     size_t      data_len;
    3361              :     int         data_pos;
    3362              :     char       *start_ptr;
    3363              :     int         escape_status;
    3364              : 
    3365        12298 :     initStringInfo(&buf);
    3366              : 
    3367              :     /* Convert data string to wide characters. */
    3368        12298 :     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
    3369        12298 :     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
    3370              : 
    3371              :     /* Check whether replace_text has escapes, especially regexp submatches. */
    3372        12298 :     escape_status = check_replace_text_has_escape(replace_text);
    3373              : 
    3374              :     /* If no regexp submatches, we can use REG_NOSUB. */
    3375        12298 :     if (escape_status < 2)
    3376              :     {
    3377        11778 :         cflags |= REG_NOSUB;
    3378              :         /* Also tell pg_regexec we only want the whole-match location. */
    3379        11778 :         nmatch = 1;
    3380              :     }
    3381              : 
    3382              :     /* Prepare the regexp. */
    3383        12298 :     re = RE_compile_and_cache(pattern_text, cflags, collation);
    3384              : 
    3385              :     /* start_ptr points to the data_pos'th character of src_text */
    3386        12298 :     start_ptr = (char *) VARDATA_ANY(src_text);
    3387        12298 :     data_pos = 0;
    3388              : 
    3389        16547 :     while (search_start <= data_len)
    3390              :     {
    3391              :         int         regexec_result;
    3392              : 
    3393        16542 :         CHECK_FOR_INTERRUPTS();
    3394              : 
    3395        16542 :         regexec_result = pg_regexec(re,
    3396              :                                     data,
    3397              :                                     data_len,
    3398              :                                     search_start,
    3399              :                                     NULL,   /* no details */
    3400              :                                     nmatch,
    3401              :                                     pmatch,
    3402              :                                     0);
    3403              : 
    3404        16542 :         if (regexec_result == REG_NOMATCH)
    3405        10849 :             break;
    3406              : 
    3407         5693 :         if (regexec_result != REG_OKAY)
    3408              :         {
    3409              :             char        errMsg[100];
    3410              : 
    3411            0 :             pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
    3412            0 :             ereport(ERROR,
    3413              :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    3414              :                      errmsg("regular expression failed: %s", errMsg)));
    3415              :         }
    3416              : 
    3417              :         /*
    3418              :          * Count matches, and decide whether to replace this match.
    3419              :          */
    3420         5693 :         nmatches++;
    3421         5693 :         if (n > 0 && nmatches != n)
    3422              :         {
    3423              :             /*
    3424              :              * No, so advance search_start, but not start_ptr/data_pos. (Thus,
    3425              :              * we treat the matched text as if it weren't matched, and copy it
    3426              :              * to the output later.)
    3427              :              */
    3428           50 :             search_start = pmatch[0].rm_eo;
    3429           50 :             if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3430            0 :                 search_start++;
    3431           50 :             continue;
    3432              :         }
    3433              : 
    3434              :         /*
    3435              :          * Copy the text to the left of the match position.  Note we are given
    3436              :          * character not byte indexes.
    3437              :          */
    3438         5643 :         if (pmatch[0].rm_so - data_pos > 0)
    3439              :         {
    3440              :             int         chunk_len;
    3441              : 
    3442         5519 :             chunk_len = charlen_to_bytelen(start_ptr,
    3443         5519 :                                            pmatch[0].rm_so - data_pos);
    3444         5519 :             appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3445              : 
    3446              :             /*
    3447              :              * Advance start_ptr over that text, to avoid multiple rescans of
    3448              :              * it if the replace_text contains multiple back-references.
    3449              :              */
    3450         5519 :             start_ptr += chunk_len;
    3451         5519 :             data_pos = pmatch[0].rm_so;
    3452              :         }
    3453              : 
    3454              :         /*
    3455              :          * Copy the replace_text, processing escapes if any are present.
    3456              :          */
    3457         5643 :         if (escape_status > 0)
    3458          170 :             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
    3459              :                                          start_ptr, data_pos);
    3460              :         else
    3461         5473 :             appendStringInfoText(&buf, replace_text);
    3462              : 
    3463              :         /* Advance start_ptr and data_pos over the matched text. */
    3464        11286 :         start_ptr += charlen_to_bytelen(start_ptr,
    3465         5643 :                                         pmatch[0].rm_eo - data_pos);
    3466         5643 :         data_pos = pmatch[0].rm_eo;
    3467              : 
    3468              :         /*
    3469              :          * If we only want to replace one occurrence, we're done.
    3470              :          */
    3471         5643 :         if (n > 0)
    3472         1444 :             break;
    3473              : 
    3474              :         /*
    3475              :          * Advance search position.  Normally we start the next search at the
    3476              :          * end of the previous match; but if the match was of zero length, we
    3477              :          * have to advance by one character, or we'd just find the same match
    3478              :          * again.
    3479              :          */
    3480         4199 :         search_start = data_pos;
    3481         4199 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3482           10 :             search_start++;
    3483              :     }
    3484              : 
    3485              :     /*
    3486              :      * Copy the text to the right of the last match.
    3487              :      */
    3488        12298 :     if (data_pos < data_len)
    3489              :     {
    3490              :         int         chunk_len;
    3491              : 
    3492        11671 :         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3493        11671 :         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3494              :     }
    3495              : 
    3496        12298 :     ret_text = cstring_to_text_with_len(buf.data, buf.len);
    3497        12298 :     pfree(buf.data);
    3498        12298 :     pfree(data);
    3499              : 
    3500        12298 :     return ret_text;
    3501              : }
    3502              : 
    3503              : /*
    3504              :  * split_part
    3505              :  * parse input string based on provided field separator
    3506              :  * return N'th item (1 based, negative counts from end)
    3507              :  */
    3508              : Datum
    3509          124 : split_part(PG_FUNCTION_ARGS)
    3510              : {
    3511          124 :     text       *inputstring = PG_GETARG_TEXT_PP(0);
    3512          124 :     text       *fldsep = PG_GETARG_TEXT_PP(1);
    3513          124 :     int         fldnum = PG_GETARG_INT32(2);
    3514              :     int         inputstring_len;
    3515              :     int         fldsep_len;
    3516              :     TextPositionState state;
    3517              :     char       *start_ptr;
    3518              :     char       *end_ptr;
    3519              :     text       *result_text;
    3520              :     bool        found;
    3521              : 
    3522              :     /* field number is 1 based */
    3523          124 :     if (fldnum == 0)
    3524            4 :         ereport(ERROR,
    3525              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    3526              :                  errmsg("field position must not be zero")));
    3527              : 
    3528          120 :     inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3529          120 :     fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3530              : 
    3531              :     /* return empty string for empty input string */
    3532          120 :     if (inputstring_len < 1)
    3533           10 :         PG_RETURN_TEXT_P(cstring_to_text(""));
    3534              : 
    3535              :     /* handle empty field separator */
    3536          110 :     if (fldsep_len < 1)
    3537              :     {
    3538              :         /* if first or last field, return input string, else empty string */
    3539           20 :         if (fldnum == 1 || fldnum == -1)
    3540           10 :             PG_RETURN_TEXT_P(inputstring);
    3541              :         else
    3542           10 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3543              :     }
    3544              : 
    3545              :     /* find the first field separator */
    3546           90 :     text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
    3547              : 
    3548           90 :     found = text_position_next(&state);
    3549              : 
    3550              :     /* special case if fldsep not found at all */
    3551           90 :     if (!found)
    3552              :     {
    3553           18 :         text_position_cleanup(&state);
    3554              :         /* if first or last field, return input string, else empty string */
    3555           18 :         if (fldnum == 1 || fldnum == -1)
    3556            9 :             PG_RETURN_TEXT_P(inputstring);
    3557              :         else
    3558            9 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3559              :     }
    3560              : 
    3561              :     /*
    3562              :      * take care of a negative field number (i.e. count from the right) by
    3563              :      * converting to a positive field number; we need total number of fields
    3564              :      */
    3565           72 :     if (fldnum < 0)
    3566              :     {
    3567              :         /* we found a fldsep, so there are at least two fields */
    3568           36 :         int         numfields = 2;
    3569              : 
    3570           54 :         while (text_position_next(&state))
    3571           18 :             numfields++;
    3572              : 
    3573              :         /* special case of last field does not require an extra pass */
    3574           36 :         if (fldnum == -1)
    3575              :         {
    3576           17 :             start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
    3577           17 :             end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
    3578           17 :             text_position_cleanup(&state);
    3579           17 :             PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
    3580              :                                                       end_ptr - start_ptr));
    3581              :         }
    3582              : 
    3583              :         /* else, convert fldnum to positive notation */
    3584           19 :         fldnum += numfields + 1;
    3585              : 
    3586              :         /* if nonexistent field, return empty string */
    3587           19 :         if (fldnum <= 0)
    3588              :         {
    3589            5 :             text_position_cleanup(&state);
    3590            5 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3591              :         }
    3592              : 
    3593              :         /* reset to pointing at first match, but now with positive fldnum */
    3594           14 :         text_position_reset(&state);
    3595           14 :         found = text_position_next(&state);
    3596              :         Assert(found);
    3597              :     }
    3598              : 
    3599              :     /* identify bounds of first field */
    3600           50 :     start_ptr = VARDATA_ANY(inputstring);
    3601           50 :     end_ptr = text_position_get_match_ptr(&state);
    3602              : 
    3603           99 :     while (found && --fldnum > 0)
    3604              :     {
    3605              :         /* identify bounds of next field */
    3606           49 :         start_ptr = end_ptr + state.last_match_len;
    3607           49 :         found = text_position_next(&state);
    3608           49 :         if (found)
    3609           27 :             end_ptr = text_position_get_match_ptr(&state);
    3610              :     }
    3611              : 
    3612           50 :     text_position_cleanup(&state);
    3613              : 
    3614           50 :     if (fldnum > 0)
    3615              :     {
    3616              :         /* N'th field separator not found */
    3617              :         /* if last field requested, return it, else empty string */
    3618           22 :         if (fldnum == 1)
    3619              :         {
    3620           17 :             int         last_len = start_ptr - VARDATA_ANY(inputstring);
    3621              : 
    3622           17 :             result_text = cstring_to_text_with_len(start_ptr,
    3623              :                                                    inputstring_len - last_len);
    3624              :         }
    3625              :         else
    3626            5 :             result_text = cstring_to_text("");
    3627              :     }
    3628              :     else
    3629              :     {
    3630              :         /* non-last field requested */
    3631           28 :         result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
    3632              :     }
    3633              : 
    3634           50 :     PG_RETURN_TEXT_P(result_text);
    3635              : }
    3636              : 
    3637              : /*
    3638              :  * Convenience function to return true when two text params are equal.
    3639              :  */
    3640              : static bool
    3641          306 : text_isequal(text *txt1, text *txt2, Oid collid)
    3642              : {
    3643          306 :     return DatumGetBool(DirectFunctionCall2Coll(texteq,
    3644              :                                                 collid,
    3645              :                                                 PointerGetDatum(txt1),
    3646              :                                                 PointerGetDatum(txt2)));
    3647              : }
    3648              : 
    3649              : /*
    3650              :  * text_to_array
    3651              :  * parse input string and return text array of elements,
    3652              :  * based on provided field separator
    3653              :  */
    3654              : Datum
    3655          136 : text_to_array(PG_FUNCTION_ARGS)
    3656              : {
    3657              :     SplitTextOutputData tstate;
    3658              : 
    3659              :     /* For array output, tstate should start as all zeroes */
    3660          136 :     memset(&tstate, 0, sizeof(tstate));
    3661              : 
    3662          136 :     if (!split_text(fcinfo, &tstate))
    3663            5 :         PG_RETURN_NULL();
    3664              : 
    3665          131 :     if (tstate.astate == NULL)
    3666            5 :         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
    3667              : 
    3668          126 :     PG_RETURN_DATUM(makeArrayResult(tstate.astate,
    3669              :                                     CurrentMemoryContext));
    3670              : }
    3671              : 
    3672              : /*
    3673              :  * text_to_array_null
    3674              :  * parse input string and return text array of elements,
    3675              :  * based on provided field separator and null string
    3676              :  *
    3677              :  * This is a separate entry point only to prevent the regression tests from
    3678              :  * complaining about different argument sets for the same internal function.
    3679              :  */
    3680              : Datum
    3681           50 : text_to_array_null(PG_FUNCTION_ARGS)
    3682              : {
    3683           50 :     return text_to_array(fcinfo);
    3684              : }
    3685              : 
    3686              : /*
    3687              :  * text_to_table
    3688              :  * parse input string and return table of elements,
    3689              :  * based on provided field separator
    3690              :  */
    3691              : Datum
    3692           56 : text_to_table(PG_FUNCTION_ARGS)
    3693              : {
    3694           56 :     ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
    3695              :     SplitTextOutputData tstate;
    3696              : 
    3697           56 :     tstate.astate = NULL;
    3698           56 :     InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
    3699           56 :     tstate.tupstore = rsi->setResult;
    3700           56 :     tstate.tupdesc = rsi->setDesc;
    3701              : 
    3702           56 :     (void) split_text(fcinfo, &tstate);
    3703              : 
    3704           56 :     return (Datum) 0;
    3705              : }
    3706              : 
    3707              : /*
    3708              :  * text_to_table_null
    3709              :  * parse input string and return table of elements,
    3710              :  * based on provided field separator and null string
    3711              :  *
    3712              :  * This is a separate entry point only to prevent the regression tests from
    3713              :  * complaining about different argument sets for the same internal function.
    3714              :  */
    3715              : Datum
    3716           16 : text_to_table_null(PG_FUNCTION_ARGS)
    3717              : {
    3718           16 :     return text_to_table(fcinfo);
    3719              : }
    3720              : 
    3721              : /*
    3722              :  * Common code for text_to_array, text_to_array_null, text_to_table
    3723              :  * and text_to_table_null functions.
    3724              :  *
    3725              :  * These are not strict so we have to test for null inputs explicitly.
    3726              :  * Returns false if result is to be null, else returns true.
    3727              :  *
    3728              :  * Note that if the result is valid but empty (zero elements), we return
    3729              :  * without changing *tstate --- caller must handle that case, too.
    3730              :  */
    3731              : static bool
    3732          192 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
    3733              : {
    3734              :     text       *inputstring;
    3735              :     text       *fldsep;
    3736              :     text       *null_string;
    3737          192 :     Oid         collation = PG_GET_COLLATION();
    3738              :     int         inputstring_len;
    3739              :     int         fldsep_len;
    3740              :     char       *start_ptr;
    3741              :     text       *result_text;
    3742              : 
    3743              :     /* when input string is NULL, then result is NULL too */
    3744          192 :     if (PG_ARGISNULL(0))
    3745            9 :         return false;
    3746              : 
    3747          183 :     inputstring = PG_GETARG_TEXT_PP(0);
    3748              : 
    3749              :     /* fldsep can be NULL */
    3750          183 :     if (!PG_ARGISNULL(1))
    3751          159 :         fldsep = PG_GETARG_TEXT_PP(1);
    3752              :     else
    3753           24 :         fldsep = NULL;
    3754              : 
    3755              :     /* null_string can be NULL or omitted */
    3756          183 :     if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
    3757           66 :         null_string = PG_GETARG_TEXT_PP(2);
    3758              :     else
    3759          117 :         null_string = NULL;
    3760              : 
    3761          183 :     if (fldsep != NULL)
    3762              :     {
    3763              :         /*
    3764              :          * Normal case with non-null fldsep.  Use the text_position machinery
    3765              :          * to search for occurrences of fldsep.
    3766              :          */
    3767              :         TextPositionState state;
    3768              : 
    3769          159 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3770          159 :         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3771              : 
    3772              :         /* return empty set for empty input string */
    3773          159 :         if (inputstring_len < 1)
    3774           46 :             return true;
    3775              : 
    3776              :         /* empty field separator: return input string as a one-element set */
    3777          150 :         if (fldsep_len < 1)
    3778              :         {
    3779           37 :             split_text_accum_result(tstate, inputstring,
    3780              :                                     null_string, collation);
    3781           37 :             return true;
    3782              :         }
    3783              : 
    3784          113 :         text_position_setup(inputstring, fldsep, collation, &state);
    3785              : 
    3786          113 :         start_ptr = VARDATA_ANY(inputstring);
    3787              : 
    3788              :         for (;;)
    3789          371 :         {
    3790              :             bool        found;
    3791              :             char       *end_ptr;
    3792              :             int         chunk_len;
    3793              : 
    3794          484 :             CHECK_FOR_INTERRUPTS();
    3795              : 
    3796          484 :             found = text_position_next(&state);
    3797          484 :             if (!found)
    3798              :             {
    3799              :                 /* fetch last field */
    3800          113 :                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
    3801          113 :                 end_ptr = NULL; /* not used, but some compilers complain */
    3802              :             }
    3803              :             else
    3804              :             {
    3805              :                 /* fetch non-last field */
    3806          371 :                 end_ptr = text_position_get_match_ptr(&state);
    3807          371 :                 chunk_len = end_ptr - start_ptr;
    3808              :             }
    3809              : 
    3810              :             /* build a temp text datum to pass to split_text_accum_result */
    3811          484 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3812              : 
    3813              :             /* stash away this field */
    3814          484 :             split_text_accum_result(tstate, result_text,
    3815              :                                     null_string, collation);
    3816              : 
    3817          484 :             pfree(result_text);
    3818              : 
    3819          484 :             if (!found)
    3820          113 :                 break;
    3821              : 
    3822          371 :             start_ptr = end_ptr + state.last_match_len;
    3823              :         }
    3824              : 
    3825          113 :         text_position_cleanup(&state);
    3826              :     }
    3827              :     else
    3828              :     {
    3829              :         const char *end_ptr;
    3830              : 
    3831              :         /*
    3832              :          * When fldsep is NULL, each character in the input string becomes a
    3833              :          * separate element in the result set.  The separator is effectively
    3834              :          * the space between characters.
    3835              :          */
    3836           24 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3837              : 
    3838           24 :         start_ptr = VARDATA_ANY(inputstring);
    3839           24 :         end_ptr = start_ptr + inputstring_len;
    3840              : 
    3841          204 :         while (inputstring_len > 0)
    3842              :         {
    3843          180 :             int         chunk_len = pg_mblen_range(start_ptr, end_ptr);
    3844              : 
    3845          180 :             CHECK_FOR_INTERRUPTS();
    3846              : 
    3847              :             /* build a temp text datum to pass to split_text_accum_result */
    3848          180 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3849              : 
    3850              :             /* stash away this field */
    3851          180 :             split_text_accum_result(tstate, result_text,
    3852              :                                     null_string, collation);
    3853              : 
    3854          180 :             pfree(result_text);
    3855              : 
    3856          180 :             start_ptr += chunk_len;
    3857          180 :             inputstring_len -= chunk_len;
    3858              :         }
    3859              :     }
    3860              : 
    3861          137 :     return true;
    3862              : }
    3863              : 
    3864              : /*
    3865              :  * Add text item to result set (table or array).
    3866              :  *
    3867              :  * This is also responsible for checking to see if the item matches
    3868              :  * the null_string, in which case we should emit NULL instead.
    3869              :  */
    3870              : static void
    3871          701 : split_text_accum_result(SplitTextOutputData *tstate,
    3872              :                         text *field_value,
    3873              :                         text *null_string,
    3874              :                         Oid collation)
    3875              : {
    3876          701 :     bool        is_null = false;
    3877              : 
    3878          701 :     if (null_string && text_isequal(field_value, null_string, collation))
    3879           56 :         is_null = true;
    3880              : 
    3881          701 :     if (tstate->tupstore)
    3882              :     {
    3883              :         Datum       values[1];
    3884              :         bool        nulls[1];
    3885              : 
    3886          152 :         values[0] = PointerGetDatum(field_value);
    3887          152 :         nulls[0] = is_null;
    3888              : 
    3889          152 :         tuplestore_putvalues(tstate->tupstore,
    3890              :                              tstate->tupdesc,
    3891              :                              values,
    3892              :                              nulls);
    3893              :     }
    3894              :     else
    3895              :     {
    3896          549 :         tstate->astate = accumArrayResult(tstate->astate,
    3897              :                                           PointerGetDatum(field_value),
    3898              :                                           is_null,
    3899              :                                           TEXTOID,
    3900              :                                           CurrentMemoryContext);
    3901              :     }
    3902          701 : }
    3903              : 
    3904              : /*
    3905              :  * array_to_text
    3906              :  * concatenate Cstring representation of input array elements
    3907              :  * using provided field separator
    3908              :  */
    3909              : Datum
    3910        44621 : array_to_text(PG_FUNCTION_ARGS)
    3911              : {
    3912        44621 :     ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
    3913        44621 :     char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3914              : 
    3915        44621 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
    3916              : }
    3917              : 
    3918              : /*
    3919              :  * array_to_text_null
    3920              :  * concatenate Cstring representation of input array elements
    3921              :  * using provided field separator and null string
    3922              :  *
    3923              :  * This version is not strict so we have to test for null inputs explicitly.
    3924              :  */
    3925              : Datum
    3926            8 : array_to_text_null(PG_FUNCTION_ARGS)
    3927              : {
    3928              :     ArrayType  *v;
    3929              :     char       *fldsep;
    3930              :     char       *null_string;
    3931              : 
    3932              :     /* returns NULL when first or second parameter is NULL */
    3933            8 :     if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
    3934            0 :         PG_RETURN_NULL();
    3935              : 
    3936            8 :     v = PG_GETARG_ARRAYTYPE_P(0);
    3937            8 :     fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3938              : 
    3939              :     /* NULL null string is passed through as a null pointer */
    3940            8 :     if (!PG_ARGISNULL(2))
    3941            4 :         null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
    3942              :     else
    3943            4 :         null_string = NULL;
    3944              : 
    3945            8 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
    3946              : }
    3947              : 
    3948              : /*
    3949              :  * common code for array_to_text and array_to_text_null functions
    3950              :  */
    3951              : static text *
    3952        44641 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
    3953              :                        const char *fldsep, const char *null_string)
    3954              : {
    3955              :     text       *result;
    3956              :     int         nitems,
    3957              :                *dims,
    3958              :                 ndims;
    3959              :     Oid         element_type;
    3960              :     int         typlen;
    3961              :     bool        typbyval;
    3962              :     char        typalign;
    3963              :     uint8       typalignby;
    3964              :     StringInfoData buf;
    3965        44641 :     bool        printed = false;
    3966              :     char       *p;
    3967              :     uint8      *bitmap;
    3968              :     int         bitmask;
    3969              :     int         i;
    3970              :     ArrayMetaState *my_extra;
    3971              : 
    3972        44641 :     ndims = ARR_NDIM(v);
    3973        44641 :     dims = ARR_DIMS(v);
    3974        44641 :     nitems = ArrayGetNItems(ndims, dims);
    3975              : 
    3976              :     /* if there are no elements, return an empty string */
    3977        44641 :     if (nitems == 0)
    3978        27944 :         return cstring_to_text_with_len("", 0);
    3979              : 
    3980        16697 :     element_type = ARR_ELEMTYPE(v);
    3981        16697 :     initStringInfo(&buf);
    3982              : 
    3983              :     /*
    3984              :      * We arrange to look up info about element type, including its output
    3985              :      * conversion proc, only once per series of calls, assuming the element
    3986              :      * type doesn't change underneath us.
    3987              :      */
    3988        16697 :     my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3989        16697 :     if (my_extra == NULL)
    3990              :     {
    3991          879 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    3992              :                                                       sizeof(ArrayMetaState));
    3993          879 :         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3994          879 :         my_extra->element_type = ~element_type;
    3995              :     }
    3996              : 
    3997        16697 :     if (my_extra->element_type != element_type)
    3998              :     {
    3999              :         /*
    4000              :          * Get info about element type, including its output conversion proc
    4001              :          */
    4002          879 :         get_type_io_data(element_type, IOFunc_output,
    4003              :                          &my_extra->typlen, &my_extra->typbyval,
    4004              :                          &my_extra->typalign, &my_extra->typdelim,
    4005              :                          &my_extra->typioparam, &my_extra->typiofunc);
    4006          879 :         fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
    4007          879 :                       fcinfo->flinfo->fn_mcxt);
    4008          879 :         my_extra->element_type = element_type;
    4009              :     }
    4010        16697 :     typlen = my_extra->typlen;
    4011        16697 :     typbyval = my_extra->typbyval;
    4012        16697 :     typalign = my_extra->typalign;
    4013        16697 :     typalignby = typalign_to_alignby(typalign);
    4014              : 
    4015        16697 :     p = ARR_DATA_PTR(v);
    4016        16697 :     bitmap = ARR_NULLBITMAP(v);
    4017        16697 :     bitmask = 1;
    4018              : 
    4019        56511 :     for (i = 0; i < nitems; i++)
    4020              :     {
    4021              :         Datum       itemvalue;
    4022              :         char       *value;
    4023              : 
    4024              :         /* Get source element, checking for NULL */
    4025        39814 :         if (bitmap && (*bitmap & bitmask) == 0)
    4026              :         {
    4027              :             /* if null_string is NULL, we just ignore null elements */
    4028           12 :             if (null_string != NULL)
    4029              :             {
    4030            4 :                 if (printed)
    4031            4 :                     appendStringInfo(&buf, "%s%s", fldsep, null_string);
    4032              :                 else
    4033            0 :                     appendStringInfoString(&buf, null_string);
    4034            4 :                 printed = true;
    4035              :             }
    4036              :         }
    4037              :         else
    4038              :         {
    4039        39802 :             itemvalue = fetch_att(p, typbyval, typlen);
    4040              : 
    4041        39802 :             value = OutputFunctionCall(&my_extra->proc, itemvalue);
    4042              : 
    4043        39802 :             if (printed)
    4044        23105 :                 appendStringInfo(&buf, "%s%s", fldsep, value);
    4045              :             else
    4046        16697 :                 appendStringInfoString(&buf, value);
    4047        39802 :             printed = true;
    4048              : 
    4049        39802 :             p = att_addlength_pointer(p, typlen, p);
    4050        39802 :             p = (char *) att_nominal_alignby(p, typalignby);
    4051              :         }
    4052              : 
    4053              :         /* advance bitmap pointer if any */
    4054        39814 :         if (bitmap)
    4055              :         {
    4056           72 :             bitmask <<= 1;
    4057           72 :             if (bitmask == 0x100)
    4058              :             {
    4059            0 :                 bitmap++;
    4060            0 :                 bitmask = 1;
    4061              :             }
    4062              :         }
    4063              :     }
    4064              : 
    4065        16697 :     result = cstring_to_text_with_len(buf.data, buf.len);
    4066        16697 :     pfree(buf.data);
    4067              : 
    4068        16697 :     return result;
    4069              : }
    4070              : 
    4071              : /*
    4072              :  * Workhorse for to_bin, to_oct, and to_hex.  Note that base must be > 1 and <=
    4073              :  * 16.
    4074              :  */
    4075              : static inline text *
    4076        20389 : convert_to_base(uint64 value, int base)
    4077              : {
    4078        20389 :     const char *digits = "0123456789abcdef";
    4079              : 
    4080              :     /* We size the buffer for to_bin's longest possible return value. */
    4081              :     char        buf[sizeof(uint64) * BITS_PER_BYTE];
    4082        20389 :     char       *const end = buf + sizeof(buf);
    4083        20389 :     char       *ptr = end;
    4084              : 
    4085              :     Assert(base > 1);
    4086              :     Assert(base <= 16);
    4087              : 
    4088              :     do
    4089              :     {
    4090        40307 :         *--ptr = digits[value % base];
    4091        40307 :         value /= base;
    4092        40307 :     } while (ptr > buf && value);
    4093              : 
    4094        20389 :     return cstring_to_text_with_len(ptr, end - ptr);
    4095              : }
    4096              : 
    4097              : /*
    4098              :  * Convert an integer to a string containing a base-2 (binary) representation
    4099              :  * of the number.
    4100              :  */
    4101              : Datum
    4102           10 : to_bin32(PG_FUNCTION_ARGS)
    4103              : {
    4104           10 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4105              : 
    4106           10 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4107              : }
    4108              : Datum
    4109           10 : to_bin64(PG_FUNCTION_ARGS)
    4110              : {
    4111           10 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4112              : 
    4113           10 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4114              : }
    4115              : 
    4116              : /*
    4117              :  * Convert an integer to a string containing a base-8 (oct) representation of
    4118              :  * the number.
    4119              :  */
    4120              : Datum
    4121           10 : to_oct32(PG_FUNCTION_ARGS)
    4122              : {
    4123           10 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4124              : 
    4125           10 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4126              : }
    4127              : Datum
    4128           10 : to_oct64(PG_FUNCTION_ARGS)
    4129              : {
    4130           10 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4131              : 
    4132           10 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4133              : }
    4134              : 
    4135              : /*
    4136              :  * Convert an integer to a string containing a base-16 (hex) representation of
    4137              :  * the number.
    4138              :  */
    4139              : Datum
    4140        20339 : to_hex32(PG_FUNCTION_ARGS)
    4141              : {
    4142        20339 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4143              : 
    4144        20339 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4145              : }
    4146              : Datum
    4147           10 : to_hex64(PG_FUNCTION_ARGS)
    4148              : {
    4149           10 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4150              : 
    4151           10 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4152              : }
    4153              : 
    4154              : /*
    4155              :  * Return the size of a datum, possibly compressed
    4156              :  *
    4157              :  * Works on any data type
    4158              :  */
    4159              : Datum
    4160           71 : pg_column_size(PG_FUNCTION_ARGS)
    4161              : {
    4162           71 :     Datum       value = PG_GETARG_DATUM(0);
    4163              :     int32       result;
    4164              :     int         typlen;
    4165              : 
    4166              :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4167           71 :     if (fcinfo->flinfo->fn_extra == NULL)
    4168              :     {
    4169              :         /* Lookup the datatype of the supplied argument */
    4170           71 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4171              : 
    4172           71 :         typlen = get_typlen(argtypeid);
    4173           71 :         if (typlen == 0)        /* should not happen */
    4174            0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4175              : 
    4176           71 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4177              :                                                       sizeof(int));
    4178           71 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4179              :     }
    4180              :     else
    4181            0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4182              : 
    4183           71 :     if (typlen == -1)
    4184              :     {
    4185              :         /* varlena type, possibly toasted */
    4186           71 :         result = toast_datum_size(value);
    4187              :     }
    4188            0 :     else if (typlen == -2)
    4189              :     {
    4190              :         /* cstring */
    4191            0 :         result = strlen(DatumGetCString(value)) + 1;
    4192              :     }
    4193              :     else
    4194              :     {
    4195              :         /* ordinary fixed-width type */
    4196            0 :         result = typlen;
    4197              :     }
    4198              : 
    4199           71 :     PG_RETURN_INT32(result);
    4200              : }
    4201              : 
    4202              : /*
    4203              :  * Return the compression method stored in the compressed attribute.  Return
    4204              :  * NULL for non varlena type or uncompressed data.
    4205              :  */
    4206              : Datum
    4207          128 : pg_column_compression(PG_FUNCTION_ARGS)
    4208              : {
    4209              :     int         typlen;
    4210              :     char       *result;
    4211              :     ToastCompressionId cmid;
    4212              : 
    4213              :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4214          128 :     if (fcinfo->flinfo->fn_extra == NULL)
    4215              :     {
    4216              :         /* Lookup the datatype of the supplied argument */
    4217          104 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4218              : 
    4219          104 :         typlen = get_typlen(argtypeid);
    4220          104 :         if (typlen == 0)        /* should not happen */
    4221            0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4222              : 
    4223          104 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4224              :                                                       sizeof(int));
    4225          104 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4226              :     }
    4227              :     else
    4228           24 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4229              : 
    4230          128 :     if (typlen != -1)
    4231            0 :         PG_RETURN_NULL();
    4232              : 
    4233              :     /* get the compression method id stored in the compressed varlena */
    4234          128 :     cmid = toast_get_compression_id((varlena *)
    4235          128 :                                     DatumGetPointer(PG_GETARG_DATUM(0)));
    4236          128 :     if (cmid == TOAST_INVALID_COMPRESSION_ID)
    4237           28 :         PG_RETURN_NULL();
    4238              : 
    4239              :     /* convert compression method id to compression method name */
    4240          100 :     switch (cmid)
    4241              :     {
    4242           56 :         case TOAST_PGLZ_COMPRESSION_ID:
    4243           56 :             result = "pglz";
    4244           56 :             break;
    4245           44 :         case TOAST_LZ4_COMPRESSION_ID:
    4246           44 :             result = "lz4";
    4247           44 :             break;
    4248            0 :         default:
    4249            0 :             elog(ERROR, "invalid compression method id %d", cmid);
    4250              :     }
    4251              : 
    4252          100 :     PG_RETURN_TEXT_P(cstring_to_text(result));
    4253              : }
    4254              : 
    4255              : /*
    4256              :  * Return the chunk_id of the on-disk TOASTed value.  Return NULL if the value
    4257              :  * is un-TOASTed or not on-disk.
    4258              :  */
    4259              : Datum
    4260          114 : pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
    4261              : {
    4262              :     int         typlen;
    4263              :     varlena    *attr;
    4264              :     varatt_external toast_pointer;
    4265              : 
    4266              :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4267          114 :     if (fcinfo->flinfo->fn_extra == NULL)
    4268              :     {
    4269              :         /* Lookup the datatype of the supplied argument */
    4270           30 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4271              : 
    4272           30 :         typlen = get_typlen(argtypeid);
    4273           30 :         if (typlen == 0)        /* should not happen */
    4274            0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4275              : 
    4276           30 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4277              :                                                       sizeof(int));
    4278           30 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4279              :     }
    4280              :     else
    4281           84 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4282              : 
    4283          114 :     if (typlen != -1)
    4284            0 :         PG_RETURN_NULL();
    4285              : 
    4286          114 :     attr = (varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
    4287              : 
    4288          114 :     if (!VARATT_IS_EXTERNAL_ONDISK(attr))
    4289           40 :         PG_RETURN_NULL();
    4290              : 
    4291           74 :     VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
    4292              : 
    4293           74 :     PG_RETURN_OID(toast_pointer.va_valueid);
    4294              : }
    4295              : 
    4296              : /*
    4297              :  * string_agg - Concatenates values and returns string.
    4298              :  *
    4299              :  * Syntax: string_agg(value text, delimiter text) RETURNS text
    4300              :  *
    4301              :  * Note: Any NULL values are ignored. The first-call delimiter isn't
    4302              :  * actually used at all, and on subsequent calls the delimiter precedes
    4303              :  * the associated value.
    4304              :  */
    4305              : 
    4306              : /* subroutine to initialize state */
    4307              : static StringInfo
    4308         1631 : makeStringAggState(FunctionCallInfo fcinfo)
    4309              : {
    4310              :     StringInfo  state;
    4311              :     MemoryContext aggcontext;
    4312              :     MemoryContext oldcontext;
    4313              : 
    4314         1631 :     if (!AggCheckCallContext(fcinfo, &aggcontext))
    4315              :     {
    4316              :         /* cannot be called directly because of internal-type argument */
    4317            0 :         elog(ERROR, "string_agg_transfn called in non-aggregate context");
    4318              :     }
    4319              : 
    4320              :     /*
    4321              :      * Create state in aggregate context.  It'll stay there across subsequent
    4322              :      * calls.
    4323              :      */
    4324         1631 :     oldcontext = MemoryContextSwitchTo(aggcontext);
    4325         1631 :     state = makeStringInfo();
    4326         1631 :     MemoryContextSwitchTo(oldcontext);
    4327              : 
    4328         1631 :     return state;
    4329              : }
    4330              : 
    4331              : Datum
    4332       621931 : string_agg_transfn(PG_FUNCTION_ARGS)
    4333              : {
    4334              :     StringInfo  state;
    4335              : 
    4336       621931 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4337              : 
    4338              :     /* Append the value unless null, preceding it with the delimiter. */
    4339       621931 :     if (!PG_ARGISNULL(1))
    4340              :     {
    4341       611899 :         text       *value = PG_GETARG_TEXT_PP(1);
    4342       611899 :         bool        isfirst = false;
    4343              : 
    4344              :         /*
    4345              :          * You might think we can just throw away the first delimiter, however
    4346              :          * we must keep it as we may be a parallel worker doing partial
    4347              :          * aggregation building a state to send to the main process.  We need
    4348              :          * to keep the delimiter of every aggregation so that the combine
    4349              :          * function can properly join up the strings of two separately
    4350              :          * partially aggregated results.  The first delimiter is only stripped
    4351              :          * off in the final function.  To know how much to strip off the front
    4352              :          * of the string, we store the length of the first delimiter in the
    4353              :          * StringInfo's cursor field, which we don't otherwise need here.
    4354              :          */
    4355       611899 :         if (state == NULL)
    4356              :         {
    4357         1411 :             state = makeStringAggState(fcinfo);
    4358         1411 :             isfirst = true;
    4359              :         }
    4360              : 
    4361       611899 :         if (!PG_ARGISNULL(2))
    4362              :         {
    4363       611899 :             text       *delim = PG_GETARG_TEXT_PP(2);
    4364              : 
    4365       611899 :             appendStringInfoText(state, delim);
    4366       611899 :             if (isfirst)
    4367         1411 :                 state->cursor = VARSIZE_ANY_EXHDR(delim);
    4368              :         }
    4369              : 
    4370       611899 :         appendStringInfoText(state, value);
    4371              :     }
    4372              : 
    4373              :     /*
    4374              :      * The transition type for string_agg() is declared to be "internal",
    4375              :      * which is a pass-by-value type the same size as a pointer.
    4376              :      */
    4377       621931 :     if (state)
    4378       621873 :         PG_RETURN_POINTER(state);
    4379           58 :     PG_RETURN_NULL();
    4380              : }
    4381              : 
    4382              : /*
    4383              :  * string_agg_combine
    4384              :  *      Aggregate combine function for string_agg(text) and string_agg(bytea)
    4385              :  */
    4386              : Datum
    4387          140 : string_agg_combine(PG_FUNCTION_ARGS)
    4388              : {
    4389              :     StringInfo  state1;
    4390              :     StringInfo  state2;
    4391              :     MemoryContext agg_context;
    4392              : 
    4393          140 :     if (!AggCheckCallContext(fcinfo, &agg_context))
    4394            0 :         elog(ERROR, "aggregate function called in non-aggregate context");
    4395              : 
    4396          140 :     state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4397          140 :     state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
    4398              : 
    4399          140 :     if (state2 == NULL)
    4400              :     {
    4401              :         /*
    4402              :          * NULL state2 is easy, just return state1, which we know is already
    4403              :          * in the agg_context
    4404              :          */
    4405            0 :         if (state1 == NULL)
    4406            0 :             PG_RETURN_NULL();
    4407            0 :         PG_RETURN_POINTER(state1);
    4408              :     }
    4409              : 
    4410          140 :     if (state1 == NULL)
    4411              :     {
    4412              :         /* We must copy state2's data into the agg_context */
    4413              :         MemoryContext old_context;
    4414              : 
    4415           80 :         old_context = MemoryContextSwitchTo(agg_context);
    4416           80 :         state1 = makeStringAggState(fcinfo);
    4417           80 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4418           80 :         state1->cursor = state2->cursor;
    4419           80 :         MemoryContextSwitchTo(old_context);
    4420              :     }
    4421           60 :     else if (state2->len > 0)
    4422              :     {
    4423              :         /* Combine ... state1->cursor does not change in this case */
    4424           60 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4425              :     }
    4426              : 
    4427          140 :     PG_RETURN_POINTER(state1);
    4428              : }
    4429              : 
    4430              : /*
    4431              :  * string_agg_serialize
    4432              :  *      Aggregate serialize function for string_agg(text) and string_agg(bytea)
    4433              :  *
    4434              :  * This is strict, so we need not handle NULL input
    4435              :  */
    4436              : Datum
    4437          140 : string_agg_serialize(PG_FUNCTION_ARGS)
    4438              : {
    4439              :     StringInfo  state;
    4440              :     StringInfoData buf;
    4441              :     bytea      *result;
    4442              : 
    4443              :     /* cannot be called directly because of internal-type argument */
    4444              :     Assert(AggCheckCallContext(fcinfo, NULL));
    4445              : 
    4446          140 :     state = (StringInfo) PG_GETARG_POINTER(0);
    4447              : 
    4448          140 :     pq_begintypsend(&buf);
    4449              : 
    4450              :     /* cursor */
    4451          140 :     pq_sendint(&buf, state->cursor, 4);
    4452              : 
    4453              :     /* data */
    4454          140 :     pq_sendbytes(&buf, state->data, state->len);
    4455              : 
    4456          140 :     result = pq_endtypsend(&buf);
    4457              : 
    4458          140 :     PG_RETURN_BYTEA_P(result);
    4459              : }
    4460              : 
    4461              : /*
    4462              :  * string_agg_deserialize
    4463              :  *      Aggregate deserial function for string_agg(text) and string_agg(bytea)
    4464              :  *
    4465              :  * This is strict, so we need not handle NULL input
    4466              :  */
    4467              : Datum
    4468          140 : string_agg_deserialize(PG_FUNCTION_ARGS)
    4469              : {
    4470              :     bytea      *sstate;
    4471              :     StringInfo  result;
    4472              :     StringInfoData buf;
    4473              :     char       *data;
    4474              :     int         datalen;
    4475              : 
    4476              :     /* cannot be called directly because of internal-type argument */
    4477              :     Assert(AggCheckCallContext(fcinfo, NULL));
    4478              : 
    4479          140 :     sstate = PG_GETARG_BYTEA_PP(0);
    4480              : 
    4481              :     /*
    4482              :      * Initialize a StringInfo so that we can "receive" it using the standard
    4483              :      * recv-function infrastructure.
    4484              :      */
    4485          140 :     initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
    4486          140 :                            VARSIZE_ANY_EXHDR(sstate));
    4487              : 
    4488          140 :     result = makeStringAggState(fcinfo);
    4489              : 
    4490              :     /* cursor */
    4491          140 :     result->cursor = pq_getmsgint(&buf, 4);
    4492              : 
    4493              :     /* data */
    4494          140 :     datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
    4495          140 :     data = (char *) pq_getmsgbytes(&buf, datalen);
    4496          140 :     appendBinaryStringInfo(result, data, datalen);
    4497              : 
    4498          140 :     pq_getmsgend(&buf);
    4499              : 
    4500          140 :     PG_RETURN_POINTER(result);
    4501              : }
    4502              : 
    4503              : Datum
    4504         1437 : string_agg_finalfn(PG_FUNCTION_ARGS)
    4505              : {
    4506              :     StringInfo  state;
    4507              : 
    4508              :     /* cannot be called directly because of internal-type argument */
    4509              :     Assert(AggCheckCallContext(fcinfo, NULL));
    4510              : 
    4511         1437 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4512              : 
    4513         1437 :     if (state != NULL)
    4514              :     {
    4515              :         /* As per comment in transfn, strip data before the cursor position */
    4516         1381 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
    4517              :                                                   state->len - state->cursor));
    4518              :     }
    4519              :     else
    4520           56 :         PG_RETURN_NULL();
    4521              : }
    4522              : 
    4523              : /*
    4524              :  * Prepare cache with fmgr info for the output functions of the datatypes of
    4525              :  * the arguments of a concat-like function, beginning with argument "argidx".
    4526              :  * (Arguments before that will have corresponding slots in the resulting
    4527              :  * FmgrInfo array, but we don't fill those slots.)
    4528              :  */
    4529              : static FmgrInfo *
    4530           90 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
    4531              : {
    4532              :     FmgrInfo   *foutcache;
    4533              :     int         i;
    4534              : 
    4535              :     /* We keep the info in fn_mcxt so it survives across calls */
    4536           90 :     foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4537           90 :                                                 PG_NARGS() * sizeof(FmgrInfo));
    4538              : 
    4539          328 :     for (i = argidx; i < PG_NARGS(); i++)
    4540              :     {
    4541              :         Oid         valtype;
    4542              :         Oid         typOutput;
    4543              :         bool        typIsVarlena;
    4544              : 
    4545          238 :         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
    4546          238 :         if (!OidIsValid(valtype))
    4547            0 :             elog(ERROR, "could not determine data type of concat() input");
    4548              : 
    4549          238 :         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
    4550          238 :         fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
    4551              :     }
    4552              : 
    4553           90 :     fcinfo->flinfo->fn_extra = foutcache;
    4554              : 
    4555           90 :     return foutcache;
    4556              : }
    4557              : 
    4558              : /*
    4559              :  * Implementation of both concat() and concat_ws().
    4560              :  *
    4561              :  * sepstr is the separator string to place between values.
    4562              :  * argidx identifies the first argument to concatenate (counting from zero);
    4563              :  * note that this must be constant across any one series of calls.
    4564              :  *
    4565              :  * Returns NULL if result should be NULL, else text value.
    4566              :  */
    4567              : static text *
    4568          195 : concat_internal(const char *sepstr, int argidx,
    4569              :                 FunctionCallInfo fcinfo)
    4570              : {
    4571              :     text       *result;
    4572              :     StringInfoData str;
    4573              :     FmgrInfo   *foutcache;
    4574          195 :     bool        first_arg = true;
    4575              :     int         i;
    4576              : 
    4577              :     /*
    4578              :      * concat(VARIADIC some-array) is essentially equivalent to
    4579              :      * array_to_text(), ie concat the array elements with the given separator.
    4580              :      * So we just pass the case off to that code.
    4581              :      */
    4582          195 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4583              :     {
    4584              :         ArrayType  *arr;
    4585              : 
    4586              :         /* Should have just the one argument */
    4587              :         Assert(argidx == PG_NARGS() - 1);
    4588              : 
    4589              :         /* concat(VARIADIC NULL) is defined as NULL */
    4590           20 :         if (PG_ARGISNULL(argidx))
    4591            8 :             return NULL;
    4592              : 
    4593              :         /*
    4594              :          * Non-null argument had better be an array.  We assume that any call
    4595              :          * context that could let get_fn_expr_variadic return true will have
    4596              :          * checked that a VARIADIC-labeled parameter actually is an array.  So
    4597              :          * it should be okay to just Assert that it's an array rather than
    4598              :          * doing a full-fledged error check.
    4599              :          */
    4600              :         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
    4601              : 
    4602              :         /* OK, safe to fetch the array value */
    4603           12 :         arr = PG_GETARG_ARRAYTYPE_P(argidx);
    4604              : 
    4605              :         /*
    4606              :          * And serialize the array.  We tell array_to_text to ignore null
    4607              :          * elements, which matches the behavior of the loop below.
    4608              :          */
    4609           12 :         return array_to_text_internal(fcinfo, arr, sepstr, NULL);
    4610              :     }
    4611              : 
    4612              :     /* Normal case without explicit VARIADIC marker */
    4613          175 :     initStringInfo(&str);
    4614              : 
    4615              :     /* Get output function info, building it if first time through */
    4616          175 :     foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
    4617          175 :     if (foutcache == NULL)
    4618           90 :         foutcache = build_concat_foutcache(fcinfo, argidx);
    4619              : 
    4620          608 :     for (i = argidx; i < PG_NARGS(); i++)
    4621              :     {
    4622          433 :         if (!PG_ARGISNULL(i))
    4623              :         {
    4624          381 :             Datum       value = PG_GETARG_DATUM(i);
    4625              : 
    4626              :             /* add separator if appropriate */
    4627          381 :             if (first_arg)
    4628          171 :                 first_arg = false;
    4629              :             else
    4630          210 :                 appendStringInfoString(&str, sepstr);
    4631              : 
    4632              :             /* call the appropriate type output function, append the result */
    4633          381 :             appendStringInfoString(&str,
    4634          381 :                                    OutputFunctionCall(&foutcache[i], value));
    4635              :         }
    4636              :     }
    4637              : 
    4638          175 :     result = cstring_to_text_with_len(str.data, str.len);
    4639          175 :     pfree(str.data);
    4640              : 
    4641          175 :     return result;
    4642              : }
    4643              : 
    4644              : /*
    4645              :  * Concatenate all arguments. NULL arguments are ignored.
    4646              :  */
    4647              : Datum
    4648          143 : text_concat(PG_FUNCTION_ARGS)
    4649              : {
    4650              :     text       *result;
    4651              : 
    4652          143 :     result = concat_internal("", 0, fcinfo);
    4653          143 :     if (result == NULL)
    4654            4 :         PG_RETURN_NULL();
    4655          139 :     PG_RETURN_TEXT_P(result);
    4656              : }
    4657              : 
    4658              : /*
    4659              :  * Concatenate all but first argument value with separators. The first
    4660              :  * parameter is used as the separator. NULL arguments are ignored.
    4661              :  */
    4662              : Datum
    4663           56 : text_concat_ws(PG_FUNCTION_ARGS)
    4664              : {
    4665              :     char       *sep;
    4666              :     text       *result;
    4667              : 
    4668              :     /* return NULL when separator is NULL */
    4669           56 :     if (PG_ARGISNULL(0))
    4670            4 :         PG_RETURN_NULL();
    4671           52 :     sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
    4672              : 
    4673           52 :     result = concat_internal(sep, 1, fcinfo);
    4674           52 :     if (result == NULL)
    4675            4 :         PG_RETURN_NULL();
    4676           48 :     PG_RETURN_TEXT_P(result);
    4677              : }
    4678              : 
    4679              : /*
    4680              :  * Return first n characters in the string. When n is negative,
    4681              :  * return all but last |n| characters.
    4682              :  */
    4683              : Datum
    4684         1432 : text_left(PG_FUNCTION_ARGS)
    4685              : {
    4686         1432 :     int         n = PG_GETARG_INT32(1);
    4687              : 
    4688         1432 :     if (n < 0)
    4689              :     {
    4690           20 :         text       *str = PG_GETARG_TEXT_PP(0);
    4691           20 :         const char *p = VARDATA_ANY(str);
    4692           20 :         int         len = VARSIZE_ANY_EXHDR(str);
    4693              :         int         rlen;
    4694              : 
    4695           20 :         n = pg_mbstrlen_with_len(p, len) + n;
    4696           20 :         rlen = pg_mbcharcliplen(p, len, n);
    4697           20 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
    4698              :     }
    4699              :     else
    4700         1412 :         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
    4701              : }
    4702              : 
    4703              : /*
    4704              :  * Return last n characters in the string. When n is negative,
    4705              :  * return all but first |n| characters.
    4706              :  */
    4707              : Datum
    4708           44 : text_right(PG_FUNCTION_ARGS)
    4709              : {
    4710           44 :     text       *str = PG_GETARG_TEXT_PP(0);
    4711           44 :     const char *p = VARDATA_ANY(str);
    4712           44 :     int         len = VARSIZE_ANY_EXHDR(str);
    4713           44 :     int         n = PG_GETARG_INT32(1);
    4714              :     int         off;
    4715              : 
    4716           44 :     if (n < 0)
    4717           20 :         n = -n;
    4718              :     else
    4719           24 :         n = pg_mbstrlen_with_len(p, len) - n;
    4720           44 :     off = pg_mbcharcliplen(p, len, n);
    4721              : 
    4722           44 :     PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
    4723              : }
    4724              : 
    4725              : /*
    4726              :  * Return reversed string
    4727              :  */
    4728              : Datum
    4729           29 : text_reverse(PG_FUNCTION_ARGS)
    4730              : {
    4731           29 :     text       *str = PG_GETARG_TEXT_PP(0);
    4732           29 :     const char *p = VARDATA_ANY(str);
    4733           29 :     int         len = VARSIZE_ANY_EXHDR(str);
    4734           29 :     const char *endp = p + len;
    4735              :     text       *result;
    4736              :     char       *dst;
    4737              : 
    4738           29 :     result = palloc(len + VARHDRSZ);
    4739           29 :     dst = (char *) VARDATA(result) + len;
    4740           29 :     SET_VARSIZE(result, len + VARHDRSZ);
    4741              : 
    4742           29 :     if (pg_database_encoding_max_length() > 1)
    4743              :     {
    4744              :         /* multibyte version */
    4745          222 :         while (p < endp)
    4746              :         {
    4747              :             int         sz;
    4748              : 
    4749          197 :             sz = pg_mblen_range(p, endp);
    4750          193 :             dst -= sz;
    4751          193 :             memcpy(dst, p, sz);
    4752          193 :             p += sz;
    4753              :         }
    4754              :     }
    4755              :     else
    4756              :     {
    4757              :         /* single byte version */
    4758            0 :         while (p < endp)
    4759            0 :             *(--dst) = *p++;
    4760              :     }
    4761              : 
    4762           25 :     PG_RETURN_TEXT_P(result);
    4763              : }
    4764              : 
    4765              : 
    4766              : /*
    4767              :  * Support macros for text_format()
    4768              :  */
    4769              : #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
    4770              : 
    4771              : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
    4772              :     do { \
    4773              :         if (++(ptr) >= (end_ptr)) \
    4774              :             ereport(ERROR, \
    4775              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    4776              :                      errmsg("unterminated format() type specifier"), \
    4777              :                      errhint("For a single \"%%\" use \"%%%%\"."))); \
    4778              :     } while (0)
    4779              : 
    4780              : /*
    4781              :  * Returns a formatted string
    4782              :  */
    4783              : Datum
    4784        22149 : text_format(PG_FUNCTION_ARGS)
    4785              : {
    4786              :     text       *fmt;
    4787              :     StringInfoData str;
    4788              :     const char *cp;
    4789              :     const char *start_ptr;
    4790              :     const char *end_ptr;
    4791              :     text       *result;
    4792              :     int         arg;
    4793              :     bool        funcvariadic;
    4794              :     int         nargs;
    4795        22149 :     Datum      *elements = NULL;
    4796        22149 :     bool       *nulls = NULL;
    4797        22149 :     Oid         element_type = InvalidOid;
    4798        22149 :     Oid         prev_type = InvalidOid;
    4799        22149 :     Oid         prev_width_type = InvalidOid;
    4800              :     FmgrInfo    typoutputfinfo;
    4801              :     FmgrInfo    typoutputinfo_width;
    4802              : 
    4803              :     /* When format string is null, immediately return null */
    4804        22149 :     if (PG_ARGISNULL(0))
    4805            4 :         PG_RETURN_NULL();
    4806              : 
    4807              :     /* If argument is marked VARIADIC, expand array into elements */
    4808        22145 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4809              :     {
    4810              :         ArrayType  *arr;
    4811              :         int16       elmlen;
    4812              :         bool        elmbyval;
    4813              :         char        elmalign;
    4814              :         int         nitems;
    4815              : 
    4816              :         /* Should have just the one argument */
    4817              :         Assert(PG_NARGS() == 2);
    4818              : 
    4819              :         /* If argument is NULL, we treat it as zero-length array */
    4820           32 :         if (PG_ARGISNULL(1))
    4821            4 :             nitems = 0;
    4822              :         else
    4823              :         {
    4824              :             /*
    4825              :              * Non-null argument had better be an array.  We assume that any
    4826              :              * call context that could let get_fn_expr_variadic return true
    4827              :              * will have checked that a VARIADIC-labeled parameter actually is
    4828              :              * an array.  So it should be okay to just Assert that it's an
    4829              :              * array rather than doing a full-fledged error check.
    4830              :              */
    4831              :             Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
    4832              : 
    4833              :             /* OK, safe to fetch the array value */
    4834           28 :             arr = PG_GETARG_ARRAYTYPE_P(1);
    4835              : 
    4836              :             /* Get info about array element type */
    4837           28 :             element_type = ARR_ELEMTYPE(arr);
    4838           28 :             get_typlenbyvalalign(element_type,
    4839              :                                  &elmlen, &elmbyval, &elmalign);
    4840              : 
    4841              :             /* Extract all array elements */
    4842           28 :             deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
    4843              :                               &elements, &nulls, &nitems);
    4844              :         }
    4845              : 
    4846           32 :         nargs = nitems + 1;
    4847           32 :         funcvariadic = true;
    4848              :     }
    4849              :     else
    4850              :     {
    4851              :         /* Non-variadic case, we'll process the arguments individually */
    4852        22113 :         nargs = PG_NARGS();
    4853        22113 :         funcvariadic = false;
    4854              :     }
    4855              : 
    4856              :     /* Setup for main loop. */
    4857        22145 :     fmt = PG_GETARG_TEXT_PP(0);
    4858        22145 :     start_ptr = VARDATA_ANY(fmt);
    4859        22145 :     end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
    4860        22145 :     initStringInfo(&str);
    4861        22145 :     arg = 1;                    /* next argument position to print */
    4862              : 
    4863              :     /* Scan format string, looking for conversion specifiers. */
    4864       705222 :     for (cp = start_ptr; cp < end_ptr; cp++)
    4865              :     {
    4866              :         int         argpos;
    4867              :         int         widthpos;
    4868              :         int         flags;
    4869              :         int         width;
    4870              :         Datum       value;
    4871              :         bool        isNull;
    4872              :         Oid         typid;
    4873              : 
    4874              :         /*
    4875              :          * If it's not the start of a conversion specifier, just copy it to
    4876              :          * the output buffer.
    4877              :          */
    4878       683117 :         if (*cp != '%')
    4879              :         {
    4880       637451 :             appendStringInfoCharMacro(&str, *cp);
    4881       637463 :             continue;
    4882              :         }
    4883              : 
    4884        45666 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    4885              : 
    4886              :         /* Easy case: %% outputs a single % */
    4887        45666 :         if (*cp == '%')
    4888              :         {
    4889           12 :             appendStringInfoCharMacro(&str, *cp);
    4890           12 :             continue;
    4891              :         }
    4892              : 
    4893              :         /* Parse the optional portions of the format specifier */
    4894        45654 :         cp = text_format_parse_format(cp, end_ptr,
    4895              :                                       &argpos, &widthpos,
    4896              :                                       &flags, &width);
    4897              : 
    4898              :         /*
    4899              :          * Next we should see the main conversion specifier.  Whether or not
    4900              :          * an argument position was present, it's known that at least one
    4901              :          * character remains in the string at this point.  Experience suggests
    4902              :          * that it's worth checking that that character is one of the expected
    4903              :          * ones before we try to fetch arguments, so as to produce the least
    4904              :          * confusing response to a mis-formatted specifier.
    4905              :          */
    4906        45638 :         if (strchr("sIL", *cp) == NULL)
    4907            4 :             ereport(ERROR,
    4908              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4909              :                      errmsg("unrecognized format() type specifier \"%.*s\"",
    4910              :                             pg_mblen_range(cp, end_ptr), cp),
    4911              :                      errhint("For a single \"%%\" use \"%%%%\".")));
    4912              : 
    4913              :         /* If indirect width was specified, get its value */
    4914        45634 :         if (widthpos >= 0)
    4915              :         {
    4916              :             /* Collect the specified or next argument position */
    4917           28 :             if (widthpos > 0)
    4918           24 :                 arg = widthpos;
    4919           28 :             if (arg >= nargs)
    4920            0 :                 ereport(ERROR,
    4921              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4922              :                          errmsg("too few arguments for format()")));
    4923              : 
    4924              :             /* Get the value and type of the selected argument */
    4925           28 :             if (!funcvariadic)
    4926              :             {
    4927           28 :                 value = PG_GETARG_DATUM(arg);
    4928           28 :                 isNull = PG_ARGISNULL(arg);
    4929           28 :                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4930              :             }
    4931              :             else
    4932              :             {
    4933            0 :                 value = elements[arg - 1];
    4934            0 :                 isNull = nulls[arg - 1];
    4935            0 :                 typid = element_type;
    4936              :             }
    4937           28 :             if (!OidIsValid(typid))
    4938            0 :                 elog(ERROR, "could not determine data type of format() input");
    4939              : 
    4940           28 :             arg++;
    4941              : 
    4942              :             /* We can treat NULL width the same as zero */
    4943           28 :             if (isNull)
    4944            4 :                 width = 0;
    4945           24 :             else if (typid == INT4OID)
    4946           24 :                 width = DatumGetInt32(value);
    4947            0 :             else if (typid == INT2OID)
    4948            0 :                 width = DatumGetInt16(value);
    4949              :             else
    4950              :             {
    4951              :                 /* For less-usual datatypes, convert to text then to int */
    4952              :                 char       *str;
    4953              : 
    4954            0 :                 if (typid != prev_width_type)
    4955              :                 {
    4956              :                     Oid         typoutputfunc;
    4957              :                     bool        typIsVarlena;
    4958              : 
    4959            0 :                     getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    4960            0 :                     fmgr_info(typoutputfunc, &typoutputinfo_width);
    4961            0 :                     prev_width_type = typid;
    4962              :                 }
    4963              : 
    4964            0 :                 str = OutputFunctionCall(&typoutputinfo_width, value);
    4965              : 
    4966              :                 /* pg_strtoint32 will complain about bad data or overflow */
    4967            0 :                 width = pg_strtoint32(str);
    4968              : 
    4969            0 :                 pfree(str);
    4970              :             }
    4971              :         }
    4972              : 
    4973              :         /* Collect the specified or next argument position */
    4974        45634 :         if (argpos > 0)
    4975           88 :             arg = argpos;
    4976        45634 :         if (arg >= nargs)
    4977           16 :             ereport(ERROR,
    4978              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4979              :                      errmsg("too few arguments for format()")));
    4980              : 
    4981              :         /* Get the value and type of the selected argument */
    4982        45618 :         if (!funcvariadic)
    4983              :         {
    4984        44770 :             value = PG_GETARG_DATUM(arg);
    4985        44770 :             isNull = PG_ARGISNULL(arg);
    4986        44770 :             typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4987              :         }
    4988              :         else
    4989              :         {
    4990          848 :             value = elements[arg - 1];
    4991          848 :             isNull = nulls[arg - 1];
    4992          848 :             typid = element_type;
    4993              :         }
    4994        45618 :         if (!OidIsValid(typid))
    4995            0 :             elog(ERROR, "could not determine data type of format() input");
    4996              : 
    4997        45618 :         arg++;
    4998              : 
    4999              :         /*
    5000              :          * Get the appropriate typOutput function, reusing previous one if
    5001              :          * same type as previous argument.  That's particularly useful in the
    5002              :          * variadic-array case, but often saves work even for ordinary calls.
    5003              :          */
    5004        45618 :         if (typid != prev_type)
    5005              :         {
    5006              :             Oid         typoutputfunc;
    5007              :             bool        typIsVarlena;
    5008              : 
    5009        23495 :             getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    5010        23495 :             fmgr_info(typoutputfunc, &typoutputfinfo);
    5011        23495 :             prev_type = typid;
    5012              :         }
    5013              : 
    5014              :         /*
    5015              :          * And now we can format the value.
    5016              :          */
    5017        45618 :         switch (*cp)
    5018              :         {
    5019        45618 :             case 's':
    5020              :             case 'I':
    5021              :             case 'L':
    5022        45618 :                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
    5023              :                                               value, isNull,
    5024              :                                               flags, width);
    5025        45614 :                 break;
    5026            0 :             default:
    5027              :                 /* should not get here, because of previous check */
    5028            0 :                 ereport(ERROR,
    5029              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5030              :                          errmsg("unrecognized format() type specifier \"%.*s\"",
    5031              :                                 pg_mblen_range(cp, end_ptr), cp),
    5032              :                          errhint("For a single \"%%\" use \"%%%%\".")));
    5033              :                 break;
    5034              :         }
    5035              :     }
    5036              : 
    5037              :     /* Don't need deconstruct_array results anymore. */
    5038        22105 :     if (elements != NULL)
    5039           28 :         pfree(elements);
    5040        22105 :     if (nulls != NULL)
    5041           28 :         pfree(nulls);
    5042              : 
    5043              :     /* Generate results. */
    5044        22105 :     result = cstring_to_text_with_len(str.data, str.len);
    5045        22105 :     pfree(str.data);
    5046              : 
    5047        22105 :     PG_RETURN_TEXT_P(result);
    5048              : }
    5049              : 
    5050              : /*
    5051              :  * Parse contiguous digits as a decimal number.
    5052              :  *
    5053              :  * Returns true if some digits could be parsed.
    5054              :  * The value is returned into *value, and *ptr is advanced to the next
    5055              :  * character to be parsed.
    5056              :  *
    5057              :  * Note parsing invariant: at least one character is known available before
    5058              :  * string end (end_ptr) at entry, and this is still true at exit.
    5059              :  */
    5060              : static bool
    5061        91284 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
    5062              : {
    5063        91284 :     bool        found = false;
    5064        91284 :     const char *cp = *ptr;
    5065        91284 :     int         val = 0;
    5066              : 
    5067        91492 :     while (*cp >= '0' && *cp <= '9')
    5068              :     {
    5069          212 :         int8        digit = (*cp - '0');
    5070              : 
    5071          212 :         if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
    5072          212 :             unlikely(pg_add_s32_overflow(val, digit, &val)))
    5073            0 :             ereport(ERROR,
    5074              :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5075              :                      errmsg("number is out of range")));
    5076          212 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5077          208 :         found = true;
    5078              :     }
    5079              : 
    5080        91280 :     *ptr = cp;
    5081        91280 :     *value = val;
    5082              : 
    5083        91280 :     return found;
    5084              : }
    5085              : 
    5086              : /*
    5087              :  * Parse a format specifier (generally following the SUS printf spec).
    5088              :  *
    5089              :  * We have already advanced over the initial '%', and we are looking for
    5090              :  * [argpos][flags][width]type (but the type character is not consumed here).
    5091              :  *
    5092              :  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
    5093              :  * Output parameters:
    5094              :  *  argpos: argument position for value to be printed.  -1 means unspecified.
    5095              :  *  widthpos: argument position for width.  Zero means the argument position
    5096              :  *          was unspecified (ie, take the next arg) and -1 means no width
    5097              :  *          argument (width was omitted or specified as a constant).
    5098              :  *  flags: bitmask of flags.
    5099              :  *  width: directly-specified width value.  Zero means the width was omitted
    5100              :  *          (note it's not necessary to distinguish this case from an explicit
    5101              :  *          zero width value).
    5102              :  *
    5103              :  * The function result is the next character position to be parsed, ie, the
    5104              :  * location where the type character is/should be.
    5105              :  *
    5106              :  * Note parsing invariant: at least one character is known available before
    5107              :  * string end (end_ptr) at entry, and this is still true at exit.
    5108              :  */
    5109              : static const char *
    5110        45654 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
    5111              :                          int *argpos, int *widthpos,
    5112              :                          int *flags, int *width)
    5113              : {
    5114        45654 :     const char *cp = start_ptr;
    5115              :     int         n;
    5116              : 
    5117              :     /* set defaults for output parameters */
    5118        45654 :     *argpos = -1;
    5119        45654 :     *widthpos = -1;
    5120        45654 :     *flags = 0;
    5121        45654 :     *width = 0;
    5122              : 
    5123              :     /* try to identify first number */
    5124        45654 :     if (text_format_parse_digits(&cp, end_ptr, &n))
    5125              :     {
    5126          116 :         if (*cp != '$')
    5127              :         {
    5128              :             /* Must be just a width and a type, so we're done */
    5129           16 :             *width = n;
    5130           16 :             return cp;
    5131              :         }
    5132              :         /* The number was argument position */
    5133          100 :         *argpos = n;
    5134              :         /* Explicit 0 for argument index is immediately refused */
    5135          100 :         if (n == 0)
    5136            4 :             ereport(ERROR,
    5137              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5138              :                      errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5139           96 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5140              :     }
    5141              : 
    5142              :     /* Handle flags (only minus is supported now) */
    5143        45650 :     while (*cp == '-')
    5144              :     {
    5145           20 :         *flags |= TEXT_FORMAT_FLAG_MINUS;
    5146           20 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5147              :     }
    5148              : 
    5149        45630 :     if (*cp == '*')
    5150              :     {
    5151              :         /* Handle indirect width */
    5152           32 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5153           32 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5154              :         {
    5155              :             /* number in this position must be closed by $ */
    5156           28 :             if (*cp != '$')
    5157            0 :                 ereport(ERROR,
    5158              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5159              :                          errmsg("width argument position must be ended by \"$\"")));
    5160              :             /* The number was width argument position */
    5161           28 :             *widthpos = n;
    5162              :             /* Explicit 0 for argument index is immediately refused */
    5163           28 :             if (n == 0)
    5164            4 :                 ereport(ERROR,
    5165              :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5166              :                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5167           24 :             ADVANCE_PARSE_POINTER(cp, end_ptr);
    5168              :         }
    5169              :         else
    5170            4 :             *widthpos = 0;      /* width's argument position is unspecified */
    5171              :     }
    5172              :     else
    5173              :     {
    5174              :         /* Check for direct width specification */
    5175        45598 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5176           20 :             *width = n;
    5177              :     }
    5178              : 
    5179              :     /* cp should now be pointing at type character */
    5180        45622 :     return cp;
    5181              : }
    5182              : 
    5183              : /*
    5184              :  * Format a %s, %I, or %L conversion
    5185              :  */
    5186              : static void
    5187        45618 : text_format_string_conversion(StringInfo buf, char conversion,
    5188              :                               FmgrInfo *typOutputInfo,
    5189              :                               Datum value, bool isNull,
    5190              :                               int flags, int width)
    5191              : {
    5192              :     char       *str;
    5193              : 
    5194              :     /* Handle NULL arguments before trying to stringify the value. */
    5195        45618 :     if (isNull)
    5196              :     {
    5197          228 :         if (conversion == 's')
    5198          180 :             text_format_append_string(buf, "", flags, width);
    5199           48 :         else if (conversion == 'L')
    5200           44 :             text_format_append_string(buf, "NULL", flags, width);
    5201            4 :         else if (conversion == 'I')
    5202            4 :             ereport(ERROR,
    5203              :                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
    5204              :                      errmsg("null values cannot be formatted as an SQL identifier")));
    5205          224 :         return;
    5206              :     }
    5207              : 
    5208              :     /* Stringify. */
    5209        45390 :     str = OutputFunctionCall(typOutputInfo, value);
    5210              : 
    5211              :     /* Escape. */
    5212        45390 :     if (conversion == 'I')
    5213              :     {
    5214              :         /* quote_identifier may or may not allocate a new string. */
    5215         3264 :         text_format_append_string(buf, quote_identifier(str), flags, width);
    5216              :     }
    5217        42126 :     else if (conversion == 'L')
    5218              :     {
    5219         2168 :         char       *qstr = quote_literal_cstr(str);
    5220              : 
    5221         2168 :         text_format_append_string(buf, qstr, flags, width);
    5222              :         /* quote_literal_cstr() always allocates a new string */
    5223         2168 :         pfree(qstr);
    5224              :     }
    5225              :     else
    5226        39958 :         text_format_append_string(buf, str, flags, width);
    5227              : 
    5228              :     /* Cleanup. */
    5229        45390 :     pfree(str);
    5230              : }
    5231              : 
    5232              : /*
    5233              :  * Append str to buf, padding as directed by flags/width
    5234              :  */
    5235              : static void
    5236        45614 : text_format_append_string(StringInfo buf, const char *str,
    5237              :                           int flags, int width)
    5238              : {
    5239        45614 :     bool        align_to_left = false;
    5240              :     int         len;
    5241              : 
    5242              :     /* fast path for typical easy case */
    5243        45614 :     if (width == 0)
    5244              :     {
    5245        45558 :         appendStringInfoString(buf, str);
    5246        45558 :         return;
    5247              :     }
    5248              : 
    5249           56 :     if (width < 0)
    5250              :     {
    5251              :         /* Negative width: implicit '-' flag, then take absolute value */
    5252            4 :         align_to_left = true;
    5253              :         /* -INT_MIN is undefined */
    5254            4 :         if (width <= INT_MIN)
    5255            0 :             ereport(ERROR,
    5256              :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5257              :                      errmsg("number is out of range")));
    5258            4 :         width = -width;
    5259              :     }
    5260           52 :     else if (flags & TEXT_FORMAT_FLAG_MINUS)
    5261           16 :         align_to_left = true;
    5262              : 
    5263           56 :     len = pg_mbstrlen(str);
    5264           56 :     if (align_to_left)
    5265              :     {
    5266              :         /* left justify */
    5267           20 :         appendStringInfoString(buf, str);
    5268           20 :         if (len < width)
    5269           20 :             appendStringInfoSpaces(buf, width - len);
    5270              :     }
    5271              :     else
    5272              :     {
    5273              :         /* right justify */
    5274           36 :         if (len < width)
    5275           36 :             appendStringInfoSpaces(buf, width - len);
    5276           36 :         appendStringInfoString(buf, str);
    5277              :     }
    5278              : }
    5279              : 
    5280              : /*
    5281              :  * text_format_nv - nonvariadic wrapper for text_format function.
    5282              :  *
    5283              :  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
    5284              :  * which checks that all built-in functions that share the implementing C
    5285              :  * function take the same number of arguments.
    5286              :  */
    5287              : Datum
    5288         1910 : text_format_nv(PG_FUNCTION_ARGS)
    5289              : {
    5290         1910 :     return text_format(fcinfo);
    5291              : }
    5292              : 
    5293              : /*
    5294              :  * Helper function for Levenshtein distance functions. Faster than memcmp(),
    5295              :  * for this use case.
    5296              :  */
    5297              : static inline bool
    5298            0 : rest_of_char_same(const char *s1, const char *s2, int len)
    5299              : {
    5300            0 :     while (len > 0)
    5301              :     {
    5302            0 :         len--;
    5303            0 :         if (s1[len] != s2[len])
    5304            0 :             return false;
    5305              :     }
    5306            0 :     return true;
    5307              : }
    5308              : 
    5309              : /* Expand each Levenshtein distance variant */
    5310              : #include "levenshtein.c"
    5311              : #define LEVENSHTEIN_LESS_EQUAL
    5312              : #include "levenshtein.c"
    5313              : 
    5314              : 
    5315              : /*
    5316              :  * The following *ClosestMatch() functions can be used to determine whether a
    5317              :  * user-provided string resembles any known valid values, which is useful for
    5318              :  * providing hints in log messages, among other things.  Use these functions
    5319              :  * like so:
    5320              :  *
    5321              :  *      initClosestMatch(&state, source_string, max_distance);
    5322              :  *
    5323              :  *      for (int i = 0; i < num_valid_strings; i++)
    5324              :  *          updateClosestMatch(&state, valid_strings[i]);
    5325              :  *
    5326              :  *      closestMatch = getClosestMatch(&state);
    5327              :  */
    5328              : 
    5329              : /*
    5330              :  * Initialize the given state with the source string and maximum Levenshtein
    5331              :  * distance to consider.
    5332              :  */
    5333              : void
    5334           45 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
    5335              : {
    5336              :     Assert(state);
    5337              :     Assert(max_d >= 0);
    5338              : 
    5339           45 :     state->source = source;
    5340           45 :     state->min_d = -1;
    5341           45 :     state->max_d = max_d;
    5342           45 :     state->match = NULL;
    5343           45 : }
    5344              : 
    5345              : /*
    5346              :  * If the candidate string is a closer match than the current one saved (or
    5347              :  * there is no match saved), save it as the closest match.
    5348              :  *
    5349              :  * If the source or candidate string is NULL, empty, or too long, this function
    5350              :  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
    5351              :  * allowed or more than half the characters are different, no action is taken.
    5352              :  */
    5353              : void
    5354          435 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
    5355              : {
    5356              :     int         dist;
    5357              : 
    5358              :     Assert(state);
    5359              : 
    5360          435 :     if (state->source == NULL || state->source[0] == '\0' ||
    5361          435 :         candidate == NULL || candidate[0] == '\0')
    5362            0 :         return;
    5363              : 
    5364              :     /*
    5365              :      * To avoid ERROR-ing, we check the lengths here instead of setting
    5366              :      * 'trusted' to false in the call to varstr_levenshtein_less_equal().
    5367              :      */
    5368          435 :     if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
    5369          435 :         strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
    5370            0 :         return;
    5371              : 
    5372          435 :     dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
    5373          435 :                                          candidate, strlen(candidate), 1, 1, 1,
    5374              :                                          state->max_d, true);
    5375          435 :     if (dist <= state->max_d &&
    5376           39 :         dist <= strlen(state->source) / 2 &&
    5377            9 :         (state->min_d == -1 || dist < state->min_d))
    5378              :     {
    5379            9 :         state->min_d = dist;
    5380            9 :         state->match = candidate;
    5381              :     }
    5382              : }
    5383              : 
    5384              : /*
    5385              :  * Return the closest match.  If no suitable candidates were provided via
    5386              :  * updateClosestMatch(), return NULL.
    5387              :  */
    5388              : const char *
    5389           45 : getClosestMatch(ClosestMatchState *state)
    5390              : {
    5391              :     Assert(state);
    5392              : 
    5393           45 :     return state->match;
    5394              : }
    5395              : 
    5396              : 
    5397              : /*
    5398              :  * Unicode support
    5399              :  */
    5400              : 
    5401              : static UnicodeNormalizationForm
    5402          149 : unicode_norm_form_from_string(const char *formstr)
    5403              : {
    5404          149 :     UnicodeNormalizationForm form = -1;
    5405              : 
    5406              :     /*
    5407              :      * Might as well check this while we're here.
    5408              :      */
    5409          149 :     if (GetDatabaseEncoding() != PG_UTF8)
    5410            0 :         ereport(ERROR,
    5411              :                 (errcode(ERRCODE_SYNTAX_ERROR),
    5412              :                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
    5413              : 
    5414          149 :     if (pg_strcasecmp(formstr, "NFC") == 0)
    5415           50 :         form = UNICODE_NFC;
    5416           99 :     else if (pg_strcasecmp(formstr, "NFD") == 0)
    5417           41 :         form = UNICODE_NFD;
    5418           58 :     else if (pg_strcasecmp(formstr, "NFKC") == 0)
    5419           25 :         form = UNICODE_NFKC;
    5420           33 :     else if (pg_strcasecmp(formstr, "NFKD") == 0)
    5421           25 :         form = UNICODE_NFKD;
    5422              :     else
    5423            8 :         ereport(ERROR,
    5424              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5425              :                  errmsg("invalid normalization form: %s", formstr)));
    5426              : 
    5427          141 :     return form;
    5428              : }
    5429              : 
    5430              : /*
    5431              :  * Returns version of Unicode used by Postgres in "major.minor" format (the
    5432              :  * same format as the Unicode version reported by ICU). The third component
    5433              :  * ("update version") never involves additions to the character repertoire and
    5434              :  * is unimportant for most purposes.
    5435              :  *
    5436              :  * See: https://unicode.org/versions/
    5437              :  */
    5438              : Datum
    5439           20 : unicode_version(PG_FUNCTION_ARGS)
    5440              : {
    5441           20 :     PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
    5442              : }
    5443              : 
    5444              : /*
    5445              :  * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
    5446              :  */
    5447              : Datum
    5448            1 : icu_unicode_version(PG_FUNCTION_ARGS)
    5449              : {
    5450            1 :     const char *version = pg_icu_unicode_version();
    5451              : 
    5452            1 :     if (version)
    5453            1 :         PG_RETURN_TEXT_P(cstring_to_text(version));
    5454              :     else
    5455            0 :         PG_RETURN_NULL();
    5456              : }
    5457              : 
    5458              : /*
    5459              :  * Check whether the string contains only assigned Unicode code
    5460              :  * points. Requires that the database encoding is UTF-8.
    5461              :  */
    5462              : Datum
    5463           10 : unicode_assigned(PG_FUNCTION_ARGS)
    5464              : {
    5465           10 :     text       *input = PG_GETARG_TEXT_PP(0);
    5466              :     unsigned char *p;
    5467              :     int         size;
    5468              : 
    5469           10 :     if (GetDatabaseEncoding() != PG_UTF8)
    5470            0 :         ereport(ERROR,
    5471              :                 (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
    5472              : 
    5473              :     /* convert to char32_t */
    5474           10 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5475           10 :     p = (unsigned char *) VARDATA_ANY(input);
    5476           40 :     for (int i = 0; i < size; i++)
    5477              :     {
    5478           35 :         char32_t    uchar = utf8_to_unicode(p);
    5479           35 :         int         category = unicode_category(uchar);
    5480              : 
    5481           35 :         if (category == PG_U_UNASSIGNED)
    5482            5 :             PG_RETURN_BOOL(false);
    5483              : 
    5484           30 :         p += pg_utf_mblen(p);
    5485              :     }
    5486              : 
    5487            5 :     PG_RETURN_BOOL(true);
    5488              : }
    5489              : 
    5490              : Datum
    5491           55 : unicode_normalize_func(PG_FUNCTION_ARGS)
    5492              : {
    5493           55 :     text       *input = PG_GETARG_TEXT_PP(0);
    5494           55 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5495              :     UnicodeNormalizationForm form;
    5496              :     size_t      size;
    5497              :     char32_t   *input_chars;
    5498              :     char32_t   *output_chars;
    5499              :     unsigned char *p;
    5500              :     text       *result;
    5501              :     size_t      i;
    5502              : 
    5503           55 :     form = unicode_norm_form_from_string(formstr);
    5504              : 
    5505              :     /* convert to char32_t */
    5506           51 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5507           51 :     input_chars = palloc_array(char32_t, size + 1);
    5508           51 :     p = (unsigned char *) VARDATA_ANY(input);
    5509          220 :     for (i = 0; i < size; i++)
    5510              :     {
    5511          169 :         input_chars[i] = utf8_to_unicode(p);
    5512          169 :         p += pg_utf_mblen(p);
    5513              :     }
    5514           51 :     input_chars[i] = (char32_t) '\0';
    5515              :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5516              : 
    5517              :     /* action */
    5518           51 :     output_chars = unicode_normalize(form, input_chars);
    5519              : 
    5520              :     /* convert back to UTF-8 string */
    5521           51 :     size = 0;
    5522          231 :     for (char32_t *wp = output_chars; *wp; wp++)
    5523              :     {
    5524              :         unsigned char buf[4];
    5525              : 
    5526          180 :         unicode_to_utf8(*wp, buf);
    5527          180 :         size += pg_utf_mblen(buf);
    5528              :     }
    5529              : 
    5530           51 :     result = palloc(size + VARHDRSZ);
    5531           51 :     SET_VARSIZE(result, size + VARHDRSZ);
    5532              : 
    5533           51 :     p = (unsigned char *) VARDATA_ANY(result);
    5534          231 :     for (char32_t *wp = output_chars; *wp; wp++)
    5535              :     {
    5536          180 :         unicode_to_utf8(*wp, p);
    5537          180 :         p += pg_utf_mblen(p);
    5538              :     }
    5539              :     Assert((char *) p == (char *) result + size + VARHDRSZ);
    5540              : 
    5541           51 :     PG_RETURN_TEXT_P(result);
    5542              : }
    5543              : 
    5544              : /*
    5545              :  * Check whether the string is in the specified Unicode normalization form.
    5546              :  *
    5547              :  * This is done by converting the string to the specified normal form and then
    5548              :  * comparing that to the original string.  To speed that up, we also apply the
    5549              :  * "quick check" algorithm specified in UAX #15, which can give a yes or no
    5550              :  * answer for many strings by just scanning the string once.
    5551              :  *
    5552              :  * This function should generally be optimized for the case where the string
    5553              :  * is in fact normalized.  In that case, we'll end up looking at the entire
    5554              :  * string, so it's probably not worth doing any incremental conversion etc.
    5555              :  */
    5556              : Datum
    5557           94 : unicode_is_normalized(PG_FUNCTION_ARGS)
    5558              : {
    5559           94 :     text       *input = PG_GETARG_TEXT_PP(0);
    5560           94 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5561              :     UnicodeNormalizationForm form;
    5562              :     size_t      size;
    5563              :     char32_t   *input_chars;
    5564              :     char32_t   *output_chars;
    5565              :     unsigned char *p;
    5566              :     size_t      i;
    5567              :     UnicodeNormalizationQC quickcheck;
    5568              :     size_t      output_size;
    5569              :     bool        result;
    5570              : 
    5571           94 :     form = unicode_norm_form_from_string(formstr);
    5572              : 
    5573              :     /* convert to char32_t */
    5574           90 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5575           90 :     input_chars = palloc_array(char32_t, size + 1);
    5576           90 :     p = (unsigned char *) VARDATA_ANY(input);
    5577          344 :     for (i = 0; i < size; i++)
    5578              :     {
    5579          254 :         input_chars[i] = utf8_to_unicode(p);
    5580          254 :         p += pg_utf_mblen(p);
    5581              :     }
    5582           90 :     input_chars[i] = (char32_t) '\0';
    5583              :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5584              : 
    5585              :     /* quick check (see UAX #15) */
    5586           90 :     quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
    5587           90 :     if (quickcheck == UNICODE_NORM_QC_YES)
    5588           30 :         PG_RETURN_BOOL(true);
    5589           60 :     else if (quickcheck == UNICODE_NORM_QC_NO)
    5590            8 :         PG_RETURN_BOOL(false);
    5591              : 
    5592              :     /* normalize and compare with original */
    5593           52 :     output_chars = unicode_normalize(form, input_chars);
    5594              : 
    5595           52 :     output_size = 0;
    5596          216 :     for (char32_t *wp = output_chars; *wp; wp++)
    5597          164 :         output_size++;
    5598              : 
    5599           76 :     result = (size == output_size) &&
    5600           24 :         (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
    5601              : 
    5602           52 :     PG_RETURN_BOOL(result);
    5603              : }
    5604              : 
    5605              : /*
    5606              :  * Check if first n chars are hexadecimal digits
    5607              :  */
    5608              : static bool
    5609          111 : isxdigits_n(const char *instr, size_t n)
    5610              : {
    5611          469 :     for (size_t i = 0; i < n; i++)
    5612          405 :         if (!isxdigit((unsigned char) instr[i]))
    5613           47 :             return false;
    5614              : 
    5615           64 :     return true;
    5616              : }
    5617              : 
    5618              : static unsigned int
    5619          358 : hexval(unsigned char c)
    5620              : {
    5621          358 :     if (c >= '0' && c <= '9')
    5622          278 :         return c - '0';
    5623           80 :     if (c >= 'a' && c <= 'f')
    5624           40 :         return c - 'a' + 0xA;
    5625           40 :     if (c >= 'A' && c <= 'F')
    5626           40 :         return c - 'A' + 0xA;
    5627            0 :     elog(ERROR, "invalid hexadecimal digit");
    5628              :     return 0;                   /* not reached */
    5629              : }
    5630              : 
    5631              : /*
    5632              :  * Translate string with hexadecimal digits to number
    5633              :  */
    5634              : static unsigned int
    5635           64 : hexval_n(const char *instr, size_t n)
    5636              : {
    5637           64 :     unsigned int result = 0;
    5638              : 
    5639          422 :     for (size_t i = 0; i < n; i++)
    5640          358 :         result += hexval(instr[i]) << (4 * (n - i - 1));
    5641              : 
    5642           64 :     return result;
    5643              : }
    5644              : 
    5645              : /*
    5646              :  * Replaces Unicode escape sequences by Unicode characters
    5647              :  */
    5648              : Datum
    5649           47 : unistr(PG_FUNCTION_ARGS)
    5650              : {
    5651           47 :     text       *input_text = PG_GETARG_TEXT_PP(0);
    5652              :     char       *instr;
    5653              :     int         len;
    5654              :     StringInfoData str;
    5655              :     text       *result;
    5656           47 :     char16_t    pair_first = 0;
    5657              :     char        cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
    5658              : 
    5659           47 :     instr = VARDATA_ANY(input_text);
    5660           47 :     len = VARSIZE_ANY_EXHDR(input_text);
    5661              : 
    5662           47 :     initStringInfo(&str);
    5663              : 
    5664          356 :     while (len > 0)
    5665              :     {
    5666          337 :         if (instr[0] == '\\')
    5667              :         {
    5668           73 :             if (len >= 2 &&
    5669           73 :                 instr[1] == '\\')
    5670              :             {
    5671            5 :                 if (pair_first)
    5672            0 :                     goto invalid_pair;
    5673            5 :                 appendStringInfoChar(&str, '\\');
    5674            5 :                 instr += 2;
    5675            5 :                 len -= 2;
    5676              :             }
    5677           68 :             else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
    5678           47 :                      (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
    5679           22 :             {
    5680              :                 char32_t    unicode;
    5681           30 :                 int         offset = instr[1] == 'u' ? 2 : 1;
    5682              : 
    5683           30 :                 unicode = hexval_n(instr + offset, 4);
    5684              : 
    5685           30 :                 if (!is_valid_unicode_codepoint(unicode))
    5686            0 :                     ereport(ERROR,
    5687              :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5688              :                             errmsg("invalid Unicode code point: %04X", unicode));
    5689              : 
    5690           30 :                 if (pair_first)
    5691              :                 {
    5692            8 :                     if (is_utf16_surrogate_second(unicode))
    5693              :                     {
    5694            0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5695            0 :                         pair_first = 0;
    5696              :                     }
    5697              :                     else
    5698            8 :                         goto invalid_pair;
    5699              :                 }
    5700           22 :                 else if (is_utf16_surrogate_second(unicode))
    5701            0 :                     goto invalid_pair;
    5702              : 
    5703           22 :                 if (is_utf16_surrogate_first(unicode))
    5704           12 :                     pair_first = unicode;
    5705              :                 else
    5706              :                 {
    5707           10 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5708           10 :                     appendStringInfoString(&str, cbuf);
    5709              :                 }
    5710              : 
    5711           22 :                 instr += 4 + offset;
    5712           22 :                 len -= 4 + offset;
    5713              :             }
    5714           38 :             else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
    5715            9 :             {
    5716              :                 char32_t    unicode;
    5717              : 
    5718           17 :                 unicode = hexval_n(instr + 2, 6);
    5719              : 
    5720           17 :                 if (!is_valid_unicode_codepoint(unicode))
    5721            4 :                     ereport(ERROR,
    5722              :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5723              :                             errmsg("invalid Unicode code point: %04X", unicode));
    5724              : 
    5725           13 :                 if (pair_first)
    5726              :                 {
    5727            4 :                     if (is_utf16_surrogate_second(unicode))
    5728              :                     {
    5729            0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5730            0 :                         pair_first = 0;
    5731              :                     }
    5732              :                     else
    5733            4 :                         goto invalid_pair;
    5734              :                 }
    5735            9 :                 else if (is_utf16_surrogate_second(unicode))
    5736            0 :                     goto invalid_pair;
    5737              : 
    5738            9 :                 if (is_utf16_surrogate_first(unicode))
    5739            4 :                     pair_first = unicode;
    5740              :                 else
    5741              :                 {
    5742            5 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5743            5 :                     appendStringInfoString(&str, cbuf);
    5744              :                 }
    5745              : 
    5746            9 :                 instr += 8;
    5747            9 :                 len -= 8;
    5748              :             }
    5749           21 :             else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
    5750            9 :             {
    5751              :                 char32_t    unicode;
    5752              : 
    5753           17 :                 unicode = hexval_n(instr + 2, 8);
    5754              : 
    5755           17 :                 if (!is_valid_unicode_codepoint(unicode))
    5756            4 :                     ereport(ERROR,
    5757              :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5758              :                             errmsg("invalid Unicode code point: %04X", unicode));
    5759              : 
    5760           13 :                 if (pair_first)
    5761              :                 {
    5762            4 :                     if (is_utf16_surrogate_second(unicode))
    5763              :                     {
    5764            0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5765            0 :                         pair_first = 0;
    5766              :                     }
    5767              :                     else
    5768            4 :                         goto invalid_pair;
    5769              :                 }
    5770            9 :                 else if (is_utf16_surrogate_second(unicode))
    5771            0 :                     goto invalid_pair;
    5772              : 
    5773            9 :                 if (is_utf16_surrogate_first(unicode))
    5774            4 :                     pair_first = unicode;
    5775              :                 else
    5776              :                 {
    5777            5 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5778            5 :                     appendStringInfoString(&str, cbuf);
    5779              :                 }
    5780              : 
    5781            9 :                 instr += 10;
    5782            9 :                 len -= 10;
    5783              :             }
    5784              :             else
    5785            4 :                 ereport(ERROR,
    5786              :                         (errcode(ERRCODE_SYNTAX_ERROR),
    5787              :                          errmsg("invalid Unicode escape"),
    5788              :                          errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
    5789              :         }
    5790              :         else
    5791              :         {
    5792          264 :             if (pair_first)
    5793            0 :                 goto invalid_pair;
    5794              : 
    5795          264 :             appendStringInfoChar(&str, *instr++);
    5796          264 :             len--;
    5797              :         }
    5798              :     }
    5799              : 
    5800              :     /* unfinished surrogate pair? */
    5801           19 :     if (pair_first)
    5802            4 :         goto invalid_pair;
    5803              : 
    5804           15 :     result = cstring_to_text_with_len(str.data, str.len);
    5805           15 :     pfree(str.data);
    5806              : 
    5807           15 :     PG_RETURN_TEXT_P(result);
    5808              : 
    5809           20 : invalid_pair:
    5810           20 :     ereport(ERROR,
    5811              :             (errcode(ERRCODE_SYNTAX_ERROR),
    5812              :              errmsg("invalid Unicode surrogate pair")));
    5813              :     PG_RETURN_NULL();           /* keep compiler quiet */
    5814              : }
        

Generated by: LCOV version 2.0-1