LCOV - code coverage report
Current view: top level - src/backend/utils/adt - varlena.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 1708 1891 90.3 %
Date: 2025-12-13 06:17:57 Functions: 132 143 92.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * varlena.c
       4             :  *    Functions for the variable-length built-in types.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/varlena.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include <ctype.h>
      18             : #include <limits.h>
      19             : 
      20             : #include "access/detoast.h"
      21             : #include "access/toast_compression.h"
      22             : #include "catalog/pg_collation.h"
      23             : #include "catalog/pg_type.h"
      24             : #include "common/hashfn.h"
      25             : #include "common/int.h"
      26             : #include "common/unicode_category.h"
      27             : #include "common/unicode_norm.h"
      28             : #include "common/unicode_version.h"
      29             : #include "funcapi.h"
      30             : #include "lib/hyperloglog.h"
      31             : #include "libpq/pqformat.h"
      32             : #include "miscadmin.h"
      33             : #include "nodes/execnodes.h"
      34             : #include "parser/scansup.h"
      35             : #include "port/pg_bswap.h"
      36             : #include "regex/regex.h"
      37             : #include "utils/builtins.h"
      38             : #include "utils/guc.h"
      39             : #include "utils/lsyscache.h"
      40             : #include "utils/memutils.h"
      41             : #include "utils/pg_locale.h"
      42             : #include "utils/sortsupport.h"
      43             : #include "utils/varlena.h"
      44             : 
      45             : typedef struct varlena VarString;
      46             : 
      47             : /*
      48             :  * State for text_position_* functions.
      49             :  */
      50             : typedef struct
      51             : {
      52             :     pg_locale_t locale;         /* collation used for substring matching */
      53             :     bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
      54             :     bool        greedy;         /* find longest possible substring? */
      55             : 
      56             :     char       *str1;           /* haystack string */
      57             :     char       *str2;           /* needle string */
      58             :     int         len1;           /* string lengths in bytes */
      59             :     int         len2;
      60             : 
      61             :     /* Skip table for Boyer-Moore-Horspool search algorithm: */
      62             :     int         skiptablemask;  /* mask for ANDing with skiptable subscripts */
      63             :     int         skiptable[256]; /* skip distance for given mismatched char */
      64             : 
      65             :     /*
      66             :      * Note that with nondeterministic collations, the length of the last
      67             :      * match is not necessarily equal to the length of the "needle" passed in.
      68             :      */
      69             :     char       *last_match;     /* pointer to last match in 'str1' */
      70             :     int         last_match_len; /* length of last match */
      71             :     int         last_match_len_tmp; /* same but for internal use */
      72             : 
      73             :     /*
      74             :      * Sometimes we need to convert the byte position of a match to a
      75             :      * character position.  These store the last position that was converted,
      76             :      * so that on the next call, we can continue from that point, rather than
      77             :      * count characters from the very beginning.
      78             :      */
      79             :     char       *refpoint;       /* pointer within original haystack string */
      80             :     int         refpos;         /* 0-based character offset of the same point */
      81             : } TextPositionState;
      82             : 
      83             : typedef struct
      84             : {
      85             :     char       *buf1;           /* 1st string, or abbreviation original string
      86             :                                  * buf */
      87             :     char       *buf2;           /* 2nd string, or abbreviation strxfrm() buf */
      88             :     int         buflen1;        /* Allocated length of buf1 */
      89             :     int         buflen2;        /* Allocated length of buf2 */
      90             :     int         last_len1;      /* Length of last buf1 string/strxfrm() input */
      91             :     int         last_len2;      /* Length of last buf2 string/strxfrm() blob */
      92             :     int         last_returned;  /* Last comparison result (cache) */
      93             :     bool        cache_blob;     /* Does buf2 contain strxfrm() blob, etc? */
      94             :     bool        collate_c;
      95             :     Oid         typid;          /* Actual datatype (text/bpchar/bytea/name) */
      96             :     hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
      97             :     hyperLogLogState full_card; /* Full key cardinality state */
      98             :     double      prop_card;      /* Required cardinality proportion */
      99             :     pg_locale_t locale;
     100             : } VarStringSortSupport;
     101             : 
     102             : /*
     103             :  * Output data for split_text(): we output either to an array or a table.
     104             :  * tupstore and tupdesc must be set up in advance to output to a table.
     105             :  */
     106             : typedef struct
     107             : {
     108             :     ArrayBuildState *astate;
     109             :     Tuplestorestate *tupstore;
     110             :     TupleDesc   tupdesc;
     111             : } SplitTextOutputData;
     112             : 
     113             : /*
     114             :  * This should be large enough that most strings will fit, but small enough
     115             :  * that we feel comfortable putting it on the stack
     116             :  */
     117             : #define TEXTBUFLEN      1024
     118             : 
     119             : #define DatumGetVarStringP(X)       ((VarString *) PG_DETOAST_DATUM(X))
     120             : #define DatumGetVarStringPP(X)      ((VarString *) PG_DETOAST_DATUM_PACKED(X))
     121             : 
     122             : static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
     123             : static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
     124             : static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
     125             : static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
     126             : static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
     127             : static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
     128             : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
     129             : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
     130             : static int32 text_length(Datum str);
     131             : static text *text_catenate(text *t1, text *t2);
     132             : static text *text_substring(Datum str,
     133             :                             int32 start,
     134             :                             int32 length,
     135             :                             bool length_not_specified);
     136             : static text *text_overlay(text *t1, text *t2, int sp, int sl);
     137             : static int  text_position(text *t1, text *t2, Oid collid);
     138             : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
     139             : static bool text_position_next(TextPositionState *state);
     140             : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
     141             : static char *text_position_get_match_ptr(TextPositionState *state);
     142             : static int  text_position_get_match_pos(TextPositionState *state);
     143             : static void text_position_cleanup(TextPositionState *state);
     144             : static void check_collation_set(Oid collid);
     145             : static int  text_cmp(text *arg1, text *arg2, Oid collid);
     146             : static void appendStringInfoText(StringInfo str, const text *t);
     147             : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
     148             : static void split_text_accum_result(SplitTextOutputData *tstate,
     149             :                                     text *field_value,
     150             :                                     text *null_string,
     151             :                                     Oid collation);
     152             : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
     153             :                                     const char *fldsep, const char *null_string);
     154             : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
     155             : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
     156             :                                      int *value);
     157             : static const char *text_format_parse_format(const char *start_ptr,
     158             :                                             const char *end_ptr,
     159             :                                             int *argpos, int *widthpos,
     160             :                                             int *flags, int *width);
     161             : static void text_format_string_conversion(StringInfo buf, char conversion,
     162             :                                           FmgrInfo *typOutputInfo,
     163             :                                           Datum value, bool isNull,
     164             :                                           int flags, int width);
     165             : static void text_format_append_string(StringInfo buf, const char *str,
     166             :                                       int flags, int width);
     167             : 
     168             : 
     169             : /*****************************************************************************
     170             :  *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                          *
     171             :  *****************************************************************************/
     172             : 
     173             : /*
     174             :  * cstring_to_text
     175             :  *
     176             :  * Create a text value from a null-terminated C string.
     177             :  *
     178             :  * The new text value is freshly palloc'd with a full-size VARHDR.
     179             :  */
     180             : text *
     181    25725728 : cstring_to_text(const char *s)
     182             : {
     183    25725728 :     return cstring_to_text_with_len(s, strlen(s));
     184             : }
     185             : 
     186             : /*
     187             :  * cstring_to_text_with_len
     188             :  *
     189             :  * Same as cstring_to_text except the caller specifies the string length;
     190             :  * the string need not be null_terminated.
     191             :  */
     192             : text *
     193    28450346 : cstring_to_text_with_len(const char *s, int len)
     194             : {
     195    28450346 :     text       *result = (text *) palloc(len + VARHDRSZ);
     196             : 
     197    28450346 :     SET_VARSIZE(result, len + VARHDRSZ);
     198    28450346 :     memcpy(VARDATA(result), s, len);
     199             : 
     200    28450346 :     return result;
     201             : }
     202             : 
     203             : /*
     204             :  * text_to_cstring
     205             :  *
     206             :  * Create a palloc'd, null-terminated C string from a text value.
     207             :  *
     208             :  * We support being passed a compressed or toasted text value.
     209             :  * This is a bit bogus since such values shouldn't really be referred to as
     210             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     211             :  * case here, we'd need another routine that did, anyway.
     212             :  */
     213             : char *
     214    18455976 : text_to_cstring(const text *t)
     215             : {
     216             :     /* must cast away the const, unfortunately */
     217    18455976 :     text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
     218    18455976 :     int         len = VARSIZE_ANY_EXHDR(tunpacked);
     219             :     char       *result;
     220             : 
     221    18455976 :     result = (char *) palloc(len + 1);
     222    18455976 :     memcpy(result, VARDATA_ANY(tunpacked), len);
     223    18455976 :     result[len] = '\0';
     224             : 
     225    18455976 :     if (tunpacked != t)
     226       45226 :         pfree(tunpacked);
     227             : 
     228    18455976 :     return result;
     229             : }
     230             : 
     231             : /*
     232             :  * text_to_cstring_buffer
     233             :  *
     234             :  * Copy a text value into a caller-supplied buffer of size dst_len.
     235             :  *
     236             :  * The text string is truncated if necessary to fit.  The result is
     237             :  * guaranteed null-terminated (unless dst_len == 0).
     238             :  *
     239             :  * We support being passed a compressed or toasted text value.
     240             :  * This is a bit bogus since such values shouldn't really be referred to as
     241             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     242             :  * case here, we'd need another routine that did, anyway.
     243             :  */
     244             : void
     245        1006 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
     246             : {
     247             :     /* must cast away the const, unfortunately */
     248        1006 :     text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
     249        1006 :     size_t      src_len = VARSIZE_ANY_EXHDR(srcunpacked);
     250             : 
     251        1006 :     if (dst_len > 0)
     252             :     {
     253        1006 :         dst_len--;
     254        1006 :         if (dst_len >= src_len)
     255        1006 :             dst_len = src_len;
     256             :         else                    /* ensure truncation is encoding-safe */
     257           0 :             dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
     258        1006 :         memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
     259        1006 :         dst[dst_len] = '\0';
     260             :     }
     261             : 
     262        1006 :     if (srcunpacked != src)
     263           0 :         pfree(srcunpacked);
     264        1006 : }
     265             : 
     266             : 
     267             : /*****************************************************************************
     268             :  *   USER I/O ROUTINES                                                       *
     269             :  *****************************************************************************/
     270             : 
     271             : /*
     272             :  *      textin          - converts cstring to internal representation
     273             :  */
     274             : Datum
     275    22440464 : textin(PG_FUNCTION_ARGS)
     276             : {
     277    22440464 :     char       *inputText = PG_GETARG_CSTRING(0);
     278             : 
     279    22440464 :     PG_RETURN_TEXT_P(cstring_to_text(inputText));
     280             : }
     281             : 
     282             : /*
     283             :  *      textout         - converts internal representation to cstring
     284             :  */
     285             : Datum
     286     8238806 : textout(PG_FUNCTION_ARGS)
     287             : {
     288     8238806 :     Datum       txt = PG_GETARG_DATUM(0);
     289             : 
     290     8238806 :     PG_RETURN_CSTRING(TextDatumGetCString(txt));
     291             : }
     292             : 
     293             : /*
     294             :  *      textrecv            - converts external binary format to text
     295             :  */
     296             : Datum
     297          48 : textrecv(PG_FUNCTION_ARGS)
     298             : {
     299          48 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     300             :     text       *result;
     301             :     char       *str;
     302             :     int         nbytes;
     303             : 
     304          48 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     305             : 
     306          48 :     result = cstring_to_text_with_len(str, nbytes);
     307          48 :     pfree(str);
     308          48 :     PG_RETURN_TEXT_P(result);
     309             : }
     310             : 
     311             : /*
     312             :  *      textsend            - converts text to binary format
     313             :  */
     314             : Datum
     315        4724 : textsend(PG_FUNCTION_ARGS)
     316             : {
     317        4724 :     text       *t = PG_GETARG_TEXT_PP(0);
     318             :     StringInfoData buf;
     319             : 
     320        4724 :     pq_begintypsend(&buf);
     321        4724 :     pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
     322        4724 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     323             : }
     324             : 
     325             : 
     326             : /*
     327             :  *      unknownin           - converts cstring to internal representation
     328             :  */
     329             : Datum
     330           0 : unknownin(PG_FUNCTION_ARGS)
     331             : {
     332           0 :     char       *str = PG_GETARG_CSTRING(0);
     333             : 
     334             :     /* representation is same as cstring */
     335           0 :     PG_RETURN_CSTRING(pstrdup(str));
     336             : }
     337             : 
     338             : /*
     339             :  *      unknownout          - converts internal representation to cstring
     340             :  */
     341             : Datum
     342         940 : unknownout(PG_FUNCTION_ARGS)
     343             : {
     344             :     /* representation is same as cstring */
     345         940 :     char       *str = PG_GETARG_CSTRING(0);
     346             : 
     347         940 :     PG_RETURN_CSTRING(pstrdup(str));
     348             : }
     349             : 
     350             : /*
     351             :  *      unknownrecv         - converts external binary format to unknown
     352             :  */
     353             : Datum
     354           0 : unknownrecv(PG_FUNCTION_ARGS)
     355             : {
     356           0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     357             :     char       *str;
     358             :     int         nbytes;
     359             : 
     360           0 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     361             :     /* representation is same as cstring */
     362           0 :     PG_RETURN_CSTRING(str);
     363             : }
     364             : 
     365             : /*
     366             :  *      unknownsend         - converts unknown to binary format
     367             :  */
     368             : Datum
     369           0 : unknownsend(PG_FUNCTION_ARGS)
     370             : {
     371             :     /* representation is same as cstring */
     372           0 :     char       *str = PG_GETARG_CSTRING(0);
     373             :     StringInfoData buf;
     374             : 
     375           0 :     pq_begintypsend(&buf);
     376           0 :     pq_sendtext(&buf, str, strlen(str));
     377           0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     378             : }
     379             : 
     380             : 
     381             : /* ========== PUBLIC ROUTINES ========== */
     382             : 
     383             : /*
     384             :  * textlen -
     385             :  *    returns the logical length of a text*
     386             :  *     (which is less than the VARSIZE of the text*)
     387             :  */
     388             : Datum
     389      430826 : textlen(PG_FUNCTION_ARGS)
     390             : {
     391      430826 :     Datum       str = PG_GETARG_DATUM(0);
     392             : 
     393             :     /* try to avoid decompressing argument */
     394      430826 :     PG_RETURN_INT32(text_length(str));
     395             : }
     396             : 
     397             : /*
     398             :  * text_length -
     399             :  *  Does the real work for textlen()
     400             :  *
     401             :  *  This is broken out so it can be called directly by other string processing
     402             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     403             :  *  it may still be in compressed form.  We can avoid decompressing it at all
     404             :  *  in some cases.
     405             :  */
     406             : static int32
     407      430838 : text_length(Datum str)
     408             : {
     409             :     /* fastpath when max encoding length is one */
     410      430838 :     if (pg_database_encoding_max_length() == 1)
     411          20 :         return (toast_raw_datum_size(str) - VARHDRSZ);
     412             :     else
     413             :     {
     414      430818 :         text       *t = DatumGetTextPP(str);
     415             : 
     416      430818 :         return (pg_mbstrlen_with_len(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)));
     417             :     }
     418             : }
     419             : 
     420             : /*
     421             :  * textoctetlen -
     422             :  *    returns the physical length of a text*
     423             :  *     (which is less than the VARSIZE of the text*)
     424             :  */
     425             : Datum
     426          70 : textoctetlen(PG_FUNCTION_ARGS)
     427             : {
     428          70 :     Datum       str = PG_GETARG_DATUM(0);
     429             : 
     430             :     /* We need not detoast the input at all */
     431          70 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     432             : }
     433             : 
     434             : /*
     435             :  * textcat -
     436             :  *    takes two text* and returns a text* that is the concatenation of
     437             :  *    the two.
     438             :  *
     439             :  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
     440             :  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
     441             :  * Allocate space for output in all cases.
     442             :  * XXX - thomas 1997-07-10
     443             :  */
     444             : Datum
     445     1953706 : textcat(PG_FUNCTION_ARGS)
     446             : {
     447     1953706 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     448     1953706 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     449             : 
     450     1953706 :     PG_RETURN_TEXT_P(text_catenate(t1, t2));
     451             : }
     452             : 
     453             : /*
     454             :  * text_catenate
     455             :  *  Guts of textcat(), broken out so it can be used by other functions
     456             :  *
     457             :  * Arguments can be in short-header form, but not compressed or out-of-line
     458             :  */
     459             : static text *
     460     1953786 : text_catenate(text *t1, text *t2)
     461             : {
     462             :     text       *result;
     463             :     int         len1,
     464             :                 len2,
     465             :                 len;
     466             :     char       *ptr;
     467             : 
     468     1953786 :     len1 = VARSIZE_ANY_EXHDR(t1);
     469     1953786 :     len2 = VARSIZE_ANY_EXHDR(t2);
     470             : 
     471             :     /* paranoia ... probably should throw error instead? */
     472     1953786 :     if (len1 < 0)
     473           0 :         len1 = 0;
     474     1953786 :     if (len2 < 0)
     475           0 :         len2 = 0;
     476             : 
     477     1953786 :     len = len1 + len2 + VARHDRSZ;
     478     1953786 :     result = (text *) palloc(len);
     479             : 
     480             :     /* Set size of result string... */
     481     1953786 :     SET_VARSIZE(result, len);
     482             : 
     483             :     /* Fill data field of result string... */
     484     1953786 :     ptr = VARDATA(result);
     485     1953786 :     if (len1 > 0)
     486     1952962 :         memcpy(ptr, VARDATA_ANY(t1), len1);
     487     1953786 :     if (len2 > 0)
     488     1953576 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
     489             : 
     490     1953786 :     return result;
     491             : }
     492             : 
     493             : /*
     494             :  * charlen_to_bytelen()
     495             :  *  Compute the number of bytes occupied by n characters starting at *p
     496             :  *
     497             :  * It is caller's responsibility that there actually are n characters;
     498             :  * the string need not be null-terminated.
     499             :  */
     500             : static int
     501       17358 : charlen_to_bytelen(const char *p, int n)
     502             : {
     503       17358 :     if (pg_database_encoding_max_length() == 1)
     504             :     {
     505             :         /* Optimization for single-byte encodings */
     506         180 :         return n;
     507             :     }
     508             :     else
     509             :     {
     510             :         const char *s;
     511             : 
     512     6073646 :         for (s = p; n > 0; n--)
     513     6056468 :             s += pg_mblen(s);
     514             : 
     515       17178 :         return s - p;
     516             :     }
     517             : }
     518             : 
     519             : /*
     520             :  * text_substr()
     521             :  * Return a substring starting at the specified position.
     522             :  * - thomas 1997-12-31
     523             :  *
     524             :  * Input:
     525             :  *  - string
     526             :  *  - starting position (is one-based)
     527             :  *  - string length
     528             :  *
     529             :  * If the starting position is zero or less, then return from the start of the string
     530             :  *  adjusting the length to be consistent with the "negative start" per SQL.
     531             :  * If the length is less than zero, return the remaining string.
     532             :  *
     533             :  * Added multibyte support.
     534             :  * - Tatsuo Ishii 1998-4-21
     535             :  * Changed behavior if starting position is less than one to conform to SQL behavior.
     536             :  * Formerly returned the entire string; now returns a portion.
     537             :  * - Thomas Lockhart 1998-12-10
     538             :  * Now uses faster TOAST-slicing interface
     539             :  * - John Gray 2002-02-22
     540             :  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
     541             :  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
     542             :  * error; if E < 1, return '', not entire string). Fixed MB related bug when
     543             :  * S > LC and < LC + 4 sometimes garbage characters are returned.
     544             :  * - Joe Conway 2002-08-10
     545             :  */
     546             : Datum
     547      660030 : text_substr(PG_FUNCTION_ARGS)
     548             : {
     549      660030 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     550             :                                     PG_GETARG_INT32(1),
     551             :                                     PG_GETARG_INT32(2),
     552             :                                     false));
     553             : }
     554             : 
     555             : /*
     556             :  * text_substr_no_len -
     557             :  *    Wrapper to avoid opr_sanity failure due to
     558             :  *    one function accepting a different number of args.
     559             :  */
     560             : Datum
     561          36 : text_substr_no_len(PG_FUNCTION_ARGS)
     562             : {
     563          36 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     564             :                                     PG_GETARG_INT32(1),
     565             :                                     -1, true));
     566             : }
     567             : 
     568             : /*
     569             :  * text_substring -
     570             :  *  Does the real work for text_substr() and text_substr_no_len()
     571             :  *
     572             :  *  This is broken out so it can be called directly by other string processing
     573             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     574             :  *  it may still be in compressed/toasted form.  We can avoid detoasting all
     575             :  *  of it in some cases.
     576             :  *
     577             :  *  The result is always a freshly palloc'd datum.
     578             :  */
     579             : static text *
     580      700178 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
     581             : {
     582      700178 :     int32       eml = pg_database_encoding_max_length();
     583      700178 :     int32       S = start;      /* start position */
     584             :     int32       S1;             /* adjusted start position */
     585             :     int32       L1;             /* adjusted substring length */
     586             :     int32       E;              /* end position */
     587             : 
     588             :     /*
     589             :      * SQL99 says S can be zero or negative (which we don't document), but we
     590             :      * still must fetch from the start of the string.
     591             :      * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
     592             :      */
     593      700178 :     S1 = Max(S, 1);
     594             : 
     595             :     /* life is easy if the encoding max length is 1 */
     596      700178 :     if (eml == 1)
     597             :     {
     598          22 :         if (length_not_specified)   /* special case - get length to end of
     599             :                                      * string */
     600           0 :             L1 = -1;
     601          22 :         else if (length < 0)
     602             :         {
     603             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     604           0 :             ereport(ERROR,
     605             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     606             :                      errmsg("negative substring length not allowed")));
     607             :             L1 = -1;            /* silence stupider compilers */
     608             :         }
     609          22 :         else if (pg_add_s32_overflow(S, length, &E))
     610             :         {
     611             :             /*
     612             :              * L could be large enough for S + L to overflow, in which case
     613             :              * the substring must run to end of string.
     614             :              */
     615           0 :             L1 = -1;
     616             :         }
     617             :         else
     618             :         {
     619             :             /*
     620             :              * A zero or negative value for the end position can happen if the
     621             :              * start was negative or one. SQL99 says to return a zero-length
     622             :              * string.
     623             :              */
     624          22 :             if (E < 1)
     625           0 :                 return cstring_to_text("");
     626             : 
     627          22 :             L1 = E - S1;
     628             :         }
     629             : 
     630             :         /*
     631             :          * If the start position is past the end of the string, SQL99 says to
     632             :          * return a zero-length string -- DatumGetTextPSlice() will do that
     633             :          * for us.  We need only convert S1 to zero-based starting position.
     634             :          */
     635          22 :         return DatumGetTextPSlice(str, S1 - 1, L1);
     636             :     }
     637      700156 :     else if (eml > 1)
     638             :     {
     639             :         /*
     640             :          * When encoding max length is > 1, we can't get LC without
     641             :          * detoasting, so we'll grab a conservatively large slice now and go
     642             :          * back later to do the right thing
     643             :          */
     644             :         int32       slice_start;
     645             :         int32       slice_size;
     646             :         int32       slice_strlen;
     647             :         text       *slice;
     648             :         int32       E1;
     649             :         int32       i;
     650             :         char       *p;
     651             :         char       *s;
     652             :         text       *ret;
     653             : 
     654             :         /*
     655             :          * We need to start at position zero because there is no way to know
     656             :          * in advance which byte offset corresponds to the supplied start
     657             :          * position.
     658             :          */
     659      700156 :         slice_start = 0;
     660             : 
     661      700156 :         if (length_not_specified)   /* special case - get length to end of
     662             :                                      * string */
     663          76 :             slice_size = L1 = -1;
     664      700080 :         else if (length < 0)
     665             :         {
     666             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     667          12 :             ereport(ERROR,
     668             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     669             :                      errmsg("negative substring length not allowed")));
     670             :             slice_size = L1 = -1;   /* silence stupider compilers */
     671             :         }
     672      700068 :         else if (pg_add_s32_overflow(S, length, &E))
     673             :         {
     674             :             /*
     675             :              * L could be large enough for S + L to overflow, in which case
     676             :              * the substring must run to end of string.
     677             :              */
     678           6 :             slice_size = L1 = -1;
     679             :         }
     680             :         else
     681             :         {
     682             :             /*
     683             :              * A zero or negative value for the end position can happen if the
     684             :              * start was negative or one. SQL99 says to return a zero-length
     685             :              * string.
     686             :              */
     687      700062 :             if (E < 1)
     688           0 :                 return cstring_to_text("");
     689             : 
     690             :             /*
     691             :              * if E is past the end of the string, the tuple toaster will
     692             :              * truncate the length for us
     693             :              */
     694      700062 :             L1 = E - S1;
     695             : 
     696             :             /*
     697             :              * Total slice size in bytes can't be any longer than the start
     698             :              * position plus substring length times the encoding max length.
     699             :              * If that overflows, we can just use -1.
     700             :              */
     701      700062 :             if (pg_mul_s32_overflow(E, eml, &slice_size))
     702           6 :                 slice_size = -1;
     703             :         }
     704             : 
     705             :         /*
     706             :          * If we're working with an untoasted source, no need to do an extra
     707             :          * copying step.
     708             :          */
     709     1400234 :         if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
     710      700090 :             VARATT_IS_EXTERNAL(DatumGetPointer(str)))
     711         354 :             slice = DatumGetTextPSlice(str, slice_start, slice_size);
     712             :         else
     713      699790 :             slice = (text *) DatumGetPointer(str);
     714             : 
     715             :         /* see if we got back an empty string */
     716      700144 :         if (VARSIZE_ANY_EXHDR(slice) == 0)
     717             :         {
     718           0 :             if (slice != (text *) DatumGetPointer(str))
     719           0 :                 pfree(slice);
     720           0 :             return cstring_to_text("");
     721             :         }
     722             : 
     723             :         /* Now we can get the actual length of the slice in MB characters */
     724      700144 :         slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
     725      700144 :                                             VARSIZE_ANY_EXHDR(slice));
     726             : 
     727             :         /*
     728             :          * Check that the start position wasn't > slice_strlen. If so, SQL99
     729             :          * says to return a zero-length string.
     730             :          */
     731      700144 :         if (S1 > slice_strlen)
     732             :         {
     733          22 :             if (slice != (text *) DatumGetPointer(str))
     734           0 :                 pfree(slice);
     735          22 :             return cstring_to_text("");
     736             :         }
     737             : 
     738             :         /*
     739             :          * Adjust L1 and E1 now that we know the slice string length. Again
     740             :          * remember that S1 is one based, and slice_start is zero based.
     741             :          */
     742      700122 :         if (L1 > -1)
     743      700062 :             E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
     744             :         else
     745          60 :             E1 = slice_start + 1 + slice_strlen;
     746             : 
     747             :         /*
     748             :          * Find the start position in the slice; remember S1 is not zero based
     749             :          */
     750      700122 :         p = VARDATA_ANY(slice);
     751     6714616 :         for (i = 0; i < S1 - 1; i++)
     752     6014494 :             p += pg_mblen(p);
     753             : 
     754             :         /* hang onto a pointer to our start position */
     755      700122 :         s = p;
     756             : 
     757             :         /*
     758             :          * Count the actual bytes used by the substring of the requested
     759             :          * length.
     760             :          */
     761     9950426 :         for (i = S1; i < E1; i++)
     762     9250304 :             p += pg_mblen(p);
     763             : 
     764      700122 :         ret = (text *) palloc(VARHDRSZ + (p - s));
     765      700122 :         SET_VARSIZE(ret, VARHDRSZ + (p - s));
     766      700122 :         memcpy(VARDATA(ret), s, (p - s));
     767             : 
     768      700122 :         if (slice != (text *) DatumGetPointer(str))
     769         354 :             pfree(slice);
     770             : 
     771      700122 :         return ret;
     772             :     }
     773             :     else
     774           0 :         elog(ERROR, "invalid backend encoding: encoding max length < 1");
     775             : 
     776             :     /* not reached: suppress compiler warning */
     777             :     return NULL;
     778             : }
     779             : 
     780             : /*
     781             :  * textoverlay
     782             :  *  Replace specified substring of first string with second
     783             :  *
     784             :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
     785             :  * This code is a direct implementation of what the standard says.
     786             :  */
     787             : Datum
     788          28 : textoverlay(PG_FUNCTION_ARGS)
     789             : {
     790          28 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     791          28 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     792          28 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     793          28 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
     794             : 
     795          28 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     796             : }
     797             : 
     798             : Datum
     799          12 : textoverlay_no_len(PG_FUNCTION_ARGS)
     800             : {
     801          12 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     802          12 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     803          12 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
     804             :     int         sl;
     805             : 
     806          12 :     sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
     807          12 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
     808             : }
     809             : 
     810             : static text *
     811          40 : text_overlay(text *t1, text *t2, int sp, int sl)
     812             : {
     813             :     text       *result;
     814             :     text       *s1;
     815             :     text       *s2;
     816             :     int         sp_pl_sl;
     817             : 
     818             :     /*
     819             :      * Check for possible integer-overflow cases.  For negative sp, throw a
     820             :      * "substring length" error because that's what should be expected
     821             :      * according to the spec's definition of OVERLAY().
     822             :      */
     823          40 :     if (sp <= 0)
     824           0 :         ereport(ERROR,
     825             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
     826             :                  errmsg("negative substring length not allowed")));
     827          40 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
     828           0 :         ereport(ERROR,
     829             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
     830             :                  errmsg("integer out of range")));
     831             : 
     832          40 :     s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
     833          40 :     s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
     834          40 :     result = text_catenate(s1, t2);
     835          40 :     result = text_catenate(result, s2);
     836             : 
     837          40 :     return result;
     838             : }
     839             : 
     840             : /*
     841             :  * textpos -
     842             :  *    Return the position of the specified substring.
     843             :  *    Implements the SQL POSITION() function.
     844             :  *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
     845             :  * - thomas 1997-07-27
     846             :  */
     847             : Datum
     848         130 : textpos(PG_FUNCTION_ARGS)
     849             : {
     850         130 :     text       *str = PG_GETARG_TEXT_PP(0);
     851         130 :     text       *search_str = PG_GETARG_TEXT_PP(1);
     852             : 
     853         130 :     PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
     854             : }
     855             : 
     856             : /*
     857             :  * text_position -
     858             :  *  Does the real work for textpos()
     859             :  *
     860             :  * Inputs:
     861             :  *      t1 - string to be searched
     862             :  *      t2 - pattern to match within t1
     863             :  * Result:
     864             :  *      Character index of the first matched char, starting from 1,
     865             :  *      or 0 if no match.
     866             :  *
     867             :  *  This is broken out so it can be called directly by other string processing
     868             :  *  functions.
     869             :  */
     870             : static int
     871         130 : text_position(text *t1, text *t2, Oid collid)
     872             : {
     873             :     TextPositionState state;
     874             :     int         result;
     875             : 
     876         130 :     check_collation_set(collid);
     877             : 
     878             :     /* Empty needle always matches at position 1 */
     879         130 :     if (VARSIZE_ANY_EXHDR(t2) < 1)
     880          12 :         return 1;
     881             : 
     882             :     /* Otherwise, can't match if haystack is shorter than needle */
     883         118 :     if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
     884          22 :         pg_newlocale_from_collation(collid)->deterministic)
     885          22 :         return 0;
     886             : 
     887          96 :     text_position_setup(t1, t2, collid, &state);
     888             :     /* don't need greedy mode here */
     889          96 :     state.greedy = false;
     890             : 
     891          96 :     if (!text_position_next(&state))
     892          24 :         result = 0;
     893             :     else
     894          72 :         result = text_position_get_match_pos(&state);
     895          96 :     text_position_cleanup(&state);
     896          96 :     return result;
     897             : }
     898             : 
     899             : 
     900             : /*
     901             :  * text_position_setup, text_position_next, text_position_cleanup -
     902             :  *  Component steps of text_position()
     903             :  *
     904             :  * These are broken out so that a string can be efficiently searched for
     905             :  * multiple occurrences of the same pattern.  text_position_next may be
     906             :  * called multiple times, and it advances to the next match on each call.
     907             :  * text_position_get_match_ptr() and text_position_get_match_pos() return
     908             :  * a pointer or 1-based character position of the last match, respectively.
     909             :  *
     910             :  * The "state" variable is normally just a local variable in the caller.
     911             :  *
     912             :  * NOTE: text_position_next skips over the matched portion.  For example,
     913             :  * searching for "xx" in "xxx" returns only one match, not two.
     914             :  */
     915             : 
     916             : static void
     917        1920 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
     918             : {
     919        1920 :     int         len1 = VARSIZE_ANY_EXHDR(t1);
     920        1920 :     int         len2 = VARSIZE_ANY_EXHDR(t2);
     921             : 
     922        1920 :     check_collation_set(collid);
     923             : 
     924        1920 :     state->locale = pg_newlocale_from_collation(collid);
     925             : 
     926             :     /*
     927             :      * Most callers need greedy mode, but some might want to unset this to
     928             :      * optimize.
     929             :      */
     930        1920 :     state->greedy = true;
     931             : 
     932             :     Assert(len2 > 0);
     933             : 
     934             :     /*
     935             :      * Even with a multi-byte encoding, we perform the search using the raw
     936             :      * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
     937             :      * because in UTF-8 the byte sequence of one character cannot contain
     938             :      * another character.  For other multi-byte encodings, we do the search
     939             :      * initially as a simple byte search, ignoring multibyte issues, but
     940             :      * verify afterwards that the match we found is at a character boundary,
     941             :      * and continue the search if it was a false match.
     942             :      */
     943        1920 :     if (pg_database_encoding_max_length() == 1)
     944         108 :         state->is_multibyte_char_in_char = false;
     945        1812 :     else if (GetDatabaseEncoding() == PG_UTF8)
     946        1812 :         state->is_multibyte_char_in_char = false;
     947             :     else
     948           0 :         state->is_multibyte_char_in_char = true;
     949             : 
     950        1920 :     state->str1 = VARDATA_ANY(t1);
     951        1920 :     state->str2 = VARDATA_ANY(t2);
     952        1920 :     state->len1 = len1;
     953        1920 :     state->len2 = len2;
     954        1920 :     state->last_match = NULL;
     955        1920 :     state->refpoint = state->str1;
     956        1920 :     state->refpos = 0;
     957             : 
     958             :     /*
     959             :      * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
     960             :      * notes we use the terminology that the "haystack" is the string to be
     961             :      * searched (t1) and the "needle" is the pattern being sought (t2).
     962             :      *
     963             :      * If the needle is empty or bigger than the haystack then there is no
     964             :      * point in wasting cycles initializing the table.  We also choose not to
     965             :      * use B-M-H for needles of length 1, since the skip table can't possibly
     966             :      * save anything in that case.
     967             :      *
     968             :      * (With nondeterministic collations, the search is already
     969             :      * multibyte-aware, so we don't need this.)
     970             :      */
     971        1920 :     if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
     972             :     {
     973        1586 :         int         searchlength = len1 - len2;
     974             :         int         skiptablemask;
     975             :         int         last;
     976             :         int         i;
     977        1586 :         const char *str2 = state->str2;
     978             : 
     979             :         /*
     980             :          * First we must determine how much of the skip table to use.  The
     981             :          * declaration of TextPositionState allows up to 256 elements, but for
     982             :          * short search problems we don't really want to have to initialize so
     983             :          * many elements --- it would take too long in comparison to the
     984             :          * actual search time.  So we choose a useful skip table size based on
     985             :          * the haystack length minus the needle length.  The closer the needle
     986             :          * length is to the haystack length the less useful skipping becomes.
     987             :          *
     988             :          * Note: since we use bit-masking to select table elements, the skip
     989             :          * table size MUST be a power of 2, and so the mask must be 2^N-1.
     990             :          */
     991        1586 :         if (searchlength < 16)
     992         114 :             skiptablemask = 3;
     993        1472 :         else if (searchlength < 64)
     994          16 :             skiptablemask = 7;
     995        1456 :         else if (searchlength < 128)
     996          14 :             skiptablemask = 15;
     997        1442 :         else if (searchlength < 512)
     998         306 :             skiptablemask = 31;
     999        1136 :         else if (searchlength < 2048)
    1000         826 :             skiptablemask = 63;
    1001         310 :         else if (searchlength < 4096)
    1002         216 :             skiptablemask = 127;
    1003             :         else
    1004          94 :             skiptablemask = 255;
    1005        1586 :         state->skiptablemask = skiptablemask;
    1006             : 
    1007             :         /*
    1008             :          * Initialize the skip table.  We set all elements to the needle
    1009             :          * length, since this is the correct skip distance for any character
    1010             :          * not found in the needle.
    1011             :          */
    1012      116762 :         for (i = 0; i <= skiptablemask; i++)
    1013      115176 :             state->skiptable[i] = len2;
    1014             : 
    1015             :         /*
    1016             :          * Now examine the needle.  For each character except the last one,
    1017             :          * set the corresponding table element to the appropriate skip
    1018             :          * distance.  Note that when two characters share the same skip table
    1019             :          * entry, the one later in the needle must determine the skip
    1020             :          * distance.
    1021             :          */
    1022        1586 :         last = len2 - 1;
    1023             : 
    1024       20854 :         for (i = 0; i < last; i++)
    1025       19268 :             state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
    1026             :     }
    1027        1920 : }
    1028             : 
    1029             : /*
    1030             :  * Advance to the next match, starting from the end of the previous match
    1031             :  * (or the beginning of the string, on first call).  Returns true if a match
    1032             :  * is found.
    1033             :  *
    1034             :  * Note that this refuses to match an empty-string needle.  Most callers
    1035             :  * will have handled that case specially and we'll never see it here.
    1036             :  */
    1037             : static bool
    1038        9700 : text_position_next(TextPositionState *state)
    1039             : {
    1040        9700 :     int         needle_len = state->len2;
    1041             :     char       *start_ptr;
    1042             :     char       *matchptr;
    1043             : 
    1044        9700 :     if (needle_len <= 0)
    1045           0 :         return false;           /* result for empty pattern */
    1046             : 
    1047             :     /* Start from the point right after the previous match. */
    1048        9700 :     if (state->last_match)
    1049        7768 :         start_ptr = state->last_match + state->last_match_len;
    1050             :     else
    1051        1932 :         start_ptr = state->str1;
    1052             : 
    1053        9700 : retry:
    1054        9700 :     matchptr = text_position_next_internal(start_ptr, state);
    1055             : 
    1056        9700 :     if (!matchptr)
    1057        1836 :         return false;
    1058             : 
    1059             :     /*
    1060             :      * Found a match for the byte sequence.  If this is a multibyte encoding,
    1061             :      * where one character's byte sequence can appear inside a longer
    1062             :      * multi-byte character, we need to verify that the match was at a
    1063             :      * character boundary, not in the middle of a multi-byte character.
    1064             :      */
    1065        7864 :     if (state->is_multibyte_char_in_char && state->locale->deterministic)
    1066             :     {
    1067             :         /* Walk one character at a time, until we reach the match. */
    1068             : 
    1069             :         /* the search should never move backwards. */
    1070             :         Assert(state->refpoint <= matchptr);
    1071             : 
    1072           0 :         while (state->refpoint < matchptr)
    1073             :         {
    1074             :             /* step to next character. */
    1075           0 :             state->refpoint += pg_mblen(state->refpoint);
    1076           0 :             state->refpos++;
    1077             : 
    1078             :             /*
    1079             :              * If we stepped over the match's start position, then it was a
    1080             :              * false positive, where the byte sequence appeared in the middle
    1081             :              * of a multi-byte character.  Skip it, and continue the search at
    1082             :              * the next character boundary.
    1083             :              */
    1084           0 :             if (state->refpoint > matchptr)
    1085             :             {
    1086           0 :                 start_ptr = state->refpoint;
    1087           0 :                 goto retry;
    1088             :             }
    1089             :         }
    1090             :     }
    1091             : 
    1092        7864 :     state->last_match = matchptr;
    1093        7864 :     state->last_match_len = state->last_match_len_tmp;
    1094        7864 :     return true;
    1095             : }
    1096             : 
    1097             : /*
    1098             :  * Subroutine of text_position_next().  This searches for the raw byte
    1099             :  * sequence, ignoring any multi-byte encoding issues.  Returns the first
    1100             :  * match starting at 'start_ptr', or NULL if no match is found.
    1101             :  */
    1102             : static char *
    1103        9700 : text_position_next_internal(char *start_ptr, TextPositionState *state)
    1104             : {
    1105        9700 :     int         haystack_len = state->len1;
    1106        9700 :     int         needle_len = state->len2;
    1107        9700 :     int         skiptablemask = state->skiptablemask;
    1108        9700 :     const char *haystack = state->str1;
    1109        9700 :     const char *needle = state->str2;
    1110        9700 :     const char *haystack_end = &haystack[haystack_len];
    1111             :     const char *hptr;
    1112             : 
    1113             :     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
    1114             :     Assert(needle_len > 0);
    1115             : 
    1116        9700 :     state->last_match_len_tmp = needle_len;
    1117             : 
    1118        9700 :     if (!state->locale->deterministic)
    1119             :     {
    1120             :         /*
    1121             :          * With a nondeterministic collation, we have to use an unoptimized
    1122             :          * route.  We walk through the haystack and see if at each position
    1123             :          * there is a substring of the remaining string that is equal to the
    1124             :          * needle under the given collation.
    1125             :          *
    1126             :          * Note, the found substring could have a different length than the
    1127             :          * needle.  Callers that want to skip over the found string need to
    1128             :          * read the length of the found substring from last_match_len rather
    1129             :          * than just using the length of their needle.
    1130             :          *
    1131             :          * Most callers will require "greedy" semantics, meaning that we need
    1132             :          * to find the longest such substring, not the shortest.  For callers
    1133             :          * that don't need greedy semantics, we can finish on the first match.
    1134             :          *
    1135             :          * This loop depends on the assumption that the needle is nonempty and
    1136             :          * any matching substring must also be nonempty.  (Even if the
    1137             :          * collation would accept an empty match, returning one would send
    1138             :          * callers that search for successive matches into an infinite loop.)
    1139             :          */
    1140         252 :         const char *result_hptr = NULL;
    1141             : 
    1142         252 :         hptr = start_ptr;
    1143         678 :         while (hptr < haystack_end)
    1144             :         {
    1145             :             const char *test_end;
    1146             : 
    1147             :             /*
    1148             :              * First check the common case that there is a match in the
    1149             :              * haystack of exactly the length of the needle.
    1150             :              */
    1151         564 :             if (!state->greedy &&
    1152         108 :                 haystack_end - hptr >= needle_len &&
    1153          54 :                 pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
    1154          12 :                 return (char *) hptr;
    1155             : 
    1156             :             /*
    1157             :              * Else check if any of the non-empty substrings starting at hptr
    1158             :              * compare equal to the needle.
    1159             :              */
    1160         552 :             test_end = hptr;
    1161             :             do
    1162             :             {
    1163        2154 :                 test_end += pg_mblen(test_end);
    1164        2154 :                 if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
    1165             :                 {
    1166         138 :                     state->last_match_len_tmp = (test_end - hptr);
    1167         138 :                     result_hptr = hptr;
    1168         138 :                     if (!state->greedy)
    1169           0 :                         break;
    1170             :                 }
    1171        2154 :             } while (test_end < haystack_end);
    1172             : 
    1173         552 :             if (result_hptr)
    1174         126 :                 break;
    1175             : 
    1176         426 :             hptr += pg_mblen(hptr);
    1177             :         }
    1178             : 
    1179         240 :         return (char *) result_hptr;
    1180             :     }
    1181        9448 :     else if (needle_len == 1)
    1182             :     {
    1183             :         /* No point in using B-M-H for a one-character needle */
    1184         760 :         char        nchar = *needle;
    1185             : 
    1186         760 :         hptr = start_ptr;
    1187        5878 :         while (hptr < haystack_end)
    1188             :         {
    1189        5712 :             if (*hptr == nchar)
    1190         594 :                 return (char *) hptr;
    1191        5118 :             hptr++;
    1192             :         }
    1193             :     }
    1194             :     else
    1195             :     {
    1196        8688 :         const char *needle_last = &needle[needle_len - 1];
    1197             : 
    1198             :         /* Start at startpos plus the length of the needle */
    1199        8688 :         hptr = start_ptr + needle_len - 1;
    1200      222604 :         while (hptr < haystack_end)
    1201             :         {
    1202             :             /* Match the needle scanning *backward* */
    1203             :             const char *nptr;
    1204             :             const char *p;
    1205             : 
    1206      221048 :             nptr = needle_last;
    1207      221048 :             p = hptr;
    1208      326830 :             while (*nptr == *p)
    1209             :             {
    1210             :                 /* Matched it all?  If so, return 1-based position */
    1211      112914 :                 if (nptr == needle)
    1212        7132 :                     return (char *) p;
    1213      105782 :                 nptr--, p--;
    1214             :             }
    1215             : 
    1216             :             /*
    1217             :              * No match, so use the haystack char at hptr to decide how far to
    1218             :              * advance.  If the needle had any occurrence of that character
    1219             :              * (or more precisely, one sharing the same skiptable entry)
    1220             :              * before its last character, then we advance far enough to align
    1221             :              * the last such needle character with that haystack position.
    1222             :              * Otherwise we can advance by the whole needle length.
    1223             :              */
    1224      213916 :             hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
    1225             :         }
    1226             :     }
    1227             : 
    1228        1722 :     return 0;                   /* not found */
    1229             : }
    1230             : 
    1231             : /*
    1232             :  * Return a pointer to the current match.
    1233             :  *
    1234             :  * The returned pointer points into the original haystack string.
    1235             :  */
    1236             : static char *
    1237        7762 : text_position_get_match_ptr(TextPositionState *state)
    1238             : {
    1239        7762 :     return state->last_match;
    1240             : }
    1241             : 
    1242             : /*
    1243             :  * Return the offset of the current match.
    1244             :  *
    1245             :  * The offset is in characters, 1-based.
    1246             :  */
    1247             : static int
    1248          72 : text_position_get_match_pos(TextPositionState *state)
    1249             : {
    1250             :     /* Convert the byte position to char position. */
    1251         144 :     state->refpos += pg_mbstrlen_with_len(state->refpoint,
    1252          72 :                                           state->last_match - state->refpoint);
    1253          72 :     state->refpoint = state->last_match;
    1254          72 :     return state->refpos + 1;
    1255             : }
    1256             : 
    1257             : /*
    1258             :  * Reset search state to the initial state installed by text_position_setup.
    1259             :  *
    1260             :  * The next call to text_position_next will search from the beginning
    1261             :  * of the string.
    1262             :  */
    1263             : static void
    1264          12 : text_position_reset(TextPositionState *state)
    1265             : {
    1266          12 :     state->last_match = NULL;
    1267          12 :     state->refpoint = state->str1;
    1268          12 :     state->refpos = 0;
    1269          12 : }
    1270             : 
    1271             : static void
    1272        1920 : text_position_cleanup(TextPositionState *state)
    1273             : {
    1274             :     /* no cleanup needed */
    1275        1920 : }
    1276             : 
    1277             : 
    1278             : static void
    1279    17138188 : check_collation_set(Oid collid)
    1280             : {
    1281    17138188 :     if (!OidIsValid(collid))
    1282             :     {
    1283             :         /*
    1284             :          * This typically means that the parser could not resolve a conflict
    1285             :          * of implicit collations, so report it that way.
    1286             :          */
    1287          30 :         ereport(ERROR,
    1288             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
    1289             :                  errmsg("could not determine which collation to use for string comparison"),
    1290             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
    1291             :     }
    1292    17138158 : }
    1293             : 
    1294             : /*
    1295             :  * varstr_cmp()
    1296             :  *
    1297             :  * Comparison function for text strings with given lengths, using the
    1298             :  * appropriate locale. Returns an integer less than, equal to, or greater than
    1299             :  * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
    1300             :  *
    1301             :  * Note: many functions that depend on this are marked leakproof; therefore,
    1302             :  * avoid reporting the actual contents of the input when throwing errors.
    1303             :  * All errors herein should be things that can't happen except on corrupt
    1304             :  * data, anyway; otherwise we will have trouble with indexing strings that
    1305             :  * would cause them.
    1306             :  */
    1307             : int
    1308     9643626 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
    1309             : {
    1310             :     int         result;
    1311             :     pg_locale_t mylocale;
    1312             : 
    1313     9643626 :     check_collation_set(collid);
    1314             : 
    1315     9643608 :     mylocale = pg_newlocale_from_collation(collid);
    1316             : 
    1317     9643608 :     if (mylocale->collate_is_c)
    1318             :     {
    1319     3646788 :         result = memcmp(arg1, arg2, Min(len1, len2));
    1320     3646788 :         if ((result == 0) && (len1 != len2))
    1321      133284 :             result = (len1 < len2) ? -1 : 1;
    1322             :     }
    1323             :     else
    1324             :     {
    1325             :         /*
    1326             :          * memcmp() can't tell us which of two unequal strings sorts first,
    1327             :          * but it's a cheap way to tell if they're equal.  Testing shows that
    1328             :          * memcmp() followed by strcoll() is only trivially slower than
    1329             :          * strcoll() by itself, so we don't lose much if this doesn't work out
    1330             :          * very often, and if it does - for example, because there are many
    1331             :          * equal strings in the input - then we win big by avoiding expensive
    1332             :          * collation-aware comparisons.
    1333             :          */
    1334     5996820 :         if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
    1335     1555576 :             return 0;
    1336             : 
    1337     4441244 :         result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
    1338             : 
    1339             :         /* Break tie if necessary. */
    1340     4441244 :         if (result == 0 && mylocale->deterministic)
    1341             :         {
    1342           0 :             result = memcmp(arg1, arg2, Min(len1, len2));
    1343           0 :             if ((result == 0) && (len1 != len2))
    1344           0 :                 result = (len1 < len2) ? -1 : 1;
    1345             :         }
    1346             :     }
    1347             : 
    1348     8088032 :     return result;
    1349             : }
    1350             : 
    1351             : /* text_cmp()
    1352             :  * Internal comparison function for text strings.
    1353             :  * Returns -1, 0 or 1
    1354             :  */
    1355             : static int
    1356     7852654 : text_cmp(text *arg1, text *arg2, Oid collid)
    1357             : {
    1358             :     char       *a1p,
    1359             :                *a2p;
    1360             :     int         len1,
    1361             :                 len2;
    1362             : 
    1363     7852654 :     a1p = VARDATA_ANY(arg1);
    1364     7852654 :     a2p = VARDATA_ANY(arg2);
    1365             : 
    1366     7852654 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1367     7852654 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1368             : 
    1369     7852654 :     return varstr_cmp(a1p, len1, a2p, len2, collid);
    1370             : }
    1371             : 
    1372             : /*
    1373             :  * Comparison functions for text strings.
    1374             :  *
    1375             :  * Note: btree indexes need these routines not to leak memory; therefore,
    1376             :  * be careful to free working copies of toasted datums.  Most places don't
    1377             :  * need to be so careful.
    1378             :  */
    1379             : 
    1380             : Datum
    1381     6701884 : texteq(PG_FUNCTION_ARGS)
    1382             : {
    1383     6701884 :     Oid         collid = PG_GET_COLLATION();
    1384     6701884 :     pg_locale_t mylocale = 0;
    1385             :     bool        result;
    1386             : 
    1387     6701884 :     check_collation_set(collid);
    1388             : 
    1389     6701884 :     mylocale = pg_newlocale_from_collation(collid);
    1390             : 
    1391     6701884 :     if (mylocale->deterministic)
    1392             :     {
    1393     6697524 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1394     6697524 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1395             :         Size        len1,
    1396             :                     len2;
    1397             : 
    1398             :         /*
    1399             :          * Since we only care about equality or not-equality, we can avoid all
    1400             :          * the expense of strcoll() here, and just do bitwise comparison.  In
    1401             :          * fact, we don't even have to do a bitwise comparison if we can show
    1402             :          * the lengths of the strings are unequal; which might save us from
    1403             :          * having to detoast one or both values.
    1404             :          */
    1405     6697524 :         len1 = toast_raw_datum_size(arg1);
    1406     6697524 :         len2 = toast_raw_datum_size(arg2);
    1407     6697524 :         if (len1 != len2)
    1408     3190606 :             result = false;
    1409             :         else
    1410             :         {
    1411     3506918 :             text       *targ1 = DatumGetTextPP(arg1);
    1412     3506918 :             text       *targ2 = DatumGetTextPP(arg2);
    1413             : 
    1414     3506918 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1415             :                              len1 - VARHDRSZ) == 0);
    1416             : 
    1417     3506918 :             PG_FREE_IF_COPY(targ1, 0);
    1418     3506918 :             PG_FREE_IF_COPY(targ2, 1);
    1419             :         }
    1420             :     }
    1421             :     else
    1422             :     {
    1423        4360 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1424        4360 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1425             : 
    1426        4360 :         result = (text_cmp(arg1, arg2, collid) == 0);
    1427             : 
    1428        4360 :         PG_FREE_IF_COPY(arg1, 0);
    1429        4360 :         PG_FREE_IF_COPY(arg2, 1);
    1430             :     }
    1431             : 
    1432     6701884 :     PG_RETURN_BOOL(result);
    1433             : }
    1434             : 
    1435             : Datum
    1436      407948 : textne(PG_FUNCTION_ARGS)
    1437             : {
    1438      407948 :     Oid         collid = PG_GET_COLLATION();
    1439             :     pg_locale_t mylocale;
    1440             :     bool        result;
    1441             : 
    1442      407948 :     check_collation_set(collid);
    1443             : 
    1444      407948 :     mylocale = pg_newlocale_from_collation(collid);
    1445             : 
    1446      407948 :     if (mylocale->deterministic)
    1447             :     {
    1448      407924 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1449      407924 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1450             :         Size        len1,
    1451             :                     len2;
    1452             : 
    1453             :         /* See comment in texteq() */
    1454      407924 :         len1 = toast_raw_datum_size(arg1);
    1455      407924 :         len2 = toast_raw_datum_size(arg2);
    1456      407924 :         if (len1 != len2)
    1457       22314 :             result = true;
    1458             :         else
    1459             :         {
    1460      385610 :             text       *targ1 = DatumGetTextPP(arg1);
    1461      385610 :             text       *targ2 = DatumGetTextPP(arg2);
    1462             : 
    1463      385610 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1464             :                              len1 - VARHDRSZ) != 0);
    1465             : 
    1466      385610 :             PG_FREE_IF_COPY(targ1, 0);
    1467      385610 :             PG_FREE_IF_COPY(targ2, 1);
    1468             :         }
    1469             :     }
    1470             :     else
    1471             :     {
    1472          24 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1473          24 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1474             : 
    1475          24 :         result = (text_cmp(arg1, arg2, collid) != 0);
    1476             : 
    1477          24 :         PG_FREE_IF_COPY(arg1, 0);
    1478          24 :         PG_FREE_IF_COPY(arg2, 1);
    1479             :     }
    1480             : 
    1481      407948 :     PG_RETURN_BOOL(result);
    1482             : }
    1483             : 
    1484             : Datum
    1485      209294 : text_lt(PG_FUNCTION_ARGS)
    1486             : {
    1487      209294 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1488      209294 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1489             :     bool        result;
    1490             : 
    1491      209294 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
    1492             : 
    1493      209276 :     PG_FREE_IF_COPY(arg1, 0);
    1494      209276 :     PG_FREE_IF_COPY(arg2, 1);
    1495             : 
    1496      209276 :     PG_RETURN_BOOL(result);
    1497             : }
    1498             : 
    1499             : Datum
    1500      317746 : text_le(PG_FUNCTION_ARGS)
    1501             : {
    1502      317746 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1503      317746 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1504             :     bool        result;
    1505             : 
    1506      317746 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
    1507             : 
    1508      317746 :     PG_FREE_IF_COPY(arg1, 0);
    1509      317746 :     PG_FREE_IF_COPY(arg2, 1);
    1510             : 
    1511      317746 :     PG_RETURN_BOOL(result);
    1512             : }
    1513             : 
    1514             : Datum
    1515      196076 : text_gt(PG_FUNCTION_ARGS)
    1516             : {
    1517      196076 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1518      196076 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1519             :     bool        result;
    1520             : 
    1521      196076 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
    1522             : 
    1523      196076 :     PG_FREE_IF_COPY(arg1, 0);
    1524      196076 :     PG_FREE_IF_COPY(arg2, 1);
    1525             : 
    1526      196076 :     PG_RETURN_BOOL(result);
    1527             : }
    1528             : 
    1529             : Datum
    1530      175720 : text_ge(PG_FUNCTION_ARGS)
    1531             : {
    1532      175720 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1533      175720 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1534             :     bool        result;
    1535             : 
    1536      175720 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
    1537             : 
    1538      175720 :     PG_FREE_IF_COPY(arg1, 0);
    1539      175720 :     PG_FREE_IF_COPY(arg2, 1);
    1540             : 
    1541      175720 :     PG_RETURN_BOOL(result);
    1542             : }
    1543             : 
    1544             : Datum
    1545       37914 : text_starts_with(PG_FUNCTION_ARGS)
    1546             : {
    1547       37914 :     Datum       arg1 = PG_GETARG_DATUM(0);
    1548       37914 :     Datum       arg2 = PG_GETARG_DATUM(1);
    1549       37914 :     Oid         collid = PG_GET_COLLATION();
    1550             :     pg_locale_t mylocale;
    1551             :     bool        result;
    1552             :     Size        len1,
    1553             :                 len2;
    1554             : 
    1555       37914 :     check_collation_set(collid);
    1556             : 
    1557       37914 :     mylocale = pg_newlocale_from_collation(collid);
    1558             : 
    1559       37914 :     if (!mylocale->deterministic)
    1560           0 :         ereport(ERROR,
    1561             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1562             :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1563             : 
    1564       37914 :     len1 = toast_raw_datum_size(arg1);
    1565       37914 :     len2 = toast_raw_datum_size(arg2);
    1566       37914 :     if (len2 > len1)
    1567           0 :         result = false;
    1568             :     else
    1569             :     {
    1570       37914 :         text       *targ1 = text_substring(arg1, 1, len2, false);
    1571       37914 :         text       *targ2 = DatumGetTextPP(arg2);
    1572             : 
    1573       37914 :         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1574             :                          VARSIZE_ANY_EXHDR(targ2)) == 0);
    1575             : 
    1576       37914 :         PG_FREE_IF_COPY(targ1, 0);
    1577       37914 :         PG_FREE_IF_COPY(targ2, 1);
    1578             :     }
    1579             : 
    1580       37914 :     PG_RETURN_BOOL(result);
    1581             : }
    1582             : 
    1583             : Datum
    1584     6633798 : bttextcmp(PG_FUNCTION_ARGS)
    1585             : {
    1586     6633798 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1587     6633798 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1588             :     int32       result;
    1589             : 
    1590     6633798 :     result = text_cmp(arg1, arg2, PG_GET_COLLATION());
    1591             : 
    1592     6633798 :     PG_FREE_IF_COPY(arg1, 0);
    1593     6633798 :     PG_FREE_IF_COPY(arg2, 1);
    1594             : 
    1595     6633798 :     PG_RETURN_INT32(result);
    1596             : }
    1597             : 
    1598             : Datum
    1599       87330 : bttextsortsupport(PG_FUNCTION_ARGS)
    1600             : {
    1601       87330 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    1602       87330 :     Oid         collid = ssup->ssup_collation;
    1603             :     MemoryContext oldcontext;
    1604             : 
    1605       87330 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    1606             : 
    1607             :     /* Use generic string SortSupport */
    1608       87330 :     varstr_sortsupport(ssup, TEXTOID, collid);
    1609             : 
    1610       87318 :     MemoryContextSwitchTo(oldcontext);
    1611             : 
    1612       87318 :     PG_RETURN_VOID();
    1613             : }
    1614             : 
    1615             : /*
    1616             :  * Generic sortsupport interface for character type's operator classes.
    1617             :  * Includes locale support, and support for BpChar semantics (i.e. removing
    1618             :  * trailing spaces before comparison).
    1619             :  *
    1620             :  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
    1621             :  * same representation.  Callers that always use the C collation (e.g.
    1622             :  * non-collatable type callers like bytea) may have NUL bytes in their strings;
    1623             :  * this will not work with any other collation, though.
    1624             :  */
    1625             : void
    1626      137980 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
    1627             : {
    1628      137980 :     bool        abbreviate = ssup->abbreviate;
    1629      137980 :     bool        collate_c = false;
    1630             :     VarStringSortSupport *sss;
    1631             :     pg_locale_t locale;
    1632             : 
    1633      137980 :     check_collation_set(collid);
    1634             : 
    1635      137968 :     locale = pg_newlocale_from_collation(collid);
    1636             : 
    1637             :     /*
    1638             :      * If possible, set ssup->comparator to a function which can be used to
    1639             :      * directly compare two datums.  If we can do this, we'll avoid the
    1640             :      * overhead of a trip through the fmgr layer for every comparison, which
    1641             :      * can be substantial.
    1642             :      *
    1643             :      * Most typically, we'll set the comparator to varlenafastcmp_locale,
    1644             :      * which uses strcoll() to perform comparisons.  We use that for the
    1645             :      * BpChar case too, but type NAME uses namefastcmp_locale. However, if
    1646             :      * LC_COLLATE = C, we can make things quite a bit faster with
    1647             :      * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
    1648             :      * memcmp() rather than strcoll().
    1649             :      */
    1650      137968 :     if (locale->collate_is_c)
    1651             :     {
    1652       92178 :         if (typid == BPCHAROID)
    1653         308 :             ssup->comparator = bpcharfastcmp_c;
    1654       91870 :         else if (typid == NAMEOID)
    1655             :         {
    1656       49570 :             ssup->comparator = namefastcmp_c;
    1657             :             /* Not supporting abbreviation with type NAME, for now */
    1658       49570 :             abbreviate = false;
    1659             :         }
    1660             :         else
    1661       42300 :             ssup->comparator = varstrfastcmp_c;
    1662             : 
    1663       92178 :         collate_c = true;
    1664             :     }
    1665             :     else
    1666             :     {
    1667             :         /*
    1668             :          * We use varlenafastcmp_locale except for type NAME.
    1669             :          */
    1670       45790 :         if (typid == NAMEOID)
    1671             :         {
    1672           0 :             ssup->comparator = namefastcmp_locale;
    1673             :             /* Not supporting abbreviation with type NAME, for now */
    1674           0 :             abbreviate = false;
    1675             :         }
    1676             :         else
    1677       45790 :             ssup->comparator = varlenafastcmp_locale;
    1678             : 
    1679             :         /*
    1680             :          * Unfortunately, it seems that abbreviation for non-C collations is
    1681             :          * broken on many common platforms; see pg_strxfrm_enabled().
    1682             :          *
    1683             :          * Even apart from the risk of broken locales, it's possible that
    1684             :          * there are platforms where the use of abbreviated keys should be
    1685             :          * disabled at compile time.  For example, macOS's strxfrm()
    1686             :          * implementation is known to not effectively concentrate a
    1687             :          * significant amount of entropy from the original string in earlier
    1688             :          * transformed blobs.  It's possible that other supported platforms
    1689             :          * are similarly encumbered.  So, if we ever get past disabling this
    1690             :          * categorically, we may still want or need to disable it for
    1691             :          * particular platforms.
    1692             :          */
    1693       45790 :         if (!pg_strxfrm_enabled(locale))
    1694       44994 :             abbreviate = false;
    1695             :     }
    1696             : 
    1697             :     /*
    1698             :      * If we're using abbreviated keys, or if we're using a locale-aware
    1699             :      * comparison, we need to initialize a VarStringSortSupport object. Both
    1700             :      * cases will make use of the temporary buffers we initialize here for
    1701             :      * scratch space (and to detect requirement for BpChar semantics from
    1702             :      * caller), and the abbreviation case requires additional state.
    1703             :      */
    1704      137968 :     if (abbreviate || !collate_c)
    1705             :     {
    1706       69606 :         sss = palloc_object(VarStringSortSupport);
    1707       69606 :         sss->buf1 = palloc(TEXTBUFLEN);
    1708       69606 :         sss->buflen1 = TEXTBUFLEN;
    1709       69606 :         sss->buf2 = palloc(TEXTBUFLEN);
    1710       69606 :         sss->buflen2 = TEXTBUFLEN;
    1711             :         /* Start with invalid values */
    1712       69606 :         sss->last_len1 = -1;
    1713       69606 :         sss->last_len2 = -1;
    1714             :         /* Initialize */
    1715       69606 :         sss->last_returned = 0;
    1716       69606 :         if (collate_c)
    1717       23816 :             sss->locale = NULL;
    1718             :         else
    1719       45790 :             sss->locale = locale;
    1720             : 
    1721             :         /*
    1722             :          * To avoid somehow confusing a strxfrm() blob and an original string,
    1723             :          * constantly keep track of the variety of data that buf1 and buf2
    1724             :          * currently contain.
    1725             :          *
    1726             :          * Comparisons may be interleaved with conversion calls.  Frequently,
    1727             :          * conversions and comparisons are batched into two distinct phases,
    1728             :          * but the correctness of caching cannot hinge upon this.  For
    1729             :          * comparison caching, buffer state is only trusted if cache_blob is
    1730             :          * found set to false, whereas strxfrm() caching only trusts the state
    1731             :          * when cache_blob is found set to true.
    1732             :          *
    1733             :          * Arbitrarily initialize cache_blob to true.
    1734             :          */
    1735       69606 :         sss->cache_blob = true;
    1736       69606 :         sss->collate_c = collate_c;
    1737       69606 :         sss->typid = typid;
    1738       69606 :         ssup->ssup_extra = sss;
    1739             : 
    1740             :         /*
    1741             :          * If possible, plan to use the abbreviated keys optimization.  The
    1742             :          * core code may switch back to authoritative comparator should
    1743             :          * abbreviation be aborted.
    1744             :          */
    1745       69606 :         if (abbreviate)
    1746             :         {
    1747       24414 :             sss->prop_card = 0.20;
    1748       24414 :             initHyperLogLog(&sss->abbr_card, 10);
    1749       24414 :             initHyperLogLog(&sss->full_card, 10);
    1750       24414 :             ssup->abbrev_full_comparator = ssup->comparator;
    1751       24414 :             ssup->comparator = ssup_datum_unsigned_cmp;
    1752       24414 :             ssup->abbrev_converter = varstr_abbrev_convert;
    1753       24414 :             ssup->abbrev_abort = varstr_abbrev_abort;
    1754             :         }
    1755             :     }
    1756      137968 : }
    1757             : 
    1758             : /*
    1759             :  * sortsupport comparison func (for C locale case)
    1760             :  */
    1761             : static int
    1762    45406186 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1763             : {
    1764    45406186 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1765    45406186 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1766             :     char       *a1p,
    1767             :                *a2p;
    1768             :     int         len1,
    1769             :                 len2,
    1770             :                 result;
    1771             : 
    1772    45406186 :     a1p = VARDATA_ANY(arg1);
    1773    45406186 :     a2p = VARDATA_ANY(arg2);
    1774             : 
    1775    45406186 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1776    45406186 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1777             : 
    1778    45406186 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1779    45406186 :     if ((result == 0) && (len1 != len2))
    1780     1209872 :         result = (len1 < len2) ? -1 : 1;
    1781             : 
    1782             :     /* We can't afford to leak memory here. */
    1783    45406186 :     if (PointerGetDatum(arg1) != x)
    1784           0 :         pfree(arg1);
    1785    45406186 :     if (PointerGetDatum(arg2) != y)
    1786           0 :         pfree(arg2);
    1787             : 
    1788    45406186 :     return result;
    1789             : }
    1790             : 
    1791             : /*
    1792             :  * sortsupport comparison func (for BpChar C locale case)
    1793             :  *
    1794             :  * BpChar outsources its sortsupport to this module.  Specialization for the
    1795             :  * varstr_sortsupport BpChar case, modeled on
    1796             :  * internal_bpchar_pattern_compare().
    1797             :  */
    1798             : static int
    1799       62420 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
    1800             : {
    1801       62420 :     BpChar     *arg1 = DatumGetBpCharPP(x);
    1802       62420 :     BpChar     *arg2 = DatumGetBpCharPP(y);
    1803             :     char       *a1p,
    1804             :                *a2p;
    1805             :     int         len1,
    1806             :                 len2,
    1807             :                 result;
    1808             : 
    1809       62420 :     a1p = VARDATA_ANY(arg1);
    1810       62420 :     a2p = VARDATA_ANY(arg2);
    1811             : 
    1812       62420 :     len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
    1813       62420 :     len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
    1814             : 
    1815       62420 :     result = memcmp(a1p, a2p, Min(len1, len2));
    1816       62420 :     if ((result == 0) && (len1 != len2))
    1817           4 :         result = (len1 < len2) ? -1 : 1;
    1818             : 
    1819             :     /* We can't afford to leak memory here. */
    1820       62420 :     if (PointerGetDatum(arg1) != x)
    1821           0 :         pfree(arg1);
    1822       62420 :     if (PointerGetDatum(arg2) != y)
    1823           0 :         pfree(arg2);
    1824             : 
    1825       62420 :     return result;
    1826             : }
    1827             : 
    1828             : /*
    1829             :  * sortsupport comparison func (for NAME C locale case)
    1830             :  */
    1831             : static int
    1832    41163888 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
    1833             : {
    1834    41163888 :     Name        arg1 = DatumGetName(x);
    1835    41163888 :     Name        arg2 = DatumGetName(y);
    1836             : 
    1837    41163888 :     return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
    1838             : }
    1839             : 
    1840             : /*
    1841             :  * sortsupport comparison func (for locale case with all varlena types)
    1842             :  */
    1843             : static int
    1844    36103326 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1845             : {
    1846    36103326 :     VarString  *arg1 = DatumGetVarStringPP(x);
    1847    36103326 :     VarString  *arg2 = DatumGetVarStringPP(y);
    1848             :     char       *a1p,
    1849             :                *a2p;
    1850             :     int         len1,
    1851             :                 len2,
    1852             :                 result;
    1853             : 
    1854    36103326 :     a1p = VARDATA_ANY(arg1);
    1855    36103326 :     a2p = VARDATA_ANY(arg2);
    1856             : 
    1857    36103326 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1858    36103326 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1859             : 
    1860    36103326 :     result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
    1861             : 
    1862             :     /* We can't afford to leak memory here. */
    1863    36103326 :     if (PointerGetDatum(arg1) != x)
    1864           0 :         pfree(arg1);
    1865    36103326 :     if (PointerGetDatum(arg2) != y)
    1866           0 :         pfree(arg2);
    1867             : 
    1868    36103326 :     return result;
    1869             : }
    1870             : 
    1871             : /*
    1872             :  * sortsupport comparison func (for locale case with NAME type)
    1873             :  */
    1874             : static int
    1875           0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
    1876             : {
    1877           0 :     Name        arg1 = DatumGetName(x);
    1878           0 :     Name        arg2 = DatumGetName(y);
    1879             : 
    1880           0 :     return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
    1881           0 :                                 NameStr(*arg2), strlen(NameStr(*arg2)),
    1882             :                                 ssup);
    1883             : }
    1884             : 
    1885             : /*
    1886             :  * sortsupport comparison func for locale cases
    1887             :  */
    1888             : static int
    1889    36103326 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
    1890             : {
    1891    36103326 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    1892             :     int         result;
    1893             :     bool        arg1_match;
    1894             : 
    1895             :     /* Fast pre-check for equality, as discussed in varstr_cmp() */
    1896    36103326 :     if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
    1897             :     {
    1898             :         /*
    1899             :          * No change in buf1 or buf2 contents, so avoid changing last_len1 or
    1900             :          * last_len2.  Existing contents of buffers might still be used by
    1901             :          * next call.
    1902             :          *
    1903             :          * It's fine to allow the comparison of BpChar padding bytes here,
    1904             :          * even though that implies that the memcmp() will usually be
    1905             :          * performed for BpChar callers (though multibyte characters could
    1906             :          * still prevent that from occurring).  The memcmp() is still very
    1907             :          * cheap, and BpChar's funny semantics have us remove trailing spaces
    1908             :          * (not limited to padding), so we need make no distinction between
    1909             :          * padding space characters and "real" space characters.
    1910             :          */
    1911     9298308 :         return 0;
    1912             :     }
    1913             : 
    1914    26805018 :     if (sss->typid == BPCHAROID)
    1915             :     {
    1916             :         /* Get true number of bytes, ignoring trailing spaces */
    1917       34546 :         len1 = bpchartruelen(a1p, len1);
    1918       34546 :         len2 = bpchartruelen(a2p, len2);
    1919             :     }
    1920             : 
    1921    26805018 :     if (len1 >= sss->buflen1)
    1922             :     {
    1923          10 :         sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    1924          10 :         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    1925             :     }
    1926    26805018 :     if (len2 >= sss->buflen2)
    1927             :     {
    1928           6 :         sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
    1929           6 :         sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    1930             :     }
    1931             : 
    1932             :     /*
    1933             :      * We're likely to be asked to compare the same strings repeatedly, and
    1934             :      * memcmp() is so much cheaper than strcoll() that it pays to try to cache
    1935             :      * comparisons, even though in general there is no reason to think that
    1936             :      * that will work out (every string datum may be unique).  Caching does
    1937             :      * not slow things down measurably when it doesn't work out, and can speed
    1938             :      * things up by rather a lot when it does.  In part, this is because the
    1939             :      * memcmp() compares data from cachelines that are needed in L1 cache even
    1940             :      * when the last comparison's result cannot be reused.
    1941             :      */
    1942    26805018 :     arg1_match = true;
    1943    26805018 :     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
    1944             :     {
    1945    24797978 :         arg1_match = false;
    1946    24797978 :         memcpy(sss->buf1, a1p, len1);
    1947    24797978 :         sss->buf1[len1] = '\0';
    1948    24797978 :         sss->last_len1 = len1;
    1949             :     }
    1950             : 
    1951             :     /*
    1952             :      * If we're comparing the same two strings as last time, we can return the
    1953             :      * same answer without calling strcoll() again.  This is more likely than
    1954             :      * it seems (at least with moderate to low cardinality sets), because
    1955             :      * quicksort compares the same pivot against many values.
    1956             :      */
    1957    26805018 :     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
    1958             :     {
    1959     4079656 :         memcpy(sss->buf2, a2p, len2);
    1960     4079656 :         sss->buf2[len2] = '\0';
    1961     4079656 :         sss->last_len2 = len2;
    1962             :     }
    1963    22725362 :     else if (arg1_match && !sss->cache_blob)
    1964             :     {
    1965             :         /* Use result cached following last actual strcoll() call */
    1966     1580796 :         return sss->last_returned;
    1967             :     }
    1968             : 
    1969    25224222 :     result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
    1970             : 
    1971             :     /* Break tie if necessary. */
    1972    25224222 :     if (result == 0 && sss->locale->deterministic)
    1973           0 :         result = strcmp(sss->buf1, sss->buf2);
    1974             : 
    1975             :     /* Cache result, perhaps saving an expensive strcoll() call next time */
    1976    25224222 :     sss->cache_blob = false;
    1977    25224222 :     sss->last_returned = result;
    1978    25224222 :     return result;
    1979             : }
    1980             : 
    1981             : /*
    1982             :  * Conversion routine for sortsupport.  Converts original to abbreviated key
    1983             :  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
    1984             :  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
    1985             :  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
    1986             :  * locale is used, or in case of bytea, just memcpy() from original instead.
    1987             :  */
    1988             : static Datum
    1989      840244 : varstr_abbrev_convert(Datum original, SortSupport ssup)
    1990             : {
    1991      840244 :     const size_t max_prefix_bytes = sizeof(Datum);
    1992      840244 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    1993      840244 :     VarString  *authoritative = DatumGetVarStringPP(original);
    1994      840244 :     char       *authoritative_data = VARDATA_ANY(authoritative);
    1995             : 
    1996             :     /* working state */
    1997             :     Datum       res;
    1998             :     char       *pres;
    1999             :     int         len;
    2000             :     uint32      hash;
    2001             : 
    2002      840244 :     pres = (char *) &res;
    2003             :     /* memset(), so any non-overwritten bytes are NUL */
    2004      840244 :     memset(pres, 0, max_prefix_bytes);
    2005      840244 :     len = VARSIZE_ANY_EXHDR(authoritative);
    2006             : 
    2007             :     /* Get number of bytes, ignoring trailing spaces */
    2008      840244 :     if (sss->typid == BPCHAROID)
    2009        1010 :         len = bpchartruelen(authoritative_data, len);
    2010             : 
    2011             :     /*
    2012             :      * If we're using the C collation, use memcpy(), rather than strxfrm(), to
    2013             :      * abbreviate keys.  The full comparator for the C locale is always
    2014             :      * memcmp().  It would be incorrect to allow bytea callers (callers that
    2015             :      * always force the C collation -- bytea isn't a collatable type, but this
    2016             :      * approach is convenient) to use strxfrm().  This is because bytea
    2017             :      * strings may contain NUL bytes.  Besides, this should be faster, too.
    2018             :      *
    2019             :      * More generally, it's okay that bytea callers can have NUL bytes in
    2020             :      * strings because abbreviated cmp need not make a distinction between
    2021             :      * terminating NUL bytes, and NUL bytes representing actual NULs in the
    2022             :      * authoritative representation.  Hopefully a comparison at or past one
    2023             :      * abbreviated key's terminating NUL byte will resolve the comparison
    2024             :      * without consulting the authoritative representation; specifically, some
    2025             :      * later non-NUL byte in the longer string can resolve the comparison
    2026             :      * against a subsequent terminating NUL in the shorter string.  There will
    2027             :      * usually be what is effectively a "length-wise" resolution there and
    2028             :      * then.
    2029             :      *
    2030             :      * If that doesn't work out -- if all bytes in the longer string
    2031             :      * positioned at or past the offset of the smaller string's (first)
    2032             :      * terminating NUL are actually representative of NUL bytes in the
    2033             :      * authoritative binary string (perhaps with some *terminating* NUL bytes
    2034             :      * towards the end of the longer string iff it happens to still be small)
    2035             :      * -- then an authoritative tie-breaker will happen, and do the right
    2036             :      * thing: explicitly consider string length.
    2037             :      */
    2038      840244 :     if (sss->collate_c)
    2039      838408 :         memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
    2040             :     else
    2041             :     {
    2042             :         Size        bsize;
    2043             : 
    2044             :         /*
    2045             :          * We're not using the C collation, so fall back on strxfrm or ICU
    2046             :          * analogs.
    2047             :          */
    2048             : 
    2049             :         /* By convention, we use buffer 1 to store and NUL-terminate */
    2050        1836 :         if (len >= sss->buflen1)
    2051             :         {
    2052           0 :             sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2053           0 :             sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    2054             :         }
    2055             : 
    2056             :         /* Might be able to reuse strxfrm() blob from last call */
    2057        1836 :         if (sss->last_len1 == len && sss->cache_blob &&
    2058         918 :             memcmp(sss->buf1, authoritative_data, len) == 0)
    2059             :         {
    2060         168 :             memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
    2061             :             /* No change affecting cardinality, so no hashing required */
    2062         168 :             goto done;
    2063             :         }
    2064             : 
    2065        1668 :         memcpy(sss->buf1, authoritative_data, len);
    2066             : 
    2067             :         /*
    2068             :          * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
    2069             :          */
    2070        1668 :         sss->buf1[len] = '\0';
    2071        1668 :         sss->last_len1 = len;
    2072             : 
    2073        1668 :         if (pg_strxfrm_prefix_enabled(sss->locale))
    2074             :         {
    2075        1668 :             if (sss->buflen2 < max_prefix_bytes)
    2076             :             {
    2077           0 :                 sss->buflen2 = Max(max_prefix_bytes,
    2078             :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2079           0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2080             :             }
    2081             : 
    2082        1668 :             bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
    2083             :                                       max_prefix_bytes, sss->locale);
    2084        1668 :             sss->last_len2 = bsize;
    2085             :         }
    2086             :         else
    2087             :         {
    2088             :             /*
    2089             :              * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
    2090             :              * again.  The pg_strxfrm() function leaves the result buffer
    2091             :              * content undefined if the result did not fit, so we need to
    2092             :              * retry until everything fits, even though we only need the first
    2093             :              * few bytes in the end.
    2094             :              */
    2095             :             for (;;)
    2096             :             {
    2097           0 :                 bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
    2098             :                                    sss->locale);
    2099             : 
    2100           0 :                 sss->last_len2 = bsize;
    2101           0 :                 if (bsize < sss->buflen2)
    2102           0 :                     break;
    2103             : 
    2104             :                 /*
    2105             :                  * Grow buffer and retry.
    2106             :                  */
    2107           0 :                 sss->buflen2 = Max(bsize + 1,
    2108             :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2109           0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2110             :             }
    2111             :         }
    2112             : 
    2113             :         /*
    2114             :          * Every Datum byte is always compared.  This is safe because the
    2115             :          * strxfrm() blob is itself NUL terminated, leaving no danger of
    2116             :          * misinterpreting any NUL bytes not intended to be interpreted as
    2117             :          * logically representing termination.
    2118             :          *
    2119             :          * (Actually, even if there were NUL bytes in the blob it would be
    2120             :          * okay.  See remarks on bytea case above.)
    2121             :          */
    2122        1668 :         memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
    2123             :     }
    2124             : 
    2125             :     /*
    2126             :      * Maintain approximate cardinality of both abbreviated keys and original,
    2127             :      * authoritative keys using HyperLogLog.  Used as cheap insurance against
    2128             :      * the worst case, where we do many string transformations for no saving
    2129             :      * in full strcoll()-based comparisons.  These statistics are used by
    2130             :      * varstr_abbrev_abort().
    2131             :      *
    2132             :      * First, Hash key proper, or a significant fraction of it.  Mix in length
    2133             :      * in order to compensate for cases where differences are past
    2134             :      * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
    2135             :      */
    2136      840076 :     hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
    2137             :                                    Min(len, PG_CACHE_LINE_SIZE)));
    2138             : 
    2139      840076 :     if (len > PG_CACHE_LINE_SIZE)
    2140         192 :         hash ^= DatumGetUInt32(hash_uint32((uint32) len));
    2141             : 
    2142      840076 :     addHyperLogLog(&sss->full_card, hash);
    2143             : 
    2144             :     /* Hash abbreviated key */
    2145             :     {
    2146             :         uint32      tmp;
    2147             : 
    2148      840076 :         tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
    2149      840076 :         hash = DatumGetUInt32(hash_uint32(tmp));
    2150             :     }
    2151             : 
    2152      840076 :     addHyperLogLog(&sss->abbr_card, hash);
    2153             : 
    2154             :     /* Cache result, perhaps saving an expensive strxfrm() call next time */
    2155      840076 :     sss->cache_blob = true;
    2156      840244 : done:
    2157             : 
    2158             :     /*
    2159             :      * Byteswap on little-endian machines.
    2160             :      *
    2161             :      * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
    2162             :      * 3-way comparator) works correctly on all platforms.  If we didn't do
    2163             :      * this, the comparator would have to call memcmp() with a pair of
    2164             :      * pointers to the first byte of each abbreviated key, which is slower.
    2165             :      */
    2166      840244 :     res = DatumBigEndianToNative(res);
    2167             : 
    2168             :     /* Don't leak memory here */
    2169      840244 :     if (PointerGetDatum(authoritative) != original)
    2170           2 :         pfree(authoritative);
    2171             : 
    2172      840244 :     return res;
    2173             : }
    2174             : 
    2175             : /*
    2176             :  * Callback for estimating effectiveness of abbreviated key optimization, using
    2177             :  * heuristic rules.  Returns value indicating if the abbreviation optimization
    2178             :  * should be aborted, based on its projected effectiveness.
    2179             :  */
    2180             : static bool
    2181        2304 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
    2182             : {
    2183        2304 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2184             :     double      abbrev_distinct,
    2185             :                 key_distinct;
    2186             : 
    2187             :     Assert(ssup->abbreviate);
    2188             : 
    2189             :     /* Have a little patience */
    2190        2304 :     if (memtupcount < 100)
    2191        1306 :         return false;
    2192             : 
    2193         998 :     abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
    2194         998 :     key_distinct = estimateHyperLogLog(&sss->full_card);
    2195             : 
    2196             :     /*
    2197             :      * Clamp cardinality estimates to at least one distinct value.  While
    2198             :      * NULLs are generally disregarded, if only NULL values were seen so far,
    2199             :      * that might misrepresent costs if we failed to clamp.
    2200             :      */
    2201         998 :     if (abbrev_distinct <= 1.0)
    2202           0 :         abbrev_distinct = 1.0;
    2203             : 
    2204         998 :     if (key_distinct <= 1.0)
    2205           0 :         key_distinct = 1.0;
    2206             : 
    2207             :     /*
    2208             :      * In the worst case all abbreviated keys are identical, while at the same
    2209             :      * time there are differences within full key strings not captured in
    2210             :      * abbreviations.
    2211             :      */
    2212         998 :     if (trace_sort)
    2213             :     {
    2214           0 :         double      norm_abbrev_card = abbrev_distinct / (double) memtupcount;
    2215             : 
    2216           0 :         elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
    2217             :              "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
    2218             :              memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
    2219             :              sss->prop_card);
    2220             :     }
    2221             : 
    2222             :     /*
    2223             :      * If the number of distinct abbreviated keys approximately matches the
    2224             :      * number of distinct authoritative original keys, that's reason enough to
    2225             :      * proceed.  We can win even with a very low cardinality set if most
    2226             :      * tie-breakers only memcmp().  This is by far the most important
    2227             :      * consideration.
    2228             :      *
    2229             :      * While comparisons that are resolved at the abbreviated key level are
    2230             :      * considerably cheaper than tie-breakers resolved with memcmp(), both of
    2231             :      * those two outcomes are so much cheaper than a full strcoll() once
    2232             :      * sorting is underway that it doesn't seem worth it to weigh abbreviated
    2233             :      * cardinality against the overall size of the set in order to more
    2234             :      * accurately model costs.  Assume that an abbreviated comparison, and an
    2235             :      * abbreviated comparison with a cheap memcmp()-based authoritative
    2236             :      * resolution are equivalent.
    2237             :      */
    2238         998 :     if (abbrev_distinct > key_distinct * sss->prop_card)
    2239             :     {
    2240             :         /*
    2241             :          * When we have exceeded 10,000 tuples, decay required cardinality
    2242             :          * aggressively for next call.
    2243             :          *
    2244             :          * This is useful because the number of comparisons required on
    2245             :          * average increases at a linearithmic rate, and at roughly 10,000
    2246             :          * tuples that factor will start to dominate over the linear costs of
    2247             :          * string transformation (this is a conservative estimate).  The decay
    2248             :          * rate is chosen to be a little less aggressive than halving -- which
    2249             :          * (since we're called at points at which memtupcount has doubled)
    2250             :          * would never see the cost model actually abort past the first call
    2251             :          * following a decay.  This decay rate is mostly a precaution against
    2252             :          * a sudden, violent swing in how well abbreviated cardinality tracks
    2253             :          * full key cardinality.  The decay also serves to prevent a marginal
    2254             :          * case from being aborted too late, when too much has already been
    2255             :          * invested in string transformation.
    2256             :          *
    2257             :          * It's possible for sets of several million distinct strings with
    2258             :          * mere tens of thousands of distinct abbreviated keys to still
    2259             :          * benefit very significantly.  This will generally occur provided
    2260             :          * each abbreviated key is a proxy for a roughly uniform number of the
    2261             :          * set's full keys. If it isn't so, we hope to catch that early and
    2262             :          * abort.  If it isn't caught early, by the time the problem is
    2263             :          * apparent it's probably not worth aborting.
    2264             :          */
    2265         998 :         if (memtupcount > 10000)
    2266           4 :             sss->prop_card *= 0.65;
    2267             : 
    2268         998 :         return false;
    2269             :     }
    2270             : 
    2271             :     /*
    2272             :      * Abort abbreviation strategy.
    2273             :      *
    2274             :      * The worst case, where all abbreviated keys are identical while all
    2275             :      * original strings differ will typically only see a regression of about
    2276             :      * 10% in execution time for small to medium sized lists of strings.
    2277             :      * Whereas on modern CPUs where cache stalls are the dominant cost, we can
    2278             :      * often expect very large improvements, particularly with sets of strings
    2279             :      * of moderately high to high abbreviated cardinality.  There is little to
    2280             :      * lose but much to gain, which our strategy reflects.
    2281             :      */
    2282           0 :     if (trace_sort)
    2283           0 :         elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
    2284             :              "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
    2285             :              memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
    2286             : 
    2287           0 :     return true;
    2288             : }
    2289             : 
    2290             : /*
    2291             :  * Generic equalimage support function for character type's operator classes.
    2292             :  * Disables the use of deduplication with nondeterministic collations.
    2293             :  */
    2294             : Datum
    2295        8968 : btvarstrequalimage(PG_FUNCTION_ARGS)
    2296             : {
    2297             :     /* Oid      opcintype = PG_GETARG_OID(0); */
    2298        8968 :     Oid         collid = PG_GET_COLLATION();
    2299             :     pg_locale_t locale;
    2300             : 
    2301        8968 :     check_collation_set(collid);
    2302             : 
    2303        8968 :     locale = pg_newlocale_from_collation(collid);
    2304             : 
    2305        8968 :     PG_RETURN_BOOL(locale->deterministic);
    2306             : }
    2307             : 
    2308             : Datum
    2309      229560 : text_larger(PG_FUNCTION_ARGS)
    2310             : {
    2311      229560 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2312      229560 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2313             :     text       *result;
    2314             : 
    2315      229560 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
    2316             : 
    2317      229560 :     PG_RETURN_TEXT_P(result);
    2318             : }
    2319             : 
    2320             : Datum
    2321       86076 : text_smaller(PG_FUNCTION_ARGS)
    2322             : {
    2323       86076 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2324       86076 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2325             :     text       *result;
    2326             : 
    2327       86076 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
    2328             : 
    2329       86076 :     PG_RETURN_TEXT_P(result);
    2330             : }
    2331             : 
    2332             : 
    2333             : /*
    2334             :  * Cross-type comparison functions for types text and name.
    2335             :  */
    2336             : 
    2337             : Datum
    2338      189914 : nameeqtext(PG_FUNCTION_ARGS)
    2339             : {
    2340      189914 :     Name        arg1 = PG_GETARG_NAME(0);
    2341      189914 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2342      189914 :     size_t      len1 = strlen(NameStr(*arg1));
    2343      189914 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2344      189914 :     Oid         collid = PG_GET_COLLATION();
    2345             :     bool        result;
    2346             : 
    2347      189914 :     check_collation_set(collid);
    2348             : 
    2349      189914 :     if (collid == C_COLLATION_OID)
    2350      255794 :         result = (len1 == len2 &&
    2351      124084 :                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2352             :     else
    2353       58204 :         result = (varstr_cmp(NameStr(*arg1), len1,
    2354       58204 :                              VARDATA_ANY(arg2), len2,
    2355             :                              collid) == 0);
    2356             : 
    2357      189914 :     PG_FREE_IF_COPY(arg2, 1);
    2358             : 
    2359      189914 :     PG_RETURN_BOOL(result);
    2360             : }
    2361             : 
    2362             : Datum
    2363        7868 : texteqname(PG_FUNCTION_ARGS)
    2364             : {
    2365        7868 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2366        7868 :     Name        arg2 = PG_GETARG_NAME(1);
    2367        7868 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2368        7868 :     size_t      len2 = strlen(NameStr(*arg2));
    2369        7868 :     Oid         collid = PG_GET_COLLATION();
    2370             :     bool        result;
    2371             : 
    2372        7868 :     check_collation_set(collid);
    2373             : 
    2374        7868 :     if (collid == C_COLLATION_OID)
    2375         568 :         result = (len1 == len2 &&
    2376         182 :                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2377             :     else
    2378        7482 :         result = (varstr_cmp(VARDATA_ANY(arg1), len1,
    2379        7482 :                              NameStr(*arg2), len2,
    2380             :                              collid) == 0);
    2381             : 
    2382        7868 :     PG_FREE_IF_COPY(arg1, 0);
    2383             : 
    2384        7868 :     PG_RETURN_BOOL(result);
    2385             : }
    2386             : 
    2387             : Datum
    2388          18 : namenetext(PG_FUNCTION_ARGS)
    2389             : {
    2390          18 :     Name        arg1 = PG_GETARG_NAME(0);
    2391          18 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2392          18 :     size_t      len1 = strlen(NameStr(*arg1));
    2393          18 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2394          18 :     Oid         collid = PG_GET_COLLATION();
    2395             :     bool        result;
    2396             : 
    2397          18 :     check_collation_set(collid);
    2398             : 
    2399          18 :     if (collid == C_COLLATION_OID)
    2400           0 :         result = !(len1 == len2 &&
    2401           0 :                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2402             :     else
    2403          18 :         result = !(varstr_cmp(NameStr(*arg1), len1,
    2404          18 :                               VARDATA_ANY(arg2), len2,
    2405             :                               collid) == 0);
    2406             : 
    2407          18 :     PG_FREE_IF_COPY(arg2, 1);
    2408             : 
    2409          18 :     PG_RETURN_BOOL(result);
    2410             : }
    2411             : 
    2412             : Datum
    2413          18 : textnename(PG_FUNCTION_ARGS)
    2414             : {
    2415          18 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2416          18 :     Name        arg2 = PG_GETARG_NAME(1);
    2417          18 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2418          18 :     size_t      len2 = strlen(NameStr(*arg2));
    2419          18 :     Oid         collid = PG_GET_COLLATION();
    2420             :     bool        result;
    2421             : 
    2422          18 :     check_collation_set(collid);
    2423             : 
    2424          18 :     if (collid == C_COLLATION_OID)
    2425           0 :         result = !(len1 == len2 &&
    2426           0 :                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2427             :     else
    2428          18 :         result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
    2429          18 :                               NameStr(*arg2), len2,
    2430             :                               collid) == 0);
    2431             : 
    2432          18 :     PG_FREE_IF_COPY(arg1, 0);
    2433             : 
    2434          18 :     PG_RETURN_BOOL(result);
    2435             : }
    2436             : 
    2437             : Datum
    2438      122424 : btnametextcmp(PG_FUNCTION_ARGS)
    2439             : {
    2440      122424 :     Name        arg1 = PG_GETARG_NAME(0);
    2441      122424 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2442             :     int32       result;
    2443             : 
    2444      122424 :     result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
    2445      122424 :                         VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
    2446             :                         PG_GET_COLLATION());
    2447             : 
    2448      122424 :     PG_FREE_IF_COPY(arg2, 1);
    2449             : 
    2450      122424 :     PG_RETURN_INT32(result);
    2451             : }
    2452             : 
    2453             : Datum
    2454          44 : bttextnamecmp(PG_FUNCTION_ARGS)
    2455             : {
    2456          44 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2457          44 :     Name        arg2 = PG_GETARG_NAME(1);
    2458             :     int32       result;
    2459             : 
    2460          44 :     result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
    2461          44 :                         NameStr(*arg2), strlen(NameStr(*arg2)),
    2462             :                         PG_GET_COLLATION());
    2463             : 
    2464          44 :     PG_FREE_IF_COPY(arg1, 0);
    2465             : 
    2466          44 :     PG_RETURN_INT32(result);
    2467             : }
    2468             : 
    2469             : #define CmpCall(cmpfunc) \
    2470             :     DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
    2471             :                                           PG_GET_COLLATION(), \
    2472             :                                           PG_GETARG_DATUM(0), \
    2473             :                                           PG_GETARG_DATUM(1)))
    2474             : 
    2475             : Datum
    2476       58384 : namelttext(PG_FUNCTION_ARGS)
    2477             : {
    2478       58384 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
    2479             : }
    2480             : 
    2481             : Datum
    2482           0 : nameletext(PG_FUNCTION_ARGS)
    2483             : {
    2484           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
    2485             : }
    2486             : 
    2487             : Datum
    2488           0 : namegttext(PG_FUNCTION_ARGS)
    2489             : {
    2490           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
    2491             : }
    2492             : 
    2493             : Datum
    2494       51826 : namegetext(PG_FUNCTION_ARGS)
    2495             : {
    2496       51826 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
    2497             : }
    2498             : 
    2499             : Datum
    2500           0 : textltname(PG_FUNCTION_ARGS)
    2501             : {
    2502           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
    2503             : }
    2504             : 
    2505             : Datum
    2506           0 : textlename(PG_FUNCTION_ARGS)
    2507             : {
    2508           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
    2509             : }
    2510             : 
    2511             : Datum
    2512           0 : textgtname(PG_FUNCTION_ARGS)
    2513             : {
    2514           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
    2515             : }
    2516             : 
    2517             : Datum
    2518           0 : textgename(PG_FUNCTION_ARGS)
    2519             : {
    2520           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
    2521             : }
    2522             : 
    2523             : #undef CmpCall
    2524             : 
    2525             : 
    2526             : /*
    2527             :  * The following operators support character-by-character comparison
    2528             :  * of text datums, to allow building indexes suitable for LIKE clauses.
    2529             :  * Note that the regular texteq/textne comparison operators, and regular
    2530             :  * support functions 1 and 2 with "C" collation are assumed to be
    2531             :  * compatible with these!
    2532             :  */
    2533             : 
    2534             : static int
    2535      160444 : internal_text_pattern_compare(text *arg1, text *arg2)
    2536             : {
    2537             :     int         result;
    2538             :     int         len1,
    2539             :                 len2;
    2540             : 
    2541      160444 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2542      160444 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2543             : 
    2544      160444 :     result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    2545      160444 :     if (result != 0)
    2546      160312 :         return result;
    2547         132 :     else if (len1 < len2)
    2548           0 :         return -1;
    2549         132 :     else if (len1 > len2)
    2550          84 :         return 1;
    2551             :     else
    2552          48 :         return 0;
    2553             : }
    2554             : 
    2555             : 
    2556             : Datum
    2557       47866 : text_pattern_lt(PG_FUNCTION_ARGS)
    2558             : {
    2559       47866 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2560       47866 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2561             :     int         result;
    2562             : 
    2563       47866 :     result = internal_text_pattern_compare(arg1, arg2);
    2564             : 
    2565       47866 :     PG_FREE_IF_COPY(arg1, 0);
    2566       47866 :     PG_FREE_IF_COPY(arg2, 1);
    2567             : 
    2568       47866 :     PG_RETURN_BOOL(result < 0);
    2569             : }
    2570             : 
    2571             : 
    2572             : Datum
    2573       37510 : text_pattern_le(PG_FUNCTION_ARGS)
    2574             : {
    2575       37510 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2576       37510 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2577             :     int         result;
    2578             : 
    2579       37510 :     result = internal_text_pattern_compare(arg1, arg2);
    2580             : 
    2581       37510 :     PG_FREE_IF_COPY(arg1, 0);
    2582       37510 :     PG_FREE_IF_COPY(arg2, 1);
    2583             : 
    2584       37510 :     PG_RETURN_BOOL(result <= 0);
    2585             : }
    2586             : 
    2587             : 
    2588             : Datum
    2589       37534 : text_pattern_ge(PG_FUNCTION_ARGS)
    2590             : {
    2591       37534 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2592       37534 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2593             :     int         result;
    2594             : 
    2595       37534 :     result = internal_text_pattern_compare(arg1, arg2);
    2596             : 
    2597       37534 :     PG_FREE_IF_COPY(arg1, 0);
    2598       37534 :     PG_FREE_IF_COPY(arg2, 1);
    2599             : 
    2600       37534 :     PG_RETURN_BOOL(result >= 0);
    2601             : }
    2602             : 
    2603             : 
    2604             : Datum
    2605       37510 : text_pattern_gt(PG_FUNCTION_ARGS)
    2606             : {
    2607       37510 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2608       37510 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2609             :     int         result;
    2610             : 
    2611       37510 :     result = internal_text_pattern_compare(arg1, arg2);
    2612             : 
    2613       37510 :     PG_FREE_IF_COPY(arg1, 0);
    2614       37510 :     PG_FREE_IF_COPY(arg2, 1);
    2615             : 
    2616       37510 :     PG_RETURN_BOOL(result > 0);
    2617             : }
    2618             : 
    2619             : 
    2620             : Datum
    2621          24 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
    2622             : {
    2623          24 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2624          24 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2625             :     int         result;
    2626             : 
    2627          24 :     result = internal_text_pattern_compare(arg1, arg2);
    2628             : 
    2629          24 :     PG_FREE_IF_COPY(arg1, 0);
    2630          24 :     PG_FREE_IF_COPY(arg2, 1);
    2631             : 
    2632          24 :     PG_RETURN_INT32(result);
    2633             : }
    2634             : 
    2635             : 
    2636             : Datum
    2637         116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
    2638             : {
    2639         116 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    2640             :     MemoryContext oldcontext;
    2641             : 
    2642         116 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    2643             : 
    2644             :     /* Use generic string SortSupport, forcing "C" collation */
    2645         116 :     varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
    2646             : 
    2647         116 :     MemoryContextSwitchTo(oldcontext);
    2648             : 
    2649         116 :     PG_RETURN_VOID();
    2650             : }
    2651             : 
    2652             : 
    2653             : /* text_name()
    2654             :  * Converts a text type to a Name type.
    2655             :  */
    2656             : Datum
    2657       30870 : text_name(PG_FUNCTION_ARGS)
    2658             : {
    2659       30870 :     text       *s = PG_GETARG_TEXT_PP(0);
    2660             :     Name        result;
    2661             :     int         len;
    2662             : 
    2663       30870 :     len = VARSIZE_ANY_EXHDR(s);
    2664             : 
    2665             :     /* Truncate oversize input */
    2666       30870 :     if (len >= NAMEDATALEN)
    2667           6 :         len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
    2668             : 
    2669             :     /* We use palloc0 here to ensure result is zero-padded */
    2670       30870 :     result = (Name) palloc0(NAMEDATALEN);
    2671       30870 :     memcpy(NameStr(*result), VARDATA_ANY(s), len);
    2672             : 
    2673       30870 :     PG_RETURN_NAME(result);
    2674             : }
    2675             : 
    2676             : /* name_text()
    2677             :  * Converts a Name type to a text type.
    2678             :  */
    2679             : Datum
    2680      654648 : name_text(PG_FUNCTION_ARGS)
    2681             : {
    2682      654648 :     Name        s = PG_GETARG_NAME(0);
    2683             : 
    2684      654648 :     PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
    2685             : }
    2686             : 
    2687             : 
    2688             : /*
    2689             :  * textToQualifiedNameList - convert a text object to list of names
    2690             :  *
    2691             :  * This implements the input parsing needed by nextval() and other
    2692             :  * functions that take a text parameter representing a qualified name.
    2693             :  * We split the name at dots, downcase if not double-quoted, and
    2694             :  * truncate names if they're too long.
    2695             :  */
    2696             : List *
    2697        5430 : textToQualifiedNameList(text *textval)
    2698             : {
    2699             :     char       *rawname;
    2700        5430 :     List       *result = NIL;
    2701             :     List       *namelist;
    2702             :     ListCell   *l;
    2703             : 
    2704             :     /* Convert to C string (handles possible detoasting). */
    2705             :     /* Note we rely on being able to modify rawname below. */
    2706        5430 :     rawname = text_to_cstring(textval);
    2707             : 
    2708        5430 :     if (!SplitIdentifierString(rawname, '.', &namelist))
    2709           0 :         ereport(ERROR,
    2710             :                 (errcode(ERRCODE_INVALID_NAME),
    2711             :                  errmsg("invalid name syntax")));
    2712             : 
    2713        5430 :     if (namelist == NIL)
    2714           0 :         ereport(ERROR,
    2715             :                 (errcode(ERRCODE_INVALID_NAME),
    2716             :                  errmsg("invalid name syntax")));
    2717             : 
    2718       10976 :     foreach(l, namelist)
    2719             :     {
    2720        5546 :         char       *curname = (char *) lfirst(l);
    2721             : 
    2722        5546 :         result = lappend(result, makeString(pstrdup(curname)));
    2723             :     }
    2724             : 
    2725        5430 :     pfree(rawname);
    2726        5430 :     list_free(namelist);
    2727             : 
    2728        5430 :     return result;
    2729             : }
    2730             : 
    2731             : /*
    2732             :  * SplitIdentifierString --- parse a string containing identifiers
    2733             :  *
    2734             :  * This is the guts of textToQualifiedNameList, and is exported for use in
    2735             :  * other situations such as parsing GUC variables.  In the GUC case, it's
    2736             :  * important to avoid memory leaks, so the API is designed to minimize the
    2737             :  * amount of stuff that needs to be allocated and freed.
    2738             :  *
    2739             :  * Inputs:
    2740             :  *  rawstring: the input string; must be overwritable!  On return, it's
    2741             :  *             been modified to contain the separated identifiers.
    2742             :  *  separator: the separator punctuation expected between identifiers
    2743             :  *             (typically '.' or ',').  Whitespace may also appear around
    2744             :  *             identifiers.
    2745             :  * Outputs:
    2746             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    2747             :  *            rawstring.  Caller should list_free() this even on error return.
    2748             :  *
    2749             :  * Returns true if okay, false if there is a syntax error in the string.
    2750             :  *
    2751             :  * Note that an empty string is considered okay here, though not in
    2752             :  * textToQualifiedNameList.
    2753             :  */
    2754             : bool
    2755      364358 : SplitIdentifierString(char *rawstring, char separator,
    2756             :                       List **namelist)
    2757             : {
    2758      364358 :     char       *nextp = rawstring;
    2759      364358 :     bool        done = false;
    2760             : 
    2761      364358 :     *namelist = NIL;
    2762             : 
    2763      364364 :     while (scanner_isspace(*nextp))
    2764           6 :         nextp++;                /* skip leading whitespace */
    2765             : 
    2766      364358 :     if (*nextp == '\0')
    2767       31644 :         return true;            /* empty string represents empty list */
    2768             : 
    2769             :     /* At the top of the loop, we are at start of a new identifier. */
    2770             :     do
    2771             :     {
    2772             :         char       *curname;
    2773             :         char       *endp;
    2774             : 
    2775      620630 :         if (*nextp == '"')
    2776             :         {
    2777             :             /* Quoted name --- collapse quote-quote pairs, no downcasing */
    2778       40794 :             curname = nextp + 1;
    2779             :             for (;;)
    2780             :             {
    2781       40798 :                 endp = strchr(nextp + 1, '"');
    2782       40796 :                 if (endp == NULL)
    2783           0 :                     return false;   /* mismatched quotes */
    2784       40796 :                 if (endp[1] != '"')
    2785       40794 :                     break;      /* found end of quoted name */
    2786             :                 /* Collapse adjacent quotes into one quote, and look again */
    2787           2 :                 memmove(endp, endp + 1, strlen(endp));
    2788           2 :                 nextp = endp;
    2789             :             }
    2790             :             /* endp now points at the terminating quote */
    2791       40794 :             nextp = endp + 1;
    2792             :         }
    2793             :         else
    2794             :         {
    2795             :             /* Unquoted name --- extends to separator or whitespace */
    2796             :             char       *downname;
    2797             :             int         len;
    2798             : 
    2799      579836 :             curname = nextp;
    2800     5304526 :             while (*nextp && *nextp != separator &&
    2801     4724692 :                    !scanner_isspace(*nextp))
    2802     4724690 :                 nextp++;
    2803      579836 :             endp = nextp;
    2804      579836 :             if (curname == nextp)
    2805           0 :                 return false;   /* empty unquoted name not allowed */
    2806             : 
    2807             :             /*
    2808             :              * Downcase the identifier, using same code as main lexer does.
    2809             :              *
    2810             :              * XXX because we want to overwrite the input in-place, we cannot
    2811             :              * support a downcasing transformation that increases the string
    2812             :              * length.  This is not a problem given the current implementation
    2813             :              * of downcase_truncate_identifier, but we'll probably have to do
    2814             :              * something about this someday.
    2815             :              */
    2816      579836 :             len = endp - curname;
    2817      579836 :             downname = downcase_truncate_identifier(curname, len, false);
    2818             :             Assert(strlen(downname) <= len);
    2819      579836 :             strncpy(curname, downname, len);    /* strncpy is required here */
    2820      579836 :             pfree(downname);
    2821             :         }
    2822             : 
    2823      620632 :         while (scanner_isspace(*nextp))
    2824           2 :             nextp++;            /* skip trailing whitespace */
    2825             : 
    2826      620630 :         if (*nextp == separator)
    2827             :         {
    2828      287916 :             nextp++;
    2829      550748 :             while (scanner_isspace(*nextp))
    2830      262832 :                 nextp++;        /* skip leading whitespace for next */
    2831             :             /* we expect another name, so done remains false */
    2832             :         }
    2833      332714 :         else if (*nextp == '\0')
    2834      332712 :             done = true;
    2835             :         else
    2836           2 :             return false;       /* invalid syntax */
    2837             : 
    2838             :         /* Now safe to overwrite separator with a null */
    2839      620628 :         *endp = '\0';
    2840             : 
    2841             :         /* Truncate name if it's overlength */
    2842      620628 :         truncate_identifier(curname, strlen(curname), false);
    2843             : 
    2844             :         /*
    2845             :          * Finished isolating current name --- add it to list
    2846             :          */
    2847      620628 :         *namelist = lappend(*namelist, curname);
    2848             : 
    2849             :         /* Loop back if we didn't reach end of string */
    2850      620628 :     } while (!done);
    2851             : 
    2852      332712 :     return true;
    2853             : }
    2854             : 
    2855             : 
    2856             : /*
    2857             :  * SplitDirectoriesString --- parse a string containing file/directory names
    2858             :  *
    2859             :  * This works fine on file names too; the function name is historical.
    2860             :  *
    2861             :  * This is similar to SplitIdentifierString, except that the parsing
    2862             :  * rules are meant to handle pathnames instead of identifiers: there is
    2863             :  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
    2864             :  * and we apply canonicalize_path() to each extracted string.  Because of the
    2865             :  * last, the returned strings are separately palloc'd rather than being
    2866             :  * pointers into rawstring --- but we still scribble on rawstring.
    2867             :  *
    2868             :  * Inputs:
    2869             :  *  rawstring: the input string; must be modifiable!
    2870             :  *  separator: the separator punctuation expected between directories
    2871             :  *             (typically ',' or ';').  Whitespace may also appear around
    2872             :  *             directories.
    2873             :  * Outputs:
    2874             :  *  namelist: filled with a palloc'd list of directory names.
    2875             :  *            Caller should list_free_deep() this even on error return.
    2876             :  *
    2877             :  * Returns true if okay, false if there is a syntax error in the string.
    2878             :  *
    2879             :  * Note that an empty string is considered okay here.
    2880             :  */
    2881             : bool
    2882        1856 : SplitDirectoriesString(char *rawstring, char separator,
    2883             :                        List **namelist)
    2884             : {
    2885        1856 :     char       *nextp = rawstring;
    2886        1856 :     bool        done = false;
    2887             : 
    2888        1856 :     *namelist = NIL;
    2889             : 
    2890        1856 :     while (scanner_isspace(*nextp))
    2891           0 :         nextp++;                /* skip leading whitespace */
    2892             : 
    2893        1856 :     if (*nextp == '\0')
    2894           2 :         return true;            /* empty string represents empty list */
    2895             : 
    2896             :     /* At the top of the loop, we are at start of a new directory. */
    2897             :     do
    2898             :     {
    2899             :         char       *curname;
    2900             :         char       *endp;
    2901             : 
    2902        1864 :         if (*nextp == '"')
    2903             :         {
    2904             :             /* Quoted name --- collapse quote-quote pairs */
    2905           0 :             curname = nextp + 1;
    2906             :             for (;;)
    2907             :             {
    2908           0 :                 endp = strchr(nextp + 1, '"');
    2909           0 :                 if (endp == NULL)
    2910           0 :                     return false;   /* mismatched quotes */
    2911           0 :                 if (endp[1] != '"')
    2912           0 :                     break;      /* found end of quoted name */
    2913             :                 /* Collapse adjacent quotes into one quote, and look again */
    2914           0 :                 memmove(endp, endp + 1, strlen(endp));
    2915           0 :                 nextp = endp;
    2916             :             }
    2917             :             /* endp now points at the terminating quote */
    2918           0 :             nextp = endp + 1;
    2919             :         }
    2920             :         else
    2921             :         {
    2922             :             /* Unquoted name --- extends to separator or end of string */
    2923        1864 :             curname = endp = nextp;
    2924       31230 :             while (*nextp && *nextp != separator)
    2925             :             {
    2926             :                 /* trailing whitespace should not be included in name */
    2927       29366 :                 if (!scanner_isspace(*nextp))
    2928       29366 :                     endp = nextp + 1;
    2929       29366 :                 nextp++;
    2930             :             }
    2931        1864 :             if (curname == endp)
    2932           0 :                 return false;   /* empty unquoted name not allowed */
    2933             :         }
    2934             : 
    2935        1864 :         while (scanner_isspace(*nextp))
    2936           0 :             nextp++;            /* skip trailing whitespace */
    2937             : 
    2938        1864 :         if (*nextp == separator)
    2939             :         {
    2940          10 :             nextp++;
    2941          16 :             while (scanner_isspace(*nextp))
    2942           6 :                 nextp++;        /* skip leading whitespace for next */
    2943             :             /* we expect another name, so done remains false */
    2944             :         }
    2945        1854 :         else if (*nextp == '\0')
    2946        1854 :             done = true;
    2947             :         else
    2948           0 :             return false;       /* invalid syntax */
    2949             : 
    2950             :         /* Now safe to overwrite separator with a null */
    2951        1864 :         *endp = '\0';
    2952             : 
    2953             :         /* Truncate path if it's overlength */
    2954        1864 :         if (strlen(curname) >= MAXPGPATH)
    2955           0 :             curname[MAXPGPATH - 1] = '\0';
    2956             : 
    2957             :         /*
    2958             :          * Finished isolating current name --- add it to list
    2959             :          */
    2960        1864 :         curname = pstrdup(curname);
    2961        1864 :         canonicalize_path(curname);
    2962        1864 :         *namelist = lappend(*namelist, curname);
    2963             : 
    2964             :         /* Loop back if we didn't reach end of string */
    2965        1864 :     } while (!done);
    2966             : 
    2967        1854 :     return true;
    2968             : }
    2969             : 
    2970             : 
    2971             : /*
    2972             :  * SplitGUCList --- parse a string containing identifiers or file names
    2973             :  *
    2974             :  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
    2975             :  * presuming whether the elements will be taken as identifiers or file names.
    2976             :  * We assume the input has already been through flatten_set_variable_args(),
    2977             :  * so that we need never downcase (if appropriate, that was done already).
    2978             :  * Nor do we ever truncate, since we don't know the correct max length.
    2979             :  * We disallow embedded whitespace for simplicity (it shouldn't matter,
    2980             :  * because any embedded whitespace should have led to double-quoting).
    2981             :  * Otherwise the API is identical to SplitIdentifierString.
    2982             :  *
    2983             :  * XXX it's annoying to have so many copies of this string-splitting logic.
    2984             :  * However, it's not clear that having one function with a bunch of option
    2985             :  * flags would be much better.
    2986             :  *
    2987             :  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
    2988             :  * Be sure to update that if you have to change this.
    2989             :  *
    2990             :  * Inputs:
    2991             :  *  rawstring: the input string; must be overwritable!  On return, it's
    2992             :  *             been modified to contain the separated identifiers.
    2993             :  *  separator: the separator punctuation expected between identifiers
    2994             :  *             (typically '.' or ',').  Whitespace may also appear around
    2995             :  *             identifiers.
    2996             :  * Outputs:
    2997             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    2998             :  *            rawstring.  Caller should list_free() this even on error return.
    2999             :  *
    3000             :  * Returns true if okay, false if there is a syntax error in the string.
    3001             :  */
    3002             : bool
    3003        4096 : SplitGUCList(char *rawstring, char separator,
    3004             :              List **namelist)
    3005             : {
    3006        4096 :     char       *nextp = rawstring;
    3007        4096 :     bool        done = false;
    3008             : 
    3009        4096 :     *namelist = NIL;
    3010             : 
    3011        4096 :     while (scanner_isspace(*nextp))
    3012           0 :         nextp++;                /* skip leading whitespace */
    3013             : 
    3014        4096 :     if (*nextp == '\0')
    3015        4020 :         return true;            /* empty string represents empty list */
    3016             : 
    3017             :     /* At the top of the loop, we are at start of a new identifier. */
    3018             :     do
    3019             :     {
    3020             :         char       *curname;
    3021             :         char       *endp;
    3022             : 
    3023         102 :         if (*nextp == '"')
    3024             :         {
    3025             :             /* Quoted name --- collapse quote-quote pairs */
    3026          24 :             curname = nextp + 1;
    3027             :             for (;;)
    3028             :             {
    3029          36 :                 endp = strchr(nextp + 1, '"');
    3030          30 :                 if (endp == NULL)
    3031           0 :                     return false;   /* mismatched quotes */
    3032          30 :                 if (endp[1] != '"')
    3033          24 :                     break;      /* found end of quoted name */
    3034             :                 /* Collapse adjacent quotes into one quote, and look again */
    3035           6 :                 memmove(endp, endp + 1, strlen(endp));
    3036           6 :                 nextp = endp;
    3037             :             }
    3038             :             /* endp now points at the terminating quote */
    3039          24 :             nextp = endp + 1;
    3040             :         }
    3041             :         else
    3042             :         {
    3043             :             /* Unquoted name --- extends to separator or whitespace */
    3044          78 :             curname = nextp;
    3045         738 :             while (*nextp && *nextp != separator &&
    3046         660 :                    !scanner_isspace(*nextp))
    3047         660 :                 nextp++;
    3048          78 :             endp = nextp;
    3049          78 :             if (curname == nextp)
    3050           0 :                 return false;   /* empty unquoted name not allowed */
    3051             :         }
    3052             : 
    3053         102 :         while (scanner_isspace(*nextp))
    3054           0 :             nextp++;            /* skip trailing whitespace */
    3055             : 
    3056         102 :         if (*nextp == separator)
    3057             :         {
    3058          26 :             nextp++;
    3059          44 :             while (scanner_isspace(*nextp))
    3060          18 :                 nextp++;        /* skip leading whitespace for next */
    3061             :             /* we expect another name, so done remains false */
    3062             :         }
    3063          76 :         else if (*nextp == '\0')
    3064          76 :             done = true;
    3065             :         else
    3066           0 :             return false;       /* invalid syntax */
    3067             : 
    3068             :         /* Now safe to overwrite separator with a null */
    3069         102 :         *endp = '\0';
    3070             : 
    3071             :         /*
    3072             :          * Finished isolating current name --- add it to list
    3073             :          */
    3074         102 :         *namelist = lappend(*namelist, curname);
    3075             : 
    3076             :         /* Loop back if we didn't reach end of string */
    3077         102 :     } while (!done);
    3078             : 
    3079          76 :     return true;
    3080             : }
    3081             : 
    3082             : /*
    3083             :  * appendStringInfoText
    3084             :  *
    3085             :  * Append a text to str.
    3086             :  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
    3087             :  */
    3088             : static void
    3089     2173152 : appendStringInfoText(StringInfo str, const text *t)
    3090             : {
    3091     2173152 :     appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
    3092     2173152 : }
    3093             : 
    3094             : /*
    3095             :  * replace_text
    3096             :  * replace all occurrences of 'old_sub_str' in 'orig_str'
    3097             :  * with 'new_sub_str' to form 'new_str'
    3098             :  *
    3099             :  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
    3100             :  * otherwise returns 'new_str'
    3101             :  */
    3102             : Datum
    3103        1564 : replace_text(PG_FUNCTION_ARGS)
    3104             : {
    3105        1564 :     text       *src_text = PG_GETARG_TEXT_PP(0);
    3106        1564 :     text       *from_sub_text = PG_GETARG_TEXT_PP(1);
    3107        1564 :     text       *to_sub_text = PG_GETARG_TEXT_PP(2);
    3108             :     int         src_text_len;
    3109             :     int         from_sub_text_len;
    3110             :     TextPositionState state;
    3111             :     text       *ret_text;
    3112             :     int         chunk_len;
    3113             :     char       *curr_ptr;
    3114             :     char       *start_ptr;
    3115             :     StringInfoData str;
    3116             :     bool        found;
    3117             : 
    3118        1564 :     src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3119        1564 :     from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
    3120             : 
    3121             :     /* Return unmodified source string if empty source or pattern */
    3122        1564 :     if (src_text_len < 1 || from_sub_text_len < 1)
    3123             :     {
    3124           0 :         PG_RETURN_TEXT_P(src_text);
    3125             :     }
    3126             : 
    3127        1564 :     text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
    3128             : 
    3129        1564 :     found = text_position_next(&state);
    3130             : 
    3131             :     /* When the from_sub_text is not found, there is nothing to do. */
    3132        1564 :     if (!found)
    3133             :     {
    3134         328 :         text_position_cleanup(&state);
    3135         328 :         PG_RETURN_TEXT_P(src_text);
    3136             :     }
    3137        1236 :     curr_ptr = text_position_get_match_ptr(&state);
    3138        1236 :     start_ptr = VARDATA_ANY(src_text);
    3139             : 
    3140        1236 :     initStringInfo(&str);
    3141             : 
    3142             :     do
    3143             :     {
    3144        7154 :         CHECK_FOR_INTERRUPTS();
    3145             : 
    3146             :         /* copy the data skipped over by last text_position_next() */
    3147        7154 :         chunk_len = curr_ptr - start_ptr;
    3148        7154 :         appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3149             : 
    3150        7154 :         appendStringInfoText(&str, to_sub_text);
    3151             : 
    3152        7154 :         start_ptr = curr_ptr + state.last_match_len;
    3153             : 
    3154        7154 :         found = text_position_next(&state);
    3155        7154 :         if (found)
    3156        5918 :             curr_ptr = text_position_get_match_ptr(&state);
    3157             :     }
    3158        7154 :     while (found);
    3159             : 
    3160             :     /* copy trailing data */
    3161        1236 :     chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3162        1236 :     appendBinaryStringInfo(&str, start_ptr, chunk_len);
    3163             : 
    3164        1236 :     text_position_cleanup(&state);
    3165             : 
    3166        1236 :     ret_text = cstring_to_text_with_len(str.data, str.len);
    3167        1236 :     pfree(str.data);
    3168             : 
    3169        1236 :     PG_RETURN_TEXT_P(ret_text);
    3170             : }
    3171             : 
    3172             : /*
    3173             :  * check_replace_text_has_escape
    3174             :  *
    3175             :  * Returns 0 if text contains no backslashes that need processing.
    3176             :  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
    3177             :  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
    3178             :  */
    3179             : static int
    3180       18814 : check_replace_text_has_escape(const text *replace_text)
    3181             : {
    3182       18814 :     int         result = 0;
    3183       18814 :     const char *p = VARDATA_ANY(replace_text);
    3184       18814 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3185             : 
    3186       37672 :     while (p < p_end)
    3187             :     {
    3188             :         /* Find next escape char, if any. */
    3189       17636 :         p = memchr(p, '\\', p_end - p);
    3190       17636 :         if (p == NULL)
    3191       16812 :             break;
    3192         824 :         p++;
    3193             :         /* Note: a backslash at the end doesn't require extra processing. */
    3194         824 :         if (p < p_end)
    3195             :         {
    3196         824 :             if (*p >= '1' && *p <= '9')
    3197         780 :                 return 2;       /* Found a submatch specifier, so done */
    3198          44 :             result = 1;         /* Found some other sequence, keep looking */
    3199          44 :             p++;
    3200             :         }
    3201             :     }
    3202       18034 :     return result;
    3203             : }
    3204             : 
    3205             : /*
    3206             :  * appendStringInfoRegexpSubstr
    3207             :  *
    3208             :  * Append replace_text to str, substituting regexp back references for
    3209             :  * \n escapes.  start_ptr is the start of the match in the source string,
    3210             :  * at logical character position data_pos.
    3211             :  */
    3212             : static void
    3213         236 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
    3214             :                              regmatch_t *pmatch,
    3215             :                              char *start_ptr, int data_pos)
    3216             : {
    3217         236 :     const char *p = VARDATA_ANY(replace_text);
    3218         236 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    3219             : 
    3220         574 :     while (p < p_end)
    3221             :     {
    3222         518 :         const char *chunk_start = p;
    3223             :         int         so;
    3224             :         int         eo;
    3225             : 
    3226             :         /* Find next escape char, if any. */
    3227         518 :         p = memchr(p, '\\', p_end - p);
    3228         518 :         if (p == NULL)
    3229         174 :             p = p_end;
    3230             : 
    3231             :         /* Copy the text we just scanned over, if any. */
    3232         518 :         if (p > chunk_start)
    3233         318 :             appendBinaryStringInfo(str, chunk_start, p - chunk_start);
    3234             : 
    3235             :         /* Done if at end of string, else advance over escape char. */
    3236         518 :         if (p >= p_end)
    3237         174 :             break;
    3238         344 :         p++;
    3239             : 
    3240         344 :         if (p >= p_end)
    3241             :         {
    3242             :             /* Escape at very end of input.  Treat same as unexpected char */
    3243           6 :             appendStringInfoChar(str, '\\');
    3244           6 :             break;
    3245             :         }
    3246             : 
    3247         338 :         if (*p >= '1' && *p <= '9')
    3248         278 :         {
    3249             :             /* Use the back reference of regexp. */
    3250         278 :             int         idx = *p - '0';
    3251             : 
    3252         278 :             so = pmatch[idx].rm_so;
    3253         278 :             eo = pmatch[idx].rm_eo;
    3254         278 :             p++;
    3255             :         }
    3256          60 :         else if (*p == '&')
    3257             :         {
    3258             :             /* Use the entire matched string. */
    3259          18 :             so = pmatch[0].rm_so;
    3260          18 :             eo = pmatch[0].rm_eo;
    3261          18 :             p++;
    3262             :         }
    3263          42 :         else if (*p == '\\')
    3264             :         {
    3265             :             /* \\ means transfer one \ to output. */
    3266          36 :             appendStringInfoChar(str, '\\');
    3267          36 :             p++;
    3268          36 :             continue;
    3269             :         }
    3270             :         else
    3271             :         {
    3272             :             /*
    3273             :              * If escape char is not followed by any expected char, just treat
    3274             :              * it as ordinary data to copy.  (XXX would it be better to throw
    3275             :              * an error?)
    3276             :              */
    3277           6 :             appendStringInfoChar(str, '\\');
    3278           6 :             continue;
    3279             :         }
    3280             : 
    3281         296 :         if (so >= 0 && eo >= 0)
    3282             :         {
    3283             :             /*
    3284             :              * Copy the text that is back reference of regexp.  Note so and eo
    3285             :              * are counted in characters not bytes.
    3286             :              */
    3287             :             char       *chunk_start;
    3288             :             int         chunk_len;
    3289             : 
    3290             :             Assert(so >= data_pos);
    3291         296 :             chunk_start = start_ptr;
    3292         296 :             chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
    3293         296 :             chunk_len = charlen_to_bytelen(chunk_start, eo - so);
    3294         296 :             appendBinaryStringInfo(str, chunk_start, chunk_len);
    3295             :         }
    3296             :     }
    3297         236 : }
    3298             : 
    3299             : /*
    3300             :  * replace_text_regexp
    3301             :  *
    3302             :  * replace substring(s) in src_text that match pattern with replace_text.
    3303             :  * The replace_text can contain backslash markers to substitute
    3304             :  * (parts of) the matched text.
    3305             :  *
    3306             :  * cflags: regexp compile flags.
    3307             :  * collation: collation to use.
    3308             :  * search_start: the character (not byte) offset in src_text at which to
    3309             :  * begin searching.
    3310             :  * n: if 0, replace all matches; if > 0, replace only the N'th match.
    3311             :  */
    3312             : text *
    3313       18814 : replace_text_regexp(text *src_text, text *pattern_text,
    3314             :                     text *replace_text,
    3315             :                     int cflags, Oid collation,
    3316             :                     int search_start, int n)
    3317             : {
    3318             :     text       *ret_text;
    3319             :     regex_t    *re;
    3320       18814 :     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
    3321       18814 :     int         nmatches = 0;
    3322             :     StringInfoData buf;
    3323             :     regmatch_t  pmatch[10];     /* main match, plus \1 to \9 */
    3324       18814 :     int         nmatch = lengthof(pmatch);
    3325             :     pg_wchar   *data;
    3326             :     size_t      data_len;
    3327             :     int         data_pos;
    3328             :     char       *start_ptr;
    3329             :     int         escape_status;
    3330             : 
    3331       18814 :     initStringInfo(&buf);
    3332             : 
    3333             :     /* Convert data string to wide characters. */
    3334       18814 :     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
    3335       18814 :     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
    3336             : 
    3337             :     /* Check whether replace_text has escapes, especially regexp submatches. */
    3338       18814 :     escape_status = check_replace_text_has_escape(replace_text);
    3339             : 
    3340             :     /* If no regexp submatches, we can use REG_NOSUB. */
    3341       18814 :     if (escape_status < 2)
    3342             :     {
    3343       18034 :         cflags |= REG_NOSUB;
    3344             :         /* Also tell pg_regexec we only want the whole-match location. */
    3345       18034 :         nmatch = 1;
    3346             :     }
    3347             : 
    3348             :     /* Prepare the regexp. */
    3349       18814 :     re = RE_compile_and_cache(pattern_text, cflags, collation);
    3350             : 
    3351             :     /* start_ptr points to the data_pos'th character of src_text */
    3352       18814 :     start_ptr = (char *) VARDATA_ANY(src_text);
    3353       18814 :     data_pos = 0;
    3354             : 
    3355       25278 :     while (search_start <= data_len)
    3356             :     {
    3357             :         int         regexec_result;
    3358             : 
    3359       25272 :         CHECK_FOR_INTERRUPTS();
    3360             : 
    3361       25272 :         regexec_result = pg_regexec(re,
    3362             :                                     data,
    3363             :                                     data_len,
    3364             :                                     search_start,
    3365             :                                     NULL,   /* no details */
    3366             :                                     nmatch,
    3367             :                                     pmatch,
    3368             :                                     0);
    3369             : 
    3370       25272 :         if (regexec_result == REG_NOMATCH)
    3371       16742 :             break;
    3372             : 
    3373        8530 :         if (regexec_result != REG_OKAY)
    3374             :         {
    3375             :             char        errMsg[100];
    3376             : 
    3377           0 :             pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
    3378           0 :             ereport(ERROR,
    3379             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    3380             :                      errmsg("regular expression failed: %s", errMsg)));
    3381             :         }
    3382             : 
    3383             :         /*
    3384             :          * Count matches, and decide whether to replace this match.
    3385             :          */
    3386        8530 :         nmatches++;
    3387        8530 :         if (n > 0 && nmatches != n)
    3388             :         {
    3389             :             /*
    3390             :              * No, so advance search_start, but not start_ptr/data_pos. (Thus,
    3391             :              * we treat the matched text as if it weren't matched, and copy it
    3392             :              * to the output later.)
    3393             :              */
    3394          60 :             search_start = pmatch[0].rm_eo;
    3395          60 :             if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3396           0 :                 search_start++;
    3397          60 :             continue;
    3398             :         }
    3399             : 
    3400             :         /*
    3401             :          * Copy the text to the left of the match position.  Note we are given
    3402             :          * character not byte indexes.
    3403             :          */
    3404        8470 :         if (pmatch[0].rm_so - data_pos > 0)
    3405             :         {
    3406             :             int         chunk_len;
    3407             : 
    3408        8296 :             chunk_len = charlen_to_bytelen(start_ptr,
    3409        8296 :                                            pmatch[0].rm_so - data_pos);
    3410        8296 :             appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3411             : 
    3412             :             /*
    3413             :              * Advance start_ptr over that text, to avoid multiple rescans of
    3414             :              * it if the replace_text contains multiple back-references.
    3415             :              */
    3416        8296 :             start_ptr += chunk_len;
    3417        8296 :             data_pos = pmatch[0].rm_so;
    3418             :         }
    3419             : 
    3420             :         /*
    3421             :          * Copy the replace_text, processing escapes if any are present.
    3422             :          */
    3423        8470 :         if (escape_status > 0)
    3424         236 :             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
    3425             :                                          start_ptr, data_pos);
    3426             :         else
    3427        8234 :             appendStringInfoText(&buf, replace_text);
    3428             : 
    3429             :         /* Advance start_ptr and data_pos over the matched text. */
    3430       16940 :         start_ptr += charlen_to_bytelen(start_ptr,
    3431        8470 :                                         pmatch[0].rm_eo - data_pos);
    3432        8470 :         data_pos = pmatch[0].rm_eo;
    3433             : 
    3434             :         /*
    3435             :          * If we only want to replace one occurrence, we're done.
    3436             :          */
    3437        8470 :         if (n > 0)
    3438        2066 :             break;
    3439             : 
    3440             :         /*
    3441             :          * Advance search position.  Normally we start the next search at the
    3442             :          * end of the previous match; but if the match was of zero length, we
    3443             :          * have to advance by one character, or we'd just find the same match
    3444             :          * again.
    3445             :          */
    3446        6404 :         search_start = data_pos;
    3447        6404 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    3448          12 :             search_start++;
    3449             :     }
    3450             : 
    3451             :     /*
    3452             :      * Copy the text to the right of the last match.
    3453             :      */
    3454       18814 :     if (data_pos < data_len)
    3455             :     {
    3456             :         int         chunk_len;
    3457             : 
    3458       17938 :         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    3459       17938 :         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    3460             :     }
    3461             : 
    3462       18814 :     ret_text = cstring_to_text_with_len(buf.data, buf.len);
    3463       18814 :     pfree(buf.data);
    3464       18814 :     pfree(data);
    3465             : 
    3466       18814 :     return ret_text;
    3467             : }
    3468             : 
    3469             : /*
    3470             :  * split_part
    3471             :  * parse input string based on provided field separator
    3472             :  * return N'th item (1 based, negative counts from end)
    3473             :  */
    3474             : Datum
    3475         150 : split_part(PG_FUNCTION_ARGS)
    3476             : {
    3477         150 :     text       *inputstring = PG_GETARG_TEXT_PP(0);
    3478         150 :     text       *fldsep = PG_GETARG_TEXT_PP(1);
    3479         150 :     int         fldnum = PG_GETARG_INT32(2);
    3480             :     int         inputstring_len;
    3481             :     int         fldsep_len;
    3482             :     TextPositionState state;
    3483             :     char       *start_ptr;
    3484             :     char       *end_ptr;
    3485             :     text       *result_text;
    3486             :     bool        found;
    3487             : 
    3488             :     /* field number is 1 based */
    3489         150 :     if (fldnum == 0)
    3490           6 :         ereport(ERROR,
    3491             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    3492             :                  errmsg("field position must not be zero")));
    3493             : 
    3494         144 :     inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3495         144 :     fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3496             : 
    3497             :     /* return empty string for empty input string */
    3498         144 :     if (inputstring_len < 1)
    3499          12 :         PG_RETURN_TEXT_P(cstring_to_text(""));
    3500             : 
    3501             :     /* handle empty field separator */
    3502         132 :     if (fldsep_len < 1)
    3503             :     {
    3504             :         /* if first or last field, return input string, else empty string */
    3505          24 :         if (fldnum == 1 || fldnum == -1)
    3506          12 :             PG_RETURN_TEXT_P(inputstring);
    3507             :         else
    3508          12 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3509             :     }
    3510             : 
    3511             :     /* find the first field separator */
    3512         108 :     text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
    3513             : 
    3514         108 :     found = text_position_next(&state);
    3515             : 
    3516             :     /* special case if fldsep not found at all */
    3517         108 :     if (!found)
    3518             :     {
    3519          24 :         text_position_cleanup(&state);
    3520             :         /* if first or last field, return input string, else empty string */
    3521          24 :         if (fldnum == 1 || fldnum == -1)
    3522          12 :             PG_RETURN_TEXT_P(inputstring);
    3523             :         else
    3524          12 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3525             :     }
    3526             : 
    3527             :     /*
    3528             :      * take care of a negative field number (i.e. count from the right) by
    3529             :      * converting to a positive field number; we need total number of fields
    3530             :      */
    3531          84 :     if (fldnum < 0)
    3532             :     {
    3533             :         /* we found a fldsep, so there are at least two fields */
    3534          42 :         int         numfields = 2;
    3535             : 
    3536          54 :         while (text_position_next(&state))
    3537          12 :             numfields++;
    3538             : 
    3539             :         /* special case of last field does not require an extra pass */
    3540          42 :         if (fldnum == -1)
    3541             :         {
    3542          24 :             start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
    3543          24 :             end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
    3544          24 :             text_position_cleanup(&state);
    3545          24 :             PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
    3546             :                                                       end_ptr - start_ptr));
    3547             :         }
    3548             : 
    3549             :         /* else, convert fldnum to positive notation */
    3550          18 :         fldnum += numfields + 1;
    3551             : 
    3552             :         /* if nonexistent field, return empty string */
    3553          18 :         if (fldnum <= 0)
    3554             :         {
    3555           6 :             text_position_cleanup(&state);
    3556           6 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    3557             :         }
    3558             : 
    3559             :         /* reset to pointing at first match, but now with positive fldnum */
    3560          12 :         text_position_reset(&state);
    3561          12 :         found = text_position_next(&state);
    3562             :         Assert(found);
    3563             :     }
    3564             : 
    3565             :     /* identify bounds of first field */
    3566          54 :     start_ptr = VARDATA_ANY(inputstring);
    3567          54 :     end_ptr = text_position_get_match_ptr(&state);
    3568             : 
    3569         102 :     while (found && --fldnum > 0)
    3570             :     {
    3571             :         /* identify bounds of next field */
    3572          48 :         start_ptr = end_ptr + state.last_match_len;
    3573          48 :         found = text_position_next(&state);
    3574          48 :         if (found)
    3575          18 :             end_ptr = text_position_get_match_ptr(&state);
    3576             :     }
    3577             : 
    3578          54 :     text_position_cleanup(&state);
    3579             : 
    3580          54 :     if (fldnum > 0)
    3581             :     {
    3582             :         /* N'th field separator not found */
    3583             :         /* if last field requested, return it, else empty string */
    3584          30 :         if (fldnum == 1)
    3585             :         {
    3586          24 :             int         last_len = start_ptr - VARDATA_ANY(inputstring);
    3587             : 
    3588          24 :             result_text = cstring_to_text_with_len(start_ptr,
    3589             :                                                    inputstring_len - last_len);
    3590             :         }
    3591             :         else
    3592           6 :             result_text = cstring_to_text("");
    3593             :     }
    3594             :     else
    3595             :     {
    3596             :         /* non-last field requested */
    3597          24 :         result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
    3598             :     }
    3599             : 
    3600          54 :     PG_RETURN_TEXT_P(result_text);
    3601             : }
    3602             : 
    3603             : /*
    3604             :  * Convenience function to return true when two text params are equal.
    3605             :  */
    3606             : static bool
    3607         384 : text_isequal(text *txt1, text *txt2, Oid collid)
    3608             : {
    3609         384 :     return DatumGetBool(DirectFunctionCall2Coll(texteq,
    3610             :                                                 collid,
    3611             :                                                 PointerGetDatum(txt1),
    3612             :                                                 PointerGetDatum(txt2)));
    3613             : }
    3614             : 
    3615             : /*
    3616             :  * text_to_array
    3617             :  * parse input string and return text array of elements,
    3618             :  * based on provided field separator
    3619             :  */
    3620             : Datum
    3621         170 : text_to_array(PG_FUNCTION_ARGS)
    3622             : {
    3623             :     SplitTextOutputData tstate;
    3624             : 
    3625             :     /* For array output, tstate should start as all zeroes */
    3626         170 :     memset(&tstate, 0, sizeof(tstate));
    3627             : 
    3628         170 :     if (!split_text(fcinfo, &tstate))
    3629           6 :         PG_RETURN_NULL();
    3630             : 
    3631         164 :     if (tstate.astate == NULL)
    3632           6 :         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
    3633             : 
    3634         158 :     PG_RETURN_DATUM(makeArrayResult(tstate.astate,
    3635             :                                     CurrentMemoryContext));
    3636             : }
    3637             : 
    3638             : /*
    3639             :  * text_to_array_null
    3640             :  * parse input string and return text array of elements,
    3641             :  * based on provided field separator and null string
    3642             :  *
    3643             :  * This is a separate entry point only to prevent the regression tests from
    3644             :  * complaining about different argument sets for the same internal function.
    3645             :  */
    3646             : Datum
    3647          60 : text_to_array_null(PG_FUNCTION_ARGS)
    3648             : {
    3649          60 :     return text_to_array(fcinfo);
    3650             : }
    3651             : 
    3652             : /*
    3653             :  * text_to_table
    3654             :  * parse input string and return table of elements,
    3655             :  * based on provided field separator
    3656             :  */
    3657             : Datum
    3658          84 : text_to_table(PG_FUNCTION_ARGS)
    3659             : {
    3660          84 :     ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
    3661             :     SplitTextOutputData tstate;
    3662             : 
    3663          84 :     tstate.astate = NULL;
    3664          84 :     InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
    3665          84 :     tstate.tupstore = rsi->setResult;
    3666          84 :     tstate.tupdesc = rsi->setDesc;
    3667             : 
    3668          84 :     (void) split_text(fcinfo, &tstate);
    3669             : 
    3670          84 :     return (Datum) 0;
    3671             : }
    3672             : 
    3673             : /*
    3674             :  * text_to_table_null
    3675             :  * parse input string and return table of elements,
    3676             :  * based on provided field separator and null string
    3677             :  *
    3678             :  * This is a separate entry point only to prevent the regression tests from
    3679             :  * complaining about different argument sets for the same internal function.
    3680             :  */
    3681             : Datum
    3682          24 : text_to_table_null(PG_FUNCTION_ARGS)
    3683             : {
    3684          24 :     return text_to_table(fcinfo);
    3685             : }
    3686             : 
    3687             : /*
    3688             :  * Common code for text_to_array, text_to_array_null, text_to_table
    3689             :  * and text_to_table_null functions.
    3690             :  *
    3691             :  * These are not strict so we have to test for null inputs explicitly.
    3692             :  * Returns false if result is to be null, else returns true.
    3693             :  *
    3694             :  * Note that if the result is valid but empty (zero elements), we return
    3695             :  * without changing *tstate --- caller must handle that case, too.
    3696             :  */
    3697             : static bool
    3698         254 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
    3699             : {
    3700             :     text       *inputstring;
    3701             :     text       *fldsep;
    3702             :     text       *null_string;
    3703         254 :     Oid         collation = PG_GET_COLLATION();
    3704             :     int         inputstring_len;
    3705             :     int         fldsep_len;
    3706             :     char       *start_ptr;
    3707             :     text       *result_text;
    3708             : 
    3709             :     /* when input string is NULL, then result is NULL too */
    3710         254 :     if (PG_ARGISNULL(0))
    3711          12 :         return false;
    3712             : 
    3713         242 :     inputstring = PG_GETARG_TEXT_PP(0);
    3714             : 
    3715             :     /* fldsep can be NULL */
    3716         242 :     if (!PG_ARGISNULL(1))
    3717         212 :         fldsep = PG_GETARG_TEXT_PP(1);
    3718             :     else
    3719          30 :         fldsep = NULL;
    3720             : 
    3721             :     /* null_string can be NULL or omitted */
    3722         242 :     if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
    3723          84 :         null_string = PG_GETARG_TEXT_PP(2);
    3724             :     else
    3725         158 :         null_string = NULL;
    3726             : 
    3727         242 :     if (fldsep != NULL)
    3728             :     {
    3729             :         /*
    3730             :          * Normal case with non-null fldsep.  Use the text_position machinery
    3731             :          * to search for occurrences of fldsep.
    3732             :          */
    3733             :         TextPositionState state;
    3734             : 
    3735         212 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3736         212 :         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    3737             : 
    3738             :         /* return empty set for empty input string */
    3739         212 :         if (inputstring_len < 1)
    3740          60 :             return true;
    3741             : 
    3742             :         /* empty field separator: return input string as a one-element set */
    3743         200 :         if (fldsep_len < 1)
    3744             :         {
    3745          48 :             split_text_accum_result(tstate, inputstring,
    3746             :                                     null_string, collation);
    3747          48 :             return true;
    3748             :         }
    3749             : 
    3750         152 :         text_position_setup(inputstring, fldsep, collation, &state);
    3751             : 
    3752         152 :         start_ptr = VARDATA_ANY(inputstring);
    3753             : 
    3754             :         for (;;)
    3755         512 :         {
    3756             :             bool        found;
    3757             :             char       *end_ptr;
    3758             :             int         chunk_len;
    3759             : 
    3760         664 :             CHECK_FOR_INTERRUPTS();
    3761             : 
    3762         664 :             found = text_position_next(&state);
    3763         664 :             if (!found)
    3764             :             {
    3765             :                 /* fetch last field */
    3766         152 :                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
    3767         152 :                 end_ptr = NULL; /* not used, but some compilers complain */
    3768             :             }
    3769             :             else
    3770             :             {
    3771             :                 /* fetch non-last field */
    3772         512 :                 end_ptr = text_position_get_match_ptr(&state);
    3773         512 :                 chunk_len = end_ptr - start_ptr;
    3774             :             }
    3775             : 
    3776             :             /* build a temp text datum to pass to split_text_accum_result */
    3777         664 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3778             : 
    3779             :             /* stash away this field */
    3780         664 :             split_text_accum_result(tstate, result_text,
    3781             :                                     null_string, collation);
    3782             : 
    3783         664 :             pfree(result_text);
    3784             : 
    3785         664 :             if (!found)
    3786         152 :                 break;
    3787             : 
    3788         512 :             start_ptr = end_ptr + state.last_match_len;
    3789             :         }
    3790             : 
    3791         152 :         text_position_cleanup(&state);
    3792             :     }
    3793             :     else
    3794             :     {
    3795             :         /*
    3796             :          * When fldsep is NULL, each character in the input string becomes a
    3797             :          * separate element in the result set.  The separator is effectively
    3798             :          * the space between characters.
    3799             :          */
    3800          30 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    3801             : 
    3802          30 :         start_ptr = VARDATA_ANY(inputstring);
    3803             : 
    3804         252 :         while (inputstring_len > 0)
    3805             :         {
    3806         222 :             int         chunk_len = pg_mblen(start_ptr);
    3807             : 
    3808         222 :             CHECK_FOR_INTERRUPTS();
    3809             : 
    3810             :             /* build a temp text datum to pass to split_text_accum_result */
    3811         222 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    3812             : 
    3813             :             /* stash away this field */
    3814         222 :             split_text_accum_result(tstate, result_text,
    3815             :                                     null_string, collation);
    3816             : 
    3817         222 :             pfree(result_text);
    3818             : 
    3819         222 :             start_ptr += chunk_len;
    3820         222 :             inputstring_len -= chunk_len;
    3821             :         }
    3822             :     }
    3823             : 
    3824         182 :     return true;
    3825             : }
    3826             : 
    3827             : /*
    3828             :  * Add text item to result set (table or array).
    3829             :  *
    3830             :  * This is also responsible for checking to see if the item matches
    3831             :  * the null_string, in which case we should emit NULL instead.
    3832             :  */
    3833             : static void
    3834         934 : split_text_accum_result(SplitTextOutputData *tstate,
    3835             :                         text *field_value,
    3836             :                         text *null_string,
    3837             :                         Oid collation)
    3838             : {
    3839         934 :     bool        is_null = false;
    3840             : 
    3841         934 :     if (null_string && text_isequal(field_value, null_string, collation))
    3842          72 :         is_null = true;
    3843             : 
    3844         934 :     if (tstate->tupstore)
    3845             :     {
    3846             :         Datum       values[1];
    3847             :         bool        nulls[1];
    3848             : 
    3849         228 :         values[0] = PointerGetDatum(field_value);
    3850         228 :         nulls[0] = is_null;
    3851             : 
    3852         228 :         tuplestore_putvalues(tstate->tupstore,
    3853             :                              tstate->tupdesc,
    3854             :                              values,
    3855             :                              nulls);
    3856             :     }
    3857             :     else
    3858             :     {
    3859         706 :         tstate->astate = accumArrayResult(tstate->astate,
    3860             :                                           PointerGetDatum(field_value),
    3861             :                                           is_null,
    3862             :                                           TEXTOID,
    3863             :                                           CurrentMemoryContext);
    3864             :     }
    3865         934 : }
    3866             : 
    3867             : /*
    3868             :  * array_to_text
    3869             :  * concatenate Cstring representation of input array elements
    3870             :  * using provided field separator
    3871             :  */
    3872             : Datum
    3873       76670 : array_to_text(PG_FUNCTION_ARGS)
    3874             : {
    3875       76670 :     ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
    3876       76670 :     char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3877             : 
    3878       76670 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
    3879             : }
    3880             : 
    3881             : /*
    3882             :  * array_to_text_null
    3883             :  * concatenate Cstring representation of input array elements
    3884             :  * using provided field separator and null string
    3885             :  *
    3886             :  * This version is not strict so we have to test for null inputs explicitly.
    3887             :  */
    3888             : Datum
    3889          12 : array_to_text_null(PG_FUNCTION_ARGS)
    3890             : {
    3891             :     ArrayType  *v;
    3892             :     char       *fldsep;
    3893             :     char       *null_string;
    3894             : 
    3895             :     /* returns NULL when first or second parameter is NULL */
    3896          12 :     if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
    3897           0 :         PG_RETURN_NULL();
    3898             : 
    3899          12 :     v = PG_GETARG_ARRAYTYPE_P(0);
    3900          12 :     fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    3901             : 
    3902             :     /* NULL null string is passed through as a null pointer */
    3903          12 :     if (!PG_ARGISNULL(2))
    3904           6 :         null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
    3905             :     else
    3906           6 :         null_string = NULL;
    3907             : 
    3908          12 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
    3909             : }
    3910             : 
    3911             : /*
    3912             :  * common code for array_to_text and array_to_text_null functions
    3913             :  */
    3914             : static text *
    3915       76700 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
    3916             :                        const char *fldsep, const char *null_string)
    3917             : {
    3918             :     text       *result;
    3919             :     int         nitems,
    3920             :                *dims,
    3921             :                 ndims;
    3922             :     Oid         element_type;
    3923             :     int         typlen;
    3924             :     bool        typbyval;
    3925             :     char        typalign;
    3926             :     StringInfoData buf;
    3927       76700 :     bool        printed = false;
    3928             :     char       *p;
    3929             :     bits8      *bitmap;
    3930             :     int         bitmask;
    3931             :     int         i;
    3932             :     ArrayMetaState *my_extra;
    3933             : 
    3934       76700 :     ndims = ARR_NDIM(v);
    3935       76700 :     dims = ARR_DIMS(v);
    3936       76700 :     nitems = ArrayGetNItems(ndims, dims);
    3937             : 
    3938             :     /* if there are no elements, return an empty string */
    3939       76700 :     if (nitems == 0)
    3940       51640 :         return cstring_to_text_with_len("", 0);
    3941             : 
    3942       25060 :     element_type = ARR_ELEMTYPE(v);
    3943       25060 :     initStringInfo(&buf);
    3944             : 
    3945             :     /*
    3946             :      * We arrange to look up info about element type, including its output
    3947             :      * conversion proc, only once per series of calls, assuming the element
    3948             :      * type doesn't change underneath us.
    3949             :      */
    3950       25060 :     my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3951       25060 :     if (my_extra == NULL)
    3952             :     {
    3953        1420 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    3954             :                                                       sizeof(ArrayMetaState));
    3955        1420 :         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    3956        1420 :         my_extra->element_type = ~element_type;
    3957             :     }
    3958             : 
    3959       25060 :     if (my_extra->element_type != element_type)
    3960             :     {
    3961             :         /*
    3962             :          * Get info about element type, including its output conversion proc
    3963             :          */
    3964        1420 :         get_type_io_data(element_type, IOFunc_output,
    3965             :                          &my_extra->typlen, &my_extra->typbyval,
    3966             :                          &my_extra->typalign, &my_extra->typdelim,
    3967             :                          &my_extra->typioparam, &my_extra->typiofunc);
    3968        1420 :         fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
    3969        1420 :                       fcinfo->flinfo->fn_mcxt);
    3970        1420 :         my_extra->element_type = element_type;
    3971             :     }
    3972       25060 :     typlen = my_extra->typlen;
    3973       25060 :     typbyval = my_extra->typbyval;
    3974       25060 :     typalign = my_extra->typalign;
    3975             : 
    3976       25060 :     p = ARR_DATA_PTR(v);
    3977       25060 :     bitmap = ARR_NULLBITMAP(v);
    3978       25060 :     bitmask = 1;
    3979             : 
    3980       85262 :     for (i = 0; i < nitems; i++)
    3981             :     {
    3982             :         Datum       itemvalue;
    3983             :         char       *value;
    3984             : 
    3985             :         /* Get source element, checking for NULL */
    3986       60202 :         if (bitmap && (*bitmap & bitmask) == 0)
    3987             :         {
    3988             :             /* if null_string is NULL, we just ignore null elements */
    3989          18 :             if (null_string != NULL)
    3990             :             {
    3991           6 :                 if (printed)
    3992           6 :                     appendStringInfo(&buf, "%s%s", fldsep, null_string);
    3993             :                 else
    3994           0 :                     appendStringInfoString(&buf, null_string);
    3995           6 :                 printed = true;
    3996             :             }
    3997             :         }
    3998             :         else
    3999             :         {
    4000       60184 :             itemvalue = fetch_att(p, typbyval, typlen);
    4001             : 
    4002       60184 :             value = OutputFunctionCall(&my_extra->proc, itemvalue);
    4003             : 
    4004       60184 :             if (printed)
    4005       35124 :                 appendStringInfo(&buf, "%s%s", fldsep, value);
    4006             :             else
    4007       25060 :                 appendStringInfoString(&buf, value);
    4008       60184 :             printed = true;
    4009             : 
    4010       60184 :             p = att_addlength_pointer(p, typlen, p);
    4011       60184 :             p = (char *) att_align_nominal(p, typalign);
    4012             :         }
    4013             : 
    4014             :         /* advance bitmap pointer if any */
    4015       60202 :         if (bitmap)
    4016             :         {
    4017         108 :             bitmask <<= 1;
    4018         108 :             if (bitmask == 0x100)
    4019             :             {
    4020           0 :                 bitmap++;
    4021           0 :                 bitmask = 1;
    4022             :             }
    4023             :         }
    4024             :     }
    4025             : 
    4026       25060 :     result = cstring_to_text_with_len(buf.data, buf.len);
    4027       25060 :     pfree(buf.data);
    4028             : 
    4029       25060 :     return result;
    4030             : }
    4031             : 
    4032             : /*
    4033             :  * Workhorse for to_bin, to_oct, and to_hex.  Note that base must be > 1 and <=
    4034             :  * 16.
    4035             :  */
    4036             : static inline text *
    4037       38750 : convert_to_base(uint64 value, int base)
    4038             : {
    4039       38750 :     const char *digits = "0123456789abcdef";
    4040             : 
    4041             :     /* We size the buffer for to_bin's longest possible return value. */
    4042             :     char        buf[sizeof(uint64) * BITS_PER_BYTE];
    4043       38750 :     char       *const end = buf + sizeof(buf);
    4044       38750 :     char       *ptr = end;
    4045             : 
    4046             :     Assert(base > 1);
    4047             :     Assert(base <= 16);
    4048             : 
    4049             :     do
    4050             :     {
    4051       75970 :         *--ptr = digits[value % base];
    4052       75970 :         value /= base;
    4053       75970 :     } while (ptr > buf && value);
    4054             : 
    4055       38750 :     return cstring_to_text_with_len(ptr, end - ptr);
    4056             : }
    4057             : 
    4058             : /*
    4059             :  * Convert an integer to a string containing a base-2 (binary) representation
    4060             :  * of the number.
    4061             :  */
    4062             : Datum
    4063          12 : to_bin32(PG_FUNCTION_ARGS)
    4064             : {
    4065          12 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4066             : 
    4067          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4068             : }
    4069             : Datum
    4070          12 : to_bin64(PG_FUNCTION_ARGS)
    4071             : {
    4072          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4073             : 
    4074          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 2));
    4075             : }
    4076             : 
    4077             : /*
    4078             :  * Convert an integer to a string containing a base-8 (oct) representation of
    4079             :  * the number.
    4080             :  */
    4081             : Datum
    4082          12 : to_oct32(PG_FUNCTION_ARGS)
    4083             : {
    4084          12 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4085             : 
    4086          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4087             : }
    4088             : Datum
    4089          12 : to_oct64(PG_FUNCTION_ARGS)
    4090             : {
    4091          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4092             : 
    4093          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 8));
    4094             : }
    4095             : 
    4096             : /*
    4097             :  * Convert an integer to a string containing a base-16 (hex) representation of
    4098             :  * the number.
    4099             :  */
    4100             : Datum
    4101       38690 : to_hex32(PG_FUNCTION_ARGS)
    4102             : {
    4103       38690 :     uint64      value = (uint32) PG_GETARG_INT32(0);
    4104             : 
    4105       38690 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4106             : }
    4107             : Datum
    4108          12 : to_hex64(PG_FUNCTION_ARGS)
    4109             : {
    4110          12 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4111             : 
    4112          12 :     PG_RETURN_TEXT_P(convert_to_base(value, 16));
    4113             : }
    4114             : 
    4115             : /*
    4116             :  * Return the size of a datum, possibly compressed
    4117             :  *
    4118             :  * Works on any data type
    4119             :  */
    4120             : Datum
    4121         122 : pg_column_size(PG_FUNCTION_ARGS)
    4122             : {
    4123         122 :     Datum       value = PG_GETARG_DATUM(0);
    4124             :     int32       result;
    4125             :     int         typlen;
    4126             : 
    4127             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4128         122 :     if (fcinfo->flinfo->fn_extra == NULL)
    4129             :     {
    4130             :         /* Lookup the datatype of the supplied argument */
    4131         122 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4132             : 
    4133         122 :         typlen = get_typlen(argtypeid);
    4134         122 :         if (typlen == 0)        /* should not happen */
    4135           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4136             : 
    4137         122 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4138             :                                                       sizeof(int));
    4139         122 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4140             :     }
    4141             :     else
    4142           0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4143             : 
    4144         122 :     if (typlen == -1)
    4145             :     {
    4146             :         /* varlena type, possibly toasted */
    4147         122 :         result = toast_datum_size(value);
    4148             :     }
    4149           0 :     else if (typlen == -2)
    4150             :     {
    4151             :         /* cstring */
    4152           0 :         result = strlen(DatumGetCString(value)) + 1;
    4153             :     }
    4154             :     else
    4155             :     {
    4156             :         /* ordinary fixed-width type */
    4157           0 :         result = typlen;
    4158             :     }
    4159             : 
    4160         122 :     PG_RETURN_INT32(result);
    4161             : }
    4162             : 
    4163             : /*
    4164             :  * Return the compression method stored in the compressed attribute.  Return
    4165             :  * NULL for non varlena type or uncompressed data.
    4166             :  */
    4167             : Datum
    4168         174 : pg_column_compression(PG_FUNCTION_ARGS)
    4169             : {
    4170             :     int         typlen;
    4171             :     char       *result;
    4172             :     ToastCompressionId cmid;
    4173             : 
    4174             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4175         174 :     if (fcinfo->flinfo->fn_extra == NULL)
    4176             :     {
    4177             :         /* Lookup the datatype of the supplied argument */
    4178         138 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4179             : 
    4180         138 :         typlen = get_typlen(argtypeid);
    4181         138 :         if (typlen == 0)        /* should not happen */
    4182           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4183             : 
    4184         138 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4185             :                                                       sizeof(int));
    4186         138 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4187             :     }
    4188             :     else
    4189          36 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4190             : 
    4191         174 :     if (typlen != -1)
    4192           0 :         PG_RETURN_NULL();
    4193             : 
    4194             :     /* get the compression method id stored in the compressed varlena */
    4195         174 :     cmid = toast_get_compression_id((struct varlena *)
    4196         174 :                                     DatumGetPointer(PG_GETARG_DATUM(0)));
    4197         174 :     if (cmid == TOAST_INVALID_COMPRESSION_ID)
    4198          42 :         PG_RETURN_NULL();
    4199             : 
    4200             :     /* convert compression method id to compression method name */
    4201         132 :     switch (cmid)
    4202             :     {
    4203          66 :         case TOAST_PGLZ_COMPRESSION_ID:
    4204          66 :             result = "pglz";
    4205          66 :             break;
    4206          66 :         case TOAST_LZ4_COMPRESSION_ID:
    4207          66 :             result = "lz4";
    4208          66 :             break;
    4209           0 :         default:
    4210           0 :             elog(ERROR, "invalid compression method id %d", cmid);
    4211             :     }
    4212             : 
    4213         132 :     PG_RETURN_TEXT_P(cstring_to_text(result));
    4214             : }
    4215             : 
    4216             : /*
    4217             :  * Return the chunk_id of the on-disk TOASTed value.  Return NULL if the value
    4218             :  * is un-TOASTed or not on-disk.
    4219             :  */
    4220             : Datum
    4221          52 : pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
    4222             : {
    4223             :     int         typlen;
    4224             :     struct varlena *attr;
    4225             :     struct varatt_external toast_pointer;
    4226             : 
    4227             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4228          52 :     if (fcinfo->flinfo->fn_extra == NULL)
    4229             :     {
    4230             :         /* Lookup the datatype of the supplied argument */
    4231          40 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4232             : 
    4233          40 :         typlen = get_typlen(argtypeid);
    4234          40 :         if (typlen == 0)        /* should not happen */
    4235           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4236             : 
    4237          40 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4238             :                                                       sizeof(int));
    4239          40 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4240             :     }
    4241             :     else
    4242          12 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    4243             : 
    4244          52 :     if (typlen != -1)
    4245           0 :         PG_RETURN_NULL();
    4246             : 
    4247          52 :     attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
    4248             : 
    4249          52 :     if (!VARATT_IS_EXTERNAL_ONDISK(attr))
    4250          12 :         PG_RETURN_NULL();
    4251             : 
    4252          40 :     VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
    4253             : 
    4254          40 :     PG_RETURN_OID(toast_pointer.va_valueid);
    4255             : }
    4256             : 
    4257             : /*
    4258             :  * string_agg - Concatenates values and returns string.
    4259             :  *
    4260             :  * Syntax: string_agg(value text, delimiter text) RETURNS text
    4261             :  *
    4262             :  * Note: Any NULL values are ignored. The first-call delimiter isn't
    4263             :  * actually used at all, and on subsequent calls the delimiter precedes
    4264             :  * the associated value.
    4265             :  */
    4266             : 
    4267             : /* subroutine to initialize state */
    4268             : static StringInfo
    4269        2366 : makeStringAggState(FunctionCallInfo fcinfo)
    4270             : {
    4271             :     StringInfo  state;
    4272             :     MemoryContext aggcontext;
    4273             :     MemoryContext oldcontext;
    4274             : 
    4275        2366 :     if (!AggCheckCallContext(fcinfo, &aggcontext))
    4276             :     {
    4277             :         /* cannot be called directly because of internal-type argument */
    4278           0 :         elog(ERROR, "string_agg_transfn called in non-aggregate context");
    4279             :     }
    4280             : 
    4281             :     /*
    4282             :      * Create state in aggregate context.  It'll stay there across subsequent
    4283             :      * calls.
    4284             :      */
    4285        2366 :     oldcontext = MemoryContextSwitchTo(aggcontext);
    4286        2366 :     state = makeStringInfo();
    4287        2366 :     MemoryContextSwitchTo(oldcontext);
    4288             : 
    4289        2366 :     return state;
    4290             : }
    4291             : 
    4292             : Datum
    4293     1093930 : string_agg_transfn(PG_FUNCTION_ARGS)
    4294             : {
    4295             :     StringInfo  state;
    4296             : 
    4297     1093930 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4298             : 
    4299             :     /* Append the value unless null, preceding it with the delimiter. */
    4300     1093930 :     if (!PG_ARGISNULL(1))
    4301             :     {
    4302     1078882 :         text       *value = PG_GETARG_TEXT_PP(1);
    4303     1078882 :         bool        isfirst = false;
    4304             : 
    4305             :         /*
    4306             :          * You might think we can just throw away the first delimiter, however
    4307             :          * we must keep it as we may be a parallel worker doing partial
    4308             :          * aggregation building a state to send to the main process.  We need
    4309             :          * to keep the delimiter of every aggregation so that the combine
    4310             :          * function can properly join up the strings of two separately
    4311             :          * partially aggregated results.  The first delimiter is only stripped
    4312             :          * off in the final function.  To know how much to strip off the front
    4313             :          * of the string, we store the length of the first delimiter in the
    4314             :          * StringInfo's cursor field, which we don't otherwise need here.
    4315             :          */
    4316     1078882 :         if (state == NULL)
    4317             :         {
    4318        2046 :             state = makeStringAggState(fcinfo);
    4319        2046 :             isfirst = true;
    4320             :         }
    4321             : 
    4322     1078882 :         if (!PG_ARGISNULL(2))
    4323             :         {
    4324     1078882 :             text       *delim = PG_GETARG_TEXT_PP(2);
    4325             : 
    4326     1078882 :             appendStringInfoText(state, delim);
    4327     1078882 :             if (isfirst)
    4328        2046 :                 state->cursor = VARSIZE_ANY_EXHDR(delim);
    4329             :         }
    4330             : 
    4331     1078882 :         appendStringInfoText(state, value);
    4332             :     }
    4333             : 
    4334             :     /*
    4335             :      * The transition type for string_agg() is declared to be "internal",
    4336             :      * which is a pass-by-value type the same size as a pointer.
    4337             :      */
    4338     1093930 :     if (state)
    4339     1093840 :         PG_RETURN_POINTER(state);
    4340          90 :     PG_RETURN_NULL();
    4341             : }
    4342             : 
    4343             : /*
    4344             :  * string_agg_combine
    4345             :  *      Aggregate combine function for string_agg(text) and string_agg(bytea)
    4346             :  */
    4347             : Datum
    4348         200 : string_agg_combine(PG_FUNCTION_ARGS)
    4349             : {
    4350             :     StringInfo  state1;
    4351             :     StringInfo  state2;
    4352             :     MemoryContext agg_context;
    4353             : 
    4354         200 :     if (!AggCheckCallContext(fcinfo, &agg_context))
    4355           0 :         elog(ERROR, "aggregate function called in non-aggregate context");
    4356             : 
    4357         200 :     state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4358         200 :     state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
    4359             : 
    4360         200 :     if (state2 == NULL)
    4361             :     {
    4362             :         /*
    4363             :          * NULL state2 is easy, just return state1, which we know is already
    4364             :          * in the agg_context
    4365             :          */
    4366           0 :         if (state1 == NULL)
    4367           0 :             PG_RETURN_NULL();
    4368           0 :         PG_RETURN_POINTER(state1);
    4369             :     }
    4370             : 
    4371         200 :     if (state1 == NULL)
    4372             :     {
    4373             :         /* We must copy state2's data into the agg_context */
    4374             :         MemoryContext old_context;
    4375             : 
    4376         120 :         old_context = MemoryContextSwitchTo(agg_context);
    4377         120 :         state1 = makeStringAggState(fcinfo);
    4378         120 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4379         120 :         state1->cursor = state2->cursor;
    4380         120 :         MemoryContextSwitchTo(old_context);
    4381             :     }
    4382          80 :     else if (state2->len > 0)
    4383             :     {
    4384             :         /* Combine ... state1->cursor does not change in this case */
    4385          80 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    4386             :     }
    4387             : 
    4388         200 :     PG_RETURN_POINTER(state1);
    4389             : }
    4390             : 
    4391             : /*
    4392             :  * string_agg_serialize
    4393             :  *      Aggregate serialize function for string_agg(text) and string_agg(bytea)
    4394             :  *
    4395             :  * This is strict, so we need not handle NULL input
    4396             :  */
    4397             : Datum
    4398         200 : string_agg_serialize(PG_FUNCTION_ARGS)
    4399             : {
    4400             :     StringInfo  state;
    4401             :     StringInfoData buf;
    4402             :     bytea      *result;
    4403             : 
    4404             :     /* cannot be called directly because of internal-type argument */
    4405             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4406             : 
    4407         200 :     state = (StringInfo) PG_GETARG_POINTER(0);
    4408             : 
    4409         200 :     pq_begintypsend(&buf);
    4410             : 
    4411             :     /* cursor */
    4412         200 :     pq_sendint(&buf, state->cursor, 4);
    4413             : 
    4414             :     /* data */
    4415         200 :     pq_sendbytes(&buf, state->data, state->len);
    4416             : 
    4417         200 :     result = pq_endtypsend(&buf);
    4418             : 
    4419         200 :     PG_RETURN_BYTEA_P(result);
    4420             : }
    4421             : 
    4422             : /*
    4423             :  * string_agg_deserialize
    4424             :  *      Aggregate deserial function for string_agg(text) and string_agg(bytea)
    4425             :  *
    4426             :  * This is strict, so we need not handle NULL input
    4427             :  */
    4428             : Datum
    4429         200 : string_agg_deserialize(PG_FUNCTION_ARGS)
    4430             : {
    4431             :     bytea      *sstate;
    4432             :     StringInfo  result;
    4433             :     StringInfoData buf;
    4434             :     char       *data;
    4435             :     int         datalen;
    4436             : 
    4437             :     /* cannot be called directly because of internal-type argument */
    4438             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4439             : 
    4440         200 :     sstate = PG_GETARG_BYTEA_PP(0);
    4441             : 
    4442             :     /*
    4443             :      * Initialize a StringInfo so that we can "receive" it using the standard
    4444             :      * recv-function infrastructure.
    4445             :      */
    4446         200 :     initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
    4447         200 :                            VARSIZE_ANY_EXHDR(sstate));
    4448             : 
    4449         200 :     result = makeStringAggState(fcinfo);
    4450             : 
    4451             :     /* cursor */
    4452         200 :     result->cursor = pq_getmsgint(&buf, 4);
    4453             : 
    4454             :     /* data */
    4455         200 :     datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
    4456         200 :     data = (char *) pq_getmsgbytes(&buf, datalen);
    4457         200 :     appendBinaryStringInfo(result, data, datalen);
    4458             : 
    4459         200 :     pq_getmsgend(&buf);
    4460             : 
    4461         200 :     PG_RETURN_POINTER(result);
    4462             : }
    4463             : 
    4464             : Datum
    4465        2090 : string_agg_finalfn(PG_FUNCTION_ARGS)
    4466             : {
    4467             :     StringInfo  state;
    4468             : 
    4469             :     /* cannot be called directly because of internal-type argument */
    4470             :     Assert(AggCheckCallContext(fcinfo, NULL));
    4471             : 
    4472        2090 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    4473             : 
    4474        2090 :     if (state != NULL)
    4475             :     {
    4476             :         /* As per comment in transfn, strip data before the cursor position */
    4477        2006 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
    4478             :                                                   state->len - state->cursor));
    4479             :     }
    4480             :     else
    4481          84 :         PG_RETURN_NULL();
    4482             : }
    4483             : 
    4484             : /*
    4485             :  * Prepare cache with fmgr info for the output functions of the datatypes of
    4486             :  * the arguments of a concat-like function, beginning with argument "argidx".
    4487             :  * (Arguments before that will have corresponding slots in the resulting
    4488             :  * FmgrInfo array, but we don't fill those slots.)
    4489             :  */
    4490             : static FmgrInfo *
    4491         106 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
    4492             : {
    4493             :     FmgrInfo   *foutcache;
    4494             :     int         i;
    4495             : 
    4496             :     /* We keep the info in fn_mcxt so it survives across calls */
    4497         106 :     foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4498         106 :                                                 PG_NARGS() * sizeof(FmgrInfo));
    4499             : 
    4500         400 :     for (i = argidx; i < PG_NARGS(); i++)
    4501             :     {
    4502             :         Oid         valtype;
    4503             :         Oid         typOutput;
    4504             :         bool        typIsVarlena;
    4505             : 
    4506         294 :         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
    4507         294 :         if (!OidIsValid(valtype))
    4508           0 :             elog(ERROR, "could not determine data type of concat() input");
    4509             : 
    4510         294 :         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
    4511         294 :         fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
    4512             :     }
    4513             : 
    4514         106 :     fcinfo->flinfo->fn_extra = foutcache;
    4515             : 
    4516         106 :     return foutcache;
    4517             : }
    4518             : 
    4519             : /*
    4520             :  * Implementation of both concat() and concat_ws().
    4521             :  *
    4522             :  * sepstr is the separator string to place between values.
    4523             :  * argidx identifies the first argument to concatenate (counting from zero);
    4524             :  * note that this must be constant across any one series of calls.
    4525             :  *
    4526             :  * Returns NULL if result should be NULL, else text value.
    4527             :  */
    4528             : static text *
    4529         264 : concat_internal(const char *sepstr, int argidx,
    4530             :                 FunctionCallInfo fcinfo)
    4531             : {
    4532             :     text       *result;
    4533             :     StringInfoData str;
    4534             :     FmgrInfo   *foutcache;
    4535         264 :     bool        first_arg = true;
    4536             :     int         i;
    4537             : 
    4538             :     /*
    4539             :      * concat(VARIADIC some-array) is essentially equivalent to
    4540             :      * array_to_text(), ie concat the array elements with the given separator.
    4541             :      * So we just pass the case off to that code.
    4542             :      */
    4543         264 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4544             :     {
    4545             :         ArrayType  *arr;
    4546             : 
    4547             :         /* Should have just the one argument */
    4548             :         Assert(argidx == PG_NARGS() - 1);
    4549             : 
    4550             :         /* concat(VARIADIC NULL) is defined as NULL */
    4551          30 :         if (PG_ARGISNULL(argidx))
    4552          12 :             return NULL;
    4553             : 
    4554             :         /*
    4555             :          * Non-null argument had better be an array.  We assume that any call
    4556             :          * context that could let get_fn_expr_variadic return true will have
    4557             :          * checked that a VARIADIC-labeled parameter actually is an array.  So
    4558             :          * it should be okay to just Assert that it's an array rather than
    4559             :          * doing a full-fledged error check.
    4560             :          */
    4561             :         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
    4562             : 
    4563             :         /* OK, safe to fetch the array value */
    4564          18 :         arr = PG_GETARG_ARRAYTYPE_P(argidx);
    4565             : 
    4566             :         /*
    4567             :          * And serialize the array.  We tell array_to_text to ignore null
    4568             :          * elements, which matches the behavior of the loop below.
    4569             :          */
    4570          18 :         return array_to_text_internal(fcinfo, arr, sepstr, NULL);
    4571             :     }
    4572             : 
    4573             :     /* Normal case without explicit VARIADIC marker */
    4574         234 :     initStringInfo(&str);
    4575             : 
    4576             :     /* Get output function info, building it if first time through */
    4577         234 :     foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
    4578         234 :     if (foutcache == NULL)
    4579         106 :         foutcache = build_concat_foutcache(fcinfo, argidx);
    4580             : 
    4581         822 :     for (i = argidx; i < PG_NARGS(); i++)
    4582             :     {
    4583         588 :         if (!PG_ARGISNULL(i))
    4584             :         {
    4585         510 :             Datum       value = PG_GETARG_DATUM(i);
    4586             : 
    4587             :             /* add separator if appropriate */
    4588         510 :             if (first_arg)
    4589         228 :                 first_arg = false;
    4590             :             else
    4591         282 :                 appendStringInfoString(&str, sepstr);
    4592             : 
    4593             :             /* call the appropriate type output function, append the result */
    4594         510 :             appendStringInfoString(&str,
    4595         510 :                                    OutputFunctionCall(&foutcache[i], value));
    4596             :         }
    4597             :     }
    4598             : 
    4599         234 :     result = cstring_to_text_with_len(str.data, str.len);
    4600         234 :     pfree(str.data);
    4601             : 
    4602         234 :     return result;
    4603             : }
    4604             : 
    4605             : /*
    4606             :  * Concatenate all arguments. NULL arguments are ignored.
    4607             :  */
    4608             : Datum
    4609         186 : text_concat(PG_FUNCTION_ARGS)
    4610             : {
    4611             :     text       *result;
    4612             : 
    4613         186 :     result = concat_internal("", 0, fcinfo);
    4614         186 :     if (result == NULL)
    4615           6 :         PG_RETURN_NULL();
    4616         180 :     PG_RETURN_TEXT_P(result);
    4617             : }
    4618             : 
    4619             : /*
    4620             :  * Concatenate all but first argument value with separators. The first
    4621             :  * parameter is used as the separator. NULL arguments are ignored.
    4622             :  */
    4623             : Datum
    4624          84 : text_concat_ws(PG_FUNCTION_ARGS)
    4625             : {
    4626             :     char       *sep;
    4627             :     text       *result;
    4628             : 
    4629             :     /* return NULL when separator is NULL */
    4630          84 :     if (PG_ARGISNULL(0))
    4631           6 :         PG_RETURN_NULL();
    4632          78 :     sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
    4633             : 
    4634          78 :     result = concat_internal(sep, 1, fcinfo);
    4635          78 :     if (result == NULL)
    4636           6 :         PG_RETURN_NULL();
    4637          72 :     PG_RETURN_TEXT_P(result);
    4638             : }
    4639             : 
    4640             : /*
    4641             :  * Return first n characters in the string. When n is negative,
    4642             :  * return all but last |n| characters.
    4643             :  */
    4644             : Datum
    4645        2148 : text_left(PG_FUNCTION_ARGS)
    4646             : {
    4647        2148 :     int         n = PG_GETARG_INT32(1);
    4648             : 
    4649        2148 :     if (n < 0)
    4650             :     {
    4651          30 :         text       *str = PG_GETARG_TEXT_PP(0);
    4652          30 :         const char *p = VARDATA_ANY(str);
    4653          30 :         int         len = VARSIZE_ANY_EXHDR(str);
    4654             :         int         rlen;
    4655             : 
    4656          30 :         n = pg_mbstrlen_with_len(p, len) + n;
    4657          30 :         rlen = pg_mbcharcliplen(p, len, n);
    4658          30 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
    4659             :     }
    4660             :     else
    4661        2118 :         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
    4662             : }
    4663             : 
    4664             : /*
    4665             :  * Return last n characters in the string. When n is negative,
    4666             :  * return all but first |n| characters.
    4667             :  */
    4668             : Datum
    4669          66 : text_right(PG_FUNCTION_ARGS)
    4670             : {
    4671          66 :     text       *str = PG_GETARG_TEXT_PP(0);
    4672          66 :     const char *p = VARDATA_ANY(str);
    4673          66 :     int         len = VARSIZE_ANY_EXHDR(str);
    4674          66 :     int         n = PG_GETARG_INT32(1);
    4675             :     int         off;
    4676             : 
    4677          66 :     if (n < 0)
    4678          30 :         n = -n;
    4679             :     else
    4680          36 :         n = pg_mbstrlen_with_len(p, len) - n;
    4681          66 :     off = pg_mbcharcliplen(p, len, n);
    4682             : 
    4683          66 :     PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
    4684             : }
    4685             : 
    4686             : /*
    4687             :  * Return reversed string
    4688             :  */
    4689             : Datum
    4690           6 : text_reverse(PG_FUNCTION_ARGS)
    4691             : {
    4692           6 :     text       *str = PG_GETARG_TEXT_PP(0);
    4693           6 :     const char *p = VARDATA_ANY(str);
    4694           6 :     int         len = VARSIZE_ANY_EXHDR(str);
    4695           6 :     const char *endp = p + len;
    4696             :     text       *result;
    4697             :     char       *dst;
    4698             : 
    4699           6 :     result = palloc(len + VARHDRSZ);
    4700           6 :     dst = (char *) VARDATA(result) + len;
    4701           6 :     SET_VARSIZE(result, len + VARHDRSZ);
    4702             : 
    4703           6 :     if (pg_database_encoding_max_length() > 1)
    4704             :     {
    4705             :         /* multibyte version */
    4706          36 :         while (p < endp)
    4707             :         {
    4708             :             int         sz;
    4709             : 
    4710          30 :             sz = pg_mblen(p);
    4711          30 :             dst -= sz;
    4712          30 :             memcpy(dst, p, sz);
    4713          30 :             p += sz;
    4714             :         }
    4715             :     }
    4716             :     else
    4717             :     {
    4718             :         /* single byte version */
    4719           0 :         while (p < endp)
    4720           0 :             *(--dst) = *p++;
    4721             :     }
    4722             : 
    4723           6 :     PG_RETURN_TEXT_P(result);
    4724             : }
    4725             : 
    4726             : 
    4727             : /*
    4728             :  * Support macros for text_format()
    4729             :  */
    4730             : #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
    4731             : 
    4732             : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
    4733             :     do { \
    4734             :         if (++(ptr) >= (end_ptr)) \
    4735             :             ereport(ERROR, \
    4736             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    4737             :                      errmsg("unterminated format() type specifier"), \
    4738             :                      errhint("For a single \"%%\" use \"%%%%\"."))); \
    4739             :     } while (0)
    4740             : 
    4741             : /*
    4742             :  * Returns a formatted string
    4743             :  */
    4744             : Datum
    4745       33216 : text_format(PG_FUNCTION_ARGS)
    4746             : {
    4747             :     text       *fmt;
    4748             :     StringInfoData str;
    4749             :     const char *cp;
    4750             :     const char *start_ptr;
    4751             :     const char *end_ptr;
    4752             :     text       *result;
    4753             :     int         arg;
    4754             :     bool        funcvariadic;
    4755             :     int         nargs;
    4756       33216 :     Datum      *elements = NULL;
    4757       33216 :     bool       *nulls = NULL;
    4758       33216 :     Oid         element_type = InvalidOid;
    4759       33216 :     Oid         prev_type = InvalidOid;
    4760       33216 :     Oid         prev_width_type = InvalidOid;
    4761             :     FmgrInfo    typoutputfinfo;
    4762             :     FmgrInfo    typoutputinfo_width;
    4763             : 
    4764             :     /* When format string is null, immediately return null */
    4765       33216 :     if (PG_ARGISNULL(0))
    4766           6 :         PG_RETURN_NULL();
    4767             : 
    4768             :     /* If argument is marked VARIADIC, expand array into elements */
    4769       33210 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    4770             :     {
    4771             :         ArrayType  *arr;
    4772             :         int16       elmlen;
    4773             :         bool        elmbyval;
    4774             :         char        elmalign;
    4775             :         int         nitems;
    4776             : 
    4777             :         /* Should have just the one argument */
    4778             :         Assert(PG_NARGS() == 2);
    4779             : 
    4780             :         /* If argument is NULL, we treat it as zero-length array */
    4781          48 :         if (PG_ARGISNULL(1))
    4782           6 :             nitems = 0;
    4783             :         else
    4784             :         {
    4785             :             /*
    4786             :              * Non-null argument had better be an array.  We assume that any
    4787             :              * call context that could let get_fn_expr_variadic return true
    4788             :              * will have checked that a VARIADIC-labeled parameter actually is
    4789             :              * an array.  So it should be okay to just Assert that it's an
    4790             :              * array rather than doing a full-fledged error check.
    4791             :              */
    4792             :             Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
    4793             : 
    4794             :             /* OK, safe to fetch the array value */
    4795          42 :             arr = PG_GETARG_ARRAYTYPE_P(1);
    4796             : 
    4797             :             /* Get info about array element type */
    4798          42 :             element_type = ARR_ELEMTYPE(arr);
    4799          42 :             get_typlenbyvalalign(element_type,
    4800             :                                  &elmlen, &elmbyval, &elmalign);
    4801             : 
    4802             :             /* Extract all array elements */
    4803          42 :             deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
    4804             :                               &elements, &nulls, &nitems);
    4805             :         }
    4806             : 
    4807          48 :         nargs = nitems + 1;
    4808          48 :         funcvariadic = true;
    4809             :     }
    4810             :     else
    4811             :     {
    4812             :         /* Non-variadic case, we'll process the arguments individually */
    4813       33162 :         nargs = PG_NARGS();
    4814       33162 :         funcvariadic = false;
    4815             :     }
    4816             : 
    4817             :     /* Setup for main loop. */
    4818       33210 :     fmt = PG_GETARG_TEXT_PP(0);
    4819       33210 :     start_ptr = VARDATA_ANY(fmt);
    4820       33210 :     end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
    4821       33210 :     initStringInfo(&str);
    4822       33210 :     arg = 1;                    /* next argument position to print */
    4823             : 
    4824             :     /* Scan format string, looking for conversion specifiers. */
    4825     1013188 :     for (cp = start_ptr; cp < end_ptr; cp++)
    4826             :     {
    4827             :         int         argpos;
    4828             :         int         widthpos;
    4829             :         int         flags;
    4830             :         int         width;
    4831             :         Datum       value;
    4832             :         bool        isNull;
    4833             :         Oid         typid;
    4834             : 
    4835             :         /*
    4836             :          * If it's not the start of a conversion specifier, just copy it to
    4837             :          * the output buffer.
    4838             :          */
    4839      980038 :         if (*cp != '%')
    4840             :         {
    4841      914170 :             appendStringInfoCharMacro(&str, *cp);
    4842      914188 :             continue;
    4843             :         }
    4844             : 
    4845       65868 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    4846             : 
    4847             :         /* Easy case: %% outputs a single % */
    4848       65868 :         if (*cp == '%')
    4849             :         {
    4850          18 :             appendStringInfoCharMacro(&str, *cp);
    4851          18 :             continue;
    4852             :         }
    4853             : 
    4854             :         /* Parse the optional portions of the format specifier */
    4855       65850 :         cp = text_format_parse_format(cp, end_ptr,
    4856             :                                       &argpos, &widthpos,
    4857             :                                       &flags, &width);
    4858             : 
    4859             :         /*
    4860             :          * Next we should see the main conversion specifier.  Whether or not
    4861             :          * an argument position was present, it's known that at least one
    4862             :          * character remains in the string at this point.  Experience suggests
    4863             :          * that it's worth checking that that character is one of the expected
    4864             :          * ones before we try to fetch arguments, so as to produce the least
    4865             :          * confusing response to a mis-formatted specifier.
    4866             :          */
    4867       65826 :         if (strchr("sIL", *cp) == NULL)
    4868           6 :             ereport(ERROR,
    4869             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4870             :                      errmsg("unrecognized format() type specifier \"%.*s\"",
    4871             :                             pg_mblen(cp), cp),
    4872             :                      errhint("For a single \"%%\" use \"%%%%\".")));
    4873             : 
    4874             :         /* If indirect width was specified, get its value */
    4875       65820 :         if (widthpos >= 0)
    4876             :         {
    4877             :             /* Collect the specified or next argument position */
    4878          42 :             if (widthpos > 0)
    4879          36 :                 arg = widthpos;
    4880          42 :             if (arg >= nargs)
    4881           0 :                 ereport(ERROR,
    4882             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4883             :                          errmsg("too few arguments for format()")));
    4884             : 
    4885             :             /* Get the value and type of the selected argument */
    4886          42 :             if (!funcvariadic)
    4887             :             {
    4888          42 :                 value = PG_GETARG_DATUM(arg);
    4889          42 :                 isNull = PG_ARGISNULL(arg);
    4890          42 :                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4891             :             }
    4892             :             else
    4893             :             {
    4894           0 :                 value = elements[arg - 1];
    4895           0 :                 isNull = nulls[arg - 1];
    4896           0 :                 typid = element_type;
    4897             :             }
    4898          42 :             if (!OidIsValid(typid))
    4899           0 :                 elog(ERROR, "could not determine data type of format() input");
    4900             : 
    4901          42 :             arg++;
    4902             : 
    4903             :             /* We can treat NULL width the same as zero */
    4904          42 :             if (isNull)
    4905           6 :                 width = 0;
    4906          36 :             else if (typid == INT4OID)
    4907          36 :                 width = DatumGetInt32(value);
    4908           0 :             else if (typid == INT2OID)
    4909           0 :                 width = DatumGetInt16(value);
    4910             :             else
    4911             :             {
    4912             :                 /* For less-usual datatypes, convert to text then to int */
    4913             :                 char       *str;
    4914             : 
    4915           0 :                 if (typid != prev_width_type)
    4916             :                 {
    4917             :                     Oid         typoutputfunc;
    4918             :                     bool        typIsVarlena;
    4919             : 
    4920           0 :                     getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    4921           0 :                     fmgr_info(typoutputfunc, &typoutputinfo_width);
    4922           0 :                     prev_width_type = typid;
    4923             :                 }
    4924             : 
    4925           0 :                 str = OutputFunctionCall(&typoutputinfo_width, value);
    4926             : 
    4927             :                 /* pg_strtoint32 will complain about bad data or overflow */
    4928           0 :                 width = pg_strtoint32(str);
    4929             : 
    4930           0 :                 pfree(str);
    4931             :             }
    4932             :         }
    4933             : 
    4934             :         /* Collect the specified or next argument position */
    4935       65820 :         if (argpos > 0)
    4936         132 :             arg = argpos;
    4937       65820 :         if (arg >= nargs)
    4938          24 :             ereport(ERROR,
    4939             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4940             :                      errmsg("too few arguments for format()")));
    4941             : 
    4942             :         /* Get the value and type of the selected argument */
    4943       65796 :         if (!funcvariadic)
    4944             :         {
    4945       64524 :             value = PG_GETARG_DATUM(arg);
    4946       64524 :             isNull = PG_ARGISNULL(arg);
    4947       64524 :             typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    4948             :         }
    4949             :         else
    4950             :         {
    4951        1272 :             value = elements[arg - 1];
    4952        1272 :             isNull = nulls[arg - 1];
    4953        1272 :             typid = element_type;
    4954             :         }
    4955       65796 :         if (!OidIsValid(typid))
    4956           0 :             elog(ERROR, "could not determine data type of format() input");
    4957             : 
    4958       65796 :         arg++;
    4959             : 
    4960             :         /*
    4961             :          * Get the appropriate typOutput function, reusing previous one if
    4962             :          * same type as previous argument.  That's particularly useful in the
    4963             :          * variadic-array case, but often saves work even for ordinary calls.
    4964             :          */
    4965       65796 :         if (typid != prev_type)
    4966             :         {
    4967             :             Oid         typoutputfunc;
    4968             :             bool        typIsVarlena;
    4969             : 
    4970       34278 :             getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    4971       34278 :             fmgr_info(typoutputfunc, &typoutputfinfo);
    4972       34278 :             prev_type = typid;
    4973             :         }
    4974             : 
    4975             :         /*
    4976             :          * And now we can format the value.
    4977             :          */
    4978       65796 :         switch (*cp)
    4979             :         {
    4980       65796 :             case 's':
    4981             :             case 'I':
    4982             :             case 'L':
    4983       65796 :                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
    4984             :                                               value, isNull,
    4985             :                                               flags, width);
    4986       65790 :                 break;
    4987           0 :             default:
    4988             :                 /* should not get here, because of previous check */
    4989           0 :                 ereport(ERROR,
    4990             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4991             :                          errmsg("unrecognized format() type specifier \"%.*s\"",
    4992             :                                 pg_mblen(cp), cp),
    4993             :                          errhint("For a single \"%%\" use \"%%%%\".")));
    4994             :                 break;
    4995             :         }
    4996             :     }
    4997             : 
    4998             :     /* Don't need deconstruct_array results anymore. */
    4999       33150 :     if (elements != NULL)
    5000          42 :         pfree(elements);
    5001       33150 :     if (nulls != NULL)
    5002          42 :         pfree(nulls);
    5003             : 
    5004             :     /* Generate results. */
    5005       33150 :     result = cstring_to_text_with_len(str.data, str.len);
    5006       33150 :     pfree(str.data);
    5007             : 
    5008       33150 :     PG_RETURN_TEXT_P(result);
    5009             : }
    5010             : 
    5011             : /*
    5012             :  * Parse contiguous digits as a decimal number.
    5013             :  *
    5014             :  * Returns true if some digits could be parsed.
    5015             :  * The value is returned into *value, and *ptr is advanced to the next
    5016             :  * character to be parsed.
    5017             :  *
    5018             :  * Note parsing invariant: at least one character is known available before
    5019             :  * string end (end_ptr) at entry, and this is still true at exit.
    5020             :  */
    5021             : static bool
    5022      131664 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
    5023             : {
    5024      131664 :     bool        found = false;
    5025      131664 :     const char *cp = *ptr;
    5026      131664 :     int         val = 0;
    5027             : 
    5028      131976 :     while (*cp >= '0' && *cp <= '9')
    5029             :     {
    5030         318 :         int8        digit = (*cp - '0');
    5031             : 
    5032         318 :         if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
    5033         318 :             unlikely(pg_add_s32_overflow(val, digit, &val)))
    5034           0 :             ereport(ERROR,
    5035             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5036             :                      errmsg("number is out of range")));
    5037         318 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5038         312 :         found = true;
    5039             :     }
    5040             : 
    5041      131658 :     *ptr = cp;
    5042      131658 :     *value = val;
    5043             : 
    5044      131658 :     return found;
    5045             : }
    5046             : 
    5047             : /*
    5048             :  * Parse a format specifier (generally following the SUS printf spec).
    5049             :  *
    5050             :  * We have already advanced over the initial '%', and we are looking for
    5051             :  * [argpos][flags][width]type (but the type character is not consumed here).
    5052             :  *
    5053             :  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
    5054             :  * Output parameters:
    5055             :  *  argpos: argument position for value to be printed.  -1 means unspecified.
    5056             :  *  widthpos: argument position for width.  Zero means the argument position
    5057             :  *          was unspecified (ie, take the next arg) and -1 means no width
    5058             :  *          argument (width was omitted or specified as a constant).
    5059             :  *  flags: bitmask of flags.
    5060             :  *  width: directly-specified width value.  Zero means the width was omitted
    5061             :  *          (note it's not necessary to distinguish this case from an explicit
    5062             :  *          zero width value).
    5063             :  *
    5064             :  * The function result is the next character position to be parsed, ie, the
    5065             :  * location where the type character is/should be.
    5066             :  *
    5067             :  * Note parsing invariant: at least one character is known available before
    5068             :  * string end (end_ptr) at entry, and this is still true at exit.
    5069             :  */
    5070             : static const char *
    5071       65850 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
    5072             :                          int *argpos, int *widthpos,
    5073             :                          int *flags, int *width)
    5074             : {
    5075       65850 :     const char *cp = start_ptr;
    5076             :     int         n;
    5077             : 
    5078             :     /* set defaults for output parameters */
    5079       65850 :     *argpos = -1;
    5080       65850 :     *widthpos = -1;
    5081       65850 :     *flags = 0;
    5082       65850 :     *width = 0;
    5083             : 
    5084             :     /* try to identify first number */
    5085       65850 :     if (text_format_parse_digits(&cp, end_ptr, &n))
    5086             :     {
    5087         174 :         if (*cp != '$')
    5088             :         {
    5089             :             /* Must be just a width and a type, so we're done */
    5090          24 :             *width = n;
    5091          24 :             return cp;
    5092             :         }
    5093             :         /* The number was argument position */
    5094         150 :         *argpos = n;
    5095             :         /* Explicit 0 for argument index is immediately refused */
    5096         150 :         if (n == 0)
    5097           6 :             ereport(ERROR,
    5098             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5099             :                      errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5100         144 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5101             :     }
    5102             : 
    5103             :     /* Handle flags (only minus is supported now) */
    5104       65844 :     while (*cp == '-')
    5105             :     {
    5106          30 :         *flags |= TEXT_FORMAT_FLAG_MINUS;
    5107          30 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5108             :     }
    5109             : 
    5110       65814 :     if (*cp == '*')
    5111             :     {
    5112             :         /* Handle indirect width */
    5113          48 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5114          48 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5115             :         {
    5116             :             /* number in this position must be closed by $ */
    5117          42 :             if (*cp != '$')
    5118           0 :                 ereport(ERROR,
    5119             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5120             :                          errmsg("width argument position must be ended by \"$\"")));
    5121             :             /* The number was width argument position */
    5122          42 :             *widthpos = n;
    5123             :             /* Explicit 0 for argument index is immediately refused */
    5124          42 :             if (n == 0)
    5125           6 :                 ereport(ERROR,
    5126             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5127             :                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5128          36 :             ADVANCE_PARSE_POINTER(cp, end_ptr);
    5129             :         }
    5130             :         else
    5131           6 :             *widthpos = 0;      /* width's argument position is unspecified */
    5132             :     }
    5133             :     else
    5134             :     {
    5135             :         /* Check for direct width specification */
    5136       65766 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5137          30 :             *width = n;
    5138             :     }
    5139             : 
    5140             :     /* cp should now be pointing at type character */
    5141       65802 :     return cp;
    5142             : }
    5143             : 
    5144             : /*
    5145             :  * Format a %s, %I, or %L conversion
    5146             :  */
    5147             : static void
    5148       65796 : text_format_string_conversion(StringInfo buf, char conversion,
    5149             :                               FmgrInfo *typOutputInfo,
    5150             :                               Datum value, bool isNull,
    5151             :                               int flags, int width)
    5152             : {
    5153             :     char       *str;
    5154             : 
    5155             :     /* Handle NULL arguments before trying to stringify the value. */
    5156       65796 :     if (isNull)
    5157             :     {
    5158         342 :         if (conversion == 's')
    5159         270 :             text_format_append_string(buf, "", flags, width);
    5160          72 :         else if (conversion == 'L')
    5161          66 :             text_format_append_string(buf, "NULL", flags, width);
    5162           6 :         else if (conversion == 'I')
    5163           6 :             ereport(ERROR,
    5164             :                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
    5165             :                      errmsg("null values cannot be formatted as an SQL identifier")));
    5166         336 :         return;
    5167             :     }
    5168             : 
    5169             :     /* Stringify. */
    5170       65454 :     str = OutputFunctionCall(typOutputInfo, value);
    5171             : 
    5172             :     /* Escape. */
    5173       65454 :     if (conversion == 'I')
    5174             :     {
    5175             :         /* quote_identifier may or may not allocate a new string. */
    5176        4906 :         text_format_append_string(buf, quote_identifier(str), flags, width);
    5177             :     }
    5178       60548 :     else if (conversion == 'L')
    5179             :     {
    5180        3234 :         char       *qstr = quote_literal_cstr(str);
    5181             : 
    5182        3234 :         text_format_append_string(buf, qstr, flags, width);
    5183             :         /* quote_literal_cstr() always allocates a new string */
    5184        3234 :         pfree(qstr);
    5185             :     }
    5186             :     else
    5187       57314 :         text_format_append_string(buf, str, flags, width);
    5188             : 
    5189             :     /* Cleanup. */
    5190       65454 :     pfree(str);
    5191             : }
    5192             : 
    5193             : /*
    5194             :  * Append str to buf, padding as directed by flags/width
    5195             :  */
    5196             : static void
    5197       65790 : text_format_append_string(StringInfo buf, const char *str,
    5198             :                           int flags, int width)
    5199             : {
    5200       65790 :     bool        align_to_left = false;
    5201             :     int         len;
    5202             : 
    5203             :     /* fast path for typical easy case */
    5204       65790 :     if (width == 0)
    5205             :     {
    5206       65706 :         appendStringInfoString(buf, str);
    5207       65706 :         return;
    5208             :     }
    5209             : 
    5210          84 :     if (width < 0)
    5211             :     {
    5212             :         /* Negative width: implicit '-' flag, then take absolute value */
    5213           6 :         align_to_left = true;
    5214             :         /* -INT_MIN is undefined */
    5215           6 :         if (width <= INT_MIN)
    5216           0 :             ereport(ERROR,
    5217             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5218             :                      errmsg("number is out of range")));
    5219           6 :         width = -width;
    5220             :     }
    5221          78 :     else if (flags & TEXT_FORMAT_FLAG_MINUS)
    5222          24 :         align_to_left = true;
    5223             : 
    5224          84 :     len = pg_mbstrlen(str);
    5225          84 :     if (align_to_left)
    5226             :     {
    5227             :         /* left justify */
    5228          30 :         appendStringInfoString(buf, str);
    5229          30 :         if (len < width)
    5230          30 :             appendStringInfoSpaces(buf, width - len);
    5231             :     }
    5232             :     else
    5233             :     {
    5234             :         /* right justify */
    5235          54 :         if (len < width)
    5236          54 :             appendStringInfoSpaces(buf, width - len);
    5237          54 :         appendStringInfoString(buf, str);
    5238             :     }
    5239             : }
    5240             : 
    5241             : /*
    5242             :  * text_format_nv - nonvariadic wrapper for text_format function.
    5243             :  *
    5244             :  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
    5245             :  * which checks that all built-in functions that share the implementing C
    5246             :  * function take the same number of arguments.
    5247             :  */
    5248             : Datum
    5249        3810 : text_format_nv(PG_FUNCTION_ARGS)
    5250             : {
    5251        3810 :     return text_format(fcinfo);
    5252             : }
    5253             : 
    5254             : /*
    5255             :  * Helper function for Levenshtein distance functions. Faster than memcmp(),
    5256             :  * for this use case.
    5257             :  */
    5258             : static inline bool
    5259           0 : rest_of_char_same(const char *s1, const char *s2, int len)
    5260             : {
    5261           0 :     while (len > 0)
    5262             :     {
    5263           0 :         len--;
    5264           0 :         if (s1[len] != s2[len])
    5265           0 :             return false;
    5266             :     }
    5267           0 :     return true;
    5268             : }
    5269             : 
    5270             : /* Expand each Levenshtein distance variant */
    5271             : #include "levenshtein.c"
    5272             : #define LEVENSHTEIN_LESS_EQUAL
    5273             : #include "levenshtein.c"
    5274             : 
    5275             : 
    5276             : /*
    5277             :  * The following *ClosestMatch() functions can be used to determine whether a
    5278             :  * user-provided string resembles any known valid values, which is useful for
    5279             :  * providing hints in log messages, among other things.  Use these functions
    5280             :  * like so:
    5281             :  *
    5282             :  *      initClosestMatch(&state, source_string, max_distance);
    5283             :  *
    5284             :  *      for (int i = 0; i < num_valid_strings; i++)
    5285             :  *          updateClosestMatch(&state, valid_strings[i]);
    5286             :  *
    5287             :  *      closestMatch = getClosestMatch(&state);
    5288             :  */
    5289             : 
    5290             : /*
    5291             :  * Initialize the given state with the source string and maximum Levenshtein
    5292             :  * distance to consider.
    5293             :  */
    5294             : void
    5295          78 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
    5296             : {
    5297             :     Assert(state);
    5298             :     Assert(max_d >= 0);
    5299             : 
    5300          78 :     state->source = source;
    5301          78 :     state->min_d = -1;
    5302          78 :     state->max_d = max_d;
    5303          78 :     state->match = NULL;
    5304          78 : }
    5305             : 
    5306             : /*
    5307             :  * If the candidate string is a closer match than the current one saved (or
    5308             :  * there is no match saved), save it as the closest match.
    5309             :  *
    5310             :  * If the source or candidate string is NULL, empty, or too long, this function
    5311             :  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
    5312             :  * allowed or more than half the characters are different, no action is taken.
    5313             :  */
    5314             : void
    5315         804 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
    5316             : {
    5317             :     int         dist;
    5318             : 
    5319             :     Assert(state);
    5320             : 
    5321         804 :     if (state->source == NULL || state->source[0] == '\0' ||
    5322         804 :         candidate == NULL || candidate[0] == '\0')
    5323           0 :         return;
    5324             : 
    5325             :     /*
    5326             :      * To avoid ERROR-ing, we check the lengths here instead of setting
    5327             :      * 'trusted' to false in the call to varstr_levenshtein_less_equal().
    5328             :      */
    5329         804 :     if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
    5330         804 :         strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
    5331           0 :         return;
    5332             : 
    5333         804 :     dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
    5334         804 :                                          candidate, strlen(candidate), 1, 1, 1,
    5335             :                                          state->max_d, true);
    5336         804 :     if (dist <= state->max_d &&
    5337          62 :         dist <= strlen(state->source) / 2 &&
    5338          14 :         (state->min_d == -1 || dist < state->min_d))
    5339             :     {
    5340          14 :         state->min_d = dist;
    5341          14 :         state->match = candidate;
    5342             :     }
    5343             : }
    5344             : 
    5345             : /*
    5346             :  * Return the closest match.  If no suitable candidates were provided via
    5347             :  * updateClosestMatch(), return NULL.
    5348             :  */
    5349             : const char *
    5350          78 : getClosestMatch(ClosestMatchState *state)
    5351             : {
    5352             :     Assert(state);
    5353             : 
    5354          78 :     return state->match;
    5355             : }
    5356             : 
    5357             : 
    5358             : /*
    5359             :  * Unicode support
    5360             :  */
    5361             : 
    5362             : static UnicodeNormalizationForm
    5363         210 : unicode_norm_form_from_string(const char *formstr)
    5364             : {
    5365         210 :     UnicodeNormalizationForm form = -1;
    5366             : 
    5367             :     /*
    5368             :      * Might as well check this while we're here.
    5369             :      */
    5370         210 :     if (GetDatabaseEncoding() != PG_UTF8)
    5371           0 :         ereport(ERROR,
    5372             :                 (errcode(ERRCODE_SYNTAX_ERROR),
    5373             :                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
    5374             : 
    5375         210 :     if (pg_strcasecmp(formstr, "NFC") == 0)
    5376          66 :         form = UNICODE_NFC;
    5377         144 :     else if (pg_strcasecmp(formstr, "NFD") == 0)
    5378          60 :         form = UNICODE_NFD;
    5379          84 :     else if (pg_strcasecmp(formstr, "NFKC") == 0)
    5380          36 :         form = UNICODE_NFKC;
    5381          48 :     else if (pg_strcasecmp(formstr, "NFKD") == 0)
    5382          36 :         form = UNICODE_NFKD;
    5383             :     else
    5384          12 :         ereport(ERROR,
    5385             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5386             :                  errmsg("invalid normalization form: %s", formstr)));
    5387             : 
    5388         198 :     return form;
    5389             : }
    5390             : 
    5391             : /*
    5392             :  * Returns version of Unicode used by Postgres in "major.minor" format (the
    5393             :  * same format as the Unicode version reported by ICU). The third component
    5394             :  * ("update version") never involves additions to the character repertoire and
    5395             :  * is unimportant for most purposes.
    5396             :  *
    5397             :  * See: https://unicode.org/versions/
    5398             :  */
    5399             : Datum
    5400          34 : unicode_version(PG_FUNCTION_ARGS)
    5401             : {
    5402          34 :     PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
    5403             : }
    5404             : 
    5405             : /*
    5406             :  * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
    5407             :  */
    5408             : Datum
    5409           2 : icu_unicode_version(PG_FUNCTION_ARGS)
    5410             : {
    5411             : #ifdef USE_ICU
    5412           2 :     PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
    5413             : #else
    5414             :     PG_RETURN_NULL();
    5415             : #endif
    5416             : }
    5417             : 
    5418             : /*
    5419             :  * Check whether the string contains only assigned Unicode code
    5420             :  * points. Requires that the database encoding is UTF-8.
    5421             :  */
    5422             : Datum
    5423          12 : unicode_assigned(PG_FUNCTION_ARGS)
    5424             : {
    5425          12 :     text       *input = PG_GETARG_TEXT_PP(0);
    5426             :     unsigned char *p;
    5427             :     int         size;
    5428             : 
    5429          12 :     if (GetDatabaseEncoding() != PG_UTF8)
    5430           0 :         ereport(ERROR,
    5431             :                 (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
    5432             : 
    5433             :     /* convert to char32_t */
    5434          12 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5435          12 :     p = (unsigned char *) VARDATA_ANY(input);
    5436          48 :     for (int i = 0; i < size; i++)
    5437             :     {
    5438          42 :         char32_t    uchar = utf8_to_unicode(p);
    5439          42 :         int         category = unicode_category(uchar);
    5440             : 
    5441          42 :         if (category == PG_U_UNASSIGNED)
    5442           6 :             PG_RETURN_BOOL(false);
    5443             : 
    5444          36 :         p += pg_utf_mblen(p);
    5445             :     }
    5446             : 
    5447           6 :     PG_RETURN_BOOL(true);
    5448             : }
    5449             : 
    5450             : Datum
    5451          72 : unicode_normalize_func(PG_FUNCTION_ARGS)
    5452             : {
    5453          72 :     text       *input = PG_GETARG_TEXT_PP(0);
    5454          72 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5455             :     UnicodeNormalizationForm form;
    5456             :     int         size;
    5457             :     char32_t   *input_chars;
    5458             :     char32_t   *output_chars;
    5459             :     unsigned char *p;
    5460             :     text       *result;
    5461             :     int         i;
    5462             : 
    5463          72 :     form = unicode_norm_form_from_string(formstr);
    5464             : 
    5465             :     /* convert to char32_t */
    5466          66 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5467          66 :     input_chars = palloc((size + 1) * sizeof(char32_t));
    5468          66 :     p = (unsigned char *) VARDATA_ANY(input);
    5469         288 :     for (i = 0; i < size; i++)
    5470             :     {
    5471         222 :         input_chars[i] = utf8_to_unicode(p);
    5472         222 :         p += pg_utf_mblen(p);
    5473             :     }
    5474          66 :     input_chars[i] = (char32_t) '\0';
    5475             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5476             : 
    5477             :     /* action */
    5478          66 :     output_chars = unicode_normalize(form, input_chars);
    5479             : 
    5480             :     /* convert back to UTF-8 string */
    5481          66 :     size = 0;
    5482         306 :     for (char32_t *wp = output_chars; *wp; wp++)
    5483             :     {
    5484             :         unsigned char buf[4];
    5485             : 
    5486         240 :         unicode_to_utf8(*wp, buf);
    5487         240 :         size += pg_utf_mblen(buf);
    5488             :     }
    5489             : 
    5490          66 :     result = palloc(size + VARHDRSZ);
    5491          66 :     SET_VARSIZE(result, size + VARHDRSZ);
    5492             : 
    5493          66 :     p = (unsigned char *) VARDATA_ANY(result);
    5494         306 :     for (char32_t *wp = output_chars; *wp; wp++)
    5495             :     {
    5496         240 :         unicode_to_utf8(*wp, p);
    5497         240 :         p += pg_utf_mblen(p);
    5498             :     }
    5499             :     Assert((char *) p == (char *) result + size + VARHDRSZ);
    5500             : 
    5501          66 :     PG_RETURN_TEXT_P(result);
    5502             : }
    5503             : 
    5504             : /*
    5505             :  * Check whether the string is in the specified Unicode normalization form.
    5506             :  *
    5507             :  * This is done by converting the string to the specified normal form and then
    5508             :  * comparing that to the original string.  To speed that up, we also apply the
    5509             :  * "quick check" algorithm specified in UAX #15, which can give a yes or no
    5510             :  * answer for many strings by just scanning the string once.
    5511             :  *
    5512             :  * This function should generally be optimized for the case where the string
    5513             :  * is in fact normalized.  In that case, we'll end up looking at the entire
    5514             :  * string, so it's probably not worth doing any incremental conversion etc.
    5515             :  */
    5516             : Datum
    5517         138 : unicode_is_normalized(PG_FUNCTION_ARGS)
    5518             : {
    5519         138 :     text       *input = PG_GETARG_TEXT_PP(0);
    5520         138 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5521             :     UnicodeNormalizationForm form;
    5522             :     int         size;
    5523             :     char32_t   *input_chars;
    5524             :     char32_t   *output_chars;
    5525             :     unsigned char *p;
    5526             :     int         i;
    5527             :     UnicodeNormalizationQC quickcheck;
    5528             :     int         output_size;
    5529             :     bool        result;
    5530             : 
    5531         138 :     form = unicode_norm_form_from_string(formstr);
    5532             : 
    5533             :     /* convert to char32_t */
    5534         132 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    5535         132 :     input_chars = palloc((size + 1) * sizeof(char32_t));
    5536         132 :     p = (unsigned char *) VARDATA_ANY(input);
    5537         504 :     for (i = 0; i < size; i++)
    5538             :     {
    5539         372 :         input_chars[i] = utf8_to_unicode(p);
    5540         372 :         p += pg_utf_mblen(p);
    5541             :     }
    5542         132 :     input_chars[i] = (char32_t) '\0';
    5543             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    5544             : 
    5545             :     /* quick check (see UAX #15) */
    5546         132 :     quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
    5547         132 :     if (quickcheck == UNICODE_NORM_QC_YES)
    5548          42 :         PG_RETURN_BOOL(true);
    5549          90 :     else if (quickcheck == UNICODE_NORM_QC_NO)
    5550          12 :         PG_RETURN_BOOL(false);
    5551             : 
    5552             :     /* normalize and compare with original */
    5553          78 :     output_chars = unicode_normalize(form, input_chars);
    5554             : 
    5555          78 :     output_size = 0;
    5556         324 :     for (char32_t *wp = output_chars; *wp; wp++)
    5557         246 :         output_size++;
    5558             : 
    5559         114 :     result = (size == output_size) &&
    5560          36 :         (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
    5561             : 
    5562          78 :     PG_RETURN_BOOL(result);
    5563             : }
    5564             : 
    5565             : /*
    5566             :  * Check if first n chars are hexadecimal digits
    5567             :  */
    5568             : static bool
    5569         156 : isxdigits_n(const char *instr, size_t n)
    5570             : {
    5571         660 :     for (size_t i = 0; i < n; i++)
    5572         570 :         if (!isxdigit((unsigned char) instr[i]))
    5573          66 :             return false;
    5574             : 
    5575          90 :     return true;
    5576             : }
    5577             : 
    5578             : static unsigned int
    5579         504 : hexval(unsigned char c)
    5580             : {
    5581         504 :     if (c >= '0' && c <= '9')
    5582         384 :         return c - '0';
    5583         120 :     if (c >= 'a' && c <= 'f')
    5584          60 :         return c - 'a' + 0xA;
    5585          60 :     if (c >= 'A' && c <= 'F')
    5586          60 :         return c - 'A' + 0xA;
    5587           0 :     elog(ERROR, "invalid hexadecimal digit");
    5588             :     return 0;                   /* not reached */
    5589             : }
    5590             : 
    5591             : /*
    5592             :  * Translate string with hexadecimal digits to number
    5593             :  */
    5594             : static unsigned int
    5595          90 : hexval_n(const char *instr, size_t n)
    5596             : {
    5597          90 :     unsigned int result = 0;
    5598             : 
    5599         594 :     for (size_t i = 0; i < n; i++)
    5600         504 :         result += hexval(instr[i]) << (4 * (n - i - 1));
    5601             : 
    5602          90 :     return result;
    5603             : }
    5604             : 
    5605             : /*
    5606             :  * Replaces Unicode escape sequences by Unicode characters
    5607             :  */
    5608             : Datum
    5609          66 : unistr(PG_FUNCTION_ARGS)
    5610             : {
    5611          66 :     text       *input_text = PG_GETARG_TEXT_PP(0);
    5612             :     char       *instr;
    5613             :     int         len;
    5614             :     StringInfoData str;
    5615             :     text       *result;
    5616          66 :     char16_t    pair_first = 0;
    5617             :     char        cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
    5618             : 
    5619          66 :     instr = VARDATA_ANY(input_text);
    5620          66 :     len = VARSIZE_ANY_EXHDR(input_text);
    5621             : 
    5622          66 :     initStringInfo(&str);
    5623             : 
    5624         510 :     while (len > 0)
    5625             :     {
    5626         486 :         if (instr[0] == '\\')
    5627             :         {
    5628         102 :             if (len >= 2 &&
    5629         102 :                 instr[1] == '\\')
    5630             :             {
    5631           6 :                 if (pair_first)
    5632           0 :                     goto invalid_pair;
    5633           6 :                 appendStringInfoChar(&str, '\\');
    5634           6 :                 instr += 2;
    5635           6 :                 len -= 2;
    5636             :             }
    5637          96 :             else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
    5638          66 :                      (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
    5639          30 :             {
    5640             :                 char32_t    unicode;
    5641          42 :                 int         offset = instr[1] == 'u' ? 2 : 1;
    5642             : 
    5643          42 :                 unicode = hexval_n(instr + offset, 4);
    5644             : 
    5645          42 :                 if (!is_valid_unicode_codepoint(unicode))
    5646           0 :                     ereport(ERROR,
    5647             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5648             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5649             : 
    5650          42 :                 if (pair_first)
    5651             :                 {
    5652          12 :                     if (is_utf16_surrogate_second(unicode))
    5653             :                     {
    5654           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5655           0 :                         pair_first = 0;
    5656             :                     }
    5657             :                     else
    5658          12 :                         goto invalid_pair;
    5659             :                 }
    5660          30 :                 else if (is_utf16_surrogate_second(unicode))
    5661           0 :                     goto invalid_pair;
    5662             : 
    5663          30 :                 if (is_utf16_surrogate_first(unicode))
    5664          18 :                     pair_first = unicode;
    5665             :                 else
    5666             :                 {
    5667          12 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5668          12 :                     appendStringInfoString(&str, cbuf);
    5669             :                 }
    5670             : 
    5671          30 :                 instr += 4 + offset;
    5672          30 :                 len -= 4 + offset;
    5673             :             }
    5674          54 :             else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
    5675          12 :             {
    5676             :                 char32_t    unicode;
    5677             : 
    5678          24 :                 unicode = hexval_n(instr + 2, 6);
    5679             : 
    5680          24 :                 if (!is_valid_unicode_codepoint(unicode))
    5681           6 :                     ereport(ERROR,
    5682             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5683             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5684             : 
    5685          18 :                 if (pair_first)
    5686             :                 {
    5687           6 :                     if (is_utf16_surrogate_second(unicode))
    5688             :                     {
    5689           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5690           0 :                         pair_first = 0;
    5691             :                     }
    5692             :                     else
    5693           6 :                         goto invalid_pair;
    5694             :                 }
    5695          12 :                 else if (is_utf16_surrogate_second(unicode))
    5696           0 :                     goto invalid_pair;
    5697             : 
    5698          12 :                 if (is_utf16_surrogate_first(unicode))
    5699           6 :                     pair_first = unicode;
    5700             :                 else
    5701             :                 {
    5702           6 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5703           6 :                     appendStringInfoString(&str, cbuf);
    5704             :                 }
    5705             : 
    5706          12 :                 instr += 8;
    5707          12 :                 len -= 8;
    5708             :             }
    5709          30 :             else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
    5710          12 :             {
    5711             :                 char32_t    unicode;
    5712             : 
    5713          24 :                 unicode = hexval_n(instr + 2, 8);
    5714             : 
    5715          24 :                 if (!is_valid_unicode_codepoint(unicode))
    5716           6 :                     ereport(ERROR,
    5717             :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5718             :                             errmsg("invalid Unicode code point: %04X", unicode));
    5719             : 
    5720          18 :                 if (pair_first)
    5721             :                 {
    5722           6 :                     if (is_utf16_surrogate_second(unicode))
    5723             :                     {
    5724           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    5725           0 :                         pair_first = 0;
    5726             :                     }
    5727             :                     else
    5728           6 :                         goto invalid_pair;
    5729             :                 }
    5730          12 :                 else if (is_utf16_surrogate_second(unicode))
    5731           0 :                     goto invalid_pair;
    5732             : 
    5733          12 :                 if (is_utf16_surrogate_first(unicode))
    5734           6 :                     pair_first = unicode;
    5735             :                 else
    5736             :                 {
    5737           6 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    5738           6 :                     appendStringInfoString(&str, cbuf);
    5739             :                 }
    5740             : 
    5741          12 :                 instr += 10;
    5742          12 :                 len -= 10;
    5743             :             }
    5744             :             else
    5745           6 :                 ereport(ERROR,
    5746             :                         (errcode(ERRCODE_SYNTAX_ERROR),
    5747             :                          errmsg("invalid Unicode escape"),
    5748             :                          errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
    5749             :         }
    5750             :         else
    5751             :         {
    5752         384 :             if (pair_first)
    5753           0 :                 goto invalid_pair;
    5754             : 
    5755         384 :             appendStringInfoChar(&str, *instr++);
    5756         384 :             len--;
    5757             :         }
    5758             :     }
    5759             : 
    5760             :     /* unfinished surrogate pair? */
    5761          24 :     if (pair_first)
    5762           6 :         goto invalid_pair;
    5763             : 
    5764          18 :     result = cstring_to_text_with_len(str.data, str.len);
    5765          18 :     pfree(str.data);
    5766             : 
    5767          18 :     PG_RETURN_TEXT_P(result);
    5768             : 
    5769          30 : invalid_pair:
    5770          30 :     ereport(ERROR,
    5771             :             (errcode(ERRCODE_SYNTAX_ERROR),
    5772             :              errmsg("invalid Unicode surrogate pair")));
    5773             :     PG_RETURN_NULL();           /* keep compiler quiet */
    5774             : }

Generated by: LCOV version 1.16