LCOV - code coverage report
Current view: top level - src/backend/utils/adt - varlena.c (source / functions) Hit Total Coverage
Test: PostgreSQL 14devel Lines: 1695 1994 85.0 %
Date: 2021-01-26 03:06:49 Functions: 136 152 89.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * varlena.c
       4             :  *    Functions for the variable-length built-in types.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/varlena.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include <ctype.h>
      18             : #include <limits.h>
      19             : 
      20             : #include "access/detoast.h"
      21             : #include "catalog/pg_collation.h"
      22             : #include "catalog/pg_type.h"
      23             : #include "common/hashfn.h"
      24             : #include "common/hex.h"
      25             : #include "common/int.h"
      26             : #include "common/unicode_norm.h"
      27             : #include "lib/hyperloglog.h"
      28             : #include "libpq/pqformat.h"
      29             : #include "miscadmin.h"
      30             : #include "nodes/execnodes.h"
      31             : #include "parser/scansup.h"
      32             : #include "port/pg_bswap.h"
      33             : #include "regex/regex.h"
      34             : #include "utils/builtins.h"
      35             : #include "utils/bytea.h"
      36             : #include "utils/lsyscache.h"
      37             : #include "utils/memutils.h"
      38             : #include "utils/pg_locale.h"
      39             : #include "utils/sortsupport.h"
      40             : #include "utils/varlena.h"
      41             : 
      42             : 
      43             : /* GUC variable */
      44             : int         bytea_output = BYTEA_OUTPUT_HEX;
      45             : 
      46             : typedef struct varlena unknown;
      47             : typedef struct varlena VarString;
      48             : 
      49             : /*
      50             :  * State for text_position_* functions.
      51             :  */
      52             : typedef struct
      53             : {
      54             :     bool        is_multibyte;   /* T if multibyte encoding */
      55             :     bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
      56             : 
      57             :     char       *str1;           /* haystack string */
      58             :     char       *str2;           /* needle string */
      59             :     int         len1;           /* string lengths in bytes */
      60             :     int         len2;
      61             : 
      62             :     /* Skip table for Boyer-Moore-Horspool search algorithm: */
      63             :     int         skiptablemask;  /* mask for ANDing with skiptable subscripts */
      64             :     int         skiptable[256]; /* skip distance for given mismatched char */
      65             : 
      66             :     char       *last_match;     /* pointer to last match in 'str1' */
      67             : 
      68             :     /*
      69             :      * Sometimes we need to convert the byte position of a match to a
      70             :      * character position.  These store the last position that was converted,
      71             :      * so that on the next call, we can continue from that point, rather than
      72             :      * count characters from the very beginning.
      73             :      */
      74             :     char       *refpoint;       /* pointer within original haystack string */
      75             :     int         refpos;         /* 0-based character offset of the same point */
      76             : } TextPositionState;
      77             : 
      78             : typedef struct
      79             : {
      80             :     char       *buf1;           /* 1st string, or abbreviation original string
      81             :                                  * buf */
      82             :     char       *buf2;           /* 2nd string, or abbreviation strxfrm() buf */
      83             :     int         buflen1;
      84             :     int         buflen2;
      85             :     int         last_len1;      /* Length of last buf1 string/strxfrm() input */
      86             :     int         last_len2;      /* Length of last buf2 string/strxfrm() blob */
      87             :     int         last_returned;  /* Last comparison result (cache) */
      88             :     bool        cache_blob;     /* Does buf2 contain strxfrm() blob, etc? */
      89             :     bool        collate_c;
      90             :     Oid         typid;          /* Actual datatype (text/bpchar/bytea/name) */
      91             :     hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
      92             :     hyperLogLogState full_card; /* Full key cardinality state */
      93             :     double      prop_card;      /* Required cardinality proportion */
      94             :     pg_locale_t locale;
      95             : } VarStringSortSupport;
      96             : 
      97             : /*
      98             :  * Output data for split_text(): we output either to an array or a table.
      99             :  * tupstore and tupdesc must be set up in advance to output to a table.
     100             :  */
     101             : typedef struct
     102             : {
     103             :     ArrayBuildState *astate;
     104             :     Tuplestorestate *tupstore;
     105             :     TupleDesc   tupdesc;
     106             : } SplitTextOutputData;
     107             : 
     108             : /*
     109             :  * This should be large enough that most strings will fit, but small enough
     110             :  * that we feel comfortable putting it on the stack
     111             :  */
     112             : #define TEXTBUFLEN      1024
     113             : 
     114             : #define DatumGetUnknownP(X)         ((unknown *) PG_DETOAST_DATUM(X))
     115             : #define DatumGetUnknownPCopy(X)     ((unknown *) PG_DETOAST_DATUM_COPY(X))
     116             : #define PG_GETARG_UNKNOWN_P(n)      DatumGetUnknownP(PG_GETARG_DATUM(n))
     117             : #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
     118             : #define PG_RETURN_UNKNOWN_P(x)      PG_RETURN_POINTER(x)
     119             : 
     120             : #define DatumGetVarStringP(X)       ((VarString *) PG_DETOAST_DATUM(X))
     121             : #define DatumGetVarStringPP(X)      ((VarString *) PG_DETOAST_DATUM_PACKED(X))
     122             : 
     123             : static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
     124             : static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
     125             : static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
     126             : static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
     127             : static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
     128             : static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
     129             : static int  varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
     130             : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
     131             : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
     132             : static int32 text_length(Datum str);
     133             : static text *text_catenate(text *t1, text *t2);
     134             : static text *text_substring(Datum str,
     135             :                             int32 start,
     136             :                             int32 length,
     137             :                             bool length_not_specified);
     138             : static text *text_overlay(text *t1, text *t2, int sp, int sl);
     139             : static int  text_position(text *t1, text *t2, Oid collid);
     140             : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
     141             : static bool text_position_next(TextPositionState *state);
     142             : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
     143             : static char *text_position_get_match_ptr(TextPositionState *state);
     144             : static int  text_position_get_match_pos(TextPositionState *state);
     145             : static void text_position_cleanup(TextPositionState *state);
     146             : static void check_collation_set(Oid collid);
     147             : static int  text_cmp(text *arg1, text *arg2, Oid collid);
     148             : static bytea *bytea_catenate(bytea *t1, bytea *t2);
     149             : static bytea *bytea_substring(Datum str,
     150             :                               int S,
     151             :                               int L,
     152             :                               bool length_not_specified);
     153             : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
     154             : static void appendStringInfoText(StringInfo str, const text *t);
     155             : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
     156             : static void split_text_accum_result(SplitTextOutputData *tstate,
     157             :                                     text *field_value,
     158             :                                     text *null_string,
     159             :                                     Oid collation);
     160             : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
     161             :                                     const char *fldsep, const char *null_string);
     162             : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
     163             : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
     164             :                                      int *value);
     165             : static const char *text_format_parse_format(const char *start_ptr,
     166             :                                             const char *end_ptr,
     167             :                                             int *argpos, int *widthpos,
     168             :                                             int *flags, int *width);
     169             : static void text_format_string_conversion(StringInfo buf, char conversion,
     170             :                                           FmgrInfo *typOutputInfo,
     171             :                                           Datum value, bool isNull,
     172             :                                           int flags, int width);
     173             : static void text_format_append_string(StringInfo buf, const char *str,
     174             :                                       int flags, int width);
     175             : 
     176             : 
     177             : /*****************************************************************************
     178             :  *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                          *
     179             :  *****************************************************************************/
     180             : 
     181             : /*
     182             :  * cstring_to_text
     183             :  *
     184             :  * Create a text value from a null-terminated C string.
     185             :  *
     186             :  * The new text value is freshly palloc'd with a full-size VARHDR.
     187             :  */
     188             : text *
     189     7416090 : cstring_to_text(const char *s)
     190             : {
     191     7416090 :     return cstring_to_text_with_len(s, strlen(s));
     192             : }
     193             : 
     194             : /*
     195             :  * cstring_to_text_with_len
     196             :  *
     197             :  * Same as cstring_to_text except the caller specifies the string length;
     198             :  * the string need not be null_terminated.
     199             :  */
     200             : text *
     201    11052248 : cstring_to_text_with_len(const char *s, int len)
     202             : {
     203    11052248 :     text       *result = (text *) palloc(len + VARHDRSZ);
     204             : 
     205    11052248 :     SET_VARSIZE(result, len + VARHDRSZ);
     206    11052248 :     memcpy(VARDATA(result), s, len);
     207             : 
     208    11052248 :     return result;
     209             : }
     210             : 
     211             : /*
     212             :  * text_to_cstring
     213             :  *
     214             :  * Create a palloc'd, null-terminated C string from a text value.
     215             :  *
     216             :  * We support being passed a compressed or toasted text value.
     217             :  * This is a bit bogus since such values shouldn't really be referred to as
     218             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     219             :  * case here, we'd need another routine that did, anyway.
     220             :  */
     221             : char *
     222     5830260 : text_to_cstring(const text *t)
     223             : {
     224             :     /* must cast away the const, unfortunately */
     225     5830260 :     text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
     226     5830260 :     int         len = VARSIZE_ANY_EXHDR(tunpacked);
     227             :     char       *result;
     228             : 
     229     5830260 :     result = (char *) palloc(len + 1);
     230     5830260 :     memcpy(result, VARDATA_ANY(tunpacked), len);
     231     5830260 :     result[len] = '\0';
     232             : 
     233     5830260 :     if (tunpacked != t)
     234       74324 :         pfree(tunpacked);
     235             : 
     236     5830260 :     return result;
     237             : }
     238             : 
     239             : /*
     240             :  * text_to_cstring_buffer
     241             :  *
     242             :  * Copy a text value into a caller-supplied buffer of size dst_len.
     243             :  *
     244             :  * The text string is truncated if necessary to fit.  The result is
     245             :  * guaranteed null-terminated (unless dst_len == 0).
     246             :  *
     247             :  * We support being passed a compressed or toasted text value.
     248             :  * This is a bit bogus since such values shouldn't really be referred to as
     249             :  * "text *", but it seems useful for robustness.  If we didn't handle that
     250             :  * case here, we'd need another routine that did, anyway.
     251             :  */
     252             : void
     253         390 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
     254             : {
     255             :     /* must cast away the const, unfortunately */
     256         390 :     text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
     257         390 :     size_t      src_len = VARSIZE_ANY_EXHDR(srcunpacked);
     258             : 
     259         390 :     if (dst_len > 0)
     260             :     {
     261         390 :         dst_len--;
     262         390 :         if (dst_len >= src_len)
     263         390 :             dst_len = src_len;
     264             :         else                    /* ensure truncation is encoding-safe */
     265           0 :             dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
     266         390 :         memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
     267         390 :         dst[dst_len] = '\0';
     268             :     }
     269             : 
     270         390 :     if (srcunpacked != src)
     271           0 :         pfree(srcunpacked);
     272         390 : }
     273             : 
     274             : 
     275             : /*****************************************************************************
     276             :  *   USER I/O ROUTINES                                                       *
     277             :  *****************************************************************************/
     278             : 
     279             : 
     280             : #define VAL(CH)         ((CH) - '0')
     281             : #define DIG(VAL)        ((VAL) + '0')
     282             : 
     283             : /*
     284             :  *      byteain         - converts from printable representation of byte array
     285             :  *
     286             :  *      Non-printable characters must be passed as '\nnn' (octal) and are
     287             :  *      converted to internal form.  '\' must be passed as '\\'.
     288             :  *      ereport(ERROR, ...) if bad form.
     289             :  *
     290             :  *      BUGS:
     291             :  *              The input is scanned twice.
     292             :  *              The error checking of input is minimal.
     293             :  */
     294             : Datum
     295       10956 : byteain(PG_FUNCTION_ARGS)
     296             : {
     297       10956 :     char       *inputText = PG_GETARG_CSTRING(0);
     298             :     char       *tp;
     299             :     char       *rp;
     300             :     int         bc;
     301             :     bytea      *result;
     302             : 
     303             :     /* Recognize hex input */
     304       10956 :     if (inputText[0] == '\\' && inputText[1] == 'x')
     305             :     {
     306          96 :         size_t      len = strlen(inputText);
     307          96 :         uint64      dstlen = pg_hex_dec_len(len - 2);
     308             : 
     309          96 :         bc = dstlen + VARHDRSZ; /* maximum possible length */
     310          96 :         result = palloc(bc);
     311             : 
     312          96 :         bc = pg_hex_decode(inputText + 2, len - 2, VARDATA(result), dstlen);
     313          88 :         SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
     314             : 
     315          88 :         PG_RETURN_BYTEA_P(result);
     316             :     }
     317             : 
     318             :     /* Else, it's the traditional escaped style */
     319      200316 :     for (bc = 0, tp = inputText; *tp != '\0'; bc++)
     320             :     {
     321      189460 :         if (tp[0] != '\\')
     322      188718 :             tp++;
     323         742 :         else if ((tp[0] == '\\') &&
     324         742 :                  (tp[1] >= '0' && tp[1] <= '3') &&
     325         738 :                  (tp[2] >= '0' && tp[2] <= '7') &&
     326         738 :                  (tp[3] >= '0' && tp[3] <= '7'))
     327         738 :             tp += 4;
     328           4 :         else if ((tp[0] == '\\') &&
     329           4 :                  (tp[1] == '\\'))
     330           0 :             tp += 2;
     331             :         else
     332             :         {
     333             :             /*
     334             :              * one backslash, not followed by another or ### valid octal
     335             :              */
     336           4 :             ereport(ERROR,
     337             :                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
     338             :                      errmsg("invalid input syntax for type %s", "bytea")));
     339             :         }
     340             :     }
     341             : 
     342       10856 :     bc += VARHDRSZ;
     343             : 
     344       10856 :     result = (bytea *) palloc(bc);
     345       10856 :     SET_VARSIZE(result, bc);
     346             : 
     347       10856 :     tp = inputText;
     348       10856 :     rp = VARDATA(result);
     349      200304 :     while (*tp != '\0')
     350             :     {
     351      189448 :         if (tp[0] != '\\')
     352      188710 :             *rp++ = *tp++;
     353         738 :         else if ((tp[0] == '\\') &&
     354         738 :                  (tp[1] >= '0' && tp[1] <= '3') &&
     355         738 :                  (tp[2] >= '0' && tp[2] <= '7') &&
     356         738 :                  (tp[3] >= '0' && tp[3] <= '7'))
     357             :         {
     358         738 :             bc = VAL(tp[1]);
     359         738 :             bc <<= 3;
     360         738 :             bc += VAL(tp[2]);
     361         738 :             bc <<= 3;
     362         738 :             *rp++ = bc + VAL(tp[3]);
     363             : 
     364         738 :             tp += 4;
     365             :         }
     366           0 :         else if ((tp[0] == '\\') &&
     367           0 :                  (tp[1] == '\\'))
     368             :         {
     369           0 :             *rp++ = '\\';
     370           0 :             tp += 2;
     371             :         }
     372             :         else
     373             :         {
     374             :             /*
     375             :              * We should never get here. The first pass should not allow it.
     376             :              */
     377           0 :             ereport(ERROR,
     378             :                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
     379             :                      errmsg("invalid input syntax for type %s", "bytea")));
     380             :         }
     381             :     }
     382             : 
     383       10856 :     PG_RETURN_BYTEA_P(result);
     384             : }
     385             : 
     386             : /*
     387             :  *      byteaout        - converts to printable representation of byte array
     388             :  *
     389             :  *      In the traditional escaped format, non-printable characters are
     390             :  *      printed as '\nnn' (octal) and '\' as '\\'.
     391             :  */
     392             : Datum
     393        4938 : byteaout(PG_FUNCTION_ARGS)
     394             : {
     395        4938 :     bytea      *vlena = PG_GETARG_BYTEA_PP(0);
     396             :     char       *result;
     397             :     char       *rp;
     398             : 
     399        4938 :     if (bytea_output == BYTEA_OUTPUT_HEX)
     400             :     {
     401        4708 :         uint64      dstlen = pg_hex_enc_len(VARSIZE_ANY_EXHDR(vlena));
     402             : 
     403             :         /* Print hex format */
     404        4708 :         rp = result = palloc(dstlen + 2 + 1);
     405        4708 :         *rp++ = '\\';
     406        4708 :         *rp++ = 'x';
     407             : 
     408        4708 :         rp += pg_hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp,
     409             :                             dstlen);
     410             :     }
     411         230 :     else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
     412             :     {
     413             :         /* Print traditional escaped format */
     414             :         char       *vp;
     415             :         uint64      len;
     416             :         int         i;
     417             : 
     418         230 :         len = 1;                /* empty string has 1 char */
     419         230 :         vp = VARDATA_ANY(vlena);
     420        2204 :         for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
     421             :         {
     422        1974 :             if (*vp == '\\')
     423           0 :                 len += 2;
     424        1974 :             else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
     425         336 :                 len += 4;
     426             :             else
     427        1638 :                 len++;
     428             :         }
     429             : 
     430             :         /*
     431             :          * In principle len can't overflow uint32 if the input fit in 1GB, but
     432             :          * for safety let's check rather than relying on palloc's internal
     433             :          * check.
     434             :          */
     435         230 :         if (len > MaxAllocSize)
     436           0 :             ereport(ERROR,
     437             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     438             :                      errmsg_internal("result of bytea output conversion is too large")));
     439         230 :         rp = result = (char *) palloc(len);
     440             : 
     441         230 :         vp = VARDATA_ANY(vlena);
     442        2204 :         for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
     443             :         {
     444        1974 :             if (*vp == '\\')
     445             :             {
     446           0 :                 *rp++ = '\\';
     447           0 :                 *rp++ = '\\';
     448             :             }
     449        1974 :             else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
     450         336 :             {
     451             :                 int         val;    /* holds unprintable chars */
     452             : 
     453         336 :                 val = *vp;
     454         336 :                 rp[0] = '\\';
     455         336 :                 rp[3] = DIG(val & 07);
     456         336 :                 val >>= 3;
     457         336 :                 rp[2] = DIG(val & 07);
     458         336 :                 val >>= 3;
     459         336 :                 rp[1] = DIG(val & 03);
     460         336 :                 rp += 4;
     461             :             }
     462             :             else
     463        1638 :                 *rp++ = *vp;
     464             :         }
     465             :     }
     466             :     else
     467             :     {
     468           0 :         elog(ERROR, "unrecognized bytea_output setting: %d",
     469             :              bytea_output);
     470             :         rp = result = NULL;     /* keep compiler quiet */
     471             :     }
     472        4938 :     *rp = '\0';
     473        4938 :     PG_RETURN_CSTRING(result);
     474             : }
     475             : 
     476             : /*
     477             :  *      bytearecv           - converts external binary format to bytea
     478             :  */
     479             : Datum
     480         710 : bytearecv(PG_FUNCTION_ARGS)
     481             : {
     482         710 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     483             :     bytea      *result;
     484             :     int         nbytes;
     485             : 
     486         710 :     nbytes = buf->len - buf->cursor;
     487         710 :     result = (bytea *) palloc(nbytes + VARHDRSZ);
     488         710 :     SET_VARSIZE(result, nbytes + VARHDRSZ);
     489         710 :     pq_copymsgbytes(buf, VARDATA(result), nbytes);
     490         710 :     PG_RETURN_BYTEA_P(result);
     491             : }
     492             : 
     493             : /*
     494             :  *      byteasend           - converts bytea to binary format
     495             :  *
     496             :  * This is a special case: just copy the input...
     497             :  */
     498             : Datum
     499        4818 : byteasend(PG_FUNCTION_ARGS)
     500             : {
     501        4818 :     bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
     502             : 
     503        4818 :     PG_RETURN_BYTEA_P(vlena);
     504             : }
     505             : 
     506             : Datum
     507       32760 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
     508             : {
     509             :     StringInfo  state;
     510             : 
     511       32760 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
     512             : 
     513             :     /* Append the value unless null. */
     514       32760 :     if (!PG_ARGISNULL(1))
     515             :     {
     516       32760 :         bytea      *value = PG_GETARG_BYTEA_PP(1);
     517             : 
     518             :         /* On the first time through, we ignore the delimiter. */
     519       32760 :         if (state == NULL)
     520          18 :             state = makeStringAggState(fcinfo);
     521       32742 :         else if (!PG_ARGISNULL(2))
     522             :         {
     523       32738 :             bytea      *delim = PG_GETARG_BYTEA_PP(2);
     524             : 
     525       32738 :             appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
     526             :         }
     527             : 
     528       32760 :         appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
     529             :     }
     530             : 
     531             :     /*
     532             :      * The transition type for string_agg() is declared to be "internal",
     533             :      * which is a pass-by-value type the same size as a pointer.
     534             :      */
     535       32760 :     PG_RETURN_POINTER(state);
     536             : }
     537             : 
     538             : Datum
     539          22 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
     540             : {
     541             :     StringInfo  state;
     542             : 
     543             :     /* cannot be called directly because of internal-type argument */
     544             :     Assert(AggCheckCallContext(fcinfo, NULL));
     545             : 
     546          22 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
     547             : 
     548          22 :     if (state != NULL)
     549             :     {
     550             :         bytea      *result;
     551             : 
     552          18 :         result = (bytea *) palloc(state->len + VARHDRSZ);
     553          18 :         SET_VARSIZE(result, state->len + VARHDRSZ);
     554          18 :         memcpy(VARDATA(result), state->data, state->len);
     555          18 :         PG_RETURN_BYTEA_P(result);
     556             :     }
     557             :     else
     558           4 :         PG_RETURN_NULL();
     559             : }
     560             : 
     561             : /*
     562             :  *      textin          - converts "..." to internal representation
     563             :  */
     564             : Datum
     565     5767454 : textin(PG_FUNCTION_ARGS)
     566             : {
     567     5767454 :     char       *inputText = PG_GETARG_CSTRING(0);
     568             : 
     569     5767454 :     PG_RETURN_TEXT_P(cstring_to_text(inputText));
     570             : }
     571             : 
     572             : /*
     573             :  *      textout         - converts internal representation to "..."
     574             :  */
     575             : Datum
     576     2367168 : textout(PG_FUNCTION_ARGS)
     577             : {
     578     2367168 :     Datum       txt = PG_GETARG_DATUM(0);
     579             : 
     580     2367168 :     PG_RETURN_CSTRING(TextDatumGetCString(txt));
     581             : }
     582             : 
     583             : /*
     584             :  *      textrecv            - converts external binary format to text
     585             :  */
     586             : Datum
     587       53366 : textrecv(PG_FUNCTION_ARGS)
     588             : {
     589       53366 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     590             :     text       *result;
     591             :     char       *str;
     592             :     int         nbytes;
     593             : 
     594       53366 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     595             : 
     596       53366 :     result = cstring_to_text_with_len(str, nbytes);
     597       53366 :     pfree(str);
     598       53366 :     PG_RETURN_TEXT_P(result);
     599             : }
     600             : 
     601             : /*
     602             :  *      textsend            - converts text to binary format
     603             :  */
     604             : Datum
     605       36050 : textsend(PG_FUNCTION_ARGS)
     606             : {
     607       36050 :     text       *t = PG_GETARG_TEXT_PP(0);
     608             :     StringInfoData buf;
     609             : 
     610       36050 :     pq_begintypsend(&buf);
     611       36050 :     pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
     612       36050 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     613             : }
     614             : 
     615             : 
     616             : /*
     617             :  *      unknownin           - converts "..." to internal representation
     618             :  */
     619             : Datum
     620           0 : unknownin(PG_FUNCTION_ARGS)
     621             : {
     622           0 :     char       *str = PG_GETARG_CSTRING(0);
     623             : 
     624             :     /* representation is same as cstring */
     625           0 :     PG_RETURN_CSTRING(pstrdup(str));
     626             : }
     627             : 
     628             : /*
     629             :  *      unknownout          - converts internal representation to "..."
     630             :  */
     631             : Datum
     632         318 : unknownout(PG_FUNCTION_ARGS)
     633             : {
     634             :     /* representation is same as cstring */
     635         318 :     char       *str = PG_GETARG_CSTRING(0);
     636             : 
     637         318 :     PG_RETURN_CSTRING(pstrdup(str));
     638             : }
     639             : 
     640             : /*
     641             :  *      unknownrecv         - converts external binary format to unknown
     642             :  */
     643             : Datum
     644           0 : unknownrecv(PG_FUNCTION_ARGS)
     645             : {
     646           0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     647             :     char       *str;
     648             :     int         nbytes;
     649             : 
     650           0 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     651             :     /* representation is same as cstring */
     652           0 :     PG_RETURN_CSTRING(str);
     653             : }
     654             : 
     655             : /*
     656             :  *      unknownsend         - converts unknown to binary format
     657             :  */
     658             : Datum
     659           0 : unknownsend(PG_FUNCTION_ARGS)
     660             : {
     661             :     /* representation is same as cstring */
     662           0 :     char       *str = PG_GETARG_CSTRING(0);
     663             :     StringInfoData buf;
     664             : 
     665           0 :     pq_begintypsend(&buf);
     666           0 :     pq_sendtext(&buf, str, strlen(str));
     667           0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     668             : }
     669             : 
     670             : 
     671             : /* ========== PUBLIC ROUTINES ========== */
     672             : 
     673             : /*
     674             :  * textlen -
     675             :  *    returns the logical length of a text*
     676             :  *     (which is less than the VARSIZE of the text*)
     677             :  */
     678             : Datum
     679      202646 : textlen(PG_FUNCTION_ARGS)
     680             : {
     681      202646 :     Datum       str = PG_GETARG_DATUM(0);
     682             : 
     683             :     /* try to avoid decompressing argument */
     684      202646 :     PG_RETURN_INT32(text_length(str));
     685             : }
     686             : 
     687             : /*
     688             :  * text_length -
     689             :  *  Does the real work for textlen()
     690             :  *
     691             :  *  This is broken out so it can be called directly by other string processing
     692             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     693             :  *  it may still be in compressed form.  We can avoid decompressing it at all
     694             :  *  in some cases.
     695             :  */
     696             : static int32
     697      202654 : text_length(Datum str)
     698             : {
     699             :     /* fastpath when max encoding length is one */
     700      202654 :     if (pg_database_encoding_max_length() == 1)
     701          24 :         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     702             :     else
     703             :     {
     704      202630 :         text       *t = DatumGetTextPP(str);
     705             : 
     706      202630 :         PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
     707             :                                              VARSIZE_ANY_EXHDR(t)));
     708             :     }
     709             : }
     710             : 
     711             : /*
     712             :  * textoctetlen -
     713             :  *    returns the physical length of a text*
     714             :  *     (which is less than the VARSIZE of the text*)
     715             :  */
     716             : Datum
     717          58 : textoctetlen(PG_FUNCTION_ARGS)
     718             : {
     719          58 :     Datum       str = PG_GETARG_DATUM(0);
     720             : 
     721             :     /* We need not detoast the input at all */
     722          58 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     723             : }
     724             : 
     725             : /*
     726             :  * textcat -
     727             :  *    takes two text* and returns a text* that is the concatenation of
     728             :  *    the two.
     729             :  *
     730             :  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
     731             :  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
     732             :  * Allocate space for output in all cases.
     733             :  * XXX - thomas 1997-07-10
     734             :  */
     735             : Datum
     736     1794664 : textcat(PG_FUNCTION_ARGS)
     737             : {
     738     1794664 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     739     1794664 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     740             : 
     741     1794664 :     PG_RETURN_TEXT_P(text_catenate(t1, t2));
     742             : }
     743             : 
     744             : /*
     745             :  * text_catenate
     746             :  *  Guts of textcat(), broken out so it can be used by other functions
     747             :  *
     748             :  * Arguments can be in short-header form, but not compressed or out-of-line
     749             :  */
     750             : static text *
     751     1794728 : text_catenate(text *t1, text *t2)
     752             : {
     753             :     text       *result;
     754             :     int         len1,
     755             :                 len2,
     756             :                 len;
     757             :     char       *ptr;
     758             : 
     759     1794728 :     len1 = VARSIZE_ANY_EXHDR(t1);
     760     1794728 :     len2 = VARSIZE_ANY_EXHDR(t2);
     761             : 
     762             :     /* paranoia ... probably should throw error instead? */
     763     1794728 :     if (len1 < 0)
     764           0 :         len1 = 0;
     765     1794728 :     if (len2 < 0)
     766           0 :         len2 = 0;
     767             : 
     768     1794728 :     len = len1 + len2 + VARHDRSZ;
     769     1794728 :     result = (text *) palloc(len);
     770             : 
     771             :     /* Set size of result string... */
     772     1794728 :     SET_VARSIZE(result, len);
     773             : 
     774             :     /* Fill data field of result string... */
     775     1794728 :     ptr = VARDATA(result);
     776     1794728 :     if (len1 > 0)
     777     1791850 :         memcpy(ptr, VARDATA_ANY(t1), len1);
     778     1794728 :     if (len2 > 0)
     779     1794624 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
     780             : 
     781     1794728 :     return result;
     782             : }
     783             : 
     784             : /*
     785             :  * charlen_to_bytelen()
     786             :  *  Compute the number of bytes occupied by n characters starting at *p
     787             :  *
     788             :  * It is caller's responsibility that there actually are n characters;
     789             :  * the string need not be null-terminated.
     790             :  */
     791             : static int
     792        5554 : charlen_to_bytelen(const char *p, int n)
     793             : {
     794        5554 :     if (pg_database_encoding_max_length() == 1)
     795             :     {
     796             :         /* Optimization for single-byte encodings */
     797           0 :         return n;
     798             :     }
     799             :     else
     800             :     {
     801             :         const char *s;
     802             : 
     803     5772768 :         for (s = p; n > 0; n--)
     804     5767214 :             s += pg_mblen(s);
     805             : 
     806        5554 :         return s - p;
     807             :     }
     808             : }
     809             : 
     810             : /*
     811             :  * text_substr()
     812             :  * Return a substring starting at the specified position.
     813             :  * - thomas 1997-12-31
     814             :  *
     815             :  * Input:
     816             :  *  - string
     817             :  *  - starting position (is one-based)
     818             :  *  - string length
     819             :  *
     820             :  * If the starting position is zero or less, then return from the start of the string
     821             :  *  adjusting the length to be consistent with the "negative start" per SQL.
     822             :  * If the length is less than zero, return the remaining string.
     823             :  *
     824             :  * Added multibyte support.
     825             :  * - Tatsuo Ishii 1998-4-21
     826             :  * Changed behavior if starting position is less than one to conform to SQL behavior.
     827             :  * Formerly returned the entire string; now returns a portion.
     828             :  * - Thomas Lockhart 1998-12-10
     829             :  * Now uses faster TOAST-slicing interface
     830             :  * - John Gray 2002-02-22
     831             :  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
     832             :  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
     833             :  * error; if E < 1, return '', not entire string). Fixed MB related bug when
     834             :  * S > LC and < LC + 4 sometimes garbage characters are returned.
     835             :  * - Joe Conway 2002-08-10
     836             :  */
     837             : Datum
     838       70068 : text_substr(PG_FUNCTION_ARGS)
     839             : {
     840       70068 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     841             :                                     PG_GETARG_INT32(1),
     842             :                                     PG_GETARG_INT32(2),
     843             :                                     false));
     844             : }
     845             : 
     846             : /*
     847             :  * text_substr_no_len -
     848             :  *    Wrapper to avoid opr_sanity failure due to
     849             :  *    one function accepting a different number of args.
     850             :  */
     851             : Datum
     852          26 : text_substr_no_len(PG_FUNCTION_ARGS)
     853             : {
     854          26 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     855             :                                     PG_GETARG_INT32(1),
     856             :                                     -1, true));
     857             : }
     858             : 
     859             : /*
     860             :  * text_substring -
     861             :  *  Does the real work for text_substr() and text_substr_no_len()
     862             :  *
     863             :  *  This is broken out so it can be called directly by other string processing
     864             :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     865             :  *  it may still be in compressed/toasted form.  We can avoid detoasting all
     866             :  *  of it in some cases.
     867             :  *
     868             :  *  The result is always a freshly palloc'd datum.
     869             :  */
     870             : static text *
     871       96526 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
     872             : {
     873       96526 :     int32       eml = pg_database_encoding_max_length();
     874       96526 :     int32       S = start;      /* start position */
     875             :     int32       S1;             /* adjusted start position */
     876             :     int32       L1;             /* adjusted substring length */
     877             :     int32       E;              /* end position */
     878             : 
     879             :     /*
     880             :      * SQL99 says S can be zero or negative, but we still must fetch from the
     881             :      * start of the string.
     882             :      */
     883       96526 :     S1 = Max(S, 1);
     884             : 
     885             :     /* life is easy if the encoding max length is 1 */
     886       96526 :     if (eml == 1)
     887             :     {
     888          12 :         if (length_not_specified)   /* special case - get length to end of
     889             :                                      * string */
     890           0 :             L1 = -1;
     891          12 :         else if (length < 0)
     892             :         {
     893             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     894           0 :             ereport(ERROR,
     895             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     896             :                      errmsg("negative substring length not allowed")));
     897             :             L1 = -1;            /* silence stupider compilers */
     898             :         }
     899          12 :         else if (pg_add_s32_overflow(S, length, &E))
     900             :         {
     901             :             /*
     902             :              * L could be large enough for S + L to overflow, in which case
     903             :              * the substring must run to end of string.
     904             :              */
     905           0 :             L1 = -1;
     906             :         }
     907             :         else
     908             :         {
     909             :             /*
     910             :              * A zero or negative value for the end position can happen if the
     911             :              * start was negative or one. SQL99 says to return a zero-length
     912             :              * string.
     913             :              */
     914          12 :             if (E < 1)
     915           0 :                 return cstring_to_text("");
     916             : 
     917          12 :             L1 = E - S1;
     918             :         }
     919             : 
     920             :         /*
     921             :          * If the start position is past the end of the string, SQL99 says to
     922             :          * return a zero-length string -- DatumGetTextPSlice() will do that
     923             :          * for us.  We need only convert S1 to zero-based starting position.
     924             :          */
     925          12 :         return DatumGetTextPSlice(str, S1 - 1, L1);
     926             :     }
     927       96514 :     else if (eml > 1)
     928             :     {
     929             :         /*
     930             :          * When encoding max length is > 1, we can't get LC without
     931             :          * detoasting, so we'll grab a conservatively large slice now and go
     932             :          * back later to do the right thing
     933             :          */
     934             :         int32       slice_start;
     935             :         int32       slice_size;
     936             :         int32       slice_strlen;
     937             :         text       *slice;
     938             :         int32       E1;
     939             :         int32       i;
     940             :         char       *p;
     941             :         char       *s;
     942             :         text       *ret;
     943             : 
     944             :         /*
     945             :          * We need to start at position zero because there is no way to know
     946             :          * in advance which byte offset corresponds to the supplied start
     947             :          * position.
     948             :          */
     949       96514 :         slice_start = 0;
     950             : 
     951       96514 :         if (length_not_specified)   /* special case - get length to end of
     952             :                                      * string */
     953          58 :             slice_size = L1 = -1;
     954       96456 :         else if (length < 0)
     955             :         {
     956             :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     957           8 :             ereport(ERROR,
     958             :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     959             :                      errmsg("negative substring length not allowed")));
     960             :             slice_size = L1 = -1;   /* silence stupider compilers */
     961             :         }
     962       96448 :         else if (pg_add_s32_overflow(S, length, &E))
     963             :         {
     964             :             /*
     965             :              * L could be large enough for S + L to overflow, in which case
     966             :              * the substring must run to end of string.
     967             :              */
     968           4 :             slice_size = L1 = -1;
     969             :         }
     970             :         else
     971             :         {
     972             :             /*
     973             :              * A zero or negative value for the end position can happen if the
     974             :              * start was negative or one. SQL99 says to return a zero-length
     975             :              * string.
     976             :              */
     977       96444 :             if (E < 1)
     978           0 :                 return cstring_to_text("");
     979             : 
     980             :             /*
     981             :              * if E is past the end of the string, the tuple toaster will
     982             :              * truncate the length for us
     983             :              */
     984       96444 :             L1 = E - S1;
     985             : 
     986             :             /*
     987             :              * Total slice size in bytes can't be any longer than the start
     988             :              * position plus substring length times the encoding max length.
     989             :              * If that overflows, we can just use -1.
     990             :              */
     991       96444 :             if (pg_mul_s32_overflow(E, eml, &slice_size))
     992           4 :                 slice_size = -1;
     993             :         }
     994             : 
     995             :         /*
     996             :          * If we're working with an untoasted source, no need to do an extra
     997             :          * copying step.
     998             :          */
     999       96506 :         if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
    1000       96482 :             VARATT_IS_EXTERNAL(DatumGetPointer(str)))
    1001          68 :             slice = DatumGetTextPSlice(str, slice_start, slice_size);
    1002             :         else
    1003       96438 :             slice = (text *) DatumGetPointer(str);
    1004             : 
    1005             :         /* see if we got back an empty string */
    1006       96506 :         if (VARSIZE_ANY_EXHDR(slice) == 0)
    1007             :         {
    1008           0 :             if (slice != (text *) DatumGetPointer(str))
    1009           0 :                 pfree(slice);
    1010           0 :             return cstring_to_text("");
    1011             :         }
    1012             : 
    1013             :         /* Now we can get the actual length of the slice in MB characters */
    1014       96506 :         slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
    1015       96506 :                                             VARSIZE_ANY_EXHDR(slice));
    1016             : 
    1017             :         /*
    1018             :          * Check that the start position wasn't > slice_strlen. If so, SQL99
    1019             :          * says to return a zero-length string.
    1020             :          */
    1021       96506 :         if (S1 > slice_strlen)
    1022             :         {
    1023          20 :             if (slice != (text *) DatumGetPointer(str))
    1024           0 :                 pfree(slice);
    1025          20 :             return cstring_to_text("");
    1026             :         }
    1027             : 
    1028             :         /*
    1029             :          * Adjust L1 and E1 now that we know the slice string length. Again
    1030             :          * remember that S1 is one based, and slice_start is zero based.
    1031             :          */
    1032       96486 :         if (L1 > -1)
    1033       96444 :             E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
    1034             :         else
    1035          42 :             E1 = slice_start + 1 + slice_strlen;
    1036             : 
    1037             :         /*
    1038             :          * Find the start position in the slice; remember S1 is not zero based
    1039             :          */
    1040       96486 :         p = VARDATA_ANY(slice);
    1041     3296500 :         for (i = 0; i < S1 - 1; i++)
    1042     3200014 :             p += pg_mblen(p);
    1043             : 
    1044             :         /* hang onto a pointer to our start position */
    1045       96486 :         s = p;
    1046             : 
    1047             :         /*
    1048             :          * Count the actual bytes used by the substring of the requested
    1049             :          * length.
    1050             :          */
    1051     1650486 :         for (i = S1; i < E1; i++)
    1052     1554000 :             p += pg_mblen(p);
    1053             : 
    1054       96486 :         ret = (text *) palloc(VARHDRSZ + (p - s));
    1055       96486 :         SET_VARSIZE(ret, VARHDRSZ + (p - s));
    1056       96486 :         memcpy(VARDATA(ret), s, (p - s));
    1057             : 
    1058       96486 :         if (slice != (text *) DatumGetPointer(str))
    1059          68 :             pfree(slice);
    1060             : 
    1061       96486 :         return ret;
    1062             :     }
    1063             :     else
    1064           0 :         elog(ERROR, "invalid backend encoding: encoding max length < 1");
    1065             : 
    1066             :     /* not reached: suppress compiler warning */
    1067             :     return NULL;
    1068             : }
    1069             : 
    1070             : /*
    1071             :  * textoverlay
    1072             :  *  Replace specified substring of first string with second
    1073             :  *
    1074             :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
    1075             :  * This code is a direct implementation of what the standard says.
    1076             :  */
    1077             : Datum
    1078          24 : textoverlay(PG_FUNCTION_ARGS)
    1079             : {
    1080          24 :     text       *t1 = PG_GETARG_TEXT_PP(0);
    1081          24 :     text       *t2 = PG_GETARG_TEXT_PP(1);
    1082          24 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    1083          24 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
    1084             : 
    1085          24 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
    1086             : }
    1087             : 
    1088             : Datum
    1089           8 : textoverlay_no_len(PG_FUNCTION_ARGS)
    1090             : {
    1091           8 :     text       *t1 = PG_GETARG_TEXT_PP(0);
    1092           8 :     text       *t2 = PG_GETARG_TEXT_PP(1);
    1093           8 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    1094             :     int         sl;
    1095             : 
    1096           8 :     sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
    1097           8 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
    1098             : }
    1099             : 
    1100             : static text *
    1101          32 : text_overlay(text *t1, text *t2, int sp, int sl)
    1102             : {
    1103             :     text       *result;
    1104             :     text       *s1;
    1105             :     text       *s2;
    1106             :     int         sp_pl_sl;
    1107             : 
    1108             :     /*
    1109             :      * Check for possible integer-overflow cases.  For negative sp, throw a
    1110             :      * "substring length" error because that's what should be expected
    1111             :      * according to the spec's definition of OVERLAY().
    1112             :      */
    1113          32 :     if (sp <= 0)
    1114           0 :         ereport(ERROR,
    1115             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    1116             :                  errmsg("negative substring length not allowed")));
    1117          32 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
    1118           0 :         ereport(ERROR,
    1119             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    1120             :                  errmsg("integer out of range")));
    1121             : 
    1122          32 :     s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
    1123          32 :     s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
    1124          32 :     result = text_catenate(s1, t2);
    1125          32 :     result = text_catenate(result, s2);
    1126             : 
    1127          32 :     return result;
    1128             : }
    1129             : 
    1130             : /*
    1131             :  * textpos -
    1132             :  *    Return the position of the specified substring.
    1133             :  *    Implements the SQL POSITION() function.
    1134             :  *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
    1135             :  * - thomas 1997-07-27
    1136             :  */
    1137             : Datum
    1138          92 : textpos(PG_FUNCTION_ARGS)
    1139             : {
    1140          92 :     text       *str = PG_GETARG_TEXT_PP(0);
    1141          92 :     text       *search_str = PG_GETARG_TEXT_PP(1);
    1142             : 
    1143          92 :     PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
    1144             : }
    1145             : 
    1146             : /*
    1147             :  * text_position -
    1148             :  *  Does the real work for textpos()
    1149             :  *
    1150             :  * Inputs:
    1151             :  *      t1 - string to be searched
    1152             :  *      t2 - pattern to match within t1
    1153             :  * Result:
    1154             :  *      Character index of the first matched char, starting from 1,
    1155             :  *      or 0 if no match.
    1156             :  *
    1157             :  *  This is broken out so it can be called directly by other string processing
    1158             :  *  functions.
    1159             :  */
    1160             : static int
    1161          92 : text_position(text *t1, text *t2, Oid collid)
    1162             : {
    1163             :     TextPositionState state;
    1164             :     int         result;
    1165             : 
    1166             :     /* Empty needle always matches at position 1 */
    1167          92 :     if (VARSIZE_ANY_EXHDR(t2) < 1)
    1168           8 :         return 1;
    1169             : 
    1170             :     /* Otherwise, can't match if haystack is shorter than needle */
    1171          84 :     if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
    1172          20 :         return 0;
    1173             : 
    1174          64 :     text_position_setup(t1, t2, collid, &state);
    1175          64 :     if (!text_position_next(&state))
    1176          22 :         result = 0;
    1177             :     else
    1178          42 :         result = text_position_get_match_pos(&state);
    1179          64 :     text_position_cleanup(&state);
    1180          64 :     return result;
    1181             : }
    1182             : 
    1183             : 
    1184             : /*
    1185             :  * text_position_setup, text_position_next, text_position_cleanup -
    1186             :  *  Component steps of text_position()
    1187             :  *
    1188             :  * These are broken out so that a string can be efficiently searched for
    1189             :  * multiple occurrences of the same pattern.  text_position_next may be
    1190             :  * called multiple times, and it advances to the next match on each call.
    1191             :  * text_position_get_match_ptr() and text_position_get_match_pos() return
    1192             :  * a pointer or 1-based character position of the last match, respectively.
    1193             :  *
    1194             :  * The "state" variable is normally just a local variable in the caller.
    1195             :  *
    1196             :  * NOTE: text_position_next skips over the matched portion.  For example,
    1197             :  * searching for "xx" in "xxx" returns only one match, not two.
    1198             :  */
    1199             : 
    1200             : static void
    1201        1848 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
    1202             : {
    1203        1848 :     int         len1 = VARSIZE_ANY_EXHDR(t1);
    1204        1848 :     int         len2 = VARSIZE_ANY_EXHDR(t2);
    1205        1848 :     pg_locale_t mylocale = 0;
    1206             : 
    1207        1848 :     check_collation_set(collid);
    1208             : 
    1209        1848 :     if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
    1210           0 :         mylocale = pg_newlocale_from_collation(collid);
    1211             : 
    1212        1848 :     if (mylocale && !mylocale->deterministic)
    1213           0 :         ereport(ERROR,
    1214             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1215             :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1216             : 
    1217             :     Assert(len1 > 0);
    1218             :     Assert(len2 > 0);
    1219             : 
    1220             :     /*
    1221             :      * Even with a multi-byte encoding, we perform the search using the raw
    1222             :      * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
    1223             :      * because in UTF-8 the byte sequence of one character cannot contain
    1224             :      * another character.  For other multi-byte encodings, we do the search
    1225             :      * initially as a simple byte search, ignoring multibyte issues, but
    1226             :      * verify afterwards that the match we found is at a character boundary,
    1227             :      * and continue the search if it was a false match.
    1228             :      */
    1229        1848 :     if (pg_database_encoding_max_length() == 1)
    1230             :     {
    1231          36 :         state->is_multibyte = false;
    1232          36 :         state->is_multibyte_char_in_char = false;
    1233             :     }
    1234        1812 :     else if (GetDatabaseEncoding() == PG_UTF8)
    1235             :     {
    1236        1812 :         state->is_multibyte = true;
    1237        1812 :         state->is_multibyte_char_in_char = false;
    1238             :     }
    1239             :     else
    1240             :     {
    1241           0 :         state->is_multibyte = true;
    1242           0 :         state->is_multibyte_char_in_char = true;
    1243             :     }
    1244             : 
    1245        1848 :     state->str1 = VARDATA_ANY(t1);
    1246        1848 :     state->str2 = VARDATA_ANY(t2);
    1247        1848 :     state->len1 = len1;
    1248        1848 :     state->len2 = len2;
    1249        1848 :     state->last_match = NULL;
    1250        1848 :     state->refpoint = state->str1;
    1251        1848 :     state->refpos = 0;
    1252             : 
    1253             :     /*
    1254             :      * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
    1255             :      * notes we use the terminology that the "haystack" is the string to be
    1256             :      * searched (t1) and the "needle" is the pattern being sought (t2).
    1257             :      *
    1258             :      * If the needle is empty or bigger than the haystack then there is no
    1259             :      * point in wasting cycles initializing the table.  We also choose not to
    1260             :      * use B-M-H for needles of length 1, since the skip table can't possibly
    1261             :      * save anything in that case.
    1262             :      */
    1263        1848 :     if (len1 >= len2 && len2 > 1)
    1264             :     {
    1265        1702 :         int         searchlength = len1 - len2;
    1266             :         int         skiptablemask;
    1267             :         int         last;
    1268             :         int         i;
    1269        1702 :         const char *str2 = state->str2;
    1270             : 
    1271             :         /*
    1272             :          * First we must determine how much of the skip table to use.  The
    1273             :          * declaration of TextPositionState allows up to 256 elements, but for
    1274             :          * short search problems we don't really want to have to initialize so
    1275             :          * many elements --- it would take too long in comparison to the
    1276             :          * actual search time.  So we choose a useful skip table size based on
    1277             :          * the haystack length minus the needle length.  The closer the needle
    1278             :          * length is to the haystack length the less useful skipping becomes.
    1279             :          *
    1280             :          * Note: since we use bit-masking to select table elements, the skip
    1281             :          * table size MUST be a power of 2, and so the mask must be 2^N-1.
    1282             :          */
    1283        1702 :         if (searchlength < 16)
    1284          40 :             skiptablemask = 3;
    1285        1662 :         else if (searchlength < 64)
    1286          12 :             skiptablemask = 7;
    1287        1650 :         else if (searchlength < 128)
    1288           2 :             skiptablemask = 15;
    1289        1648 :         else if (searchlength < 512)
    1290         126 :             skiptablemask = 31;
    1291        1522 :         else if (searchlength < 2048)
    1292        1428 :             skiptablemask = 63;
    1293          94 :         else if (searchlength < 4096)
    1294          32 :             skiptablemask = 127;
    1295             :         else
    1296          62 :             skiptablemask = 255;
    1297        1702 :         state->skiptablemask = skiptablemask;
    1298             : 
    1299             :         /*
    1300             :          * Initialize the skip table.  We set all elements to the needle
    1301             :          * length, since this is the correct skip distance for any character
    1302             :          * not found in the needle.
    1303             :          */
    1304      117382 :         for (i = 0; i <= skiptablemask; i++)
    1305      115680 :             state->skiptable[i] = len2;
    1306             : 
    1307             :         /*
    1308             :          * Now examine the needle.  For each character except the last one,
    1309             :          * set the corresponding table element to the appropriate skip
    1310             :          * distance.  Note that when two characters share the same skip table
    1311             :          * entry, the one later in the needle must determine the skip
    1312             :          * distance.
    1313             :          */
    1314        1702 :         last = len2 - 1;
    1315             : 
    1316       20938 :         for (i = 0; i < last; i++)
    1317       19236 :             state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
    1318             :     }
    1319        1848 : }
    1320             : 
    1321             : /*
    1322             :  * Advance to the next match, starting from the end of the previous match
    1323             :  * (or the beginning of the string, on first call).  Returns true if a match
    1324             :  * is found.
    1325             :  *
    1326             :  * Note that this refuses to match an empty-string needle.  Most callers
    1327             :  * will have handled that case specially and we'll never see it here.
    1328             :  */
    1329             : static bool
    1330        6986 : text_position_next(TextPositionState *state)
    1331             : {
    1332        6986 :     int         needle_len = state->len2;
    1333             :     char       *start_ptr;
    1334             :     char       *matchptr;
    1335             : 
    1336        6986 :     if (needle_len <= 0)
    1337           0 :         return false;           /* result for empty pattern */
    1338             : 
    1339             :     /* Start from the point right after the previous match. */
    1340        6986 :     if (state->last_match)
    1341        5130 :         start_ptr = state->last_match + needle_len;
    1342             :     else
    1343        1856 :         start_ptr = state->str1;
    1344             : 
    1345        6986 : retry:
    1346        6986 :     matchptr = text_position_next_internal(start_ptr, state);
    1347             : 
    1348        6986 :     if (!matchptr)
    1349        1798 :         return false;
    1350             : 
    1351             :     /*
    1352             :      * Found a match for the byte sequence.  If this is a multibyte encoding,
    1353             :      * where one character's byte sequence can appear inside a longer
    1354             :      * multi-byte character, we need to verify that the match was at a
    1355             :      * character boundary, not in the middle of a multi-byte character.
    1356             :      */
    1357        5188 :     if (state->is_multibyte_char_in_char)
    1358             :     {
    1359             :         /* Walk one character at a time, until we reach the match. */
    1360             : 
    1361             :         /* the search should never move backwards. */
    1362             :         Assert(state->refpoint <= matchptr);
    1363             : 
    1364           0 :         while (state->refpoint < matchptr)
    1365             :         {
    1366             :             /* step to next character. */
    1367           0 :             state->refpoint += pg_mblen(state->refpoint);
    1368           0 :             state->refpos++;
    1369             : 
    1370             :             /*
    1371             :              * If we stepped over the match's start position, then it was a
    1372             :              * false positive, where the byte sequence appeared in the middle
    1373             :              * of a multi-byte character.  Skip it, and continue the search at
    1374             :              * the next character boundary.
    1375             :              */
    1376           0 :             if (state->refpoint > matchptr)
    1377             :             {
    1378           0 :                 start_ptr = state->refpoint;
    1379           0 :                 goto retry;
    1380             :             }
    1381             :         }
    1382             :     }
    1383             : 
    1384        5188 :     state->last_match = matchptr;
    1385        5188 :     return true;
    1386             : }
    1387             : 
    1388             : /*
    1389             :  * Subroutine of text_position_next().  This searches for the raw byte
    1390             :  * sequence, ignoring any multi-byte encoding issues.  Returns the first
    1391             :  * match starting at 'start_ptr', or NULL if no match is found.
    1392             :  */
    1393             : static char *
    1394        6986 : text_position_next_internal(char *start_ptr, TextPositionState *state)
    1395             : {
    1396        6986 :     int         haystack_len = state->len1;
    1397        6986 :     int         needle_len = state->len2;
    1398        6986 :     int         skiptablemask = state->skiptablemask;
    1399        6986 :     const char *haystack = state->str1;
    1400        6986 :     const char *needle = state->str2;
    1401        6986 :     const char *haystack_end = &haystack[haystack_len];
    1402             :     const char *hptr;
    1403             : 
    1404             :     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
    1405             : 
    1406        6986 :     if (needle_len == 1)
    1407             :     {
    1408             :         /* No point in using B-M-H for a one-character needle */
    1409         502 :         char        nchar = *needle;
    1410             : 
    1411         502 :         hptr = start_ptr;
    1412        3770 :         while (hptr < haystack_end)
    1413             :         {
    1414        3658 :             if (*hptr == nchar)
    1415         390 :                 return (char *) hptr;
    1416        3268 :             hptr++;
    1417             :         }
    1418             :     }
    1419             :     else
    1420             :     {
    1421        6484 :         const char *needle_last = &needle[needle_len - 1];
    1422             : 
    1423             :         /* Start at startpos plus the length of the needle */
    1424        6484 :         hptr = start_ptr + needle_len - 1;
    1425      168796 :         while (hptr < haystack_end)
    1426             :         {
    1427             :             /* Match the needle scanning *backward* */
    1428             :             const char *nptr;
    1429             :             const char *p;
    1430             : 
    1431      167110 :             nptr = needle_last;
    1432      167110 :             p = hptr;
    1433      236516 :             while (*nptr == *p)
    1434             :             {
    1435             :                 /* Matched it all?  If so, return 1-based position */
    1436       74204 :                 if (nptr == needle)
    1437        4798 :                     return (char *) p;
    1438       69406 :                 nptr--, p--;
    1439             :             }
    1440             : 
    1441             :             /*
    1442             :              * No match, so use the haystack char at hptr to decide how far to
    1443             :              * advance.  If the needle had any occurrence of that character
    1444             :              * (or more precisely, one sharing the same skiptable entry)
    1445             :              * before its last character, then we advance far enough to align
    1446             :              * the last such needle character with that haystack position.
    1447             :              * Otherwise we can advance by the whole needle length.
    1448             :              */
    1449      162312 :             hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
    1450             :         }
    1451             :     }
    1452             : 
    1453        1798 :     return 0;                   /* not found */
    1454             : }
    1455             : 
    1456             : /*
    1457             :  * Return a pointer to the current match.
    1458             :  *
    1459             :  * The returned pointer points into the original haystack string.
    1460             :  */
    1461             : static char *
    1462        5126 : text_position_get_match_ptr(TextPositionState *state)
    1463             : {
    1464        5126 :     return state->last_match;
    1465             : }
    1466             : 
    1467             : /*
    1468             :  * Return the offset of the current match.
    1469             :  *
    1470             :  * The offset is in characters, 1-based.
    1471             :  */
    1472             : static int
    1473          42 : text_position_get_match_pos(TextPositionState *state)
    1474             : {
    1475          42 :     if (!state->is_multibyte)
    1476           0 :         return state->last_match - state->str1 + 1;
    1477             :     else
    1478             :     {
    1479             :         /* Convert the byte position to char position. */
    1480         102 :         while (state->refpoint < state->last_match)
    1481             :         {
    1482          60 :             state->refpoint += pg_mblen(state->refpoint);
    1483          60 :             state->refpos++;
    1484             :         }
    1485             :         Assert(state->refpoint == state->last_match);
    1486          42 :         return state->refpos + 1;
    1487             :     }
    1488             : }
    1489             : 
    1490             : /*
    1491             :  * Reset search state to the initial state installed by text_position_setup.
    1492             :  *
    1493             :  * The next call to text_position_next will search from the beginning
    1494             :  * of the string.
    1495             :  */
    1496             : static void
    1497           8 : text_position_reset(TextPositionState *state)
    1498             : {
    1499           8 :     state->last_match = NULL;
    1500           8 :     state->refpoint = state->str1;
    1501           8 :     state->refpos = 0;
    1502           8 : }
    1503             : 
    1504             : static void
    1505        1848 : text_position_cleanup(TextPositionState *state)
    1506             : {
    1507             :     /* no cleanup needed */
    1508        1848 : }
    1509             : 
    1510             : 
    1511             : static void
    1512     8614084 : check_collation_set(Oid collid)
    1513             : {
    1514     8614084 :     if (!OidIsValid(collid))
    1515             :     {
    1516             :         /*
    1517             :          * This typically means that the parser could not resolve a conflict
    1518             :          * of implicit collations, so report it that way.
    1519             :          */
    1520           8 :         ereport(ERROR,
    1521             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
    1522             :                  errmsg("could not determine which collation to use for string comparison"),
    1523             :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
    1524             :     }
    1525     8614076 : }
    1526             : 
    1527             : /* varstr_cmp()
    1528             :  * Comparison function for text strings with given lengths.
    1529             :  * Includes locale support, but must copy strings to temporary memory
    1530             :  *  to allow null-termination for inputs to strcoll().
    1531             :  * Returns an integer less than, equal to, or greater than zero, indicating
    1532             :  * whether arg1 is less than, equal to, or greater than arg2.
    1533             :  *
    1534             :  * Note: many functions that depend on this are marked leakproof; therefore,
    1535             :  * avoid reporting the actual contents of the input when throwing errors.
    1536             :  * All errors herein should be things that can't happen except on corrupt
    1537             :  * data, anyway; otherwise we will have trouble with indexing strings that
    1538             :  * would cause them.
    1539             :  */
    1540             : int
    1541     6089722 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
    1542             : {
    1543             :     int         result;
    1544             : 
    1545     6089722 :     check_collation_set(collid);
    1546             : 
    1547             :     /*
    1548             :      * Unfortunately, there is no strncoll(), so in the non-C locale case we
    1549             :      * have to do some memory copying.  This turns out to be significantly
    1550             :      * slower, so we optimize the case where LC_COLLATE is C.  We also try to
    1551             :      * optimize relatively-short strings by avoiding palloc/pfree overhead.
    1552             :      */
    1553     6089718 :     if (lc_collate_is_c(collid))
    1554             :     {
    1555     2946070 :         result = memcmp(arg1, arg2, Min(len1, len2));
    1556     2946070 :         if ((result == 0) && (len1 != len2))
    1557       74340 :             result = (len1 < len2) ? -1 : 1;
    1558             :     }
    1559             :     else
    1560             :     {
    1561             :         char        a1buf[TEXTBUFLEN];
    1562             :         char        a2buf[TEXTBUFLEN];
    1563             :         char       *a1p,
    1564             :                    *a2p;
    1565     3143648 :         pg_locale_t mylocale = 0;
    1566             : 
    1567     3143648 :         if (collid != DEFAULT_COLLATION_OID)
    1568           0 :             mylocale = pg_newlocale_from_collation(collid);
    1569             : 
    1570             :         /*
    1571             :          * memcmp() can't tell us which of two unequal strings sorts first,
    1572             :          * but it's a cheap way to tell if they're equal.  Testing shows that
    1573             :          * memcmp() followed by strcoll() is only trivially slower than
    1574             :          * strcoll() by itself, so we don't lose much if this doesn't work out
    1575             :          * very often, and if it does - for example, because there are many
    1576             :          * equal strings in the input - then we win big by avoiding expensive
    1577             :          * collation-aware comparisons.
    1578             :          */
    1579     3143648 :         if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
    1580     1186932 :             return 0;
    1581             : 
    1582             : #ifdef WIN32
    1583             :         /* Win32 does not have UTF-8, so we need to map to UTF-16 */
    1584             :         if (GetDatabaseEncoding() == PG_UTF8
    1585             :             && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
    1586             :         {
    1587             :             int         a1len;
    1588             :             int         a2len;
    1589             :             int         r;
    1590             : 
    1591             :             if (len1 >= TEXTBUFLEN / 2)
    1592             :             {
    1593             :                 a1len = len1 * 2 + 2;
    1594             :                 a1p = palloc(a1len);
    1595             :             }
    1596             :             else
    1597             :             {
    1598             :                 a1len = TEXTBUFLEN;
    1599             :                 a1p = a1buf;
    1600             :             }
    1601             :             if (len2 >= TEXTBUFLEN / 2)
    1602             :             {
    1603             :                 a2len = len2 * 2 + 2;
    1604             :                 a2p = palloc(a2len);
    1605             :             }
    1606             :             else
    1607             :             {
    1608             :                 a2len = TEXTBUFLEN;
    1609             :                 a2p = a2buf;
    1610             :             }
    1611             : 
    1612             :             /* stupid Microsloth API does not work for zero-length input */
    1613             :             if (len1 == 0)
    1614             :                 r = 0;
    1615             :             else
    1616             :             {
    1617             :                 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
    1618             :                                         (LPWSTR) a1p, a1len / 2);
    1619             :                 if (!r)
    1620             :                     ereport(ERROR,
    1621             :                             (errmsg("could not convert string to UTF-16: error code %lu",
    1622             :                                     GetLastError())));
    1623             :             }
    1624             :             ((LPWSTR) a1p)[r] = 0;
    1625             : 
    1626             :             if (len2 == 0)
    1627             :                 r = 0;
    1628             :             else
    1629             :             {
    1630             :                 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
    1631             :                                         (LPWSTR) a2p, a2len / 2);
    1632             :                 if (!r)
    1633             :                     ereport(ERROR,
    1634             :                             (errmsg("could not convert string to UTF-16: error code %lu",
    1635             :                                     GetLastError())));
    1636             :             }
    1637             :             ((LPWSTR) a2p)[r] = 0;
    1638             : 
    1639             :             errno = 0;
    1640             : #ifdef HAVE_LOCALE_T
    1641             :             if (mylocale)
    1642             :                 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
    1643             :             else
    1644             : #endif
    1645             :                 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
    1646             :             if (result == 2147483647)   /* _NLSCMPERROR; missing from mingw
    1647             :                                          * headers */
    1648             :                 ereport(ERROR,
    1649             :                         (errmsg("could not compare Unicode strings: %m")));
    1650             : 
    1651             :             /* Break tie if necessary. */
    1652             :             if (result == 0 &&
    1653             :                 (!mylocale || mylocale->deterministic))
    1654             :             {
    1655             :                 result = memcmp(arg1, arg2, Min(len1, len2));
    1656             :                 if ((result == 0) && (len1 != len2))
    1657             :                     result = (len1 < len2) ? -1 : 1;
    1658             :             }
    1659             : 
    1660             :             if (a1p != a1buf)
    1661             :                 pfree(a1p);
    1662             :             if (a2p != a2buf)
    1663             :                 pfree(a2p);
    1664             : 
    1665             :             return result;
    1666             :         }
    1667             : #endif                          /* WIN32 */
    1668             : 
    1669     1956716 :         if (len1 >= TEXTBUFLEN)
    1670         220 :             a1p = (char *) palloc(len1 + 1);
    1671             :         else
    1672     1956496 :             a1p = a1buf;
    1673     1956716 :         if (len2 >= TEXTBUFLEN)
    1674          88 :             a2p = (char *) palloc(len2 + 1);
    1675             :         else
    1676     1956628 :             a2p = a2buf;
    1677             : 
    1678     1956716 :         memcpy(a1p, arg1, len1);
    1679     1956716 :         a1p[len1] = '\0';
    1680     1956716 :         memcpy(a2p, arg2, len2);
    1681     1956716 :         a2p[len2] = '\0';
    1682             : 
    1683     1956716 :         if (mylocale)
    1684             :         {
    1685           0 :             if (mylocale->provider == COLLPROVIDER_ICU)
    1686             :             {
    1687             : #ifdef USE_ICU
    1688             : #ifdef HAVE_UCOL_STRCOLLUTF8
    1689             :                 if (GetDatabaseEncoding() == PG_UTF8)
    1690             :                 {
    1691             :                     UErrorCode  status;
    1692             : 
    1693             :                     status = U_ZERO_ERROR;
    1694             :                     result = ucol_strcollUTF8(mylocale->info.icu.ucol,
    1695             :                                               arg1, len1,
    1696             :                                               arg2, len2,
    1697             :                                               &status);
    1698             :                     if (U_FAILURE(status))
    1699             :                         ereport(ERROR,
    1700             :                                 (errmsg("collation failed: %s", u_errorName(status))));
    1701             :                 }
    1702             :                 else
    1703             : #endif
    1704             :                 {
    1705             :                     int32_t     ulen1,
    1706             :                                 ulen2;
    1707             :                     UChar      *uchar1,
    1708             :                                *uchar2;
    1709             : 
    1710             :                     ulen1 = icu_to_uchar(&uchar1, arg1, len1);
    1711             :                     ulen2 = icu_to_uchar(&uchar2, arg2, len2);
    1712             : 
    1713             :                     result = ucol_strcoll(mylocale->info.icu.ucol,
    1714             :                                           uchar1, ulen1,
    1715             :                                           uchar2, ulen2);
    1716             : 
    1717             :                     pfree(uchar1);
    1718             :                     pfree(uchar2);
    1719             :                 }
    1720             : #else                           /* not USE_ICU */
    1721             :                 /* shouldn't happen */
    1722           0 :                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
    1723             : #endif                          /* not USE_ICU */
    1724             :             }
    1725             :             else
    1726             :             {
    1727             : #ifdef HAVE_LOCALE_T
    1728           0 :                 result = strcoll_l(a1p, a2p, mylocale->info.lt);
    1729             : #else
    1730             :                 /* shouldn't happen */
    1731             :                 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
    1732             : #endif
    1733             :             }
    1734             :         }
    1735             :         else
    1736     1956716 :             result = strcoll(a1p, a2p);
    1737             : 
    1738             :         /* Break tie if necessary. */
    1739     1956716 :         if (result == 0 &&
    1740           0 :             (!mylocale || mylocale->deterministic))
    1741           0 :             result = strcmp(a1p, a2p);
    1742             : 
    1743     1956716 :         if (a1p != a1buf)
    1744         220 :             pfree(a1p);
    1745     1956716 :         if (a2p != a2buf)
    1746          88 :             pfree(a2p);
    1747             :     }
    1748             : 
    1749     4902786 :     return result;
    1750             : }
    1751             : 
    1752             : /* text_cmp()
    1753             :  * Internal comparison function for text strings.
    1754             :  * Returns -1, 0 or 1
    1755             :  */
    1756             : static int
    1757     5008758 : text_cmp(text *arg1, text *arg2, Oid collid)
    1758             : {
    1759             :     char       *a1p,
    1760             :                *a2p;
    1761             :     int         len1,
    1762             :                 len2;
    1763             : 
    1764     5008758 :     a1p = VARDATA_ANY(arg1);
    1765     5008758 :     a2p = VARDATA_ANY(arg2);
    1766             : 
    1767     5008758 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1768     5008758 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1769             : 
    1770     5008758 :     return varstr_cmp(a1p, len1, a2p, len2, collid);
    1771             : }
    1772             : 
    1773             : /*
    1774             :  * Comparison functions for text strings.
    1775             :  *
    1776             :  * Note: btree indexes need these routines not to leak memory; therefore,
    1777             :  * be careful to free working copies of toasted datums.  Most places don't
    1778             :  * need to be so careful.
    1779             :  */
    1780             : 
    1781             : Datum
    1782     2277484 : texteq(PG_FUNCTION_ARGS)
    1783             : {
    1784     2277484 :     Oid         collid = PG_GET_COLLATION();
    1785             :     bool        result;
    1786             : 
    1787     2277484 :     check_collation_set(collid);
    1788             : 
    1789     2277484 :     if (lc_collate_is_c(collid) ||
    1790           0 :         collid == DEFAULT_COLLATION_OID ||
    1791           0 :         pg_newlocale_from_collation(collid)->deterministic)
    1792     2277484 :     {
    1793     2277484 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1794     2277484 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1795             :         Size        len1,
    1796             :                     len2;
    1797             : 
    1798             :         /*
    1799             :          * Since we only care about equality or not-equality, we can avoid all
    1800             :          * the expense of strcoll() here, and just do bitwise comparison.  In
    1801             :          * fact, we don't even have to do a bitwise comparison if we can show
    1802             :          * the lengths of the strings are unequal; which might save us from
    1803             :          * having to detoast one or both values.
    1804             :          */
    1805     2277484 :         len1 = toast_raw_datum_size(arg1);
    1806     2277484 :         len2 = toast_raw_datum_size(arg2);
    1807     2277484 :         if (len1 != len2)
    1808      631326 :             result = false;
    1809             :         else
    1810             :         {
    1811     1646158 :             text       *targ1 = DatumGetTextPP(arg1);
    1812     1646158 :             text       *targ2 = DatumGetTextPP(arg2);
    1813             : 
    1814     1646158 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1815             :                              len1 - VARHDRSZ) == 0);
    1816             : 
    1817     1646158 :             PG_FREE_IF_COPY(targ1, 0);
    1818     1646158 :             PG_FREE_IF_COPY(targ2, 1);
    1819             :         }
    1820             :     }
    1821             :     else
    1822             :     {
    1823           0 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1824           0 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1825             : 
    1826           0 :         result = (text_cmp(arg1, arg2, collid) == 0);
    1827             : 
    1828           0 :         PG_FREE_IF_COPY(arg1, 0);
    1829           0 :         PG_FREE_IF_COPY(arg2, 1);
    1830             :     }
    1831             : 
    1832     2277484 :     PG_RETURN_BOOL(result);
    1833             : }
    1834             : 
    1835             : Datum
    1836       12130 : textne(PG_FUNCTION_ARGS)
    1837             : {
    1838       12130 :     Oid         collid = PG_GET_COLLATION();
    1839             :     bool        result;
    1840             : 
    1841       12130 :     check_collation_set(collid);
    1842             : 
    1843       12130 :     if (lc_collate_is_c(collid) ||
    1844           0 :         collid == DEFAULT_COLLATION_OID ||
    1845           0 :         pg_newlocale_from_collation(collid)->deterministic)
    1846       12130 :     {
    1847       12130 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1848       12130 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1849             :         Size        len1,
    1850             :                     len2;
    1851             : 
    1852             :         /* See comment in texteq() */
    1853       12130 :         len1 = toast_raw_datum_size(arg1);
    1854       12130 :         len2 = toast_raw_datum_size(arg2);
    1855       12130 :         if (len1 != len2)
    1856         686 :             result = true;
    1857             :         else
    1858             :         {
    1859       11444 :             text       *targ1 = DatumGetTextPP(arg1);
    1860       11444 :             text       *targ2 = DatumGetTextPP(arg2);
    1861             : 
    1862       11444 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1863             :                              len1 - VARHDRSZ) != 0);
    1864             : 
    1865       11444 :             PG_FREE_IF_COPY(targ1, 0);
    1866       11444 :             PG_FREE_IF_COPY(targ2, 1);
    1867             :         }
    1868             :     }
    1869             :     else
    1870             :     {
    1871           0 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1872           0 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1873             : 
    1874           0 :         result = (text_cmp(arg1, arg2, collid) != 0);
    1875             : 
    1876           0 :         PG_FREE_IF_COPY(arg1, 0);
    1877           0 :         PG_FREE_IF_COPY(arg2, 1);
    1878             :     }
    1879             : 
    1880       12130 :     PG_RETURN_BOOL(result);
    1881             : }
    1882             : 
    1883             : Datum
    1884       88896 : text_lt(PG_FUNCTION_ARGS)
    1885             : {
    1886       88896 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1887       88896 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1888             :     bool        result;
    1889             : 
    1890       88896 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
    1891             : 
    1892       88892 :     PG_FREE_IF_COPY(arg1, 0);
    1893       88892 :     PG_FREE_IF_COPY(arg2, 1);
    1894             : 
    1895       88892 :     PG_RETURN_BOOL(result);
    1896             : }
    1897             : 
    1898             : Datum
    1899      141290 : text_le(PG_FUNCTION_ARGS)
    1900             : {
    1901      141290 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1902      141290 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1903             :     bool        result;
    1904             : 
    1905      141290 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
    1906             : 
    1907      141290 :     PG_FREE_IF_COPY(arg1, 0);
    1908      141290 :     PG_FREE_IF_COPY(arg2, 1);
    1909             : 
    1910      141290 :     PG_RETURN_BOOL(result);
    1911             : }
    1912             : 
    1913             : Datum
    1914       57926 : text_gt(PG_FUNCTION_ARGS)
    1915             : {
    1916       57926 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1917       57926 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1918             :     bool        result;
    1919             : 
    1920       57926 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
    1921             : 
    1922       57926 :     PG_FREE_IF_COPY(arg1, 0);
    1923       57926 :     PG_FREE_IF_COPY(arg2, 1);
    1924             : 
    1925       57926 :     PG_RETURN_BOOL(result);
    1926             : }
    1927             : 
    1928             : Datum
    1929       84288 : text_ge(PG_FUNCTION_ARGS)
    1930             : {
    1931       84288 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1932       84288 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1933             :     bool        result;
    1934             : 
    1935       84288 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
    1936             : 
    1937       84288 :     PG_FREE_IF_COPY(arg1, 0);
    1938       84288 :     PG_FREE_IF_COPY(arg2, 1);
    1939             : 
    1940       84288 :     PG_RETURN_BOOL(result);
    1941             : }
    1942             : 
    1943             : Datum
    1944       25132 : text_starts_with(PG_FUNCTION_ARGS)
    1945             : {
    1946       25132 :     Datum       arg1 = PG_GETARG_DATUM(0);
    1947       25132 :     Datum       arg2 = PG_GETARG_DATUM(1);
    1948       25132 :     Oid         collid = PG_GET_COLLATION();
    1949       25132 :     pg_locale_t mylocale = 0;
    1950             :     bool        result;
    1951             :     Size        len1,
    1952             :                 len2;
    1953             : 
    1954       25132 :     check_collation_set(collid);
    1955             : 
    1956       25132 :     if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
    1957           0 :         mylocale = pg_newlocale_from_collation(collid);
    1958             : 
    1959       25132 :     if (mylocale && !mylocale->deterministic)
    1960           0 :         ereport(ERROR,
    1961             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1962             :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1963             : 
    1964       25132 :     len1 = toast_raw_datum_size(arg1);
    1965       25132 :     len2 = toast_raw_datum_size(arg2);
    1966       25132 :     if (len2 > len1)
    1967           0 :         result = false;
    1968             :     else
    1969             :     {
    1970       25132 :         text       *targ1 = text_substring(arg1, 1, len2, false);
    1971       25132 :         text       *targ2 = DatumGetTextPP(arg2);
    1972             : 
    1973       25132 :         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1974       25132 :                          VARSIZE_ANY_EXHDR(targ2)) == 0);
    1975             : 
    1976       25132 :         PG_FREE_IF_COPY(targ1, 0);
    1977       25132 :         PG_FREE_IF_COPY(targ2, 1);
    1978             :     }
    1979             : 
    1980       25132 :     PG_RETURN_BOOL(result);
    1981             : }
    1982             : 
    1983             : Datum
    1984     4457892 : bttextcmp(PG_FUNCTION_ARGS)
    1985             : {
    1986     4457892 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1987     4457892 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1988             :     int32       result;
    1989             : 
    1990     4457892 :     result = text_cmp(arg1, arg2, PG_GET_COLLATION());
    1991             : 
    1992     4457892 :     PG_FREE_IF_COPY(arg1, 0);
    1993     4457892 :     PG_FREE_IF_COPY(arg2, 1);
    1994             : 
    1995     4457892 :     PG_RETURN_INT32(result);
    1996             : }
    1997             : 
    1998             : Datum
    1999       48770 : bttextsortsupport(PG_FUNCTION_ARGS)
    2000             : {
    2001       48770 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    2002       48770 :     Oid         collid = ssup->ssup_collation;
    2003             :     MemoryContext oldcontext;
    2004             : 
    2005       48770 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    2006             : 
    2007             :     /* Use generic string SortSupport */
    2008       48770 :     varstr_sortsupport(ssup, TEXTOID, collid);
    2009             : 
    2010       48766 :     MemoryContextSwitchTo(oldcontext);
    2011             : 
    2012       48766 :     PG_RETURN_VOID();
    2013             : }
    2014             : 
    2015             : /*
    2016             :  * Generic sortsupport interface for character type's operator classes.
    2017             :  * Includes locale support, and support for BpChar semantics (i.e. removing
    2018             :  * trailing spaces before comparison).
    2019             :  *
    2020             :  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
    2021             :  * same representation.  Callers that always use the C collation (e.g.
    2022             :  * non-collatable type callers like bytea) may have NUL bytes in their strings;
    2023             :  * this will not work with any other collation, though.
    2024             :  */
    2025             : void
    2026      100468 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
    2027             : {
    2028      100468 :     bool        abbreviate = ssup->abbreviate;
    2029      100468 :     bool        collate_c = false;
    2030             :     VarStringSortSupport *sss;
    2031      100468 :     pg_locale_t locale = 0;
    2032             : 
    2033      100468 :     check_collation_set(collid);
    2034             : 
    2035             :     /*
    2036             :      * If possible, set ssup->comparator to a function which can be used to
    2037             :      * directly compare two datums.  If we can do this, we'll avoid the
    2038             :      * overhead of a trip through the fmgr layer for every comparison, which
    2039             :      * can be substantial.
    2040             :      *
    2041             :      * Most typically, we'll set the comparator to varlenafastcmp_locale,
    2042             :      * which uses strcoll() to perform comparisons.  We use that for the
    2043             :      * BpChar case too, but type NAME uses namefastcmp_locale. However, if
    2044             :      * LC_COLLATE = C, we can make things quite a bit faster with
    2045             :      * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
    2046             :      * memcmp() rather than strcoll().
    2047             :      */
    2048      100464 :     if (lc_collate_is_c(collid))
    2049             :     {
    2050       71264 :         if (typid == BPCHAROID)
    2051          16 :             ssup->comparator = bpcharfastcmp_c;
    2052       71248 :         else if (typid == NAMEOID)
    2053             :         {
    2054       51108 :             ssup->comparator = namefastcmp_c;
    2055             :             /* Not supporting abbreviation with type NAME, for now */
    2056       51108 :             abbreviate = false;
    2057             :         }
    2058             :         else
    2059       20140 :             ssup->comparator = varstrfastcmp_c;
    2060             : 
    2061       71264 :         collate_c = true;
    2062             :     }
    2063             :     else
    2064             :     {
    2065             :         /*
    2066             :          * We need a collation-sensitive comparison.  To make things faster,
    2067             :          * we'll figure out the collation based on the locale id and cache the
    2068             :          * result.
    2069             :          */
    2070       29200 :         if (collid != DEFAULT_COLLATION_OID)
    2071           0 :             locale = pg_newlocale_from_collation(collid);
    2072             : 
    2073             :         /*
    2074             :          * There is a further exception on Windows.  When the database
    2075             :          * encoding is UTF-8 and we are not using the C collation, complex
    2076             :          * hacks are required.  We don't currently have a comparator that
    2077             :          * handles that case, so we fall back on the slow method of having the
    2078             :          * sort code invoke bttextcmp() (in the case of text) via the fmgr
    2079             :          * trampoline.  ICU locales work just the same on Windows, however.
    2080             :          */
    2081             : #ifdef WIN32
    2082             :         if (GetDatabaseEncoding() == PG_UTF8 &&
    2083             :             !(locale && locale->provider == COLLPROVIDER_ICU))
    2084             :             return;
    2085             : #endif
    2086             : 
    2087             :         /*
    2088             :          * We use varlenafastcmp_locale except for type NAME.
    2089             :          */
    2090       29200 :         if (typid == NAMEOID)
    2091             :         {
    2092           0 :             ssup->comparator = namefastcmp_locale;
    2093             :             /* Not supporting abbreviation with type NAME, for now */
    2094           0 :             abbreviate = false;
    2095             :         }
    2096             :         else
    2097       29200 :             ssup->comparator = varlenafastcmp_locale;
    2098             :     }
    2099             : 
    2100             :     /*
    2101             :      * Unfortunately, it seems that abbreviation for non-C collations is
    2102             :      * broken on many common platforms; testing of multiple versions of glibc
    2103             :      * reveals that, for many locales, strcoll() and strxfrm() do not return
    2104             :      * consistent results, which is fatal to this optimization.  While no
    2105             :      * other libc other than Cygwin has so far been shown to have a problem,
    2106             :      * we take the conservative course of action for right now and disable
    2107             :      * this categorically.  (Users who are certain this isn't a problem on
    2108             :      * their system can define TRUST_STRXFRM.)
    2109             :      *
    2110             :      * Even apart from the risk of broken locales, it's possible that there
    2111             :      * are platforms where the use of abbreviated keys should be disabled at
    2112             :      * compile time.  Having only 4 byte datums could make worst-case
    2113             :      * performance drastically more likely, for example.  Moreover, macOS's
    2114             :      * strxfrm() implementation is known to not effectively concentrate a
    2115             :      * significant amount of entropy from the original string in earlier
    2116             :      * transformed blobs.  It's possible that other supported platforms are
    2117             :      * similarly encumbered.  So, if we ever get past disabling this
    2118             :      * categorically, we may still want or need to disable it for particular
    2119             :      * platforms.
    2120             :      */
    2121             : #ifndef TRUST_STRXFRM
    2122      100464 :     if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
    2123       29200 :         abbreviate = false;
    2124             : #endif
    2125             : 
    2126             :     /*
    2127             :      * If we're using abbreviated keys, or if we're using a locale-aware
    2128             :      * comparison, we need to initialize a VarStringSortSupport object. Both
    2129             :      * cases will make use of the temporary buffers we initialize here for
    2130             :      * scratch space (and to detect requirement for BpChar semantics from
    2131             :      * caller), and the abbreviation case requires additional state.
    2132             :      */
    2133      100464 :     if (abbreviate || !collate_c)
    2134             :     {
    2135       30652 :         sss = palloc(sizeof(VarStringSortSupport));
    2136       30652 :         sss->buf1 = palloc(TEXTBUFLEN);
    2137       30652 :         sss->buflen1 = TEXTBUFLEN;
    2138       30652 :         sss->buf2 = palloc(TEXTBUFLEN);
    2139       30652 :         sss->buflen2 = TEXTBUFLEN;
    2140             :         /* Start with invalid values */
    2141       30652 :         sss->last_len1 = -1;
    2142       30652 :         sss->last_len2 = -1;
    2143             :         /* Initialize */
    2144       30652 :         sss->last_returned = 0;
    2145       30652 :         sss->locale = locale;
    2146             : 
    2147             :         /*
    2148             :          * To avoid somehow confusing a strxfrm() blob and an original string,
    2149             :          * constantly keep track of the variety of data that buf1 and buf2
    2150             :          * currently contain.
    2151             :          *
    2152             :          * Comparisons may be interleaved with conversion calls.  Frequently,
    2153             :          * conversions and comparisons are batched into two distinct phases,
    2154             :          * but the correctness of caching cannot hinge upon this.  For
    2155             :          * comparison caching, buffer state is only trusted if cache_blob is
    2156             :          * found set to false, whereas strxfrm() caching only trusts the state
    2157             :          * when cache_blob is found set to true.
    2158             :          *
    2159             :          * Arbitrarily initialize cache_blob to true.
    2160             :          */
    2161       30652 :         sss->cache_blob = true;
    2162       30652 :         sss->collate_c = collate_c;
    2163       30652 :         sss->typid = typid;
    2164       30652 :         ssup->ssup_extra = sss;
    2165             : 
    2166             :         /*
    2167             :          * If possible, plan to use the abbreviated keys optimization.  The
    2168             :          * core code may switch back to authoritative comparator should
    2169             :          * abbreviation be aborted.
    2170             :          */
    2171       30652 :         if (abbreviate)
    2172             :         {
    2173        1452 :             sss->prop_card = 0.20;
    2174        1452 :             initHyperLogLog(&sss->abbr_card, 10);
    2175        1452 :             initHyperLogLog(&sss->full_card, 10);
    2176        1452 :             ssup->abbrev_full_comparator = ssup->comparator;
    2177        1452 :             ssup->comparator = varstrcmp_abbrev;
    2178        1452 :             ssup->abbrev_converter = varstr_abbrev_convert;
    2179        1452 :             ssup->abbrev_abort = varstr_abbrev_abort;
    2180             :         }
    2181             :     }
    2182      100464 : }
    2183             : 
    2184             : /*
    2185             :  * sortsupport comparison func (for C locale case)
    2186             :  */
    2187             : static int
    2188    61558648 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
    2189             : {
    2190    61558648 :     VarString  *arg1 = DatumGetVarStringPP(x);
    2191    61558648 :     VarString  *arg2 = DatumGetVarStringPP(y);
    2192             :     char       *a1p,
    2193             :                *a2p;
    2194             :     int         len1,
    2195             :                 len2,
    2196             :                 result;
    2197             : 
    2198    61558648 :     a1p = VARDATA_ANY(arg1);
    2199    61558648 :     a2p = VARDATA_ANY(arg2);
    2200             : 
    2201    61558648 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2202    61558648 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2203             : 
    2204    61558648 :     result = memcmp(a1p, a2p, Min(len1, len2));
    2205    61558648 :     if ((result == 0) && (len1 != len2))
    2206     1224756 :         result = (len1 < len2) ? -1 : 1;
    2207             : 
    2208             :     /* We can't afford to leak memory here. */
    2209    61558648 :     if (PointerGetDatum(arg1) != x)
    2210           0 :         pfree(arg1);
    2211    61558648 :     if (PointerGetDatum(arg2) != y)
    2212           0 :         pfree(arg2);
    2213             : 
    2214    61558648 :     return result;
    2215             : }
    2216             : 
    2217             : /*
    2218             :  * sortsupport comparison func (for BpChar C locale case)
    2219             :  *
    2220             :  * BpChar outsources its sortsupport to this module.  Specialization for the
    2221             :  * varstr_sortsupport BpChar case, modeled on
    2222             :  * internal_bpchar_pattern_compare().
    2223             :  */
    2224             : static int
    2225          16 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
    2226             : {
    2227          16 :     BpChar     *arg1 = DatumGetBpCharPP(x);
    2228          16 :     BpChar     *arg2 = DatumGetBpCharPP(y);
    2229             :     char       *a1p,
    2230             :                *a2p;
    2231             :     int         len1,
    2232             :                 len2,
    2233             :                 result;
    2234             : 
    2235          16 :     a1p = VARDATA_ANY(arg1);
    2236          16 :     a2p = VARDATA_ANY(arg2);
    2237             : 
    2238          16 :     len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
    2239          16 :     len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
    2240             : 
    2241          16 :     result = memcmp(a1p, a2p, Min(len1, len2));
    2242          16 :     if ((result == 0) && (len1 != len2))
    2243           0 :         result = (len1 < len2) ? -1 : 1;
    2244             : 
    2245             :     /* We can't afford to leak memory here. */
    2246          16 :     if (PointerGetDatum(arg1) != x)
    2247           0 :         pfree(arg1);
    2248          16 :     if (PointerGetDatum(arg2) != y)
    2249           0 :         pfree(arg2);
    2250             : 
    2251          16 :     return result;
    2252             : }
    2253             : 
    2254             : /*
    2255             :  * sortsupport comparison func (for NAME C locale case)
    2256             :  */
    2257             : static int
    2258    75082870 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
    2259             : {
    2260    75082870 :     Name        arg1 = DatumGetName(x);
    2261    75082870 :     Name        arg2 = DatumGetName(y);
    2262             : 
    2263    75082870 :     return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
    2264             : }
    2265             : 
    2266             : /*
    2267             :  * sortsupport comparison func (for locale case with all varlena types)
    2268             :  */
    2269             : static int
    2270    25153588 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
    2271             : {
    2272    25153588 :     VarString  *arg1 = DatumGetVarStringPP(x);
    2273    25153588 :     VarString  *arg2 = DatumGetVarStringPP(y);
    2274             :     char       *a1p,
    2275             :                *a2p;
    2276             :     int         len1,
    2277             :                 len2,
    2278             :                 result;
    2279             : 
    2280    25153588 :     a1p = VARDATA_ANY(arg1);
    2281    25153588 :     a2p = VARDATA_ANY(arg2);
    2282             : 
    2283    25153588 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2284    25153588 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2285             : 
    2286    25153588 :     result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
    2287             : 
    2288             :     /* We can't afford to leak memory here. */
    2289    25153588 :     if (PointerGetDatum(arg1) != x)
    2290           0 :         pfree(arg1);
    2291    25153588 :     if (PointerGetDatum(arg2) != y)
    2292           0 :         pfree(arg2);
    2293             : 
    2294    25153588 :     return result;
    2295             : }
    2296             : 
    2297             : /*
    2298             :  * sortsupport comparison func (for locale case with NAME type)
    2299             :  */
    2300             : static int
    2301           0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
    2302             : {
    2303           0 :     Name        arg1 = DatumGetName(x);
    2304           0 :     Name        arg2 = DatumGetName(y);
    2305             : 
    2306           0 :     return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
    2307           0 :                                 NameStr(*arg2), strlen(NameStr(*arg2)),
    2308             :                                 ssup);
    2309             : }
    2310             : 
    2311             : /*
    2312             :  * sortsupport comparison func for locale cases
    2313             :  */
    2314             : static int
    2315    25153588 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
    2316             : {
    2317    25153588 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2318             :     int         result;
    2319             :     bool        arg1_match;
    2320             : 
    2321             :     /* Fast pre-check for equality, as discussed in varstr_cmp() */
    2322    25153588 :     if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
    2323             :     {
    2324             :         /*
    2325             :          * No change in buf1 or buf2 contents, so avoid changing last_len1 or
    2326             :          * last_len2.  Existing contents of buffers might still be used by
    2327             :          * next call.
    2328             :          *
    2329             :          * It's fine to allow the comparison of BpChar padding bytes here,
    2330             :          * even though that implies that the memcmp() will usually be
    2331             :          * performed for BpChar callers (though multibyte characters could
    2332             :          * still prevent that from occurring).  The memcmp() is still very
    2333             :          * cheap, and BpChar's funny semantics have us remove trailing spaces
    2334             :          * (not limited to padding), so we need make no distinction between
    2335             :          * padding space characters and "real" space characters.
    2336             :          */
    2337     9289352 :         return 0;
    2338             :     }
    2339             : 
    2340    15864236 :     if (sss->typid == BPCHAROID)
    2341             :     {
    2342             :         /* Get true number of bytes, ignoring trailing spaces */
    2343       32972 :         len1 = bpchartruelen(a1p, len1);
    2344       32972 :         len2 = bpchartruelen(a2p, len2);
    2345             :     }
    2346             : 
    2347    15864236 :     if (len1 >= sss->buflen1)
    2348             :     {
    2349           0 :         pfree(sss->buf1);
    2350           0 :         sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2351           0 :         sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
    2352             :     }
    2353    15864236 :     if (len2 >= sss->buflen2)
    2354             :     {
    2355           0 :         pfree(sss->buf2);
    2356           0 :         sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
    2357           0 :         sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
    2358             :     }
    2359             : 
    2360             :     /*
    2361             :      * We're likely to be asked to compare the same strings repeatedly, and
    2362             :      * memcmp() is so much cheaper than strcoll() that it pays to try to cache
    2363             :      * comparisons, even though in general there is no reason to think that
    2364             :      * that will work out (every string datum may be unique).  Caching does
    2365             :      * not slow things down measurably when it doesn't work out, and can speed
    2366             :      * things up by rather a lot when it does.  In part, this is because the
    2367             :      * memcmp() compares data from cachelines that are needed in L1 cache even
    2368             :      * when the last comparison's result cannot be reused.
    2369             :      */
    2370    15864236 :     arg1_match = true;
    2371    15864236 :     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
    2372             :     {
    2373    14048476 :         arg1_match = false;
    2374    14048476 :         memcpy(sss->buf1, a1p, len1);
    2375    14048476 :         sss->buf1[len1] = '\0';
    2376    14048476 :         sss->last_len1 = len1;
    2377             :     }
    2378             : 
    2379             :     /*
    2380             :      * If we're comparing the same two strings as last time, we can return the
    2381             :      * same answer without calling strcoll() again.  This is more likely than
    2382             :      * it seems (at least with moderate to low cardinality sets), because
    2383             :      * quicksort compares the same pivot against many values.
    2384             :      */
    2385    15864236 :     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
    2386             :     {
    2387     2596454 :         memcpy(sss->buf2, a2p, len2);
    2388     2596454 :         sss->buf2[len2] = '\0';
    2389     2596454 :         sss->last_len2 = len2;
    2390             :     }
    2391    13267782 :     else if (arg1_match && !sss->cache_blob)
    2392             :     {
    2393             :         /* Use result cached following last actual strcoll() call */
    2394     1562694 :         return sss->last_returned;
    2395             :     }
    2396             : 
    2397    14301542 :     if (sss->locale)
    2398             :     {
    2399           0 :         if (sss->locale->provider == COLLPROVIDER_ICU)
    2400             :         {
    2401             : #ifdef USE_ICU
    2402             : #ifdef HAVE_UCOL_STRCOLLUTF8
    2403             :             if (GetDatabaseEncoding() == PG_UTF8)
    2404             :             {
    2405             :                 UErrorCode  status;
    2406             : 
    2407             :                 status = U_ZERO_ERROR;
    2408             :                 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
    2409             :                                           a1p, len1,
    2410             :                                           a2p, len2,
    2411             :                                           &status);
    2412             :                 if (U_FAILURE(status))
    2413             :                     ereport(ERROR,
    2414             :                             (errmsg("collation failed: %s", u_errorName(status))));
    2415             :             }
    2416             :             else
    2417             : #endif
    2418             :             {
    2419             :                 int32_t     ulen1,
    2420             :                             ulen2;
    2421             :                 UChar      *uchar1,
    2422             :                            *uchar2;
    2423             : 
    2424             :                 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
    2425             :                 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
    2426             : 
    2427             :                 result = ucol_strcoll(sss->locale->info.icu.ucol,
    2428             :                                       uchar1, ulen1,
    2429             :                                       uchar2, ulen2);
    2430             : 
    2431             :                 pfree(uchar1);
    2432             :                 pfree(uchar2);
    2433             :             }
    2434             : #else                           /* not USE_ICU */
    2435             :             /* shouldn't happen */
    2436           0 :             elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
    2437             : #endif                          /* not USE_ICU */
    2438             :         }
    2439             :         else
    2440             :         {
    2441             : #ifdef HAVE_LOCALE_T
    2442           0 :             result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
    2443             : #else
    2444             :             /* shouldn't happen */
    2445             :             elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
    2446             : #endif
    2447             :         }
    2448             :     }
    2449             :     else
    2450    14301542 :         result = strcoll(sss->buf1, sss->buf2);
    2451             : 
    2452             :     /* Break tie if necessary. */
    2453    14301542 :     if (result == 0 &&
    2454           0 :         (!sss->locale || sss->locale->deterministic))
    2455           0 :         result = strcmp(sss->buf1, sss->buf2);
    2456             : 
    2457             :     /* Cache result, perhaps saving an expensive strcoll() call next time */
    2458    14301542 :     sss->cache_blob = false;
    2459    14301542 :     sss->last_returned = result;
    2460    14301542 :     return result;
    2461             : }
    2462             : 
    2463             : /*
    2464             :  * Abbreviated key comparison func
    2465             :  */
    2466             : static int
    2467     3528180 : varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
    2468             : {
    2469             :     /*
    2470             :      * When 0 is returned, the core system will call varstrfastcmp_c()
    2471             :      * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale().  Even a
    2472             :      * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
    2473             :      * authoritatively, for the same reason that there is a strcoll()
    2474             :      * tie-breaker call to strcmp() in varstr_cmp().
    2475             :      */
    2476     3528180 :     if (x > y)
    2477     1538364 :         return 1;
    2478     1989816 :     else if (x == y)
    2479      527810 :         return 0;
    2480             :     else
    2481     1462006 :         return -1;
    2482             : }
    2483             : 
    2484             : /*
    2485             :  * Conversion routine for sortsupport.  Converts original to abbreviated key
    2486             :  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
    2487             :  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
    2488             :  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
    2489             :  * locale is used, or in case of bytea, just memcpy() from original instead.
    2490             :  */
    2491             : static Datum
    2492      354970 : varstr_abbrev_convert(Datum original, SortSupport ssup)
    2493             : {
    2494      354970 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2495      354970 :     VarString  *authoritative = DatumGetVarStringPP(original);
    2496      354970 :     char       *authoritative_data = VARDATA_ANY(authoritative);
    2497             : 
    2498             :     /* working state */
    2499             :     Datum       res;
    2500             :     char       *pres;
    2501             :     int         len;
    2502             :     uint32      hash;
    2503             : 
    2504      354970 :     pres = (char *) &res;
    2505             :     /* memset(), so any non-overwritten bytes are NUL */
    2506      354970 :     memset(pres, 0, sizeof(Datum));
    2507      354970 :     len = VARSIZE_ANY_EXHDR(authoritative);
    2508             : 
    2509             :     /* Get number of bytes, ignoring trailing spaces */
    2510      354970 :     if (sss->typid == BPCHAROID)
    2511           0 :         len = bpchartruelen(authoritative_data, len);
    2512             : 
    2513             :     /*
    2514             :      * If we're using the C collation, use memcpy(), rather than strxfrm(), to
    2515             :      * abbreviate keys.  The full comparator for the C locale is always
    2516             :      * memcmp().  It would be incorrect to allow bytea callers (callers that
    2517             :      * always force the C collation -- bytea isn't a collatable type, but this
    2518             :      * approach is convenient) to use strxfrm().  This is because bytea
    2519             :      * strings may contain NUL bytes.  Besides, this should be faster, too.
    2520             :      *
    2521             :      * More generally, it's okay that bytea callers can have NUL bytes in
    2522             :      * strings because varstrcmp_abbrev() need not make a distinction between
    2523             :      * terminating NUL bytes, and NUL bytes representing actual NULs in the
    2524             :      * authoritative representation.  Hopefully a comparison at or past one
    2525             :      * abbreviated key's terminating NUL byte will resolve the comparison
    2526             :      * without consulting the authoritative representation; specifically, some
    2527             :      * later non-NUL byte in the longer string can resolve the comparison
    2528             :      * against a subsequent terminating NUL in the shorter string.  There will
    2529             :      * usually be what is effectively a "length-wise" resolution there and
    2530             :      * then.
    2531             :      *
    2532             :      * If that doesn't work out -- if all bytes in the longer string
    2533             :      * positioned at or past the offset of the smaller string's (first)
    2534             :      * terminating NUL are actually representative of NUL bytes in the
    2535             :      * authoritative binary string (perhaps with some *terminating* NUL bytes
    2536             :      * towards the end of the longer string iff it happens to still be small)
    2537             :      * -- then an authoritative tie-breaker will happen, and do the right
    2538             :      * thing: explicitly consider string length.
    2539             :      */
    2540      354970 :     if (sss->collate_c)
    2541      354970 :         memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
    2542             :     else
    2543             :     {
    2544             :         Size        bsize;
    2545             : #ifdef USE_ICU
    2546             :         int32_t     ulen = -1;
    2547             :         UChar      *uchar = NULL;
    2548             : #endif
    2549             : 
    2550             :         /*
    2551             :          * We're not using the C collation, so fall back on strxfrm or ICU
    2552             :          * analogs.
    2553             :          */
    2554             : 
    2555             :         /* By convention, we use buffer 1 to store and NUL-terminate */
    2556           0 :         if (len >= sss->buflen1)
    2557             :         {
    2558           0 :             pfree(sss->buf1);
    2559           0 :             sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2560           0 :             sss->buf1 = palloc(sss->buflen1);
    2561             :         }
    2562             : 
    2563             :         /* Might be able to reuse strxfrm() blob from last call */
    2564           0 :         if (sss->last_len1 == len && sss->cache_blob &&
    2565           0 :             memcmp(sss->buf1, authoritative_data, len) == 0)
    2566             :         {
    2567           0 :             memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
    2568             :             /* No change affecting cardinality, so no hashing required */
    2569           0 :             goto done;
    2570             :         }
    2571             : 
    2572           0 :         memcpy(sss->buf1, authoritative_data, len);
    2573             : 
    2574             :         /*
    2575             :          * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
    2576             :          * necessary for ICU, but doesn't hurt.
    2577             :          */
    2578           0 :         sss->buf1[len] = '\0';
    2579           0 :         sss->last_len1 = len;
    2580             : 
    2581             : #ifdef USE_ICU
    2582             :         /* When using ICU and not UTF8, convert string to UChar. */
    2583             :         if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
    2584             :             GetDatabaseEncoding() != PG_UTF8)
    2585             :             ulen = icu_to_uchar(&uchar, sss->buf1, len);
    2586             : #endif
    2587             : 
    2588             :         /*
    2589             :          * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
    2590             :          * and try again.  Both of these functions have the result buffer
    2591             :          * content undefined if the result did not fit, so we need to retry
    2592             :          * until everything fits, even though we only need the first few bytes
    2593             :          * in the end.  When using ucol_nextSortKeyPart(), however, we only
    2594             :          * ask for as many bytes as we actually need.
    2595             :          */
    2596             :         for (;;)
    2597             :         {
    2598             : #ifdef USE_ICU
    2599             :             if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
    2600             :             {
    2601             :                 /*
    2602             :                  * When using UTF8, use the iteration interface so we only
    2603             :                  * need to produce as many bytes as we actually need.
    2604             :                  */
    2605             :                 if (GetDatabaseEncoding() == PG_UTF8)
    2606             :                 {
    2607             :                     UCharIterator iter;
    2608             :                     uint32_t    state[2];
    2609             :                     UErrorCode  status;
    2610             : 
    2611             :                     uiter_setUTF8(&iter, sss->buf1, len);
    2612             :                     state[0] = state[1] = 0;    /* won't need that again */
    2613             :                     status = U_ZERO_ERROR;
    2614             :                     bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
    2615             :                                                  &iter,
    2616             :                                                  state,
    2617             :                                                  (uint8_t *) sss->buf2,
    2618             :                                                  Min(sizeof(Datum), sss->buflen2),
    2619             :                                                  &status);
    2620             :                     if (U_FAILURE(status))
    2621             :                         ereport(ERROR,
    2622             :                                 (errmsg("sort key generation failed: %s",
    2623             :                                         u_errorName(status))));
    2624             :                 }
    2625             :                 else
    2626             :                     bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
    2627             :                                             uchar, ulen,
    2628             :                                             (uint8_t *) sss->buf2, sss->buflen2);
    2629             :             }
    2630             :             else
    2631             : #endif
    2632             : #ifdef HAVE_LOCALE_T
    2633           0 :             if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
    2634           0 :                 bsize = strxfrm_l(sss->buf2, sss->buf1,
    2635           0 :                                   sss->buflen2, sss->locale->info.lt);
    2636             :             else
    2637             : #endif
    2638           0 :                 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
    2639             : 
    2640           0 :             sss->last_len2 = bsize;
    2641           0 :             if (bsize < sss->buflen2)
    2642           0 :                 break;
    2643             : 
    2644             :             /*
    2645             :              * Grow buffer and retry.
    2646             :              */
    2647           0 :             pfree(sss->buf2);
    2648           0 :             sss->buflen2 = Max(bsize + 1,
    2649             :                                Min(sss->buflen2 * 2, MaxAllocSize));
    2650           0 :             sss->buf2 = palloc(sss->buflen2);
    2651             :         }
    2652             : 
    2653             :         /*
    2654             :          * Every Datum byte is always compared.  This is safe because the
    2655             :          * strxfrm() blob is itself NUL terminated, leaving no danger of
    2656             :          * misinterpreting any NUL bytes not intended to be interpreted as
    2657             :          * logically representing termination.
    2658             :          *
    2659             :          * (Actually, even if there were NUL bytes in the blob it would be
    2660             :          * okay.  See remarks on bytea case above.)
    2661             :          */
    2662           0 :         memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
    2663             : 
    2664             : #ifdef USE_ICU
    2665             :         if (uchar)
    2666             :             pfree(uchar);
    2667             : #endif
    2668             :     }
    2669             : 
    2670             :     /*
    2671             :      * Maintain approximate cardinality of both abbreviated keys and original,
    2672             :      * authoritative keys using HyperLogLog.  Used as cheap insurance against
    2673             :      * the worst case, where we do many string transformations for no saving
    2674             :      * in full strcoll()-based comparisons.  These statistics are used by
    2675             :      * varstr_abbrev_abort().
    2676             :      *
    2677             :      * First, Hash key proper, or a significant fraction of it.  Mix in length
    2678             :      * in order to compensate for cases where differences are past
    2679             :      * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
    2680             :      */
    2681      354970 :     hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
    2682             :                                    Min(len, PG_CACHE_LINE_SIZE)));
    2683             : 
    2684      354970 :     if (len > PG_CACHE_LINE_SIZE)
    2685           8 :         hash ^= DatumGetUInt32(hash_uint32((uint32) len));
    2686             : 
    2687      354970 :     addHyperLogLog(&sss->full_card, hash);
    2688             : 
    2689             :     /* Hash abbreviated key */
    2690             : #if SIZEOF_DATUM == 8
    2691             :     {
    2692             :         uint32      lohalf,
    2693             :                     hihalf;
    2694             : 
    2695      354970 :         lohalf = (uint32) res;
    2696      354970 :         hihalf = (uint32) (res >> 32);
    2697      354970 :         hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
    2698             :     }
    2699             : #else                           /* SIZEOF_DATUM != 8 */
    2700             :     hash = DatumGetUInt32(hash_uint32((uint32) res));
    2701             : #endif
    2702             : 
    2703      354970 :     addHyperLogLog(&sss->abbr_card, hash);
    2704             : 
    2705             :     /* Cache result, perhaps saving an expensive strxfrm() call next time */
    2706      354970 :     sss->cache_blob = true;
    2707      354970 : done:
    2708             : 
    2709             :     /*
    2710             :      * Byteswap on little-endian machines.
    2711             :      *
    2712             :      * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
    2713             :      * comparator) works correctly on all platforms.  If we didn't do this,
    2714             :      * the comparator would have to call memcmp() with a pair of pointers to
    2715             :      * the first byte of each abbreviated key, which is slower.
    2716             :      */
    2717      354970 :     res = DatumBigEndianToNative(res);
    2718             : 
    2719             :     /* Don't leak memory here */
    2720      354970 :     if (PointerGetDatum(authoritative) != original)
    2721           0 :         pfree(authoritative);
    2722             : 
    2723      354970 :     return res;
    2724             : }
    2725             : 
    2726             : /*
    2727             :  * Callback for estimating effectiveness of abbreviated key optimization, using
    2728             :  * heuristic rules.  Returns value indicating if the abbreviation optimization
    2729             :  * should be aborted, based on its projected effectiveness.
    2730             :  */
    2731             : static bool
    2732         964 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
    2733             : {
    2734         964 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2735             :     double      abbrev_distinct,
    2736             :                 key_distinct;
    2737             : 
    2738             :     Assert(ssup->abbreviate);
    2739             : 
    2740             :     /* Have a little patience */
    2741         964 :     if (memtupcount < 100)
    2742         464 :         return false;
    2743             : 
    2744         500 :     abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
    2745         500 :     key_distinct = estimateHyperLogLog(&sss->full_card);
    2746             : 
    2747             :     /*
    2748             :      * Clamp cardinality estimates to at least one distinct value.  While
    2749             :      * NULLs are generally disregarded, if only NULL values were seen so far,
    2750             :      * that might misrepresent costs if we failed to clamp.
    2751             :      */
    2752         500 :     if (abbrev_distinct <= 1.0)
    2753           0 :         abbrev_distinct = 1.0;
    2754             : 
    2755         500 :     if (key_distinct <= 1.0)
    2756           0 :         key_distinct = 1.0;
    2757             : 
    2758             :     /*
    2759             :      * In the worst case all abbreviated keys are identical, while at the same
    2760             :      * time there are differences within full key strings not captured in
    2761             :      * abbreviations.
    2762             :      */
    2763             : #ifdef TRACE_SORT
    2764         500 :     if (trace_sort)
    2765             :     {
    2766           0 :         double      norm_abbrev_card = abbrev_distinct / (double) memtupcount;
    2767             : 
    2768           0 :         elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
    2769             :              "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
    2770             :              memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
    2771             :              sss->prop_card);
    2772             :     }
    2773             : #endif
    2774             : 
    2775             :     /*
    2776             :      * If the number of distinct abbreviated keys approximately matches the
    2777             :      * number of distinct authoritative original keys, that's reason enough to
    2778             :      * proceed.  We can win even with a very low cardinality set if most
    2779             :      * tie-breakers only memcmp().  This is by far the most important
    2780             :      * consideration.
    2781             :      *
    2782             :      * While comparisons that are resolved at the abbreviated key level are
    2783             :      * considerably cheaper than tie-breakers resolved with memcmp(), both of
    2784             :      * those two outcomes are so much cheaper than a full strcoll() once
    2785             :      * sorting is underway that it doesn't seem worth it to weigh abbreviated
    2786             :      * cardinality against the overall size of the set in order to more
    2787             :      * accurately model costs.  Assume that an abbreviated comparison, and an
    2788             :      * abbreviated comparison with a cheap memcmp()-based authoritative
    2789             :      * resolution are equivalent.
    2790             :      */
    2791         500 :     if (abbrev_distinct > key_distinct * sss->prop_card)
    2792             :     {
    2793             :         /*
    2794             :          * When we have exceeded 10,000 tuples, decay required cardinality
    2795             :          * aggressively for next call.
    2796             :          *
    2797             :          * This is useful because the number of comparisons required on
    2798             :          * average increases at a linearithmic rate, and at roughly 10,000
    2799             :          * tuples that factor will start to dominate over the linear costs of
    2800             :          * string transformation (this is a conservative estimate).  The decay
    2801             :          * rate is chosen to be a little less aggressive than halving -- which
    2802             :          * (since we're called at points at which memtupcount has doubled)
    2803             :          * would never see the cost model actually abort past the first call
    2804             :          * following a decay.  This decay rate is mostly a precaution against
    2805             :          * a sudden, violent swing in how well abbreviated cardinality tracks
    2806             :          * full key cardinality.  The decay also serves to prevent a marginal
    2807             :          * case from being aborted too late, when too much has already been
    2808             :          * invested in string transformation.
    2809             :          *
    2810             :          * It's possible for sets of several million distinct strings with
    2811             :          * mere tens of thousands of distinct abbreviated keys to still
    2812             :          * benefit very significantly.  This will generally occur provided
    2813             :          * each abbreviated key is a proxy for a roughly uniform number of the
    2814             :          * set's full keys. If it isn't so, we hope to catch that early and
    2815             :          * abort.  If it isn't caught early, by the time the problem is
    2816             :          * apparent it's probably not worth aborting.
    2817             :          */
    2818         500 :         if (memtupcount > 10000)
    2819           0 :             sss->prop_card *= 0.65;
    2820             : 
    2821         500 :         return false;
    2822             :     }
    2823             : 
    2824             :     /*
    2825             :      * Abort abbreviation strategy.
    2826             :      *
    2827             :      * The worst case, where all abbreviated keys are identical while all
    2828             :      * original strings differ will typically only see a regression of about
    2829             :      * 10% in execution time for small to medium sized lists of strings.
    2830             :      * Whereas on modern CPUs where cache stalls are the dominant cost, we can
    2831             :      * often expect very large improvements, particularly with sets of strings
    2832             :      * of moderately high to high abbreviated cardinality.  There is little to
    2833             :      * lose but much to gain, which our strategy reflects.
    2834             :      */
    2835             : #ifdef TRACE_SORT
    2836           0 :     if (trace_sort)
    2837           0 :         elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
    2838             :              "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
    2839             :              memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
    2840             : #endif
    2841             : 
    2842           0 :     return true;
    2843             : }
    2844             : 
    2845             : /*
    2846             :  * Generic equalimage support function for character type's operator classes.
    2847             :  * Disables the use of deduplication with nondeterministic collations.
    2848             :  */
    2849             : Datum
    2850        1906 : btvarstrequalimage(PG_FUNCTION_ARGS)
    2851             : {
    2852             :     /* Oid      opcintype = PG_GETARG_OID(0); */
    2853        1906 :     Oid         collid = PG_GET_COLLATION();
    2854             : 
    2855        1906 :     check_collation_set(collid);
    2856             : 
    2857        1906 :     if (lc_collate_is_c(collid) ||
    2858           0 :         collid == DEFAULT_COLLATION_OID ||
    2859           0 :         get_collation_isdeterministic(collid))
    2860        1906 :         PG_RETURN_BOOL(true);
    2861             :     else
    2862           0 :         PG_RETURN_BOOL(false);
    2863             : }
    2864             : 
    2865             : Datum
    2866      137394 : text_larger(PG_FUNCTION_ARGS)
    2867             : {
    2868      137394 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2869      137394 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2870             :     text       *result;
    2871             : 
    2872      137394 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
    2873             : 
    2874      137394 :     PG_RETURN_TEXT_P(result);
    2875             : }
    2876             : 
    2877             : Datum
    2878       41072 : text_smaller(PG_FUNCTION_ARGS)
    2879             : {
    2880       41072 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2881       41072 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2882             :     text       *result;
    2883             : 
    2884       41072 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
    2885             : 
    2886       41072 :     PG_RETURN_TEXT_P(result);
    2887             : }
    2888             : 
    2889             : 
    2890             : /*
    2891             :  * Cross-type comparison functions for types text and name.
    2892             :  */
    2893             : 
    2894             : Datum
    2895      105138 : nameeqtext(PG_FUNCTION_ARGS)
    2896             : {
    2897      105138 :     Name        arg1 = PG_GETARG_NAME(0);
    2898      105138 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2899      105138 :     size_t      len1 = strlen(NameStr(*arg1));
    2900      105138 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2901      105138 :     Oid         collid = PG_GET_COLLATION();
    2902             :     bool        result;
    2903             : 
    2904      105138 :     check_collation_set(collid);
    2905             : 
    2906      105138 :     if (collid == C_COLLATION_OID)
    2907      181814 :         result = (len1 == len2 &&
    2908       85452 :                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2909             :     else
    2910        8776 :         result = (varstr_cmp(NameStr(*arg1), len1,
    2911        8776 :                              VARDATA_ANY(arg2), len2,
    2912             :                              collid) == 0);
    2913             : 
    2914      105138 :     PG_FREE_IF_COPY(arg2, 1);
    2915             : 
    2916      105138 :     PG_RETURN_BOOL(result);
    2917             : }
    2918             : 
    2919             : Datum
    2920         256 : texteqname(PG_FUNCTION_ARGS)
    2921             : {
    2922         256 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2923         256 :     Name        arg2 = PG_GETARG_NAME(1);
    2924         256 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2925         256 :     size_t      len2 = strlen(NameStr(*arg2));
    2926         256 :     Oid         collid = PG_GET_COLLATION();
    2927             :     bool        result;
    2928             : 
    2929         256 :     check_collation_set(collid);
    2930             : 
    2931         256 :     if (collid == C_COLLATION_OID)
    2932         376 :         result = (len1 == len2 &&
    2933         120 :                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2934             :     else
    2935           0 :         result = (varstr_cmp(VARDATA_ANY(arg1), len1,
    2936           0 :                              NameStr(*arg2), len2,
    2937             :                              collid) == 0);
    2938             : 
    2939         256 :     PG_FREE_IF_COPY(arg1, 0);
    2940             : 
    2941         256 :     PG_RETURN_BOOL(result);
    2942             : }
    2943             : 
    2944             : Datum
    2945           0 : namenetext(PG_FUNCTION_ARGS)
    2946             : {
    2947           0 :     Name        arg1 = PG_GETARG_NAME(0);
    2948           0 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2949           0 :     size_t      len1 = strlen(NameStr(*arg1));
    2950           0 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2951           0 :     Oid         collid = PG_GET_COLLATION();
    2952             :     bool        result;
    2953             : 
    2954           0 :     check_collation_set(collid);
    2955             : 
    2956           0 :     if (collid == C_COLLATION_OID)
    2957           0 :         result = !(len1 == len2 &&
    2958           0 :                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2959             :     else
    2960           0 :         result = !(varstr_cmp(NameStr(*arg1), len1,
    2961           0 :                               VARDATA_ANY(arg2), len2,
    2962             :                               collid) == 0);
    2963             : 
    2964           0 :     PG_FREE_IF_COPY(arg2, 1);
    2965             : 
    2966           0 :     PG_RETURN_BOOL(result);
    2967             : }
    2968             : 
    2969             : Datum
    2970           0 : textnename(PG_FUNCTION_ARGS)
    2971             : {
    2972           0 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2973           0 :     Name        arg2 = PG_GETARG_NAME(1);
    2974           0 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2975           0 :     size_t      len2 = strlen(NameStr(*arg2));
    2976           0 :     Oid         collid = PG_GET_COLLATION();
    2977             :     bool        result;
    2978             : 
    2979           0 :     check_collation_set(collid);
    2980             : 
    2981           0 :     if (collid == C_COLLATION_OID)
    2982           0 :         result = !(len1 == len2 &&
    2983           0 :                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2984             :     else
    2985           0 :         result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
    2986           0 :                               NameStr(*arg2), len2,
    2987             :                               collid) == 0);
    2988             : 
    2989           0 :     PG_FREE_IF_COPY(arg1, 0);
    2990             : 
    2991           0 :     PG_RETURN_BOOL(result);
    2992             : }
    2993             : 
    2994             : Datum
    2995       70956 : btnametextcmp(PG_FUNCTION_ARGS)
    2996             : {
    2997       70956 :     Name        arg1 = PG_GETARG_NAME(0);
    2998       70956 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2999             :     int32       result;
    3000             : 
    3001      141912 :     result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
    3002      141912 :                         VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
    3003             :                         PG_GET_COLLATION());
    3004             : 
    3005       70956 :     PG_FREE_IF_COPY(arg2, 1);
    3006             : 
    3007       70956 :     PG_RETURN_INT32(result);
    3008             : }
    3009             : 
    3010             : Datum
    3011           0 : bttextnamecmp(PG_FUNCTION_ARGS)
    3012             : {
    3013           0 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3014           0 :     Name        arg2 = PG_GETARG_NAME(1);
    3015             :     int32       result;
    3016             : 
    3017           0 :     result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
    3018           0 :                         NameStr(*arg2), strlen(NameStr(*arg2)),
    3019             :                         PG_GET_COLLATION());
    3020             : 
    3021           0 :     PG_FREE_IF_COPY(arg1, 0);
    3022             : 
    3023           0 :     PG_RETURN_INT32(result);
    3024             : }
    3025             : 
    3026             : #define CmpCall(cmpfunc) \
    3027             :     DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
    3028             :                                           PG_GET_COLLATION(), \
    3029             :                                           PG_GETARG_DATUM(0), \
    3030             :                                           PG_GETARG_DATUM(1)))
    3031             : 
    3032             : Datum
    3033       21838 : namelttext(PG_FUNCTION_ARGS)
    3034             : {
    3035       21838 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
    3036             : }
    3037             : 
    3038             : Datum
    3039           0 : nameletext(PG_FUNCTION_ARGS)
    3040             : {
    3041           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
    3042             : }
    3043             : 
    3044             : Datum
    3045           0 : namegttext(PG_FUNCTION_ARGS)
    3046             : {
    3047           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
    3048             : }
    3049             : 
    3050             : Datum
    3051       20506 : namegetext(PG_FUNCTION_ARGS)
    3052             : {
    3053       20506 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
    3054             : }
    3055             : 
    3056             : Datum
    3057           0 : textltname(PG_FUNCTION_ARGS)
    3058             : {
    3059           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
    3060             : }
    3061             : 
    3062             : Datum
    3063           0 : textlename(PG_FUNCTION_ARGS)
    3064             : {
    3065           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
    3066             : }
    3067             : 
    3068             : Datum
    3069           0 : textgtname(PG_FUNCTION_ARGS)
    3070             : {
    3071           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
    3072             : }
    3073             : 
    3074             : Datum
    3075           0 : textgename(PG_FUNCTION_ARGS)
    3076             : {
    3077           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
    3078             : }
    3079             : 
    3080             : #undef CmpCall
    3081             : 
    3082             : 
    3083             : /*
    3084             :  * The following operators support character-by-character comparison
    3085             :  * of text datums, to allow building indexes suitable for LIKE clauses.
    3086             :  * Note that the regular texteq/textne comparison operators, and regular
    3087             :  * support functions 1 and 2 with "C" collation are assumed to be
    3088             :  * compatible with these!
    3089             :  */
    3090             : 
    3091             : static int
    3092      101392 : internal_text_pattern_compare(text *arg1, text *arg2)
    3093             : {
    3094             :     int         result;
    3095             :     int         len1,
    3096             :                 len2;
    3097             : 
    3098      101392 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3099      101392 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3100             : 
    3101      101392 :     result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3102      101392 :     if (result != 0)
    3103      101356 :         return result;
    3104          36 :     else if (len1 < len2)
    3105           0 :         return -1;
    3106          36 :     else if (len1 > len2)
    3107          12 :         return 1;
    3108             :     else
    3109          24 :         return 0;
    3110             : }
    3111             : 
    3112             : 
    3113             : Datum
    3114       26360 : text_pattern_lt(PG_FUNCTION_ARGS)
    3115             : {
    3116       26360 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3117       26360 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    3118             :     int         result;
    3119             : 
    3120       26360 :     result = internal_text_pattern_compare(arg1, arg2);
    3121             : 
    3122       26360 :     PG_FREE_IF_COPY(arg1, 0);
    3123       26360 :     PG_FREE_IF_COPY(arg2, 1);
    3124             : 
    3125       26360 :     PG_RETURN_BOOL(result < 0);
    3126             : }
    3127             : 
    3128             : 
    3129             : Datum
    3130       25008 : text_pattern_le(PG_FUNCTION_ARGS)
    3131             : {
    3132       25008 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3133       25008 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    3134             :     int         result;
    3135             : 
    3136       25008 :     result = internal_text_pattern_compare(arg1, arg2);
    3137             : 
    3138       25008 :     PG_FREE_IF_COPY(arg1, 0);
    3139       25008 :     PG_FREE_IF_COPY(arg2, 1);
    3140             : 
    3141       25008 :     PG_RETURN_BOOL(result <= 0);
    3142             : }
    3143             : 
    3144             : 
    3145             : Datum
    3146       25008 : text_pattern_ge(PG_FUNCTION_ARGS)
    3147             : {
    3148       25008 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3149       25008 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    3150             :     int         result;
    3151             : 
    3152       25008 :     result = internal_text_pattern_compare(arg1, arg2);
    3153             : 
    3154       25008 :     PG_FREE_IF_COPY(arg1, 0);
    3155       25008 :     PG_FREE_IF_COPY(arg2, 1);
    3156             : 
    3157       25008 :     PG_RETURN_BOOL(result >= 0);
    3158             : }
    3159             : 
    3160             : 
    3161             : Datum
    3162       25008 : text_pattern_gt(PG_FUNCTION_ARGS)
    3163             : {
    3164       25008 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3165       25008 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    3166             :     int         result;
    3167             : 
    3168       25008 :     result = internal_text_pattern_compare(arg1, arg2);
    3169             : 
    3170       25008 :     PG_FREE_IF_COPY(arg1, 0);
    3171       25008 :     PG_FREE_IF_COPY(arg2, 1);
    3172             : 
    3173       25008 :     PG_RETURN_BOOL(result > 0);
    3174             : }
    3175             : 
    3176             : 
    3177             : Datum
    3178           8 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
    3179             : {
    3180           8 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    3181           8 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    3182             :     int         result;
    3183             : 
    3184           8 :     result = internal_text_pattern_compare(arg1, arg2);
    3185             : 
    3186           8 :     PG_FREE_IF_COPY(arg1, 0);
    3187           8 :     PG_FREE_IF_COPY(arg2, 1);
    3188             : 
    3189           8 :     PG_RETURN_INT32(result);
    3190             : }
    3191             : 
    3192             : 
    3193             : Datum
    3194          78 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
    3195             : {
    3196          78 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    3197             :     MemoryContext oldcontext;
    3198             : 
    3199          78 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    3200             : 
    3201             :     /* Use generic string SortSupport, forcing "C" collation */
    3202          78 :     varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
    3203             : 
    3204          78 :     MemoryContextSwitchTo(oldcontext);
    3205             : 
    3206          78 :     PG_RETURN_VOID();
    3207             : }
    3208             : 
    3209             : 
    3210             : /*-------------------------------------------------------------
    3211             :  * byteaoctetlen
    3212             :  *
    3213             :  * get the number of bytes contained in an instance of type 'bytea'
    3214             :  *-------------------------------------------------------------
    3215             :  */
    3216             : Datum
    3217          26 : byteaoctetlen(PG_FUNCTION_ARGS)
    3218             : {
    3219          26 :     Datum       str = PG_GETARG_DATUM(0);
    3220             : 
    3221             :     /* We need not detoast the input at all */
    3222          26 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
    3223             : }
    3224             : 
    3225             : /*
    3226             :  * byteacat -
    3227             :  *    takes two bytea* and returns a bytea* that is the concatenation of
    3228             :  *    the two.
    3229             :  *
    3230             :  * Cloned from textcat and modified as required.
    3231             :  */
    3232             : Datum
    3233           0 : byteacat(PG_FUNCTION_ARGS)
    3234             : {
    3235           0 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3236           0 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3237             : 
    3238           0 :     PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
    3239             : }
    3240             : 
    3241             : /*
    3242             :  * bytea_catenate
    3243             :  *  Guts of byteacat(), broken out so it can be used by other functions
    3244             :  *
    3245             :  * Arguments can be in short-header form, but not compressed or out-of-line
    3246             :  */
    3247             : static bytea *
    3248          24 : bytea_catenate(bytea *t1, bytea *t2)
    3249             : {
    3250             :     bytea      *result;
    3251             :     int         len1,
    3252             :                 len2,
    3253             :                 len;
    3254             :     char       *ptr;
    3255             : 
    3256          24 :     len1 = VARSIZE_ANY_EXHDR(t1);
    3257          24 :     len2 = VARSIZE_ANY_EXHDR(t2);
    3258             : 
    3259             :     /* paranoia ... probably should throw error instead? */
    3260          24 :     if (len1 < 0)
    3261           0 :         len1 = 0;
    3262          24 :     if (len2 < 0)
    3263           0 :         len2 = 0;
    3264             : 
    3265          24 :     len = len1 + len2 + VARHDRSZ;
    3266          24 :     result = (bytea *) palloc(len);
    3267             : 
    3268             :     /* Set size of result string... */
    3269          24 :     SET_VARSIZE(result, len);
    3270             : 
    3271             :     /* Fill data field of result string... */
    3272          24 :     ptr = VARDATA(result);
    3273          24 :     if (len1 > 0)
    3274          24 :         memcpy(ptr, VARDATA_ANY(t1), len1);
    3275          24 :     if (len2 > 0)
    3276          12 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
    3277             : 
    3278          24 :     return result;
    3279             : }
    3280             : 
    3281             : #define PG_STR_GET_BYTEA(str_) \
    3282             :     DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
    3283             : 
    3284             : /*
    3285             :  * bytea_substr()
    3286             :  * Return a substring starting at the specified position.
    3287             :  * Cloned from text_substr and modified as required.
    3288             :  *
    3289             :  * Input:
    3290             :  *  - string
    3291             :  *  - starting position (is one-based)
    3292             :  *  - string length (optional)
    3293             :  *
    3294             :  * If the starting position is zero or less, then return from the start of the string
    3295             :  * adjusting the length to be consistent with the "negative start" per SQL.
    3296             :  * If the length is less than zero, an ERROR is thrown. If no third argument
    3297             :  * (length) is provided, the length to the end of the string is assumed.
    3298             :  */
    3299             : Datum
    3300          52 : bytea_substr(PG_FUNCTION_ARGS)
    3301             : {
    3302          52 :     PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
    3303             :                                       PG_GETARG_INT32(1),
    3304             :                                       PG_GETARG_INT32(2),
    3305             :                                       false));
    3306             : }
    3307             : 
    3308             : /*
    3309             :  * bytea_substr_no_len -
    3310             :  *    Wrapper to avoid opr_sanity failure due to
    3311             :  *    one function accepting a different number of args.
    3312             :  */
    3313             : Datum
    3314          20 : bytea_substr_no_len(PG_FUNCTION_ARGS)
    3315             : {
    3316          20 :     PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
    3317             :                                       PG_GETARG_INT32(1),
    3318             :                                       -1,
    3319             :                                       true));
    3320             : }
    3321             : 
    3322             : static bytea *
    3323          96 : bytea_substring(Datum str,
    3324             :                 int S,
    3325             :                 int L,
    3326             :                 bool length_not_specified)
    3327             : {
    3328             :     int32       S1;             /* adjusted start position */
    3329             :     int32       L1;             /* adjusted substring length */
    3330             :     int32       E;              /* end position */
    3331             : 
    3332             :     /*
    3333             :      * The logic here should generally match text_substring().
    3334             :      */
    3335          96 :     S1 = Max(S, 1);
    3336             : 
    3337          96 :     if (length_not_specified)
    3338             :     {
    3339             :         /*
    3340             :          * Not passed a length - DatumGetByteaPSlice() grabs everything to the
    3341             :          * end of the string if we pass it a negative value for length.
    3342             :          */
    3343          32 :         L1 = -1;
    3344             :     }
    3345          64 :     else if (L < 0)
    3346             :     {
    3347             :         /* SQL99 says to throw an error for E < S, i.e., negative length */
    3348           8 :         ereport(ERROR,
    3349             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    3350             :                  errmsg("negative substring length not allowed")));
    3351             :         L1 = -1;                /* silence stupider compilers */
    3352             :     }
    3353          56 :     else if (pg_add_s32_overflow(S, L, &E))
    3354             :     {
    3355             :         /*
    3356             :          * L could be large enough for S + L to overflow, in which case the
    3357             :          * substring must run to end of string.
    3358             :          */
    3359           4 :         L1 = -1;
    3360             :     }
    3361             :     else
    3362             :     {
    3363             :         /*
    3364             :          * A zero or negative value for the end position can happen if the
    3365             :          * start was negative or one. SQL99 says to return a zero-length
    3366             :          * string.
    3367             :          */
    3368          52 :         if (E < 1)
    3369           0 :             return PG_STR_GET_BYTEA("");
    3370             : 
    3371          52 :         L1 = E - S1;
    3372             :     }
    3373             : 
    3374             :     /*
    3375             :      * If the start position is past the end of the string, SQL99 says to
    3376             :      * return a zero-length string -- DatumGetByteaPSlice() will do that for
    3377             :      * us.  We need only convert S1 to zero-based starting position.
    3378             :      */
    3379          88 :     return DatumGetByteaPSlice(str, S1 - 1, L1);
    3380             : }
    3381             : 
    3382             : /*
    3383             :  * byteaoverlay
    3384             :  *  Replace specified substring of first string with second
    3385             :  *
    3386             :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
    3387             :  * This code is a direct implementation of what the standard says.
    3388             :  */
    3389             : Datum
    3390           4 : byteaoverlay(PG_FUNCTION_ARGS)
    3391             : {
    3392           4 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3393           4 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3394           4 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    3395           4 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
    3396             : 
    3397           4 :     PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
    3398             : }
    3399             : 
    3400             : Datum
    3401           8 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
    3402             : {
    3403           8 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3404           8 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3405           8 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    3406             :     int         sl;
    3407             : 
    3408           8 :     sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
    3409           8 :     PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
    3410             : }
    3411             : 
    3412             : static bytea *
    3413          12 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
    3414             : {
    3415             :     bytea      *result;
    3416             :     bytea      *s1;
    3417             :     bytea      *s2;
    3418             :     int         sp_pl_sl;
    3419             : 
    3420             :     /*
    3421             :      * Check for possible integer-overflow cases.  For negative sp, throw a
    3422             :      * "substring length" error because that's what should be expected
    3423             :      * according to the spec's definition of OVERLAY().
    3424             :      */
    3425          12 :     if (sp <= 0)
    3426           0 :         ereport(ERROR,
    3427             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    3428             :                  errmsg("negative substring length not allowed")));
    3429          12 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
    3430           0 :         ereport(ERROR,
    3431             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    3432             :                  errmsg("integer out of range")));
    3433             : 
    3434          12 :     s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
    3435          12 :     s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
    3436          12 :     result = bytea_catenate(s1, t2);
    3437          12 :     result = bytea_catenate(result, s2);
    3438             : 
    3439          12 :     return result;
    3440             : }
    3441             : 
    3442             : /*
    3443             :  * byteapos -
    3444             :  *    Return the position of the specified substring.
    3445             :  *    Implements the SQL POSITION() function.
    3446             :  * Cloned from textpos and modified as required.
    3447             :  */
    3448             : Datum
    3449           0 : byteapos(PG_FUNCTION_ARGS)
    3450             : {
    3451           0 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3452           0 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3453             :     int         pos;
    3454             :     int         px,
    3455             :                 p;
    3456             :     int         len1,
    3457             :                 len2;
    3458             :     char       *p1,
    3459             :                *p2;
    3460             : 
    3461           0 :     len1 = VARSIZE_ANY_EXHDR(t1);
    3462           0 :     len2 = VARSIZE_ANY_EXHDR(t2);
    3463             : 
    3464           0 :     if (len2 <= 0)
    3465           0 :         PG_RETURN_INT32(1);     /* result for empty pattern */
    3466             : 
    3467           0 :     p1 = VARDATA_ANY(t1);
    3468           0 :     p2 = VARDATA_ANY(t2);
    3469             : 
    3470           0 :     pos = 0;
    3471           0 :     px = (len1 - len2);
    3472           0 :     for (p = 0; p <= px; p++)
    3473             :     {
    3474           0 :         if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
    3475             :         {
    3476           0 :             pos = p + 1;
    3477           0 :             break;
    3478             :         };
    3479           0 :         p1++;
    3480             :     };
    3481             : 
    3482           0 :     PG_RETURN_INT32(pos);
    3483             : }
    3484             : 
    3485             : /*-------------------------------------------------------------
    3486             :  * byteaGetByte
    3487             :  *
    3488             :  * this routine treats "bytea" as an array of bytes.
    3489             :  * It returns the Nth byte (a number between 0 and 255).
    3490             :  *-------------------------------------------------------------
    3491             :  */
    3492             : Datum
    3493           8 : byteaGetByte(PG_FUNCTION_ARGS)
    3494             : {
    3495           8 :     bytea      *v = PG_GETARG_BYTEA_PP(0);
    3496           8 :     int32       n = PG_GETARG_INT32(1);
    3497             :     int         len;
    3498             :     int         byte;
    3499             : 
    3500           8 :     len = VARSIZE_ANY_EXHDR(v);
    3501             : 
    3502           8 :     if (n < 0 || n >= len)
    3503           4 :         ereport(ERROR,
    3504             :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3505             :                  errmsg("index %d out of valid range, 0..%d",
    3506             :                         n, len - 1)));
    3507             : 
    3508           4 :     byte = ((unsigned char *) VARDATA_ANY(v))[n];
    3509             : 
    3510           4 :     PG_RETURN_INT32(byte);
    3511             : }
    3512             : 
    3513             : /*-------------------------------------------------------------
    3514             :  * byteaGetBit
    3515             :  *
    3516             :  * This routine treats a "bytea" type like an array of bits.
    3517             :  * It returns the value of the Nth bit (0 or 1).
    3518             :  *
    3519             :  *-------------------------------------------------------------
    3520             :  */
    3521             : Datum
    3522           8 : byteaGetBit(PG_FUNCTION_ARGS)
    3523             : {
    3524           8 :     bytea      *v = PG_GETARG_BYTEA_PP(0);
    3525           8 :     int64       n = PG_GETARG_INT64(1);
    3526             :     int         byteNo,
    3527             :                 bitNo;
    3528             :     int         len;
    3529             :     int         byte;
    3530             : 
    3531           8 :     len = VARSIZE_ANY_EXHDR(v);
    3532             : 
    3533           8 :     if (n < 0 || n >= (int64) len * 8)
    3534           4 :         ereport(ERROR,
    3535             :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3536             :                  errmsg("index %lld out of valid range, 0..%lld",
    3537             :                         (long long) n, (long long) len * 8 - 1)));
    3538             : 
    3539             :     /* n/8 is now known < len, so safe to cast to int */
    3540           4 :     byteNo = (int) (n / 8);
    3541           4 :     bitNo = (int) (n % 8);
    3542             : 
    3543           4 :     byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
    3544             : 
    3545           4 :     if (byte & (1 << bitNo))
    3546           4 :         PG_RETURN_INT32(1);
    3547             :     else
    3548           0 :         PG_RETURN_INT32(0);
    3549             : }
    3550             : 
    3551             : /*-------------------------------------------------------------
    3552             :  * byteaSetByte
    3553             :  *
    3554             :  * Given an instance of type 'bytea' creates a new one with
    3555             :  * the Nth byte set to the given value.
    3556             :  *
    3557             :  *-------------------------------------------------------------
    3558             :  */
    3559             : Datum
    3560           8 : byteaSetByte(PG_FUNCTION_ARGS)
    3561             : {
    3562           8 :     bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
    3563           8 :     int32       n = PG_GETARG_INT32(1);
    3564           8 :     int32       newByte = PG_GETARG_INT32(2);
    3565             :     int         len;
    3566             : 
    3567           8 :     len = VARSIZE(res) - VARHDRSZ;
    3568             : 
    3569           8 :     if (n < 0 || n >= len)
    3570           4 :         ereport(ERROR,
    3571             :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3572             :                  errmsg("index %d out of valid range, 0..%d",
    3573             :                         n, len - 1)));
    3574             : 
    3575             :     /*
    3576             :      * Now set the byte.
    3577             :      */
    3578           4 :     ((unsigned char *) VARDATA(res))[n] = newByte;
    3579             : 
    3580           4 :     PG_RETURN_BYTEA_P(res);
    3581             : }
    3582             : 
    3583             : /*-------------------------------------------------------------
    3584             :  * byteaSetBit
    3585             :  *
    3586             :  * Given an instance of type 'bytea' creates a new one with
    3587             :  * the Nth bit set to the given value.
    3588             :  *
    3589             :  *-------------------------------------------------------------
    3590             :  */
    3591             : Datum
    3592           8 : byteaSetBit(PG_FUNCTION_ARGS)
    3593             : {
    3594           8 :     bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
    3595           8 :     int64       n = PG_GETARG_INT64(1);
    3596           8 :     int32       newBit = PG_GETARG_INT32(2);
    3597             :     int         len;
    3598             :     int         oldByte,
    3599             :                 newByte;
    3600             :     int         byteNo,
    3601             :                 bitNo;
    3602             : 
    3603           8 :     len = VARSIZE(res) - VARHDRSZ;
    3604             : 
    3605           8 :     if (n < 0 || n >= (int64) len * 8)
    3606           4 :         ereport(ERROR,
    3607             :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3608             :                  errmsg("index %lld out of valid range, 0..%lld",
    3609             :                         (long long) n, (long long) len * 8 - 1)));
    3610             : 
    3611             :     /* n/8 is now known < len, so safe to cast to int */
    3612           4 :     byteNo = (int) (n / 8);
    3613           4 :     bitNo = (int) (n % 8);
    3614             : 
    3615             :     /*
    3616             :      * sanity check!
    3617             :      */
    3618           4 :     if (newBit != 0 && newBit != 1)
    3619           0 :         ereport(ERROR,
    3620             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    3621             :                  errmsg("new bit must be 0 or 1")));
    3622             : 
    3623             :     /*
    3624             :      * Update the byte.
    3625             :      */
    3626           4 :     oldByte = ((unsigned char *) VARDATA(res))[byteNo];
    3627             : 
    3628           4 :     if (newBit == 0)
    3629           4 :         newByte = oldByte & (~(1 << bitNo));
    3630             :     else
    3631           0 :         newByte = oldByte | (1 << bitNo);
    3632             : 
    3633           4 :     ((unsigned char *) VARDATA(res))[byteNo] = newByte;
    3634             : 
    3635           4 :     PG_RETURN_BYTEA_P(res);
    3636             : }
    3637             : 
    3638             : 
    3639             : /* text_name()
    3640             :  * Converts a text type to a Name type.
    3641             :  */
    3642             : Datum
    3643        2882 : text_name(PG_FUNCTION_ARGS)
    3644             : {
    3645        2882 :     text       *s = PG_GETARG_TEXT_PP(0);
    3646             :     Name        result;
    3647             :     int         len;
    3648             : 
    3649        2882 :     len = VARSIZE_ANY_EXHDR(s);
    3650             : 
    3651             :     /* Truncate oversize input */
    3652        2882 :     if (len >= NAMEDATALEN)
    3653           4 :         len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
    3654             : 
    3655             :     /* We use palloc0 here to ensure result is zero-padded */
    3656        2882 :     result = (Name) palloc0(NAMEDATALEN);
    3657        2882 :     memcpy(NameStr(*result), VARDATA_ANY(s), len);
    3658             : 
    3659        2882 :     PG_RETURN_NAME(result);
    3660             : }
    3661             : 
    3662             : /* name_text()
    3663             :  * Converts a Name type to a text type.
    3664             :  */
    3665             : Datum
    3666      576696 : name_text(PG_FUNCTION_ARGS)
    3667             : {
    3668      576696 :     Name        s = PG_GETARG_NAME(0);
    3669             : 
    3670      576696 :     PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
    3671             : }
    3672             : 
    3673             : 
    3674             : /*
    3675             :  * textToQualifiedNameList - convert a text object to list of names
    3676             :  *
    3677             :  * This implements the input parsing needed by nextval() and other
    3678             :  * functions that take a text parameter representing a qualified name.
    3679             :  * We split the name at dots, downcase if not double-quoted, and
    3680             :  * truncate names if they're too long.
    3681             :  */
    3682             : List *
    3683         878 : textToQualifiedNameList(text *textval)
    3684             : {
    3685             :     char       *rawname;
    3686         878 :     List       *result = NIL;
    3687             :     List       *namelist;
    3688             :     ListCell   *l;
    3689             : 
    3690             :     /* Convert to C string (handles possible detoasting). */
    3691             :     /* Note we rely on being able to modify rawname below. */
    3692         878 :     rawname = text_to_cstring(textval);
    3693             : 
    3694         878 :     if (!SplitIdentifierString(rawname, '.', &namelist))
    3695           0 :         ereport(ERROR,
    3696             :                 (errcode(ERRCODE_INVALID_NAME),
    3697             :                  errmsg("invalid name syntax")));
    3698             : 
    3699         878 :     if (namelist == NIL)
    3700           0 :         ereport(ERROR,
    3701             :                 (errcode(ERRCODE_INVALID_NAME),
    3702             :                  errmsg("invalid name syntax")));
    3703             : 
    3704        1832 :     foreach(l, namelist)
    3705             :     {
    3706         954 :         char       *curname = (char *) lfirst(l);
    3707             : 
    3708         954 :         result = lappend(result, makeString(pstrdup(curname)));
    3709             :     }
    3710             : 
    3711         878 :     pfree(rawname);
    3712         878 :     list_free(namelist);
    3713             : 
    3714         878 :     return result;
    3715             : }
    3716             : 
    3717             : /*
    3718             :  * SplitIdentifierString --- parse a string containing identifiers
    3719             :  *
    3720             :  * This is the guts of textToQualifiedNameList, and is exported for use in
    3721             :  * other situations such as parsing GUC variables.  In the GUC case, it's
    3722             :  * important to avoid memory leaks, so the API is designed to minimize the
    3723             :  * amount of stuff that needs to be allocated and freed.
    3724             :  *
    3725             :  * Inputs:
    3726             :  *  rawstring: the input string; must be overwritable!  On return, it's
    3727             :  *             been modified to contain the separated identifiers.
    3728             :  *  separator: the separator punctuation expected between identifiers
    3729             :  *             (typically '.' or ',').  Whitespace may also appear around
    3730             :  *             identifiers.
    3731             :  * Outputs:
    3732             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    3733             :  *            rawstring.  Caller should list_free() this even on error return.
    3734             :  *
    3735             :  * Returns true if okay, false if there is a syntax error in the string.
    3736             :  *
    3737             :  * Note that an empty string is considered okay here, though not in
    3738             :  * textToQualifiedNameList.
    3739             :  */
    3740             : bool
    3741       89640 : SplitIdentifierString(char *rawstring, char separator,
    3742             :                       List **namelist)
    3743             : {
    3744       89640 :     char       *nextp = rawstring;
    3745       89640 :     bool        done = false;
    3746             : 
    3747       89640 :     *namelist = NIL;
    3748             : 
    3749       89640 :     while (scanner_isspace(*nextp))
    3750           0 :         nextp++;                /* skip leading whitespace */
    3751             : 
    3752       89640 :     if (*nextp == '\0')
    3753        9672 :         return true;            /* allow empty string */
    3754             : 
    3755             :     /* At the top of the loop, we are at start of a new identifier. */
    3756             :     do
    3757             :     {
    3758             :         char       *curname;
    3759             :         char       *endp;
    3760             : 
    3761      130938 :         if (*nextp == '"')
    3762             :         {
    3763             :             /* Quoted name --- collapse quote-quote pairs, no downcasing */
    3764       17178 :             curname = nextp + 1;
    3765             :             for (;;)
    3766             :             {
    3767       17182 :                 endp = strchr(nextp + 1, '"');
    3768       17180 :                 if (endp == NULL)
    3769           0 :                     return false;   /* mismatched quotes */
    3770       17180 :                 if (endp[1] != '"')
    3771       17178 :                     break;      /* found end of quoted name */
    3772             :                 /* Collapse adjacent quotes into one quote, and look again */
    3773           2 :                 memmove(endp, endp + 1, strlen(endp));
    3774           2 :                 nextp = endp;
    3775             :             }
    3776             :             /* endp now points at the terminating quote */
    3777       17178 :             nextp = endp + 1;
    3778             :         }
    3779             :         else
    3780             :         {
    3781             :             /* Unquoted name --- extends to separator or whitespace */
    3782             :             char       *downname;
    3783             :             int         len;
    3784             : 
    3785      113760 :             curname = nextp;
    3786     1012004 :             while (*nextp && *nextp != separator &&
    3787      898246 :                    !scanner_isspace(*nextp))
    3788      898244 :                 nextp++;
    3789      113760 :             endp = nextp;
    3790      113760 :             if (curname == nextp)
    3791           0 :                 return false;   /* empty unquoted name not allowed */
    3792             : 
    3793             :             /*
    3794             :              * Downcase the identifier, using same code as main lexer does.
    3795             :              *
    3796             :              * XXX because we want to overwrite the input in-place, we cannot
    3797             :              * support a downcasing transformation that increases the string
    3798             :              * length.  This is not a problem given the current implementation
    3799             :              * of downcase_truncate_identifier, but we'll probably have to do
    3800             :              * something about this someday.
    3801             :              */
    3802      113760 :             len = endp - curname;
    3803      113760 :             downname = downcase_truncate_identifier(curname, len, false);
    3804             :             Assert(strlen(downname) <= len);
    3805      113760 :             strncpy(curname, downname, len);    /* strncpy is required here */
    3806      113760 :             pfree(downname);
    3807             :         }
    3808             : 
    3809      130940 :         while (scanner_isspace(*nextp))
    3810           2 :             nextp++;            /* skip trailing whitespace */
    3811             : 
    3812      130938 :         if (*nextp == separator)
    3813             :         {
    3814       50970 :             nextp++;
    3815       76786 :             while (scanner_isspace(*nextp))
    3816       25816 :                 nextp++;        /* skip leading whitespace for next */
    3817             :             /* we expect another name, so done remains false */
    3818             :         }
    3819       79968 :         else if (*nextp == '\0')
    3820       79966 :             done = true;
    3821             :         else
    3822           2 :             return false;       /* invalid syntax */
    3823             : 
    3824             :         /* Now safe to overwrite separator with a null */
    3825      130936 :         *endp = '\0';
    3826             : 
    3827             :         /* Truncate name if it's overlength */
    3828      130936 :         truncate_identifier(curname, strlen(curname), false);
    3829             : 
    3830             :         /*
    3831             :          * Finished isolating current name --- add it to list
    3832             :          */
    3833      130936 :         *namelist = lappend(*namelist, curname);
    3834             : 
    3835             :         /* Loop back if we didn't reach end of string */
    3836      130936 :     } while (!done);
    3837             : 
    3838       79966 :     return true;
    3839             : }
    3840             : 
    3841             : 
    3842             : /*
    3843             :  * SplitDirectoriesString --- parse a string containing file/directory names
    3844             :  *
    3845             :  * This works fine on file names too; the function name is historical.
    3846             :  *
    3847             :  * This is similar to SplitIdentifierString, except that the parsing
    3848             :  * rules are meant to handle pathnames instead of identifiers: there is
    3849             :  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
    3850             :  * and we apply canonicalize_path() to each extracted string.  Because of the
    3851             :  * last, the returned strings are separately palloc'd rather than being
    3852             :  * pointers into rawstring --- but we still scribble on rawstring.
    3853             :  *
    3854             :  * Inputs:
    3855             :  *  rawstring: the input string; must be modifiable!
    3856             :  *  separator: the separator punctuation expected between directories
    3857             :  *             (typically ',' or ';').  Whitespace may also appear around
    3858             :  *             directories.
    3859             :  * Outputs:
    3860             :  *  namelist: filled with a palloc'd list of directory names.
    3861             :  *            Caller should list_free_deep() this even on error return.
    3862             :  *
    3863             :  * Returns true if okay, false if there is a syntax error in the string.
    3864             :  *
    3865             :  * Note that an empty string is considered okay here.
    3866             :  */
    3867             : bool
    3868         802 : SplitDirectoriesString(char *rawstring, char separator,
    3869             :                        List **namelist)
    3870             : {
    3871         802 :     char       *nextp = rawstring;
    3872         802 :     bool        done = false;
    3873             : 
    3874         802 :     *namelist = NIL;
    3875             : 
    3876         802 :     while (scanner_isspace(*nextp))
    3877           0 :         nextp++;                /* skip leading whitespace */
    3878             : 
    3879         802 :     if (*nextp == '\0')
    3880           2 :         return true;            /* allow empty string */
    3881             : 
    3882             :     /* At the top of the loop, we are at start of a new directory. */
    3883             :     do
    3884             :     {
    3885             :         char       *curname;
    3886             :         char       *endp;
    3887             : 
    3888         800 :         if (*nextp == '"')
    3889             :         {
    3890             :             /* Quoted name --- collapse quote-quote pairs */
    3891           0 :             curname = nextp + 1;
    3892             :             for (;;)
    3893             :             {
    3894           0 :                 endp = strchr(nextp + 1, '"');
    3895           0 :                 if (endp == NULL)
    3896           0 :                     return false;   /* mismatched quotes */
    3897           0 :                 if (endp[1] != '"')
    3898           0 :                     break;      /* found end of quoted name */
    3899             :                 /* Collapse adjacent quotes into one quote, and look again */
    3900           0 :                 memmove(endp, endp + 1, strlen(endp));
    3901           0 :                 nextp = endp;
    3902             :             }
    3903             :             /* endp now points at the terminating quote */
    3904           0 :             nextp = endp + 1;
    3905             :         }
    3906             :         else
    3907             :         {
    3908             :             /* Unquoted name --- extends to separator or end of string */
    3909         800 :             curname = endp = nextp;
    3910       14070 :             while (*nextp && *nextp != separator)
    3911             :             {
    3912             :                 /* trailing whitespace should not be included in name */
    3913       13270 :                 if (!scanner_isspace(*nextp))
    3914       13270 :                     endp = nextp + 1;
    3915       13270 :                 nextp++;
    3916             :             }
    3917         800 :             if (curname == endp)
    3918           0 :                 return false;   /* empty unquoted name not allowed */
    3919             :         }
    3920             : 
    3921         800 :         while (scanner_isspace(*nextp))
    3922           0 :             nextp++;            /* skip trailing whitespace */
    3923             : 
    3924         800 :         if (*nextp == separator)
    3925             :         {
    3926           0 :             nextp++;
    3927           0 :             while (scanner_isspace(*nextp))
    3928           0 :                 nextp++;        /* skip leading whitespace for next */
    3929             :             /* we expect another name, so done remains false */
    3930             :         }
    3931         800 :         else if (*nextp == '\0')
    3932         800 :             done = true;
    3933             :         else
    3934           0 :             return false;       /* invalid syntax */
    3935             : 
    3936             :         /* Now safe to overwrite separator with a null */
    3937         800 :         *endp = '\0';
    3938             : 
    3939             :         /* Truncate path if it's overlength */
    3940         800 :         if (strlen(curname) >= MAXPGPATH)
    3941           0 :             curname[MAXPGPATH - 1] = '\0';
    3942             : 
    3943             :         /*
    3944             :          * Finished isolating current name --- add it to list
    3945             :          */
    3946         800 :         curname = pstrdup(curname);
    3947         800 :         canonicalize_path(curname);
    3948         800 :         *namelist = lappend(*namelist, curname);
    3949             : 
    3950             :         /* Loop back if we didn't reach end of string */
    3951         800 :     } while (!done);
    3952             : 
    3953         800 :     return true;
    3954             : }
    3955             : 
    3956             : 
    3957             : /*
    3958             :  * SplitGUCList --- parse a string containing identifiers or file names
    3959             :  *
    3960             :  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
    3961             :  * presuming whether the elements will be taken as identifiers or file names.
    3962             :  * We assume the input has already been through flatten_set_variable_args(),
    3963             :  * so that we need never downcase (if appropriate, that was done already).
    3964             :  * Nor do we ever truncate, since we don't know the correct max length.
    3965             :  * We disallow embedded whitespace for simplicity (it shouldn't matter,
    3966             :  * because any embedded whitespace should have led to double-quoting).
    3967             :  * Otherwise the API is identical to SplitIdentifierString.
    3968             :  *
    3969             :  * XXX it's annoying to have so many copies of this string-splitting logic.
    3970             :  * However, it's not clear that having one function with a bunch of option
    3971             :  * flags would be much better.
    3972             :  *
    3973             :  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
    3974             :  * Be sure to update that if you have to change this.
    3975             :  *
    3976             :  * Inputs:
    3977             :  *  rawstring: the input string; must be overwritable!  On return, it's
    3978             :  *             been modified to contain the separated identifiers.
    3979             :  *  separator: the separator punctuation expected between identifiers
    3980             :  *             (typically '.' or ',').  Whitespace may also appear around
    3981             :  *             identifiers.
    3982             :  * Outputs:
    3983             :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    3984             :  *            rawstring.  Caller should list_free() this even on error return.
    3985             :  *
    3986             :  * Returns true if okay, false if there is a syntax error in the string.
    3987             :  */
    3988             : bool
    3989         796 : SplitGUCList(char *rawstring, char separator,
    3990             :              List **namelist)
    3991             : {
    3992         796 :     char       *nextp = rawstring;
    3993         796 :     bool        done = false;
    3994             : 
    3995         796 :     *namelist = NIL;
    3996             : 
    3997         796 :     while (scanner_isspace(*nextp))
    3998           0 :         nextp++;                /* skip leading whitespace */
    3999             : 
    4000         796 :     if (*nextp == '\0')
    4001         762 :         return true;            /* allow empty string */
    4002             : 
    4003             :     /* At the top of the loop, we are at start of a new identifier. */
    4004             :     do
    4005             :     {
    4006             :         char       *curname;
    4007             :         char       *endp;
    4008             : 
    4009          46 :         if (*nextp == '"')
    4010             :         {
    4011             :             /* Quoted name --- collapse quote-quote pairs */
    4012          16 :             curname = nextp + 1;
    4013             :             for (;;)
    4014             :             {
    4015          24 :                 endp = strchr(nextp + 1, '"');
    4016          20 :                 if (endp == NULL)
    4017           0 :                     return false;   /* mismatched quotes */
    4018          20 :                 if (endp[1] != '"')
    4019          16 :                     break;      /* found end of quoted name */
    4020             :                 /* Collapse adjacent quotes into one quote, and look again */
    4021           4 :                 memmove(endp, endp + 1, strlen(endp));
    4022           4 :                 nextp = endp;
    4023             :             }
    4024             :             /* endp now points at the terminating quote */
    4025          16 :             nextp = endp + 1;
    4026             :         }
    4027             :         else
    4028             :         {
    4029             :             /* Unquoted name --- extends to separator or whitespace */
    4030          30 :             curname = nextp;
    4031         304 :             while (*nextp && *nextp != separator &&
    4032         274 :                    !scanner_isspace(*nextp))
    4033         274 :                 nextp++;
    4034          30 :             endp = nextp;
    4035          30 :             if (curname == nextp)
    4036           0 :                 return false;   /* empty unquoted name not allowed */
    4037             :         }
    4038             : 
    4039          46 :         while (scanner_isspace(*nextp))
    4040           0 :             nextp++;            /* skip trailing whitespace */
    4041             : 
    4042          46 :         if (*nextp == separator)
    4043             :         {
    4044          12 :             nextp++;
    4045          24 :             while (scanner_isspace(*nextp))
    4046          12 :                 nextp++;        /* skip leading whitespace for next */
    4047             :             /* we expect another name, so done remains false */
    4048             :         }
    4049          34 :         else if (*nextp == '\0')
    4050          34 :             done = true;
    4051             :         else
    4052           0 :             return false;       /* invalid syntax */
    4053             : 
    4054             :         /* Now safe to overwrite separator with a null */
    4055          46 :         *endp = '\0';
    4056             : 
    4057             :         /*
    4058             :          * Finished isolating current name --- add it to list
    4059             :          */
    4060          46 :         *namelist = lappend(*namelist, curname);
    4061             : 
    4062             :         /* Loop back if we didn't reach end of string */
    4063          46 :     } while (!done);
    4064             : 
    4065          34 :     return true;
    4066             : }
    4067             : 
    4068             : 
    4069             : /*****************************************************************************
    4070             :  *  Comparison Functions used for bytea
    4071             :  *
    4072             :  * Note: btree indexes need these routines not to leak memory; therefore,
    4073             :  * be careful to free working copies of toasted datums.  Most places don't
    4074             :  * need to be so careful.
    4075             :  *****************************************************************************/
    4076             : 
    4077             : Datum
    4078        7960 : byteaeq(PG_FUNCTION_ARGS)
    4079             : {
    4080        7960 :     Datum       arg1 = PG_GETARG_DATUM(0);
    4081        7960 :     Datum       arg2 = PG_GETARG_DATUM(1);
    4082             :     bool        result;
    4083             :     Size        len1,
    4084             :                 len2;
    4085             : 
    4086             :     /*
    4087             :      * We can use a fast path for unequal lengths, which might save us from
    4088             :      * having to detoast one or both values.
    4089             :      */
    4090        7960 :     len1 = toast_raw_datum_size(arg1);
    4091        7960 :     len2 = toast_raw_datum_size(arg2);
    4092        7960 :     if (len1 != len2)
    4093        4304 :         result = false;
    4094             :     else
    4095             :     {
    4096        3656 :         bytea      *barg1 = DatumGetByteaPP(arg1);
    4097        3656 :         bytea      *barg2 = DatumGetByteaPP(arg2);
    4098             : 
    4099        3656 :         result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
    4100             :                          len1 - VARHDRSZ) == 0);
    4101             : 
    4102        3656 :         PG_FREE_IF_COPY(barg1, 0);
    4103        3656 :         PG_FREE_IF_COPY(barg2, 1);
    4104             :     }
    4105             : 
    4106        7960 :     PG_RETURN_BOOL(result);
    4107             : }
    4108             : 
    4109             : Datum
    4110         512 : byteane(PG_FUNCTION_ARGS)
    4111             : {
    4112         512 :     Datum       arg1 = PG_GETARG_DATUM(0);
    4113         512 :     Datum       arg2 = PG_GETARG_DATUM(1);
    4114             :     bool        result;
    4115             :     Size        len1,
    4116             :                 len2;
    4117             : 
    4118             :     /*
    4119             :      * We can use a fast path for unequal lengths, which might save us from
    4120             :      * having to detoast one or both values.
    4121             :      */
    4122         512 :     len1 = toast_raw_datum_size(arg1);
    4123         512 :     len2 = toast_raw_datum_size(arg2);
    4124         512 :     if (len1 != len2)
    4125           0 :         result = true;
    4126             :     else
    4127             :     {
    4128         512 :         bytea      *barg1 = DatumGetByteaPP(arg1);
    4129         512 :         bytea      *barg2 = DatumGetByteaPP(arg2);
    4130             : 
    4131         512 :         result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
    4132             :                          len1 - VARHDRSZ) != 0);
    4133             : 
    4134         512 :         PG_FREE_IF_COPY(barg1, 0);
    4135         512 :         PG_FREE_IF_COPY(barg2, 1);
    4136             :     }
    4137             : 
    4138         512 :     PG_RETURN_BOOL(result);
    4139             : }
    4140             : 
    4141             : Datum
    4142        7302 : bytealt(PG_FUNCTION_ARGS)
    4143             : {
    4144        7302 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    4145        7302 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    4146             :     int         len1,
    4147             :                 len2;
    4148             :     int         cmp;
    4149             : 
    4150        7302 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    4151        7302 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    4152             : 
    4153        7302 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    4154             : 
    4155        7302 :     PG_FREE_IF_COPY(arg1, 0);
    4156        7302 :     PG_FREE_IF_COPY(arg2, 1);
    4157             : 
    4158        7302 :     PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
    4159             : }
    4160             : 
    4161             : Datum
    4162        5556 : byteale(PG_FUNCTION_ARGS)
    4163             : {
    4164        5556 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    4165        5556 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    4166             :     int         len1,
    4167             :                 len2;
    4168             :     int         cmp;
    4169             : 
    4170        5556 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    4171        5556 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    4172             : 
    4173        5556 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    4174             : 
    4175        5556 :     PG_FREE_IF_COPY(arg1, 0);
    4176        5556 :     PG_FREE_IF_COPY(arg2, 1);
    4177             : 
    4178        5556 :     PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
    4179             : }
    4180             : 
    4181             : Datum
    4182        5214 : byteagt(PG_FUNCTION_ARGS)
    4183             : {
    4184        5214 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    4185        5214 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    4186             :     int         len1,
    4187             :                 len2;
    4188             :     int         cmp;
    4189             : 
    4190        5214 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    4191        5214 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    4192             : 
    4193        5214 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    4194             : 
    4195        5214 :     PG_FREE_IF_COPY(arg1, 0);
    4196        5214 :     PG_FREE_IF_COPY(arg2, 1);
    4197             : 
    4198        5214 :     PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
    4199             : }
    4200             : 
    4201             : Datum
    4202        4394 : byteage(PG_FUNCTION_ARGS)
    4203             : {
    4204        4394 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    4205        4394 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    4206             :     int         len1,
    4207             :                 len2;
    4208             :     int         cmp;
    4209             : 
    4210        4394 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    4211        4394 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    4212             : 
    4213        4394 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    4214             : 
    4215        4394 :     PG_FREE_IF_COPY(arg1, 0);
    4216        4394 :     PG_FREE_IF_COPY(arg2, 1);
    4217             : 
    4218        4394 :     PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
    4219             : }
    4220             : 
    4221             : Datum
    4222       87300 : byteacmp(PG_FUNCTION_ARGS)
    4223             : {
    4224       87300 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    4225       87300 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    4226             :     int         len1,
    4227             :                 len2;
    4228             :     int         cmp;
    4229             : 
    4230       87300 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    4231       87300 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    4232             : 
    4233       87300 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    4234       87300 :     if ((cmp == 0) && (len1 != len2))
    4235       14572 :         cmp = (len1 < len2) ? -1 : 1;
    4236             : 
    4237       87300 :     PG_FREE_IF_COPY(arg1, 0);
    4238       87300 :     PG_FREE_IF_COPY(arg2, 1);
    4239             : 
    4240       87300 :     PG_RETURN_INT32(cmp);
    4241             : }
    4242             : 
    4243             : Datum
    4244          20 : bytea_sortsupport(PG_FUNCTION_ARGS)
    4245             : {
    4246          20 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    4247             :     MemoryContext oldcontext;
    4248             : 
    4249          20 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    4250             : 
    4251             :     /* Use generic string SortSupport, forcing "C" collation */
    4252          20 :     varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
    4253             : 
    4254          20 :     MemoryContextSwitchTo(oldcontext);
    4255             : 
    4256          20 :     PG_RETURN_VOID();
    4257             : }
    4258             : 
    4259             : /*
    4260             :  * appendStringInfoText
    4261             :  *
    4262             :  * Append a text to str.
    4263             :  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
    4264             :  */
    4265             : static void
    4266     1236856 : appendStringInfoText(StringInfo str, const text *t)
    4267             : {
    4268     1236856 :     appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
    4269     1236856 : }
    4270             : 
    4271             : /*
    4272             :  * replace_text
    4273             :  * replace all occurrences of 'old_sub_str' in 'orig_str'
    4274             :  * with 'new_sub_str' to form 'new_str'
    4275             :  *
    4276             :  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
    4277             :  * otherwise returns 'new_str'
    4278             :  */
    4279             : Datum
    4280        1672 : replace_text(PG_FUNCTION_ARGS)
    4281             : {
    4282        1672 :     text       *src_text = PG_GETARG_TEXT_PP(0);
    4283        1672 :     text       *from_sub_text = PG_GETARG_TEXT_PP(1);
    4284        1672 :     text       *to_sub_text = PG_GETARG_TEXT_PP(2);
    4285             :     int         src_text_len;
    4286             :     int         from_sub_text_len;
    4287             :     TextPositionState state;
    4288             :     text       *ret_text;
    4289             :     int         chunk_len;
    4290             :     char       *curr_ptr;
    4291             :     char       *start_ptr;
    4292             :     StringInfoData str;
    4293             :     bool        found;
    4294             : 
    4295        1672 :     src_text_len = VARSIZE_ANY_EXHDR(src_text);
    4296        1672 :     from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
    4297             : 
    4298             :     /* Return unmodified source string if empty source or pattern */
    4299        1672 :     if (src_text_len < 1 || from_sub_text_len < 1)
    4300             :     {
    4301           0 :         PG_RETURN_TEXT_P(src_text);
    4302             :     }
    4303             : 
    4304        1672 :     text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
    4305             : 
    4306        1672 :     found = text_position_next(&state);
    4307             : 
    4308             :     /* When the from_sub_text is not found, there is nothing to do. */
    4309        1672 :     if (!found)
    4310             :     {
    4311         514 :         text_position_cleanup(&state);
    4312         514 :         PG_RETURN_TEXT_P(src_text);
    4313             :     }
    4314        1158 :     curr_ptr = text_position_get_match_ptr(&state);
    4315        1158 :     start_ptr = VARDATA_ANY(src_text);
    4316             : 
    4317        1158 :     initStringInfo(&str);
    4318             : 
    4319             :     do
    4320             :     {
    4321        4790 :         CHECK_FOR_INTERRUPTS();
    4322             : 
    4323             :         /* copy the data skipped over by last text_position_next() */
    4324        4790 :         chunk_len = curr_ptr - start_ptr;
    4325        4790 :         appendBinaryStringInfo(&str, start_ptr, chunk_len);
    4326             : 
    4327        4790 :         appendStringInfoText(&str, to_sub_text);
    4328             : 
    4329        4790 :         start_ptr = curr_ptr + from_sub_text_len;
    4330             : 
    4331        4790 :         found = text_position_next(&state);
    4332        4790 :         if (found)
    4333        3632 :             curr_ptr = text_position_get_match_ptr(&state);
    4334             :     }
    4335        4790 :     while (found);
    4336             : 
    4337             :     /* copy trailing data */
    4338        1158 :     chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    4339        1158 :     appendBinaryStringInfo(&str, start_ptr, chunk_len);
    4340             : 
    4341        1158 :     text_position_cleanup(&state);
    4342             : 
    4343        1158 :     ret_text = cstring_to_text_with_len(str.data, str.len);
    4344        1158 :     pfree(str.data);
    4345             : 
    4346        1158 :     PG_RETURN_TEXT_P(ret_text);
    4347             : }
    4348             : 
    4349             : /*
    4350             :  * check_replace_text_has_escape_char
    4351             :  *
    4352             :  * check whether replace_text contains escape char.
    4353             :  */
    4354             : static bool
    4355        4160 : check_replace_text_has_escape_char(const text *replace_text)
    4356             : {
    4357        4160 :     const char *p = VARDATA_ANY(replace_text);
    4358        4160 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    4359             : 
    4360        4160 :     if (pg_database_encoding_max_length() == 1)
    4361             :     {
    4362          12 :         for (; p < p_end; p++)
    4363             :         {
    4364           0 :             if (*p == '\\')
    4365           0 :                 return true;
    4366             :         }
    4367             :     }
    4368             :     else
    4369             :     {
    4370       67986 :         for (; p < p_end; p += pg_mblen(p))
    4371             :         {
    4372       63946 :             if (*p == '\\')
    4373         108 :                 return true;
    4374             :         }
    4375             :     }
    4376             : 
    4377        4052 :     return false;
    4378             : }
    4379             : 
    4380             : /*
    4381             :  * appendStringInfoRegexpSubstr
    4382             :  *
    4383             :  * Append replace_text to str, substituting regexp back references for
    4384             :  * \n escapes.  start_ptr is the start of the match in the source string,
    4385             :  * at logical character position data_pos.
    4386             :  */
    4387             : static void
    4388          60 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
    4389             :                              regmatch_t *pmatch,
    4390             :                              char *start_ptr, int data_pos)
    4391             : {
    4392          60 :     const char *p = VARDATA_ANY(replace_text);
    4393          60 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    4394          60 :     int         eml = pg_database_encoding_max_length();
    4395             : 
    4396             :     for (;;)
    4397         122 :     {
    4398         182 :         const char *chunk_start = p;
    4399             :         int         so;
    4400             :         int         eo;
    4401             : 
    4402             :         /* Find next escape char. */
    4403         182 :         if (eml == 1)
    4404             :         {
    4405           0 :             for (; p < p_end && *p != '\\'; p++)
    4406             :                  /* nothing */ ;
    4407             :         }
    4408             :         else
    4409             :         {
    4410         990 :             for (; p < p_end && *p != '\\'; p += pg_mblen(p))
    4411             :                  /* nothing */ ;
    4412             :         }
    4413             : 
    4414             :         /* Copy the text we just scanned over, if any. */
    4415         182 :         if (p > chunk_start)
    4416          72 :             appendBinaryStringInfo(str, chunk_start, p - chunk_start);
    4417             : 
    4418             :         /* Done if at end of string, else advance over escape char. */
    4419         182 :         if (p >= p_end)
    4420          60 :             break;
    4421         122 :         p++;
    4422             : 
    4423         122 :         if (p >= p_end)
    4424             :         {
    4425             :             /* Escape at very end of input.  Treat same as unexpected char */
    4426           0 :             appendStringInfoChar(str, '\\');
    4427           0 :             break;
    4428             :         }
    4429             : 
    4430         122 :         if (*p >= '1' && *p <= '9')
    4431         104 :         {
    4432             :             /* Use the back reference of regexp. */
    4433         104 :             int         idx = *p - '0';
    4434             : 
    4435         104 :             so = pmatch[idx].rm_so;
    4436         104 :             eo = pmatch[idx].rm_eo;
    4437         104 :             p++;
    4438             :         }
    4439          18 :         else if (*p == '&')
    4440             :         {
    4441             :             /* Use the entire matched string. */
    4442           0 :             so = pmatch[0].rm_so;
    4443           0 :             eo = pmatch[0].rm_eo;
    4444           0 :             p++;
    4445             :         }
    4446          18 :         else if (*p == '\\')
    4447             :         {
    4448             :             /* \\ means transfer one \ to output. */
    4449          18 :             appendStringInfoChar(str, '\\');
    4450          18 :             p++;
    4451          18 :             continue;
    4452             :         }
    4453             :         else
    4454             :         {
    4455             :             /*
    4456             :              * If escape char is not followed by any expected char, just treat
    4457             :              * it as ordinary data to copy.  (XXX would it be better to throw
    4458             :              * an error?)
    4459             :              */
    4460           0 :             appendStringInfoChar(str, '\\');
    4461           0 :             continue;
    4462             :         }
    4463             : 
    4464         104 :         if (so != -1 && eo != -1)
    4465             :         {
    4466             :             /*
    4467             :              * Copy the text that is back reference of regexp.  Note so and eo
    4468             :              * are counted in characters not bytes.
    4469             :              */
    4470             :             char       *chunk_start;
    4471             :             int         chunk_len;
    4472             : 
    4473             :             Assert(so >= data_pos);
    4474         104 :             chunk_start = start_ptr;
    4475         104 :             chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
    4476         104 :             chunk_len = charlen_to_bytelen(chunk_start, eo - so);
    4477         104 :             appendBinaryStringInfo(str, chunk_start, chunk_len);
    4478             :         }
    4479             :     }
    4480          60 : }
    4481             : 
    4482             : #define REGEXP_REPLACE_BACKREF_CNT      10
    4483             : 
    4484             : /*
    4485             :  * replace_text_regexp
    4486             :  *
    4487             :  * replace text that matches to regexp in src_text to replace_text.
    4488             :  *
    4489             :  * Note: to avoid having to include regex.h in builtins.h, we declare
    4490             :  * the regexp argument as void *, but really it's regex_t *.
    4491             :  */
    4492             : text *
    4493        4160 : replace_text_regexp(text *src_text, void *regexp,
    4494             :                     text *replace_text, bool glob)
    4495             : {
    4496             :     text       *ret_text;
    4497        4160 :     regex_t    *re = (regex_t *) regexp;
    4498        4160 :     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
    4499             :     StringInfoData buf;
    4500             :     regmatch_t  pmatch[REGEXP_REPLACE_BACKREF_CNT];
    4501             :     pg_wchar   *data;
    4502             :     size_t      data_len;
    4503             :     int         search_start;
    4504             :     int         data_pos;
    4505             :     char       *start_ptr;
    4506             :     bool        have_escape;
    4507             : 
    4508        4160 :     initStringInfo(&buf);
    4509             : 
    4510             :     /* Convert data string to wide characters. */
    4511        4160 :     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
    4512        4160 :     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
    4513             : 
    4514             :     /* Check whether replace_text has escape char. */
    4515        4160 :     have_escape = check_replace_text_has_escape_char(replace_text);
    4516             : 
    4517             :     /* start_ptr points to the data_pos'th character of src_text */
    4518        4160 :     start_ptr = (char *) VARDATA_ANY(src_text);
    4519        4160 :     data_pos = 0;
    4520             : 
    4521        4160 :     search_start = 0;
    4522        6358 :     while (search_start <= data_len)
    4523             :     {
    4524             :         int         regexec_result;
    4525             : 
    4526        6354 :         CHECK_FOR_INTERRUPTS();
    4527             : 
    4528        6354 :         regexec_result = pg_regexec(re,
    4529             :                                     data,
    4530             :                                     data_len,
    4531             :                                     search_start,
    4532             :                                     NULL,   /* no details */
    4533             :                                     REGEXP_REPLACE_BACKREF_CNT,
    4534             :                                     pmatch,
    4535             :                                     0);
    4536             : 
    4537        6354 :         if (regexec_result == REG_NOMATCH)
    4538        3650 :             break;
    4539             : 
    4540        2704 :         if (regexec_result != REG_OKAY)
    4541             :         {
    4542             :             char        errMsg[100];
    4543             : 
    4544           0 :             CHECK_FOR_INTERRUPTS();
    4545           0 :             pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
    4546           0 :             ereport(ERROR,
    4547             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    4548             :                      errmsg("regular expression failed: %s", errMsg)));
    4549             :         }
    4550             : 
    4551             :         /*
    4552             :          * Copy the text to the left of the match position.  Note we are given
    4553             :          * character not byte indexes.
    4554             :          */
    4555        2704 :         if (pmatch[0].rm_so - data_pos > 0)
    4556             :         {
    4557             :             int         chunk_len;
    4558             : 
    4559        2642 :             chunk_len = charlen_to_bytelen(start_ptr,
    4560        2642 :                                            pmatch[0].rm_so - data_pos);
    4561        2642 :             appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    4562             : 
    4563             :             /*
    4564             :              * Advance start_ptr over that text, to avoid multiple rescans of
    4565             :              * it if the replace_text contains multiple back-references.
    4566             :              */
    4567        2642 :             start_ptr += chunk_len;
    4568        2642 :             data_pos = pmatch[0].rm_so;
    4569             :         }
    4570             : 
    4571             :         /*
    4572             :          * Copy the replace_text. Process back references when the
    4573             :          * replace_text has escape characters.
    4574             :          */
    4575        2704 :         if (have_escape)
    4576          60 :             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
    4577             :                                          start_ptr, data_pos);
    4578             :         else
    4579        2644 :             appendStringInfoText(&buf, replace_text);
    4580             : 
    4581             :         /* Advance start_ptr and data_pos over the matched text. */
    4582        5408 :         start_ptr += charlen_to_bytelen(start_ptr,
    4583        2704 :                                         pmatch[0].rm_eo - data_pos);
    4584        2704 :         data_pos = pmatch[0].rm_eo;
    4585             : 
    4586             :         /*
    4587             :          * When global option is off, replace the first instance only.
    4588             :          */
    4589        2704 :         if (!glob)
    4590         506 :             break;
    4591             : 
    4592             :         /*
    4593             :          * Advance search position.  Normally we start the next search at the
    4594             :          * end of the previous match; but if the match was of zero length, we
    4595             :          * have to advance by one character, or we'd just find the same match
    4596             :          * again.
    4597             :          */
    4598        2198 :         search_start = data_pos;
    4599        2198 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    4600           8 :             search_start++;
    4601             :     }
    4602             : 
    4603             :     /*
    4604             :      * Copy the text to the right of the last match.
    4605             :      */
    4606        4160 :     if (data_pos < data_len)
    4607             :     {
    4608             :         int         chunk_len;
    4609             : 
    4610        3978 :         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    4611        3978 :         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    4612             :     }
    4613             : 
    4614        4160 :     ret_text = cstring_to_text_with_len(buf.data, buf.len);
    4615        4160 :     pfree(buf.data);
    4616        4160 :     pfree(data);
    4617             : 
    4618        4160 :     return ret_text;
    4619             : }
    4620             : 
    4621             : /*
    4622             :  * split_part
    4623             :  * parse input string based on provided field separator
    4624             :  * return N'th item (1 based, negative counts from end)
    4625             :  */
    4626             : Datum
    4627          68 : split_part(PG_FUNCTION_ARGS)
    4628             : {
    4629          68 :     text       *inputstring = PG_GETARG_TEXT_PP(0);
    4630          68 :     text       *fldsep = PG_GETARG_TEXT_PP(1);
    4631          68 :     int         fldnum = PG_GETARG_INT32(2);
    4632             :     int         inputstring_len;
    4633             :     int         fldsep_len;
    4634             :     TextPositionState state;
    4635             :     char       *start_ptr;
    4636             :     char       *end_ptr;
    4637             :     text       *result_text;
    4638             :     bool        found;
    4639             : 
    4640             :     /* field number is 1 based */
    4641          68 :     if (fldnum == 0)
    4642           4 :         ereport(ERROR,
    4643             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4644             :                  errmsg("field position must not be zero")));
    4645             : 
    4646          64 :     inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4647          64 :     fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    4648             : 
    4649             :     /* return empty string for empty input string */
    4650          64 :     if (inputstring_len < 1)
    4651           8 :         PG_RETURN_TEXT_P(cstring_to_text(""));
    4652             : 
    4653             :     /* handle empty field separator */
    4654          56 :     if (fldsep_len < 1)
    4655             :     {
    4656             :         /* if first or last field, return input string, else empty string */
    4657          16 :         if (fldnum == 1 || fldnum == -1)
    4658           8 :             PG_RETURN_TEXT_P(inputstring);
    4659             :         else
    4660           8 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4661             :     }
    4662             : 
    4663             :     /* find the first field separator */
    4664          40 :     text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
    4665             : 
    4666          40 :     found = text_position_next(&state);
    4667             : 
    4668             :     /* special case if fldsep not found at all */
    4669          40 :     if (!found)
    4670             :     {
    4671           8 :         text_position_cleanup(&state);
    4672             :         /* if first or last field, return input string, else empty string */
    4673           8 :         if (fldnum == 1 || fldnum == -1)
    4674           4 :             PG_RETURN_TEXT_P(inputstring);
    4675             :         else
    4676           4 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4677             :     }
    4678             : 
    4679             :     /*
    4680             :      * take care of a negative field number (i.e. count from the right) by
    4681             :      * converting to a positive field number; we need total number of fields
    4682             :      */
    4683          32 :     if (fldnum < 0)
    4684             :     {
    4685             :         /* we found a fldsep, so there are at least two fields */
    4686          16 :         int         numfields = 2;
    4687             : 
    4688          24 :         while (text_position_next(&state))
    4689           8 :             numfields++;
    4690             : 
    4691             :         /* special case of last field does not require an extra pass */
    4692          16 :         if (fldnum == -1)
    4693             :         {
    4694           4 :             start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
    4695           4 :             end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
    4696           4 :             text_position_cleanup(&state);
    4697           4 :             PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
    4698             :                                                       end_ptr - start_ptr));
    4699             :         }
    4700             : 
    4701             :         /* else, convert fldnum to positive notation */
    4702          12 :         fldnum += numfields + 1;
    4703             : 
    4704             :         /* if nonexistent field, return empty string */
    4705          12 :         if (fldnum <= 0)
    4706             :         {
    4707           4 :             text_position_cleanup(&state);
    4708           4 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4709             :         }
    4710             : 
    4711             :         /* reset to pointing at first match, but now with positive fldnum */
    4712           8 :         text_position_reset(&state);
    4713           8 :         found = text_position_next(&state);
    4714             :         Assert(found);
    4715             :     }
    4716             : 
    4717             :     /* identify bounds of first field */
    4718          24 :     start_ptr = VARDATA_ANY(inputstring);
    4719          24 :     end_ptr = text_position_get_match_ptr(&state);
    4720             : 
    4721          44 :     while (found && --fldnum > 0)
    4722             :     {
    4723             :         /* identify bounds of next field */
    4724          20 :         start_ptr = end_ptr + fldsep_len;
    4725          20 :         found = text_position_next(&state);
    4726          20 :         if (found)
    4727          12 :             end_ptr = text_position_get_match_ptr(&state);
    4728             :     }
    4729             : 
    4730          24 :     text_position_cleanup(&state);
    4731             : 
    4732          24 :     if (fldnum > 0)
    4733             :     {
    4734             :         /* N'th field separator not found */
    4735             :         /* if last field requested, return it, else empty string */
    4736           8 :         if (fldnum == 1)
    4737             :         {
    4738           4 :             int         last_len = start_ptr - VARDATA_ANY(inputstring);
    4739             : 
    4740           4 :             result_text = cstring_to_text_with_len(start_ptr,
    4741             :                                                    inputstring_len - last_len);
    4742             :         }
    4743             :         else
    4744           4 :             result_text = cstring_to_text("");
    4745             :     }
    4746             :     else
    4747             :     {
    4748             :         /* non-last field requested */
    4749          16 :         result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
    4750             :     }
    4751             : 
    4752          24 :     PG_RETURN_TEXT_P(result_text);
    4753             : }
    4754             : 
    4755             : /*
    4756             :  * Convenience function to return true when two text params are equal.
    4757             :  */
    4758             : static bool
    4759         112 : text_isequal(text *txt1, text *txt2, Oid collid)
    4760             : {
    4761         112 :     return DatumGetBool(DirectFunctionCall2Coll(texteq,
    4762             :                                                 collid,
    4763             :                                                 PointerGetDatum(txt1),
    4764             :                                                 PointerGetDatum(txt2)));
    4765             : }
    4766             : 
    4767             : /*
    4768             :  * text_to_array
    4769             :  * parse input string and return text array of elements,
    4770             :  * based on provided field separator
    4771             :  */
    4772             : Datum
    4773          72 : text_to_array(PG_FUNCTION_ARGS)
    4774             : {
    4775             :     SplitTextOutputData tstate;
    4776             : 
    4777             :     /* For array output, tstate should start as all zeroes */
    4778          72 :     memset(&tstate, 0, sizeof(tstate));
    4779             : 
    4780          72 :     if (!split_text(fcinfo, &tstate))
    4781           4 :         PG_RETURN_NULL();
    4782             : 
    4783          68 :     if (tstate.astate == NULL)
    4784           4 :         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
    4785             : 
    4786          64 :     PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
    4787             :                                           CurrentMemoryContext));
    4788             : }
    4789             : 
    4790             : /*
    4791             :  * text_to_array_null
    4792             :  * parse input string and return text array of elements,
    4793             :  * based on provided field separator and null string
    4794             :  *
    4795             :  * This is a separate entry point only to prevent the regression tests from
    4796             :  * complaining about different argument sets for the same internal function.
    4797             :  */
    4798             : Datum
    4799          16 : text_to_array_null(PG_FUNCTION_ARGS)
    4800             : {
    4801          16 :     return text_to_array(fcinfo);
    4802             : }
    4803             : 
    4804             : /*
    4805             :  * text_to_table
    4806             :  * parse input string and return table of elements,
    4807             :  * based on provided field separator
    4808             :  */
    4809             : Datum
    4810          56 : text_to_table(PG_FUNCTION_ARGS)
    4811             : {
    4812          56 :     ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
    4813             :     SplitTextOutputData tstate;
    4814             :     MemoryContext old_cxt;
    4815             : 
    4816             :     /* check to see if caller supports us returning a tuplestore */
    4817          56 :     if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
    4818           0 :         ereport(ERROR,
    4819             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    4820             :                  errmsg("set-valued function called in context that cannot accept a set")));
    4821          56 :     if (!(rsi->allowedModes & SFRM_Materialize))
    4822           0 :         ereport(ERROR,
    4823             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    4824             :                  errmsg("materialize mode required, but it is not allowed in this context")));
    4825             : 
    4826             :     /* OK, prepare tuplestore in per-query memory */
    4827          56 :     old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory);
    4828             : 
    4829          56 :     tstate.astate = NULL;
    4830          56 :     tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
    4831          56 :     tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
    4832             : 
    4833          56 :     MemoryContextSwitchTo(old_cxt);
    4834             : 
    4835          56 :     (void) split_text(fcinfo, &tstate);
    4836             : 
    4837             :     tuplestore_donestoring(tstate.tupstore);
    4838             : 
    4839          56 :     rsi->returnMode = SFRM_Materialize;
    4840          56 :     rsi->setResult = tstate.tupstore;
    4841          56 :     rsi->setDesc = tstate.tupdesc;
    4842             : 
    4843          56 :     return (Datum) 0;
    4844             : }
    4845             : 
    4846             : /*
    4847             :  * text_to_table_null
    4848             :  * parse input string and return table of elements,
    4849             :  * based on provided field separator and null string
    4850             :  *
    4851             :  * This is a separate entry point only to prevent the regression tests from
    4852             :  * complaining about different argument sets for the same internal function.
    4853             :  */
    4854             : Datum
    4855          16 : text_to_table_null(PG_FUNCTION_ARGS)
    4856             : {
    4857          16 :     return text_to_table(fcinfo);
    4858             : }
    4859             : 
    4860             : /*
    4861             :  * Common code for text_to_array, text_to_array_null, text_to_table
    4862             :  * and text_to_table_null functions.
    4863             :  *
    4864             :  * These are not strict so we have to test for null inputs explicitly.
    4865             :  * Returns false if result is to be null, else returns true.
    4866             :  *
    4867             :  * Note that if the result is valid but empty (zero elements), we return
    4868             :  * without changing *tstate --- caller must handle that case, too.
    4869             :  */
    4870             : static bool
    4871         128 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
    4872             : {
    4873             :     text       *inputstring;
    4874             :     text       *fldsep;
    4875             :     text       *null_string;
    4876         128 :     Oid         collation = PG_GET_COLLATION();
    4877             :     int         inputstring_len;
    4878             :     int         fldsep_len;
    4879             :     char       *start_ptr;
    4880             :     text       *result_text;
    4881             : 
    4882             :     /* when input string is NULL, then result is NULL too */
    4883         128 :     if (PG_ARGISNULL(0))
    4884           8 :         return false;
    4885             : 
    4886         120 :     inputstring = PG_GETARG_TEXT_PP(0);
    4887             : 
    4888             :     /* fldsep can be NULL */
    4889         120 :     if (!PG_ARGISNULL(1))
    4890         112 :         fldsep = PG_GETARG_TEXT_PP(1);
    4891             :     else
    4892           8 :         fldsep = NULL;
    4893             : 
    4894             :     /* null_string can be NULL or omitted */
    4895         120 :     if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
    4896          32 :         null_string = PG_GETARG_TEXT_PP(2);
    4897             :     else
    4898          88 :         null_string = NULL;
    4899             : 
    4900         120 :     if (fldsep != NULL)
    4901             :     {
    4902             :         /*
    4903             :          * Normal case with non-null fldsep.  Use the text_position machinery
    4904             :          * to search for occurrences of fldsep.
    4905             :          */
    4906             :         TextPositionState state;
    4907             : 
    4908         112 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4909         112 :         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    4910             : 
    4911             :         /* return empty set for empty input string */
    4912         112 :         if (inputstring_len < 1)
    4913          40 :             return true;
    4914             : 
    4915             :         /* empty field separator: return input string as a one-element set */
    4916         104 :         if (fldsep_len < 1)
    4917             :         {
    4918          32 :             split_text_accum_result(tstate, inputstring,
    4919             :                                     null_string, collation);
    4920          32 :             return true;
    4921             :         }
    4922             : 
    4923          72 :         text_position_setup(inputstring, fldsep, collation, &state);
    4924             : 
    4925          72 :         start_ptr = VARDATA_ANY(inputstring);
    4926             : 
    4927             :         for (;;)
    4928         296 :         {
    4929             :             bool        found;
    4930             :             char       *end_ptr;
    4931             :             int         chunk_len;
    4932             : 
    4933         368 :             CHECK_FOR_INTERRUPTS();
    4934             : 
    4935         368 :             found = text_position_next(&state);
    4936         368 :             if (!found)
    4937             :             {
    4938             :                 /* fetch last field */
    4939          72 :                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
    4940          72 :                 end_ptr = NULL; /* not used, but some compilers complain */
    4941             :             }
    4942             :             else
    4943             :             {
    4944             :                 /* fetch non-last field */
    4945         296 :                 end_ptr = text_position_get_match_ptr(&state);
    4946         296 :                 chunk_len = end_ptr - start_ptr;
    4947             :             }
    4948             : 
    4949             :             /* build a temp text datum to pass to split_text_accum_result */
    4950         368 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    4951             : 
    4952             :             /* stash away this field */
    4953         368 :             split_text_accum_result(tstate, result_text,
    4954             :                                     null_string, collation);
    4955             : 
    4956         368 :             pfree(result_text);
    4957             : 
    4958         368 :             if (!found)
    4959          72 :                 break;
    4960             : 
    4961         296 :             start_ptr = end_ptr + fldsep_len;
    4962             :         }
    4963             : 
    4964          72 :         text_position_cleanup(&state);
    4965             :     }
    4966             :     else
    4967             :     {
    4968             :         /*
    4969             :          * When fldsep is NULL, each character in the input string becomes a
    4970             :          * separate element in the result set.  The separator is effectively
    4971             :          * the space between characters.
    4972             :          */
    4973           8 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4974             : 
    4975           8 :         start_ptr = VARDATA_ANY(inputstring);
    4976             : 
    4977          48 :         while (inputstring_len > 0)
    4978             :         {
    4979          40 :             int         chunk_len = pg_mblen(start_ptr);
    4980             : 
    4981          40 :             CHECK_FOR_INTERRUPTS();
    4982             : 
    4983             :             /* build a temp text datum to pass to split_text_accum_result */
    4984          40 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    4985             : 
    4986             :             /* stash away this field */
    4987          40 :             split_text_accum_result(tstate, result_text,
    4988             :                                     null_string, collation);
    4989             : 
    4990          40 :             pfree(result_text);
    4991             : 
    4992          40 :             start_ptr += chunk_len;
    4993          40 :             inputstring_len -= chunk_len;
    4994             :         }
    4995             :     }
    4996             : 
    4997          80 :     return true;
    4998             : }
    4999             : 
    5000             : /*
    5001             :  * Add text item to result set (table or array).
    5002             :  *
    5003             :  * This is also responsible for checking to see if the item matches
    5004             :  * the null_string, in which case we should emit NULL instead.
    5005             :  */
    5006             : static void
    5007         440 : split_text_accum_result(SplitTextOutputData *tstate,
    5008             :                         text *field_value,
    5009             :                         text *null_string,
    5010             :                         Oid collation)
    5011             : {
    5012         440 :     bool        is_null = false;
    5013             : 
    5014         440 :     if (null_string && text_isequal(field_value, null_string, collation))
    5015          32 :         is_null = true;
    5016             : 
    5017         440 :     if (tstate->tupstore)
    5018             :     {
    5019             :         Datum       values[1];
    5020             :         bool        nulls[1];
    5021             : 
    5022         152 :         values[0] = PointerGetDatum(field_value);
    5023         152 :         nulls[0] = is_null;
    5024             : 
    5025         152 :         tuplestore_putvalues(tstate->tupstore,
    5026             :                              tstate->tupdesc,
    5027             :                              values,
    5028             :                              nulls);
    5029             :     }
    5030             :     else
    5031             :     {
    5032         288 :         tstate->astate = accumArrayResult(tstate->astate,
    5033             :                                           PointerGetDatum(field_value),
    5034             :                                           is_null,
    5035             :                                           TEXTOID,
    5036             :                                           CurrentMemoryContext);
    5037             :     }
    5038         440 : }
    5039             : 
    5040             : /*
    5041             :  * array_to_text
    5042             :  * concatenate Cstring representation of input array elements
    5043             :  * using provided field separator
    5044             :  */
    5045             : Datum
    5046       36410 : array_to_text(PG_FUNCTION_ARGS)
    5047             : {
    5048       36410 :     ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
    5049       36410 :     char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5050             : 
    5051       36410 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
    5052             : }
    5053             : 
    5054             : /*
    5055             :  * array_to_text_null
    5056             :  * concatenate Cstring representation of input array elements
    5057             :  * using provided field separator and null string
    5058             :  *
    5059             :  * This version is not strict so we have to test for null inputs explicitly.
    5060             :  */
    5061             : Datum
    5062           8 : array_to_text_null(PG_FUNCTION_ARGS)
    5063             : {
    5064             :     ArrayType  *v;
    5065             :     char       *fldsep;
    5066             :     char       *null_string;
    5067             : 
    5068             :     /* returns NULL when first or second parameter is NULL */
    5069           8 :     if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
    5070           0 :         PG_RETURN_NULL();
    5071             : 
    5072           8 :     v = PG_GETARG_ARRAYTYPE_P(0);
    5073           8 :     fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    5074             : 
    5075             :     /* NULL null string is passed through as a null pointer */
    5076           8 :     if (!PG_ARGISNULL(2))
    5077           4 :         null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
    5078             :     else
    5079           4 :         null_string = NULL;
    5080             : 
    5081           8 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
    5082             : }
    5083             : 
    5084             : /*
    5085             :  * common code for array_to_text and array_to_text_null functions
    5086             :  */
    5087             : static text *
    5088       36430 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
    5089             :                        const char *fldsep, const char *null_string)
    5090             : {
    5091             :     text       *result;
    5092             :     int         nitems,
    5093             :                *dims,
    5094             :                 ndims;
    5095             :     Oid         element_type;
    5096             :     int         typlen;
    5097             :     bool        typbyval;
    5098             :     char        typalign;
    5099             :     StringInfoData buf;
    5100       36430 :     bool        printed = false;
    5101             :     char       *p;
    5102             :     bits8      *bitmap;
    5103             :     int         bitmask;
    5104             :     int         i;
    5105             :     ArrayMetaState *my_extra;
    5106             : 
    5107       36430 :     ndims = ARR_NDIM(v);
    5108       36430 :     dims = ARR_DIMS(v);
    5109       36430 :     nitems = ArrayGetNItems(ndims, dims);
    5110             : 
    5111             :     /* if there are no elements, return an empty string */
    5112       36430 :     if (nitems == 0)
    5113       21138 :         return cstring_to_text_with_len("", 0);
    5114             : 
    5115       15292 :     element_type = ARR_ELEMTYPE(v);
    5116       15292 :     initStringInfo(&buf);
    5117             : 
    5118             :     /*
    5119             :      * We arrange to look up info about element type, including its output
    5120             :      * conversion proc, only once per series of calls, assuming the element
    5121             :      * type doesn't change underneath us.
    5122             :      */
    5123       15292 :     my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    5124       15292 :     if (my_extra == NULL)
    5125             :     {
    5126         936 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    5127             :                                                       sizeof(ArrayMetaState));
    5128         936 :         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    5129         936 :         my_extra->element_type = ~element_type;
    5130             :     }
    5131             : 
    5132       15292 :     if (my_extra->element_type != element_type)
    5133             :     {
    5134             :         /*
    5135             :          * Get info about element type, including its output conversion proc
    5136             :          */
    5137         936 :         get_type_io_data(element_type, IOFunc_output,
    5138             :                          &my_extra->typlen, &my_extra->typbyval,
    5139             :                          &my_extra->typalign, &my_extra->typdelim,
    5140             :                          &my_extra->typioparam, &my_extra->typiofunc);
    5141         936 :         fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
    5142         936 :                       fcinfo->flinfo->fn_mcxt);
    5143         936 :         my_extra->element_type = element_type;
    5144             :     }
    5145       15292 :     typlen = my_extra->typlen;
    5146       15292 :     typbyval = my_extra->typbyval;
    5147       15292 :     typalign = my_extra->typalign;
    5148             : 
    5149       15292 :     p = ARR_DATA_PTR(v);
    5150       15292 :     bitmap = ARR_NULLBITMAP(v);
    5151       15292 :     bitmask = 1;
    5152             : 
    5153       51700 :     for (i = 0; i < nitems; i++)
    5154             :     {
    5155             :         Datum       itemvalue;
    5156             :         char       *value;
    5157             : 
    5158             :         /* Get source element, checking for NULL */
    5159       36408 :         if (bitmap && (*bitmap & bitmask) == 0)
    5160             :         {
    5161             :             /* if null_string is NULL, we just ignore null elements */
    5162          16 :             if (null_string != NULL)
    5163             :             {
    5164           4 :                 if (printed)
    5165           4 :                     appendStringInfo(&buf, "%s%s", fldsep, null_string);
    5166             :                 else
    5167           0 :                     appendStringInfoString(&buf, null_string);
    5168           4 :                 printed = true;
    5169             :             }
    5170             :         }
    5171             :         else
    5172             :         {
    5173       36396 :             itemvalue = fetch_att(p, typbyval, typlen);
    5174             : 
    5175       36396 :             value = OutputFunctionCall(&my_extra->proc, itemvalue);
    5176             : 
    5177       36396 :             if (printed)
    5178       21104 :                 appendStringInfo(&buf, "%s%s", fldsep, value);
    5179             :             else
    5180       15292 :                 appendStringInfoString(&buf, value);
    5181       36396 :             printed = true;
    5182             : 
    5183       36396 :             p = att_addlength_pointer(p, typlen, p);
    5184       36396 :             p = (char *) att_align_nominal(p, typalign);
    5185             :         }
    5186             : 
    5187             :         /* advance bitmap pointer if any */
    5188       36408 :         if (bitmap)
    5189             :         {
    5190          72 :             bitmask <<= 1;
    5191          72 :             if (bitmask == 0x100)
    5192             :             {
    5193           0 :                 bitmap++;
    5194           0 :                 bitmask = 1;
    5195             :             }
    5196             :         }
    5197             :     }
    5198             : 
    5199       15292 :     result = cstring_to_text_with_len(buf.data, buf.len);
    5200       15292 :     pfree(buf.data);
    5201             : 
    5202       15292 :     return result;
    5203             : }
    5204             : 
    5205             : #define HEXBASE 16
    5206             : /*
    5207             :  * Convert an int32 to a string containing a base 16 (hex) representation of
    5208             :  * the number.
    5209             :  */
    5210             : Datum
    5211       34056 : to_hex32(PG_FUNCTION_ARGS)
    5212             : {
    5213       34056 :     uint32      value = (uint32) PG_GETARG_INT32(0);
    5214             :     char       *ptr;
    5215       34056 :     const char *digits = "0123456789abcdef";
    5216             :     char        buf[32];        /* bigger than needed, but reasonable */
    5217             : 
    5218       34056 :     ptr = buf + sizeof(buf) - 1;
    5219       34056 :     *ptr = '\0';
    5220             : 
    5221             :     do
    5222             :     {
    5223       65896 :         *--ptr = digits[value % HEXBASE];
    5224       65896 :         value /= HEXBASE;
    5225       65896 :     } while (ptr > buf && value);
    5226             : 
    5227       34056 :     PG_RETURN_TEXT_P(cstring_to_text(ptr));
    5228             : }
    5229             : 
    5230             : /*
    5231             :  * Convert an int64 to a string containing a base 16 (hex) representation of
    5232             :  * the number.
    5233             :  */
    5234             : Datum
    5235           4 : to_hex64(PG_FUNCTION_ARGS)
    5236             : {
    5237           4 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    5238             :     char       *ptr;
    5239           4 :     const char *digits = "0123456789abcdef";
    5240             :     char        buf[32];        /* bigger than needed, but reasonable */
    5241             : 
    5242           4 :     ptr = buf + sizeof(buf) - 1;
    5243           4 :     *ptr = '\0';
    5244             : 
    5245             :     do
    5246             :     {
    5247          32 :         *--ptr = digits[value % HEXBASE];
    5248          32 :         value /= HEXBASE;
    5249          32 :     } while (ptr > buf && value);
    5250             : 
    5251           4 :     PG_RETURN_TEXT_P(cstring_to_text(ptr));
    5252             : }
    5253             : 
    5254             : /*
    5255             :  * Return the size of a datum, possibly compressed
    5256             :  *
    5257             :  * Works on any data type
    5258             :  */
    5259             : Datum
    5260         102 : pg_column_size(PG_FUNCTION_ARGS)
    5261             : {
    5262         102 :     Datum       value = PG_GETARG_DATUM(0);
    5263             :     int32       result;
    5264             :     int         typlen;
    5265             : 
    5266             :     /* On first call, get the input type's typlen, and save at *fn_extra */
    5267         102 :     if (fcinfo->flinfo->fn_extra == NULL)
    5268             :     {
    5269             :         /* Lookup the datatype of the supplied argument */
    5270         102 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    5271             : 
    5272         102 :         typlen = get_typlen(argtypeid);
    5273         102 :         if (typlen == 0)        /* should not happen */
    5274           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    5275             : 
    5276         102 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    5277             :                                                       sizeof(int));
    5278         102 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    5279             :     }
    5280             :     else
    5281           0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    5282             : 
    5283         102 :     if (typlen == -1)
    5284             :     {
    5285             :         /* varlena type, possibly toasted */
    5286         102 :         result = toast_datum_size(value);
    5287             :     }
    5288           0 :     else if (typlen == -2)
    5289             :     {
    5290             :         /* cstring */
    5291           0 :         result = strlen(DatumGetCString(value)) + 1;
    5292             :     }
    5293             :     else
    5294             :     {
    5295             :         /* ordinary fixed-width type */
    5296           0 :         result = typlen;
    5297             :     }
    5298             : 
    5299         102 :     PG_RETURN_INT32(result);
    5300             : }
    5301             : 
    5302             : /*
    5303             :  * string_agg - Concatenates values and returns string.
    5304             :  *
    5305             :  * Syntax: string_agg(value text, delimiter text) RETURNS text
    5306             :  *
    5307             :  * Note: Any NULL values are ignored. The first-call delimiter isn't
    5308             :  * actually used at all, and on subsequent calls the delimiter precedes
    5309             :  * the associated value.
    5310             :  */
    5311             : 
    5312             : /* subroutine to initialize state */
    5313             : static StringInfo
    5314         904 : makeStringAggState(FunctionCallInfo fcinfo)
    5315             : {
    5316             :     StringInfo  state;
    5317             :     MemoryContext aggcontext;
    5318             :     MemoryContext oldcontext;
    5319             : 
    5320         904 :     if (!AggCheckCallContext(fcinfo, &aggcontext))
    5321             :     {
    5322             :         /* cannot be called directly because of internal-type argument */
    5323           0 :         elog(ERROR, "string_agg_transfn called in non-aggregate context");
    5324             :     }
    5325             : 
    5326             :     /*
    5327             :      * Create state in aggregate context.  It'll stay there across subsequent
    5328             :      * calls.
    5329             :      */
    5330         904 :     oldcontext = MemoryContextSwitchTo(aggcontext);
    5331         904 :     state = makeStringInfo();
    5332         904 :     MemoryContextSwitchTo(oldcontext);
    5333             : 
    5334         904 :     return state;
    5335             : }
    5336             : 
    5337             : Datum
    5338      615186 : string_agg_transfn(PG_FUNCTION_ARGS)
    5339             : {
    5340             :     StringInfo  state;
    5341             : 
    5342      615186 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    5343             : 
    5344             :     /* Append the value unless null. */
    5345      615186 :     if (!PG_ARGISNULL(1))
    5346             :     {
    5347             :         /* On the first time through, we ignore the delimiter. */
    5348      615154 :         if (state == NULL)
    5349         886 :             state = makeStringAggState(fcinfo);
    5350      614268 :         else if (!PG_ARGISNULL(2))
    5351      614268 :             appendStringInfoText(state, PG_GETARG_TEXT_PP(2));  /* delimiter */
    5352             : 
    5353      615154 :         appendStringInfoText(state, PG_GETARG_TEXT_PP(1));  /* value */
    5354             :     }
    5355             : 
    5356             :     /*
    5357             :      * The transition type for string_agg() is declared to be "internal",
    5358             :      * which is a pass-by-value type the same size as a pointer.
    5359             :      */
    5360      615186 :     PG_RETURN_POINTER(state);
    5361             : }
    5362             : 
    5363             : Datum
    5364         926 : string_agg_finalfn(PG_FUNCTION_ARGS)
    5365             : {
    5366             :     StringInfo  state;
    5367             : 
    5368             :     /* cannot be called directly because of internal-type argument */
    5369             :     Assert(AggCheckCallContext(fcinfo, NULL));
    5370             : 
    5371         926 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    5372             : 
    5373         926 :     if (state != NULL)
    5374         886 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
    5375             :     else
    5376          40 :         PG_RETURN_NULL();
    5377             : }
    5378             : 
    5379             : /*
    5380             :  * Prepare cache with fmgr info for the output functions of the datatypes of
    5381             :  * the arguments of a concat-like function, beginning with argument "argidx".
    5382             :  * (Arguments before that will have corresponding slots in the resulting
    5383             :  * FmgrInfo array, but we don't fill those slots.)
    5384             :  */
    5385             : static FmgrInfo *
    5386          24 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
    5387             : {
    5388             :     FmgrInfo   *foutcache;
    5389             :     int         i;
    5390             : 
    5391             :     /* We keep the info in fn_mcxt so it survives across calls */
    5392          24 :     foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    5393          24 :                                                 PG_NARGS() * sizeof(FmgrInfo));
    5394             : 
    5395         120 :     for (i = argidx; i < PG_NARGS(); i++)
    5396             :     {
    5397             :         Oid         valtype;
    5398             :         Oid         typOutput;
    5399             :         bool        typIsVarlena;
    5400             : 
    5401          96 :         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
    5402          96 :         if (!OidIsValid(valtype))
    5403           0 :             elog(ERROR, "could not determine data type of concat() input");
    5404             : 
    5405          96 :         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
    5406          96 :         fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
    5407             :     }
    5408             : 
    5409          24 :     fcinfo->flinfo->fn_extra = foutcache;
    5410             : 
    5411          24 :     return foutcache;
    5412             : }
    5413             : 
    5414             : /*
    5415             :  * Implementation of both concat() and concat_ws().
    5416             :  *
    5417             :  * sepstr is the separator string to place between values.
    5418             :  * argidx identifies the first argument to concatenate (counting from zero);
    5419             :  * note that this must be constant across any one series of calls.
    5420             :  *
    5421             :  * Returns NULL if result should be NULL, else text value.
    5422             :  */
    5423             : static text *
    5424          44 : concat_internal(const char *sepstr, int argidx,
    5425             :                 FunctionCallInfo fcinfo)
    5426             : {
    5427             :     text       *result;
    5428             :     StringInfoData str;
    5429             :     FmgrInfo   *foutcache;
    5430          44 :     bool        first_arg = true;
    5431             :     int         i;
    5432             : 
    5433             :     /*
    5434             :      * concat(VARIADIC some-array) is essentially equivalent to
    5435             :      * array_to_text(), ie concat the array elements with the given separator.
    5436             :      * So we just pass the case off to that code.
    5437             :      */
    5438          44 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    5439             :     {
    5440             :         ArrayType  *arr;
    5441             : 
    5442             :         /* Should have just the one argument */
    5443             :         Assert(argidx == PG_NARGS() - 1);
    5444             : 
    5445             :         /* concat(VARIADIC NULL) is defined as NULL */
    5446          20 :         if (PG_ARGISNULL(argidx))
    5447           8 :             return NULL;
    5448             : 
    5449             :         /*
    5450             :          * Non-null argument had better be an array.  We assume that any call
    5451             :          * context that could let get_fn_expr_variadic return true will have
    5452             :          * checked that a VARIADIC-labeled parameter actually is an array.  So
    5453             :          * it should be okay to just Assert that it's an array rather than
    5454             :          * doing a full-fledged error check.
    5455             :          */
    5456             :         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
    5457             : 
    5458             :         /* OK, safe to fetch the array value */
    5459          12 :         arr = PG_GETARG_ARRAYTYPE_P(argidx);
    5460             : 
    5461             :         /*
    5462             :          * And serialize the array.  We tell array_to_text to ignore null
    5463             :          * elements, which matches the behavior of the loop below.
    5464             :          */
    5465          12 :         return array_to_text_internal(fcinfo, arr, sepstr, NULL);
    5466             :     }
    5467             : 
    5468             :     /* Normal case without explicit VARIADIC marker */
    5469          24 :     initStringInfo(&str);
    5470             : 
    5471             :     /* Get output function info, building it if first time through */
    5472          24 :     foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
    5473          24 :     if (foutcache == NULL)
    5474          24 :         foutcache = build_concat_foutcache(fcinfo, argidx);
    5475             : 
    5476         120 :     for (i = argidx; i < PG_NARGS(); i++)
    5477             :     {
    5478          96 :         if (!PG_ARGISNULL(i))
    5479             :         {
    5480          88 :             Datum       value = PG_GETARG_DATUM(i);
    5481             : 
    5482             :             /* add separator if appropriate */
    5483          88 :             if (first_arg)
    5484          24 :                 first_arg = false;
    5485             :             else
    5486          64 :                 appendStringInfoString(&str, sepstr);
    5487             : 
    5488             :             /* call the appropriate type output function, append the result */
    5489          88 :             appendStringInfoString(&str,
    5490          88 :                                    OutputFunctionCall(&foutcache[i], value));
    5491             :         }
    5492             :     }
    5493             : 
    5494          24 :     result = cstring_to_text_with_len(str.data, str.len);
    5495          24 :     pfree(str.data);
    5496             : 
    5497          24 :     return result;
    5498             : }
    5499             : 
    5500             : /*
    5501             :  * Concatenate all arguments. NULL arguments are ignored.
    5502             :  */
    5503             : Datum
    5504          20 : text_concat(PG_FUNCTION_ARGS)
    5505             : {
    5506             :     text       *result;
    5507             : 
    5508          20 :     result = concat_internal("", 0, fcinfo);
    5509          20 :     if (result == NULL)
    5510           4 :         PG_RETURN_NULL();
    5511          16 :     PG_RETURN_TEXT_P(result);
    5512             : }
    5513             : 
    5514             : /*
    5515             :  * Concatenate all but first argument value with separators. The first
    5516             :  * parameter is used as the separator. NULL arguments are ignored.
    5517             :  */
    5518             : Datum
    5519          28 : text_concat_ws(PG_FUNCTION_ARGS)
    5520             : {
    5521             :     char       *sep;
    5522             :     text       *result;
    5523             : 
    5524             :     /* return NULL when separator is NULL */
    5525          28 :     if (PG_ARGISNULL(0))
    5526           4 :         PG_RETURN_NULL();
    5527          24 :     sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
    5528             : 
    5529          24 :     result = concat_internal(sep, 1, fcinfo);
    5530          24 :     if (result == NULL)
    5531           4 :         PG_RETURN_NULL();
    5532          20 :     PG_RETURN_TEXT_P(result);
    5533             : }
    5534             : 
    5535             : /*
    5536             :  * Return first n characters in the string. When n is negative,
    5537             :  * return all but last |n| characters.
    5538             :  */
    5539             : Datum
    5540        1256 : text_left(PG_FUNCTION_ARGS)
    5541             : {
    5542        1256 :     int         n = PG_GETARG_INT32(1);
    5543             : 
    5544        1256 :     if (n < 0)
    5545             :     {
    5546          20 :         text       *str = PG_GETARG_TEXT_PP(0);
    5547          20 :         const char *p = VARDATA_ANY(str);
    5548          20 :         int         len = VARSIZE_ANY_EXHDR(str);
    5549             :         int         rlen;
    5550             : 
    5551          20 :         n = pg_mbstrlen_with_len(p, len) + n;
    5552          20 :         rlen = pg_mbcharcliplen(p, len, n);
    5553          20 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
    5554             :     }
    5555             :     else
    5556        1236 :         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
    5557             : }
    5558             : 
    5559             : /*
    5560             :  * Return last n characters in the string. When n is negative,
    5561             :  * return all but first |n| characters.
    5562             :  */
    5563             : Datum
    5564          44 : text_right(PG_FUNCTION_ARGS)
    5565             : {
    5566          44 :     text       *str = PG_GETARG_TEXT_PP(0);
    5567          44 :     const char *p = VARDATA_ANY(str);
    5568          44 :     int         len = VARSIZE_ANY_EXHDR(str);
    5569          44 :     int         n = PG_GETARG_INT32(1);
    5570             :     int         off;
    5571             : 
    5572          44 :     if (n < 0)
    5573          20 :         n = -n;
    5574             :     else
    5575          24 :         n = pg_mbstrlen_with_len(p, len) - n;
    5576          44 :     off = pg_mbcharcliplen(p, len, n);
    5577             : 
    5578          44 :     PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
    5579             : }
    5580             : 
    5581             : /*
    5582             :  * Return reversed string
    5583             :  */
    5584             : Datum
    5585           4 : text_reverse(PG_FUNCTION_ARGS)
    5586             : {
    5587           4 :     text       *str = PG_GETARG_TEXT_PP(0);
    5588           4 :     const char *p = VARDATA_ANY(str);
    5589           4 :     int         len = VARSIZE_ANY_EXHDR(str);
    5590           4 :     const char *endp = p + len;
    5591             :     text       *result;
    5592             :     char       *dst;
    5593             : 
    5594           4 :     result = palloc(len + VARHDRSZ);
    5595           4 :     dst = (char *) VARDATA(result) + len;
    5596           4 :     SET_VARSIZE(result, len + VARHDRSZ);
    5597             : 
    5598           4 :     if (pg_database_encoding_max_length() > 1)
    5599             :     {
    5600             :         /* multibyte version */
    5601          24 :         while (p < endp)
    5602             :         {
    5603             :             int         sz;
    5604             : 
    5605          20 :             sz = pg_mblen(p);
    5606          20 :             dst -= sz;
    5607          20 :             memcpy(dst, p, sz);
    5608          20 :             p += sz;
    5609             :         }
    5610             :     }
    5611             :     else
    5612             :     {
    5613             :         /* single byte version */
    5614           0 :         while (p < endp)
    5615           0 :             *(--dst) = *p++;
    5616             :     }
    5617             : 
    5618           4 :     PG_RETURN_TEXT_P(result);
    5619             : }
    5620             : 
    5621             : 
    5622             : /*
    5623             :  * Support macros for text_format()
    5624             :  */
    5625             : #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
    5626             : 
    5627             : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
    5628             :     do { \
    5629             :         if (++(ptr) >= (end_ptr)) \
    5630             :             ereport(ERROR, \
    5631             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    5632             :                      errmsg("unterminated format() type specifier"), \
    5633             :                      errhint("For a single \"%%\" use \"%%%%\"."))); \
    5634             :     } while (0)
    5635             : 
    5636             : /*
    5637             :  * Returns a formatted string
    5638             :  */
    5639             : Datum
    5640        9030 : text_format(PG_FUNCTION_ARGS)
    5641             : {
    5642             :     text       *fmt;
    5643             :     StringInfoData str;
    5644             :     const char *cp;
    5645             :     const char *start_ptr;
    5646             :     const char *end_ptr;
    5647             :     text       *result;
    5648             :     int         arg;
    5649             :     bool        funcvariadic;
    5650             :     int         nargs;
    5651        9030 :     Datum      *elements = NULL;
    5652        9030 :     bool       *nulls = NULL;
    5653        9030 :     Oid         element_type = InvalidOid;
    5654        9030 :     Oid         prev_type = InvalidOid;
    5655        9030 :     Oid         prev_width_type = InvalidOid;
    5656             :     FmgrInfo    typoutputfinfo;
    5657             :     FmgrInfo    typoutputinfo_width;
    5658             : 
    5659             :     /* When format string is null, immediately return null */
    5660        9030 :     if (PG_ARGISNULL(0))
    5661           4 :         PG_RETURN_NULL();
    5662             : 
    5663             :     /* If argument is marked VARIADIC, expand array into elements */
    5664        9026 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    5665             :     {
    5666             :         ArrayType  *arr;
    5667             :         int16       elmlen;
    5668             :         bool        elmbyval;
    5669             :         char        elmalign;
    5670             :         int         nitems;
    5671             : 
    5672             :         /* Should have just the one argument */
    5673             :         Assert(PG_NARGS() == 2);
    5674             : 
    5675             :         /* If argument is NULL, we treat it as zero-length array */
    5676          32 :         if (PG_ARGISNULL(1))
    5677           4 :             nitems = 0;
    5678             :         else
    5679             :         {
    5680             :             /*
    5681             :              * Non-null argument had better be an array.  We assume that any
    5682             :              * call context that could let get_fn_expr_variadic return true
    5683             :              * will have checked that a VARIADIC-labeled parameter actually is
    5684             :              * an array.  So it should be okay to just Assert that it's an
    5685             :              * array rather than doing a full-fledged error check.
    5686             :              */
    5687             :             Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
    5688             : 
    5689             :             /* OK, safe to fetch the array value */
    5690          28 :             arr = PG_GETARG_ARRAYTYPE_P(1);
    5691             : 
    5692             :             /* Get info about array element type */
    5693          28 :             element_type = ARR_ELEMTYPE(arr);
    5694          28 :             get_typlenbyvalalign(element_type,
    5695             :                                  &elmlen, &elmbyval, &elmalign);
    5696             : 
    5697             :             /* Extract all array elements */
    5698          28 :             deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
    5699             :                               &elements, &nulls, &nitems);
    5700             :         }
    5701             : 
    5702          32 :         nargs = nitems + 1;
    5703          32 :         funcvariadic = true;
    5704             :     }
    5705             :     else
    5706             :     {
    5707             :         /* Non-variadic case, we'll process the arguments individually */
    5708        8994 :         nargs = PG_NARGS();
    5709        8994 :         funcvariadic = false;
    5710             :     }
    5711             : 
    5712             :     /* Setup for main loop. */
    5713        9026 :     fmt = PG_GETARG_TEXT_PP(0);
    5714        9026 :     start_ptr = VARDATA_ANY(fmt);
    5715        9026 :     end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
    5716        9026 :     initStringInfo(&str);
    5717        9026 :     arg = 1;                    /* next argument position to print */
    5718             : 
    5719             :     /* Scan format string, looking for conversion specifiers. */
    5720      268146 :     for (cp = start_ptr; cp < end_ptr; cp++)
    5721             :     {
    5722             :         int         argpos;
    5723             :         int         widthpos;
    5724             :         int         flags;
    5725             :         int         width;
    5726             :         Datum       value;
    5727             :         bool        isNull;
    5728             :         Oid         typid;
    5729             : 
    5730             :         /*
    5731             :          * If it's not the start of a conversion specifier, just copy it to
    5732             :          * the output buffer.
    5733             :          */
    5734      259160 :         if (*cp != '%')
    5735             :         {
    5736      240682 :             appendStringInfoCharMacro(&str, *cp);
    5737      240694 :             continue;
    5738             :         }
    5739             : 
    5740       18478 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5741             : 
    5742             :         /* Easy case: %% outputs a single % */
    5743       18478 :         if (*cp == '%')
    5744             :         {
    5745          12 :             appendStringInfoCharMacro(&str, *cp);
    5746          12 :             continue;
    5747             :         }
    5748             : 
    5749             :         /* Parse the optional portions of the format specifier */
    5750       18466 :         cp = text_format_parse_format(cp, end_ptr,
    5751             :                                       &argpos, &widthpos,
    5752             :                                       &flags, &width);
    5753             : 
    5754             :         /*
    5755             :          * Next we should see the main conversion specifier.  Whether or not
    5756             :          * an argument position was present, it's known that at least one
    5757             :          * character remains in the string at this point.  Experience suggests
    5758             :          * that it's worth checking that that character is one of the expected
    5759             :          * ones before we try to fetch arguments, so as to produce the least
    5760             :          * confusing response to a mis-formatted specifier.
    5761             :          */
    5762       18450 :         if (strchr("sIL", *cp) == NULL)
    5763           4 :             ereport(ERROR,
    5764             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5765             :                      errmsg("unrecognized format() type specifier \"%.*s\"",
    5766             :                             pg_mblen(cp), cp),
    5767             :                      errhint("For a single \"%%\" use \"%%%%\".")));
    5768             : 
    5769             :         /* If indirect width was specified, get its value */
    5770       18446 :         if (widthpos >= 0)
    5771             :         {
    5772             :             /* Collect the specified or next argument position */
    5773          28 :             if (widthpos > 0)
    5774          24 :                 arg = widthpos;
    5775          28 :             if (arg >= nargs)
    5776           0 :                 ereport(ERROR,
    5777             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5778             :                          errmsg("too few arguments for format()")));
    5779             : 
    5780             :             /* Get the value and type of the selected argument */
    5781          28 :             if (!funcvariadic)
    5782             :             {
    5783          28 :                 value = PG_GETARG_DATUM(arg);
    5784          28 :                 isNull = PG_ARGISNULL(arg);
    5785          28 :                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    5786             :             }
    5787             :             else
    5788             :             {
    5789           0 :                 value = elements[arg - 1];
    5790           0 :                 isNull = nulls[arg - 1];
    5791           0 :                 typid = element_type;
    5792             :             }
    5793          28 :             if (!OidIsValid(typid))
    5794           0 :                 elog(ERROR, "could not determine data type of format() input");
    5795             : 
    5796          28 :             arg++;
    5797             : 
    5798             :             /* We can treat NULL width the same as zero */
    5799          28 :             if (isNull)
    5800           4 :                 width = 0;
    5801          24 :             else if (typid == INT4OID)
    5802          24 :                 width = DatumGetInt32(value);
    5803           0 :             else if (typid == INT2OID)
    5804           0 :                 width = DatumGetInt16(value);
    5805             :             else
    5806             :             {
    5807             :                 /* For less-usual datatypes, convert to text then to int */
    5808             :                 char       *str;
    5809             : 
    5810           0 :                 if (typid != prev_width_type)
    5811             :                 {
    5812             :                     Oid         typoutputfunc;
    5813             :                     bool        typIsVarlena;
    5814             : 
    5815           0 :                     getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    5816           0 :                     fmgr_info(typoutputfunc, &typoutputinfo_width);
    5817           0 :                     prev_width_type = typid;
    5818             :                 }
    5819             : 
    5820           0 :                 str = OutputFunctionCall(&typoutputinfo_width, value);
    5821             : 
    5822             :                 /* pg_strtoint32 will complain about bad data or overflow */
    5823           0 :                 width = pg_strtoint32(str);
    5824             : 
    5825           0 :                 pfree(str);
    5826             :             }
    5827             :         }
    5828             : 
    5829             :         /* Collect the specified or next argument position */
    5830       18446 :         if (argpos > 0)
    5831          88 :             arg = argpos;
    5832       18446 :         if (arg >= nargs)
    5833          16 :             ereport(ERROR,
    5834             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5835             :                      errmsg("too few arguments for format()")));
    5836             : 
    5837             :         /* Get the value and type of the selected argument */
    5838       18430 :         if (!funcvariadic)
    5839             :         {
    5840       17582 :             value = PG_GETARG_DATUM(arg);
    5841       17582 :             isNull = PG_ARGISNULL(arg);
    5842       17582 :             typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    5843             :         }
    5844             :         else
    5845             :         {
    5846         848 :             value = elements[arg - 1];
    5847         848 :             isNull = nulls[arg - 1];
    5848         848 :             typid = element_type;
    5849             :         }
    5850       18430 :         if (!OidIsValid(typid))
    5851           0 :             elog(ERROR, "could not determine data type of format() input");
    5852             : 
    5853       18430 :         arg++;
    5854             : 
    5855             :         /*
    5856             :          * Get the appropriate typOutput function, reusing previous one if
    5857             :          * same type as previous argument.  That's particularly useful in the
    5858             :          * variadic-array case, but often saves work even for ordinary calls.
    5859             :          */
    5860       18430 :         if (typid != prev_type)
    5861             :         {
    5862             :             Oid         typoutputfunc;
    5863             :             bool        typIsVarlena;
    5864             : 
    5865       10202 :             getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    5866       10202 :             fmgr_info(typoutputfunc, &typoutputfinfo);
    5867       10202 :             prev_type = typid;
    5868             :         }
    5869             : 
    5870             :         /*
    5871             :          * And now we can format the value.
    5872             :          */
    5873       18430 :         switch (*cp)
    5874             :         {
    5875       18430 :             case 's':
    5876             :             case 'I':
    5877             :             case 'L':
    5878       18430 :                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
    5879             :                                               value, isNull,
    5880             :                                               flags, width);
    5881       18426 :                 break;
    5882           0 :             default:
    5883             :                 /* should not get here, because of previous check */
    5884           0 :                 ereport(ERROR,
    5885             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5886             :                          errmsg("unrecognized format() type specifier \"%.*s\"",
    5887             :                                 pg_mblen(cp), cp),
    5888             :                          errhint("For a single \"%%\" use \"%%%%\".")));
    5889             :                 break;
    5890             :         }
    5891             :     }
    5892             : 
    5893             :     /* Don't need deconstruct_array results anymore. */
    5894        8986 :     if (elements != NULL)
    5895          28 :         pfree(elements);
    5896        8986 :     if (nulls != NULL)
    5897          28 :         pfree(nulls);
    5898             : 
    5899             :     /* Generate results. */
    5900        8986 :     result = cstring_to_text_with_len(str.data, str.len);
    5901        8986 :     pfree(str.data);
    5902             : 
    5903        8986 :     PG_RETURN_TEXT_P(result);
    5904             : }
    5905             : 
    5906             : /*
    5907             :  * Parse contiguous digits as a decimal number.
    5908             :  *
    5909             :  * Returns true if some digits could be parsed.
    5910             :  * The value is returned into *value, and *ptr is advanced to the next
    5911             :  * character to be parsed.
    5912             :  *
    5913             :  * Note parsing invariant: at least one character is known available before
    5914             :  * string end (end_ptr) at entry, and this is still true at exit.
    5915             :  */
    5916             : static bool
    5917       36908 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
    5918             : {
    5919       36908 :     bool        found = false;
    5920       36908 :     const char *cp = *ptr;
    5921       36908 :     int         val = 0;
    5922             : 
    5923       37116 :     while (*cp >= '0' && *cp <= '9')
    5924             :     {
    5925         212 :         int8        digit = (*cp - '0');
    5926             : 
    5927         212 :         if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
    5928         212 :             unlikely(pg_add_s32_overflow(val, digit, &val)))
    5929           0 :             ereport(ERROR,
    5930             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5931             :                      errmsg("number is out of range")));
    5932         212 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5933         208 :         found = true;
    5934             :     }
    5935             : 
    5936       36904 :     *ptr = cp;
    5937       36904 :     *value = val;
    5938             : 
    5939       36904 :     return found;
    5940             : }
    5941             : 
    5942             : /*
    5943             :  * Parse a format specifier (generally following the SUS printf spec).
    5944             :  *
    5945             :  * We have already advanced over the initial '%', and we are looking for
    5946             :  * [argpos][flags][width]type (but the type character is not consumed here).
    5947             :  *
    5948             :  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
    5949             :  * Output parameters:
    5950             :  *  argpos: argument position for value to be printed.  -1 means unspecified.
    5951             :  *  widthpos: argument position for width.  Zero means the argument position
    5952             :  *          was unspecified (ie, take the next arg) and -1 means no width
    5953             :  *          argument (width was omitted or specified as a constant).
    5954             :  *  flags: bitmask of flags.
    5955             :  *  width: directly-specified width value.  Zero means the width was omitted
    5956             :  *          (note it's not necessary to distinguish this case from an explicit
    5957             :  *          zero width value).
    5958             :  *
    5959             :  * The function result is the next character position to be parsed, ie, the
    5960             :  * location where the type character is/should be.
    5961             :  *
    5962             :  * Note parsing invariant: at least one character is known available before
    5963             :  * string end (end_ptr) at entry, and this is still true at exit.
    5964             :  */
    5965             : static const char *
    5966       18466 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
    5967             :                          int *argpos, int *widthpos,
    5968             :                          int *flags, int *width)
    5969             : {
    5970       18466 :     const char *cp = start_ptr;
    5971             :     int         n;
    5972             : 
    5973             :     /* set defaults for output parameters */
    5974       18466 :     *argpos = -1;
    5975       18466 :     *widthpos = -1;
    5976       18466 :     *flags = 0;
    5977       18466 :     *width = 0;
    5978             : 
    5979             :     /* try to identify first number */
    5980       18466 :     if (text_format_parse_digits(&cp, end_ptr, &n))
    5981             :     {
    5982         116 :         if (*cp != '$')
    5983             :         {
    5984             :             /* Must be just a width and a type, so we're done */
    5985          16 :             *width = n;
    5986          16 :             return cp;
    5987             :         }
    5988             :         /* The number was argument position */
    5989         100 :         *argpos = n;
    5990             :         /* Explicit 0 for argument index is immediately refused */
    5991         100 :         if (n == 0)
    5992           4 :             ereport(ERROR,
    5993             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5994             :                      errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5995          96 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5996             :     }
    5997             : 
    5998             :     /* Handle flags (only minus is supported now) */
    5999       18462 :     while (*cp == '-')
    6000             :     {
    6001          20 :         *flags |= TEXT_FORMAT_FLAG_MINUS;
    6002          20 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    6003             :     }
    6004             : 
    6005       18442 :     if (*cp == '*')
    6006             :     {
    6007             :         /* Handle indirect width */
    6008          32 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    6009          32 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    6010             :         {
    6011             :             /* number in this position must be closed by $ */
    6012          28 :             if (*cp != '$')
    6013           0 :                 ereport(ERROR,
    6014             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6015             :                          errmsg("width argument position must be ended by \"$\"")));
    6016             :             /* The number was width argument position */
    6017          28 :             *widthpos = n;
    6018             :             /* Explicit 0 for argument index is immediately refused */
    6019          28 :             if (n == 0)
    6020           4 :                 ereport(ERROR,
    6021             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6022             :                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
    6023          24 :             ADVANCE_PARSE_POINTER(cp, end_ptr);
    6024             :         }
    6025             :         else
    6026           4 :             *widthpos = 0;      /* width's argument position is unspecified */
    6027             :     }
    6028             :     else
    6029             :     {
    6030             :         /* Check for direct width specification */
    6031       18410 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    6032          20 :             *width = n;
    6033             :     }
    6034             : 
    6035             :     /* cp should now be pointing at type character */
    6036       18434 :     return cp;
    6037             : }
    6038             : 
    6039             : /*
    6040             :  * Format a %s, %I, or %L conversion
    6041             :  */
    6042             : static void
    6043       18430 : text_format_string_conversion(StringInfo buf, char conversion,
    6044             :                               FmgrInfo *typOutputInfo,
    6045             :                               Datum value, bool isNull,
    6046             :                               int flags, int width)
    6047             : {
    6048             :     char       *str;
    6049             : 
    6050             :     /* Handle NULL arguments before trying to stringify the value. */
    6051       18430 :     if (isNull)
    6052             :     {
    6053          44 :         if (conversion == 's')
    6054          12 :             text_format_append_string(buf, "", flags, width);
    6055          32 :         else if (conversion == 'L')
    6056          28 :             text_format_append_string(buf, "NULL", flags, width);
    6057           4 :         else if (conversion == 'I')
    6058           4 :             ereport(ERROR,
    6059             :                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
    6060             :                      errmsg("null values cannot be formatted as an SQL identifier")));
    6061          40 :         return;
    6062             :     }
    6063             : 
    6064             :     /* Stringify. */
    6065       18386 :     str = OutputFunctionCall(typOutputInfo, value);
    6066             : 
    6067             :     /* Escape. */
    6068       18386 :     if (conversion == 'I')
    6069             :     {
    6070             :         /* quote_identifier may or may not allocate a new string. */
    6071        1336 :         text_format_append_string(buf, quote_identifier(str), flags, width);
    6072             :     }
    6073       17050 :     else if (conversion == 'L')
    6074             :     {
    6075        1018 :         char       *qstr = quote_literal_cstr(str);
    6076             : 
    6077        1018 :         text_format_append_string(buf, qstr, flags, width);
    6078             :         /* quote_literal_cstr() always allocates a new string */
    6079        1018 :         pfree(qstr);
    6080             :     }
    6081             :     else
    6082       16032 :         text_format_append_string(buf, str, flags, width);
    6083             : 
    6084             :     /* Cleanup. */
    6085       18386 :     pfree(str);
    6086             : }
    6087             : 
    6088             : /*
    6089             :  * Append str to buf, padding as directed by flags/width
    6090             :  */
    6091             : static void
    6092       18426 : text_format_append_string(StringInfo buf, const char *str,
    6093             :                           int flags, int width)
    6094             : {
    6095       18426 :     bool        align_to_left = false;
    6096             :     int         len;
    6097             : 
    6098             :     /* fast path for typical easy case */
    6099       18426 :     if (width == 0)
    6100             :     {
    6101       18370 :         appendStringInfoString(buf, str);
    6102       18370 :         return;
    6103             :     }
    6104             : 
    6105          56 :     if (width < 0)
    6106             :     {
    6107             :         /* Negative width: implicit '-' flag, then take absolute value */
    6108           4 :         align_to_left = true;
    6109             :         /* -INT_MIN is undefined */
    6110           4 :         if (width <= INT_MIN)
    6111           0 :             ereport(ERROR,
    6112             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    6113             :                      errmsg("number is out of range")));
    6114           4 :         width = -width;
    6115             :     }
    6116          52 :     else if (flags & TEXT_FORMAT_FLAG_MINUS)
    6117          16 :         align_to_left = true;
    6118             : 
    6119          56 :     len = pg_mbstrlen(str);
    6120          56 :     if (align_to_left)
    6121             :     {
    6122             :         /* left justify */
    6123          20 :         appendStringInfoString(buf, str);
    6124          20 :         if (len < width)
    6125          20 :             appendStringInfoSpaces(buf, width - len);
    6126             :     }
    6127             :     else
    6128             :     {
    6129             :         /* right justify */
    6130          36 :         if (len < width)
    6131          36 :             appendStringInfoSpaces(buf, width - len);
    6132          36 :         appendStringInfoString(buf, str);
    6133             :     }
    6134             : }
    6135             : 
    6136             : /*
    6137             :  * text_format_nv - nonvariadic wrapper for text_format function.
    6138             :  *
    6139             :  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
    6140             :  * which checks that all built-in functions that share the implementing C
    6141             :  * function take the same number of arguments.
    6142             :  */
    6143             : Datum
    6144          20 : text_format_nv(PG_FUNCTION_ARGS)
    6145             : {
    6146          20 :     return text_format(fcinfo);
    6147             : }
    6148             : 
    6149             : /*
    6150             :  * Helper function for Levenshtein distance functions. Faster than memcmp(),
    6151             :  * for this use case.
    6152             :  */
    6153             : static inline bool
    6154           0 : rest_of_char_same(const char *s1, const char *s2, int len)
    6155             : {
    6156           0 :     while (len > 0)
    6157             :     {
    6158           0 :         len--;
    6159           0 :         if (s1[len] != s2[len])
    6160           0 :             return false;
    6161             :     }
    6162           0 :     return true;
    6163             : }
    6164             : 
    6165             : /* Expand each Levenshtein distance variant */
    6166             : #include "levenshtein.c"
    6167             : #define LEVENSHTEIN_LESS_EQUAL
    6168             : #include "levenshtein.c"
    6169             : 
    6170             : 
    6171             : /*
    6172             :  * Unicode support
    6173             :  */
    6174             : 
    6175             : static UnicodeNormalizationForm
    6176         104 : unicode_norm_form_from_string(const char *formstr)
    6177             : {
    6178         104 :     UnicodeNormalizationForm form = -1;
    6179             : 
    6180             :     /*
    6181             :      * Might as well check this while we're here.
    6182             :      */
    6183         104 :     if (GetDatabaseEncoding() != PG_UTF8)
    6184           0 :         ereport(ERROR,
    6185             :                 (errcode(ERRCODE_SYNTAX_ERROR),
    6186             :                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
    6187             : 
    6188         104 :     if (pg_strcasecmp(formstr, "NFC") == 0)
    6189          36 :         form = UNICODE_NFC;
    6190          68 :     else if (pg_strcasecmp(formstr, "NFD") == 0)
    6191          20 :         form = UNICODE_NFD;
    6192          48 :     else if (pg_strcasecmp(formstr, "NFKC") == 0)
    6193          20 :         form = UNICODE_NFKC;
    6194          28 :     else if (pg_strcasecmp(formstr, "NFKD") == 0)
    6195          20 :         form = UNICODE_NFKD;
    6196             :     else
    6197           8 :         ereport(ERROR,
    6198             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6199             :                  errmsg("invalid normalization form: %s", formstr)));
    6200             : 
    6201          96 :     return form;
    6202             : }
    6203             : 
    6204             : Datum
    6205          28 : unicode_normalize_func(PG_FUNCTION_ARGS)
    6206             : {
    6207          28 :     text       *input = PG_GETARG_TEXT_PP(0);
    6208          28 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    6209             :     UnicodeNormalizationForm form;
    6210             :     int         size;
    6211             :     pg_wchar   *input_chars;
    6212             :     pg_wchar   *output_chars;
    6213             :     unsigned char *p;
    6214             :     text       *result;
    6215             :     int         i;
    6216             : 
    6217          28 :     form = unicode_norm_form_from_string(formstr);
    6218             : 
    6219             :     /* convert to pg_wchar */
    6220          24 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    6221          24 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    6222          24 :     p = (unsigned char *) VARDATA_ANY(input);
    6223         108 :     for (i = 0; i < size; i++)
    6224             :     {
    6225          84 :         input_chars[i] = utf8_to_unicode(p);
    6226          84 :         p += pg_utf_mblen(p);
    6227             :     }
    6228          24 :     input_chars[i] = (pg_wchar) '\0';
    6229             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    6230             : 
    6231             :     /* action */
    6232          24 :     output_chars = unicode_normalize(form, input_chars);
    6233             : 
    6234             :     /* convert back to UTF-8 string */
    6235          24 :     size = 0;
    6236         104 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6237             :     {
    6238             :         unsigned char buf[4];
    6239             : 
    6240          80 :         unicode_to_utf8(*wp, buf);
    6241          80 :         size += pg_utf_mblen(buf);
    6242             :     }
    6243             : 
    6244          24 :     result = palloc(size + VARHDRSZ);
    6245          24 :     SET_VARSIZE(result, size + VARHDRSZ);
    6246             : 
    6247          24 :     p = (unsigned char *) VARDATA_ANY(result);
    6248         104 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6249             :     {
    6250          80 :         unicode_to_utf8(*wp, p);
    6251          80 :         p += pg_utf_mblen(p);
    6252             :     }
    6253             :     Assert((char *) p == (char *) result + size + VARHDRSZ);
    6254             : 
    6255          24 :     PG_RETURN_TEXT_P(result);
    6256             : }
    6257             : 
    6258             : /*
    6259             :  * Check whether the string is in the specified Unicode normalization form.
    6260             :  *
    6261             :  * This is done by converting the string to the specified normal form and then
    6262             :  * comparing that to the original string.  To speed that up, we also apply the
    6263             :  * "quick check" algorithm specified in UAX #15, which can give a yes or no
    6264             :  * answer for many strings by just scanning the string once.
    6265             :  *
    6266             :  * This function should generally be optimized for the case where the string
    6267             :  * is in fact normalized.  In that case, we'll end up looking at the entire
    6268             :  * string, so it's probably not worth doing any incremental conversion etc.
    6269             :  */
    6270             : Datum
    6271          76 : unicode_is_normalized(PG_FUNCTION_ARGS)
    6272             : {
    6273          76 :     text       *input = PG_GETARG_TEXT_PP(0);
    6274          76 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    6275             :     UnicodeNormalizationForm form;
    6276             :     int         size;
    6277             :     pg_wchar   *input_chars;
    6278             :     pg_wchar   *output_chars;
    6279             :     unsigned char *p;
    6280             :     int         i;
    6281             :     UnicodeNormalizationQC quickcheck;
    6282             :     int         output_size;
    6283             :     bool        result;
    6284             : 
    6285          76 :     form = unicode_norm_form_from_string(formstr);
    6286             : 
    6287             :     /* convert to pg_wchar */
    6288          72 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    6289          72 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    6290          72 :     p = (unsigned char *) VARDATA_ANY(input);
    6291         320 :     for (i = 0; i < size; i++)
    6292             :     {
    6293         248 :         input_chars[i] = utf8_to_unicode(p);
    6294         248 :         p += pg_utf_mblen(p);
    6295             :     }
    6296          72 :     input_chars[i] = (pg_wchar) '\0';
    6297             :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    6298             : 
    6299             :     /* quick check (see UAX #15) */
    6300          72 :     quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
    6301          72 :     if (quickcheck == UNICODE_NORM_QC_YES)
    6302          20 :         PG_RETURN_BOOL(true);
    6303          52 :     else if (quickcheck == UNICODE_NORM_QC_NO)
    6304           8 :         PG_RETURN_BOOL(false);
    6305             : 
    6306             :     /* normalize and compare with original */
    6307          44 :     output_chars = unicode_normalize(form, input_chars);
    6308             : 
    6309          44 :     output_size = 0;
    6310         208 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6311         164 :         output_size++;
    6312             : 
    6313          60 :     result = (size == output_size) &&
    6314          16 :         (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
    6315             : 
    6316          44 :     PG_RETURN_BOOL(result);
    6317             : }

Generated by: LCOV version 1.13