Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/detoast.h"
21 : #include "access/toast_compression.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/hashfn.h"
25 : #include "common/int.h"
26 : #include "common/unicode_category.h"
27 : #include "common/unicode_norm.h"
28 : #include "common/unicode_version.h"
29 : #include "funcapi.h"
30 : #include "lib/hyperloglog.h"
31 : #include "libpq/pqformat.h"
32 : #include "miscadmin.h"
33 : #include "nodes/execnodes.h"
34 : #include "parser/scansup.h"
35 : #include "port/pg_bswap.h"
36 : #include "regex/regex.h"
37 : #include "utils/builtins.h"
38 : #include "utils/bytea.h"
39 : #include "utils/guc.h"
40 : #include "utils/lsyscache.h"
41 : #include "utils/memutils.h"
42 : #include "utils/pg_locale.h"
43 : #include "utils/sortsupport.h"
44 : #include "utils/varlena.h"
45 :
46 :
47 : /* GUC variable */
48 : int bytea_output = BYTEA_OUTPUT_HEX;
49 :
50 : typedef struct varlena VarString;
51 :
52 : /*
53 : * State for text_position_* functions.
54 : */
55 : typedef struct
56 : {
57 : pg_locale_t locale; /* collation used for substring matching */
58 : bool is_multibyte_char_in_char; /* need to check char boundaries? */
59 : bool greedy; /* find longest possible substring? */
60 :
61 : char *str1; /* haystack string */
62 : char *str2; /* needle string */
63 : int len1; /* string lengths in bytes */
64 : int len2;
65 :
66 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
67 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
68 : int skiptable[256]; /* skip distance for given mismatched char */
69 :
70 : /*
71 : * Note that with nondeterministic collations, the length of the last
72 : * match is not necessarily equal to the length of the "needle" passed in.
73 : */
74 : char *last_match; /* pointer to last match in 'str1' */
75 : int last_match_len; /* length of last match */
76 : int last_match_len_tmp; /* same but for internal use */
77 :
78 : /*
79 : * Sometimes we need to convert the byte position of a match to a
80 : * character position. These store the last position that was converted,
81 : * so that on the next call, we can continue from that point, rather than
82 : * count characters from the very beginning.
83 : */
84 : char *refpoint; /* pointer within original haystack string */
85 : int refpos; /* 0-based character offset of the same point */
86 : } TextPositionState;
87 :
88 : typedef struct
89 : {
90 : char *buf1; /* 1st string, or abbreviation original string
91 : * buf */
92 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
93 : int buflen1; /* Allocated length of buf1 */
94 : int buflen2; /* Allocated length of buf2 */
95 : int last_len1; /* Length of last buf1 string/strxfrm() input */
96 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
97 : int last_returned; /* Last comparison result (cache) */
98 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
99 : bool collate_c;
100 : Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
101 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
102 : hyperLogLogState full_card; /* Full key cardinality state */
103 : double prop_card; /* Required cardinality proportion */
104 : pg_locale_t locale;
105 : } VarStringSortSupport;
106 :
107 : /*
108 : * Output data for split_text(): we output either to an array or a table.
109 : * tupstore and tupdesc must be set up in advance to output to a table.
110 : */
111 : typedef struct
112 : {
113 : ArrayBuildState *astate;
114 : Tuplestorestate *tupstore;
115 : TupleDesc tupdesc;
116 : } SplitTextOutputData;
117 :
118 : /*
119 : * This should be large enough that most strings will fit, but small enough
120 : * that we feel comfortable putting it on the stack
121 : */
122 : #define TEXTBUFLEN 1024
123 :
124 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
125 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
126 :
127 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
128 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
129 : static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
130 : static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
131 : static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
132 : static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
133 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
134 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
135 : static int32 text_length(Datum str);
136 : static text *text_catenate(text *t1, text *t2);
137 : static text *text_substring(Datum str,
138 : int32 start,
139 : int32 length,
140 : bool length_not_specified);
141 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
142 : static int text_position(text *t1, text *t2, Oid collid);
143 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
144 : static bool text_position_next(TextPositionState *state);
145 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146 : static char *text_position_get_match_ptr(TextPositionState *state);
147 : static int text_position_get_match_pos(TextPositionState *state);
148 : static void text_position_cleanup(TextPositionState *state);
149 : static void check_collation_set(Oid collid);
150 : static int text_cmp(text *arg1, text *arg2, Oid collid);
151 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
152 : static bytea *bytea_substring(Datum str,
153 : int S,
154 : int L,
155 : bool length_not_specified);
156 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
157 : static void appendStringInfoText(StringInfo str, const text *t);
158 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
159 : static void split_text_accum_result(SplitTextOutputData *tstate,
160 : text *field_value,
161 : text *null_string,
162 : Oid collation);
163 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
164 : const char *fldsep, const char *null_string);
165 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
166 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
167 : int *value);
168 : static const char *text_format_parse_format(const char *start_ptr,
169 : const char *end_ptr,
170 : int *argpos, int *widthpos,
171 : int *flags, int *width);
172 : static void text_format_string_conversion(StringInfo buf, char conversion,
173 : FmgrInfo *typOutputInfo,
174 : Datum value, bool isNull,
175 : int flags, int width);
176 : static void text_format_append_string(StringInfo buf, const char *str,
177 : int flags, int width);
178 :
179 :
180 : /*****************************************************************************
181 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
182 : *****************************************************************************/
183 :
184 : /*
185 : * cstring_to_text
186 : *
187 : * Create a text value from a null-terminated C string.
188 : *
189 : * The new text value is freshly palloc'd with a full-size VARHDR.
190 : */
191 : text *
192 24706598 : cstring_to_text(const char *s)
193 : {
194 24706598 : return cstring_to_text_with_len(s, strlen(s));
195 : }
196 :
197 : /*
198 : * cstring_to_text_with_len
199 : *
200 : * Same as cstring_to_text except the caller specifies the string length;
201 : * the string need not be null_terminated.
202 : */
203 : text *
204 27357610 : cstring_to_text_with_len(const char *s, int len)
205 : {
206 27357610 : text *result = (text *) palloc(len + VARHDRSZ);
207 :
208 27357610 : SET_VARSIZE(result, len + VARHDRSZ);
209 27357610 : memcpy(VARDATA(result), s, len);
210 :
211 27357610 : return result;
212 : }
213 :
214 : /*
215 : * text_to_cstring
216 : *
217 : * Create a palloc'd, null-terminated C string from a text value.
218 : *
219 : * We support being passed a compressed or toasted text value.
220 : * This is a bit bogus since such values shouldn't really be referred to as
221 : * "text *", but it seems useful for robustness. If we didn't handle that
222 : * case here, we'd need another routine that did, anyway.
223 : */
224 : char *
225 16110304 : text_to_cstring(const text *t)
226 : {
227 : /* must cast away the const, unfortunately */
228 16110304 : text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
229 16110304 : int len = VARSIZE_ANY_EXHDR(tunpacked);
230 : char *result;
231 :
232 16110304 : result = (char *) palloc(len + 1);
233 16110304 : memcpy(result, VARDATA_ANY(tunpacked), len);
234 16110304 : result[len] = '\0';
235 :
236 16110304 : if (tunpacked != t)
237 42986 : pfree(tunpacked);
238 :
239 16110304 : return result;
240 : }
241 :
242 : /*
243 : * text_to_cstring_buffer
244 : *
245 : * Copy a text value into a caller-supplied buffer of size dst_len.
246 : *
247 : * The text string is truncated if necessary to fit. The result is
248 : * guaranteed null-terminated (unless dst_len == 0).
249 : *
250 : * We support being passed a compressed or toasted text value.
251 : * This is a bit bogus since such values shouldn't really be referred to as
252 : * "text *", but it seems useful for robustness. If we didn't handle that
253 : * case here, we'd need another routine that did, anyway.
254 : */
255 : void
256 978 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
257 : {
258 : /* must cast away the const, unfortunately */
259 978 : text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
260 978 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
261 :
262 978 : if (dst_len > 0)
263 : {
264 978 : dst_len--;
265 978 : if (dst_len >= src_len)
266 978 : dst_len = src_len;
267 : else /* ensure truncation is encoding-safe */
268 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
269 978 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
270 978 : dst[dst_len] = '\0';
271 : }
272 :
273 978 : if (srcunpacked != src)
274 0 : pfree(srcunpacked);
275 978 : }
276 :
277 :
278 : /*****************************************************************************
279 : * USER I/O ROUTINES *
280 : *****************************************************************************/
281 :
282 :
283 : #define VAL(CH) ((CH) - '0')
284 : #define DIG(VAL) ((VAL) + '0')
285 :
286 : /*
287 : * byteain - converts from printable representation of byte array
288 : *
289 : * Non-printable characters must be passed as '\nnn' (octal) and are
290 : * converted to internal form. '\' must be passed as '\\'.
291 : * ereport(ERROR, ...) if bad form.
292 : *
293 : * BUGS:
294 : * The input is scanned twice.
295 : * The error checking of input is minimal.
296 : */
297 : Datum
298 1385964 : byteain(PG_FUNCTION_ARGS)
299 : {
300 1385964 : char *inputText = PG_GETARG_CSTRING(0);
301 1385964 : Node *escontext = fcinfo->context;
302 : char *tp;
303 : char *rp;
304 : int bc;
305 : bytea *result;
306 :
307 : /* Recognize hex input */
308 1385964 : if (inputText[0] == '\\' && inputText[1] == 'x')
309 : {
310 111308 : size_t len = strlen(inputText);
311 :
312 111308 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
313 111308 : result = palloc(bc);
314 111308 : bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
315 : escontext);
316 111296 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
317 :
318 111296 : PG_RETURN_BYTEA_P(result);
319 : }
320 :
321 : /* Else, it's the traditional escaped style */
322 9563988 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
323 : {
324 8289344 : if (tp[0] != '\\')
325 8288326 : tp++;
326 1018 : else if ((tp[0] == '\\') &&
327 1018 : (tp[1] >= '0' && tp[1] <= '3') &&
328 1006 : (tp[2] >= '0' && tp[2] <= '7') &&
329 1006 : (tp[3] >= '0' && tp[3] <= '7'))
330 1006 : tp += 4;
331 12 : else if ((tp[0] == '\\') &&
332 12 : (tp[1] == '\\'))
333 0 : tp += 2;
334 : else
335 : {
336 : /*
337 : * one backslash, not followed by another or ### valid octal
338 : */
339 12 : ereturn(escontext, (Datum) 0,
340 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
341 : errmsg("invalid input syntax for type %s", "bytea")));
342 : }
343 : }
344 :
345 1274644 : bc += VARHDRSZ;
346 :
347 1274644 : result = (bytea *) palloc(bc);
348 1274644 : SET_VARSIZE(result, bc);
349 :
350 1274644 : tp = inputText;
351 1274644 : rp = VARDATA(result);
352 9563946 : while (*tp != '\0')
353 : {
354 8289302 : if (tp[0] != '\\')
355 8288296 : *rp++ = *tp++;
356 1006 : else if ((tp[0] == '\\') &&
357 1006 : (tp[1] >= '0' && tp[1] <= '3') &&
358 1006 : (tp[2] >= '0' && tp[2] <= '7') &&
359 1006 : (tp[3] >= '0' && tp[3] <= '7'))
360 : {
361 1006 : bc = VAL(tp[1]);
362 1006 : bc <<= 3;
363 1006 : bc += VAL(tp[2]);
364 1006 : bc <<= 3;
365 1006 : *rp++ = bc + VAL(tp[3]);
366 :
367 1006 : tp += 4;
368 : }
369 0 : else if ((tp[0] == '\\') &&
370 0 : (tp[1] == '\\'))
371 : {
372 0 : *rp++ = '\\';
373 0 : tp += 2;
374 : }
375 : else
376 : {
377 : /*
378 : * We should never get here. The first pass should not allow it.
379 : */
380 0 : ereturn(escontext, (Datum) 0,
381 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
382 : errmsg("invalid input syntax for type %s", "bytea")));
383 : }
384 : }
385 :
386 1274644 : PG_RETURN_BYTEA_P(result);
387 : }
388 :
389 : /*
390 : * byteaout - converts to printable representation of byte array
391 : *
392 : * In the traditional escaped format, non-printable characters are
393 : * printed as '\nnn' (octal) and '\' as '\\'.
394 : */
395 : Datum
396 559590 : byteaout(PG_FUNCTION_ARGS)
397 : {
398 559590 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
399 : char *result;
400 : char *rp;
401 :
402 559590 : if (bytea_output == BYTEA_OUTPUT_HEX)
403 : {
404 : /* Print hex format */
405 559206 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
406 559206 : *rp++ = '\\';
407 559206 : *rp++ = 'x';
408 559206 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
409 : }
410 384 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
411 : {
412 : /* Print traditional escaped format */
413 : char *vp;
414 : uint64 len;
415 : int i;
416 :
417 384 : len = 1; /* empty string has 1 char */
418 384 : vp = VARDATA_ANY(vlena);
419 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420 : {
421 217276 : if (*vp == '\\')
422 0 : len += 2;
423 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
424 498 : len += 4;
425 : else
426 216778 : len++;
427 : }
428 :
429 : /*
430 : * In principle len can't overflow uint32 if the input fit in 1GB, but
431 : * for safety let's check rather than relying on palloc's internal
432 : * check.
433 : */
434 384 : if (len > MaxAllocSize)
435 0 : ereport(ERROR,
436 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
437 : errmsg_internal("result of bytea output conversion is too large")));
438 384 : rp = result = (char *) palloc(len);
439 :
440 384 : vp = VARDATA_ANY(vlena);
441 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
442 : {
443 217276 : if (*vp == '\\')
444 : {
445 0 : *rp++ = '\\';
446 0 : *rp++ = '\\';
447 : }
448 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
449 498 : {
450 : int val; /* holds unprintable chars */
451 :
452 498 : val = *vp;
453 498 : rp[0] = '\\';
454 498 : rp[3] = DIG(val & 07);
455 498 : val >>= 3;
456 498 : rp[2] = DIG(val & 07);
457 498 : val >>= 3;
458 498 : rp[1] = DIG(val & 03);
459 498 : rp += 4;
460 : }
461 : else
462 216778 : *rp++ = *vp;
463 : }
464 : }
465 : else
466 : {
467 0 : elog(ERROR, "unrecognized \"bytea_output\" setting: %d",
468 : bytea_output);
469 : rp = result = NULL; /* keep compiler quiet */
470 : }
471 559590 : *rp = '\0';
472 559590 : PG_RETURN_CSTRING(result);
473 : }
474 :
475 : /*
476 : * bytearecv - converts external binary format to bytea
477 : */
478 : Datum
479 107710 : bytearecv(PG_FUNCTION_ARGS)
480 : {
481 107710 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
482 : bytea *result;
483 : int nbytes;
484 :
485 107710 : nbytes = buf->len - buf->cursor;
486 107710 : result = (bytea *) palloc(nbytes + VARHDRSZ);
487 107710 : SET_VARSIZE(result, nbytes + VARHDRSZ);
488 107710 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
489 107710 : PG_RETURN_BYTEA_P(result);
490 : }
491 :
492 : /*
493 : * byteasend - converts bytea to binary format
494 : *
495 : * This is a special case: just copy the input...
496 : */
497 : Datum
498 68968 : byteasend(PG_FUNCTION_ARGS)
499 : {
500 68968 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
501 :
502 68968 : PG_RETURN_BYTEA_P(vlena);
503 : }
504 :
505 : Datum
506 258774 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
507 : {
508 : StringInfo state;
509 :
510 258774 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
511 :
512 : /* Append the value unless null, preceding it with the delimiter. */
513 258774 : if (!PG_ARGISNULL(1))
514 : {
515 243774 : bytea *value = PG_GETARG_BYTEA_PP(1);
516 243774 : bool isfirst = false;
517 :
518 : /*
519 : * You might think we can just throw away the first delimiter, however
520 : * we must keep it as we may be a parallel worker doing partial
521 : * aggregation building a state to send to the main process. We need
522 : * to keep the delimiter of every aggregation so that the combine
523 : * function can properly join up the strings of two separately
524 : * partially aggregated results. The first delimiter is only stripped
525 : * off in the final function. To know how much to strip off the front
526 : * of the string, we store the length of the first delimiter in the
527 : * StringInfo's cursor field, which we don't otherwise need here.
528 : */
529 243774 : if (state == NULL)
530 : {
531 148 : state = makeStringAggState(fcinfo);
532 148 : isfirst = true;
533 : }
534 :
535 243774 : if (!PG_ARGISNULL(2))
536 : {
537 243762 : bytea *delim = PG_GETARG_BYTEA_PP(2);
538 :
539 243762 : appendBinaryStringInfo(state, VARDATA_ANY(delim),
540 243762 : VARSIZE_ANY_EXHDR(delim));
541 243762 : if (isfirst)
542 142 : state->cursor = VARSIZE_ANY_EXHDR(delim);
543 : }
544 :
545 243774 : appendBinaryStringInfo(state, VARDATA_ANY(value),
546 243774 : VARSIZE_ANY_EXHDR(value));
547 : }
548 :
549 : /*
550 : * The transition type for string_agg() is declared to be "internal",
551 : * which is a pass-by-value type the same size as a pointer.
552 : */
553 258774 : if (state)
554 258738 : PG_RETURN_POINTER(state);
555 36 : PG_RETURN_NULL();
556 : }
557 :
558 : Datum
559 154 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
560 : {
561 : StringInfo state;
562 :
563 : /* cannot be called directly because of internal-type argument */
564 : Assert(AggCheckCallContext(fcinfo, NULL));
565 :
566 154 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
567 :
568 154 : if (state != NULL)
569 : {
570 : /* As per comment in transfn, strip data before the cursor position */
571 : bytea *result;
572 148 : int strippedlen = state->len - state->cursor;
573 :
574 148 : result = (bytea *) palloc(strippedlen + VARHDRSZ);
575 148 : SET_VARSIZE(result, strippedlen + VARHDRSZ);
576 148 : memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
577 148 : PG_RETURN_BYTEA_P(result);
578 : }
579 : else
580 6 : PG_RETURN_NULL();
581 : }
582 :
583 : /*
584 : * textin - converts cstring to internal representation
585 : */
586 : Datum
587 21519474 : textin(PG_FUNCTION_ARGS)
588 : {
589 21519474 : char *inputText = PG_GETARG_CSTRING(0);
590 :
591 21519474 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
592 : }
593 :
594 : /*
595 : * textout - converts internal representation to cstring
596 : */
597 : Datum
598 8073976 : textout(PG_FUNCTION_ARGS)
599 : {
600 8073976 : Datum txt = PG_GETARG_DATUM(0);
601 :
602 8073976 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
603 : }
604 :
605 : /*
606 : * textrecv - converts external binary format to text
607 : */
608 : Datum
609 48 : textrecv(PG_FUNCTION_ARGS)
610 : {
611 48 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
612 : text *result;
613 : char *str;
614 : int nbytes;
615 :
616 48 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
617 :
618 48 : result = cstring_to_text_with_len(str, nbytes);
619 48 : pfree(str);
620 48 : PG_RETURN_TEXT_P(result);
621 : }
622 :
623 : /*
624 : * textsend - converts text to binary format
625 : */
626 : Datum
627 4902 : textsend(PG_FUNCTION_ARGS)
628 : {
629 4902 : text *t = PG_GETARG_TEXT_PP(0);
630 : StringInfoData buf;
631 :
632 4902 : pq_begintypsend(&buf);
633 4902 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
634 4902 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
635 : }
636 :
637 :
638 : /*
639 : * unknownin - converts cstring to internal representation
640 : */
641 : Datum
642 0 : unknownin(PG_FUNCTION_ARGS)
643 : {
644 0 : char *str = PG_GETARG_CSTRING(0);
645 :
646 : /* representation is same as cstring */
647 0 : PG_RETURN_CSTRING(pstrdup(str));
648 : }
649 :
650 : /*
651 : * unknownout - converts internal representation to cstring
652 : */
653 : Datum
654 940 : unknownout(PG_FUNCTION_ARGS)
655 : {
656 : /* representation is same as cstring */
657 940 : char *str = PG_GETARG_CSTRING(0);
658 :
659 940 : PG_RETURN_CSTRING(pstrdup(str));
660 : }
661 :
662 : /*
663 : * unknownrecv - converts external binary format to unknown
664 : */
665 : Datum
666 0 : unknownrecv(PG_FUNCTION_ARGS)
667 : {
668 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
669 : char *str;
670 : int nbytes;
671 :
672 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
673 : /* representation is same as cstring */
674 0 : PG_RETURN_CSTRING(str);
675 : }
676 :
677 : /*
678 : * unknownsend - converts unknown to binary format
679 : */
680 : Datum
681 0 : unknownsend(PG_FUNCTION_ARGS)
682 : {
683 : /* representation is same as cstring */
684 0 : char *str = PG_GETARG_CSTRING(0);
685 : StringInfoData buf;
686 :
687 0 : pq_begintypsend(&buf);
688 0 : pq_sendtext(&buf, str, strlen(str));
689 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
690 : }
691 :
692 :
693 : /* ========== PUBLIC ROUTINES ========== */
694 :
695 : /*
696 : * textlen -
697 : * returns the logical length of a text*
698 : * (which is less than the VARSIZE of the text*)
699 : */
700 : Datum
701 430748 : textlen(PG_FUNCTION_ARGS)
702 : {
703 430748 : Datum str = PG_GETARG_DATUM(0);
704 :
705 : /* try to avoid decompressing argument */
706 430748 : PG_RETURN_INT32(text_length(str));
707 : }
708 :
709 : /*
710 : * text_length -
711 : * Does the real work for textlen()
712 : *
713 : * This is broken out so it can be called directly by other string processing
714 : * functions. Note that the argument is passed as a Datum, to indicate that
715 : * it may still be in compressed form. We can avoid decompressing it at all
716 : * in some cases.
717 : */
718 : static int32
719 430760 : text_length(Datum str)
720 : {
721 : /* fastpath when max encoding length is one */
722 430760 : if (pg_database_encoding_max_length() == 1)
723 20 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
724 : else
725 : {
726 430740 : text *t = DatumGetTextPP(str);
727 :
728 430740 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
729 : VARSIZE_ANY_EXHDR(t)));
730 : }
731 : }
732 :
733 : /*
734 : * textoctetlen -
735 : * returns the physical length of a text*
736 : * (which is less than the VARSIZE of the text*)
737 : */
738 : Datum
739 70 : textoctetlen(PG_FUNCTION_ARGS)
740 : {
741 70 : Datum str = PG_GETARG_DATUM(0);
742 :
743 : /* We need not detoast the input at all */
744 70 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
745 : }
746 :
747 : /*
748 : * textcat -
749 : * takes two text* and returns a text* that is the concatenation of
750 : * the two.
751 : *
752 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
753 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
754 : * Allocate space for output in all cases.
755 : * XXX - thomas 1997-07-10
756 : */
757 : Datum
758 1934346 : textcat(PG_FUNCTION_ARGS)
759 : {
760 1934346 : text *t1 = PG_GETARG_TEXT_PP(0);
761 1934346 : text *t2 = PG_GETARG_TEXT_PP(1);
762 :
763 1934346 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
764 : }
765 :
766 : /*
767 : * text_catenate
768 : * Guts of textcat(), broken out so it can be used by other functions
769 : *
770 : * Arguments can be in short-header form, but not compressed or out-of-line
771 : */
772 : static text *
773 1934426 : text_catenate(text *t1, text *t2)
774 : {
775 : text *result;
776 : int len1,
777 : len2,
778 : len;
779 : char *ptr;
780 :
781 1934426 : len1 = VARSIZE_ANY_EXHDR(t1);
782 1934426 : len2 = VARSIZE_ANY_EXHDR(t2);
783 :
784 : /* paranoia ... probably should throw error instead? */
785 1934426 : if (len1 < 0)
786 0 : len1 = 0;
787 1934426 : if (len2 < 0)
788 0 : len2 = 0;
789 :
790 1934426 : len = len1 + len2 + VARHDRSZ;
791 1934426 : result = (text *) palloc(len);
792 :
793 : /* Set size of result string... */
794 1934426 : SET_VARSIZE(result, len);
795 :
796 : /* Fill data field of result string... */
797 1934426 : ptr = VARDATA(result);
798 1934426 : if (len1 > 0)
799 1933602 : memcpy(ptr, VARDATA_ANY(t1), len1);
800 1934426 : if (len2 > 0)
801 1934216 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
802 :
803 1934426 : return result;
804 : }
805 :
806 : /*
807 : * charlen_to_bytelen()
808 : * Compute the number of bytes occupied by n characters starting at *p
809 : *
810 : * It is caller's responsibility that there actually are n characters;
811 : * the string need not be null-terminated.
812 : */
813 : static int
814 16586 : charlen_to_bytelen(const char *p, int n)
815 : {
816 16586 : if (pg_database_encoding_max_length() == 1)
817 : {
818 : /* Optimization for single-byte encodings */
819 180 : return n;
820 : }
821 : else
822 : {
823 : const char *s;
824 :
825 6033856 : for (s = p; n > 0; n--)
826 6017450 : s += pg_mblen(s);
827 :
828 16406 : return s - p;
829 : }
830 : }
831 :
832 : /*
833 : * text_substr()
834 : * Return a substring starting at the specified position.
835 : * - thomas 1997-12-31
836 : *
837 : * Input:
838 : * - string
839 : * - starting position (is one-based)
840 : * - string length
841 : *
842 : * If the starting position is zero or less, then return from the start of the string
843 : * adjusting the length to be consistent with the "negative start" per SQL.
844 : * If the length is less than zero, return the remaining string.
845 : *
846 : * Added multibyte support.
847 : * - Tatsuo Ishii 1998-4-21
848 : * Changed behavior if starting position is less than one to conform to SQL behavior.
849 : * Formerly returned the entire string; now returns a portion.
850 : * - Thomas Lockhart 1998-12-10
851 : * Now uses faster TOAST-slicing interface
852 : * - John Gray 2002-02-22
853 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
854 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
855 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
856 : * S > LC and < LC + 4 sometimes garbage characters are returned.
857 : * - Joe Conway 2002-08-10
858 : */
859 : Datum
860 588748 : text_substr(PG_FUNCTION_ARGS)
861 : {
862 588748 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
863 : PG_GETARG_INT32(1),
864 : PG_GETARG_INT32(2),
865 : false));
866 : }
867 :
868 : /*
869 : * text_substr_no_len -
870 : * Wrapper to avoid opr_sanity failure due to
871 : * one function accepting a different number of args.
872 : */
873 : Datum
874 36 : text_substr_no_len(PG_FUNCTION_ARGS)
875 : {
876 36 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
877 : PG_GETARG_INT32(1),
878 : -1, true));
879 : }
880 :
881 : /*
882 : * text_substring -
883 : * Does the real work for text_substr() and text_substr_no_len()
884 : *
885 : * This is broken out so it can be called directly by other string processing
886 : * functions. Note that the argument is passed as a Datum, to indicate that
887 : * it may still be in compressed/toasted form. We can avoid detoasting all
888 : * of it in some cases.
889 : *
890 : * The result is always a freshly palloc'd datum.
891 : */
892 : static text *
893 628896 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
894 : {
895 628896 : int32 eml = pg_database_encoding_max_length();
896 628896 : int32 S = start; /* start position */
897 : int32 S1; /* adjusted start position */
898 : int32 L1; /* adjusted substring length */
899 : int32 E; /* end position */
900 :
901 : /*
902 : * SQL99 says S can be zero or negative (which we don't document), but we
903 : * still must fetch from the start of the string.
904 : * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
905 : */
906 628896 : S1 = Max(S, 1);
907 :
908 : /* life is easy if the encoding max length is 1 */
909 628896 : if (eml == 1)
910 : {
911 22 : if (length_not_specified) /* special case - get length to end of
912 : * string */
913 0 : L1 = -1;
914 22 : else if (length < 0)
915 : {
916 : /* SQL99 says to throw an error for E < S, i.e., negative length */
917 0 : ereport(ERROR,
918 : (errcode(ERRCODE_SUBSTRING_ERROR),
919 : errmsg("negative substring length not allowed")));
920 : L1 = -1; /* silence stupider compilers */
921 : }
922 22 : else if (pg_add_s32_overflow(S, length, &E))
923 : {
924 : /*
925 : * L could be large enough for S + L to overflow, in which case
926 : * the substring must run to end of string.
927 : */
928 0 : L1 = -1;
929 : }
930 : else
931 : {
932 : /*
933 : * A zero or negative value for the end position can happen if the
934 : * start was negative or one. SQL99 says to return a zero-length
935 : * string.
936 : */
937 22 : if (E < 1)
938 0 : return cstring_to_text("");
939 :
940 22 : L1 = E - S1;
941 : }
942 :
943 : /*
944 : * If the start position is past the end of the string, SQL99 says to
945 : * return a zero-length string -- DatumGetTextPSlice() will do that
946 : * for us. We need only convert S1 to zero-based starting position.
947 : */
948 22 : return DatumGetTextPSlice(str, S1 - 1, L1);
949 : }
950 628874 : else if (eml > 1)
951 : {
952 : /*
953 : * When encoding max length is > 1, we can't get LC without
954 : * detoasting, so we'll grab a conservatively large slice now and go
955 : * back later to do the right thing
956 : */
957 : int32 slice_start;
958 : int32 slice_size;
959 : int32 slice_strlen;
960 : text *slice;
961 : int32 E1;
962 : int32 i;
963 : char *p;
964 : char *s;
965 : text *ret;
966 :
967 : /*
968 : * We need to start at position zero because there is no way to know
969 : * in advance which byte offset corresponds to the supplied start
970 : * position.
971 : */
972 628874 : slice_start = 0;
973 :
974 628874 : if (length_not_specified) /* special case - get length to end of
975 : * string */
976 76 : slice_size = L1 = -1;
977 628798 : else if (length < 0)
978 : {
979 : /* SQL99 says to throw an error for E < S, i.e., negative length */
980 12 : ereport(ERROR,
981 : (errcode(ERRCODE_SUBSTRING_ERROR),
982 : errmsg("negative substring length not allowed")));
983 : slice_size = L1 = -1; /* silence stupider compilers */
984 : }
985 628786 : else if (pg_add_s32_overflow(S, length, &E))
986 : {
987 : /*
988 : * L could be large enough for S + L to overflow, in which case
989 : * the substring must run to end of string.
990 : */
991 6 : slice_size = L1 = -1;
992 : }
993 : else
994 : {
995 : /*
996 : * A zero or negative value for the end position can happen if the
997 : * start was negative or one. SQL99 says to return a zero-length
998 : * string.
999 : */
1000 628780 : if (E < 1)
1001 0 : return cstring_to_text("");
1002 :
1003 : /*
1004 : * if E is past the end of the string, the tuple toaster will
1005 : * truncate the length for us
1006 : */
1007 628780 : L1 = E - S1;
1008 :
1009 : /*
1010 : * Total slice size in bytes can't be any longer than the start
1011 : * position plus substring length times the encoding max length.
1012 : * If that overflows, we can just use -1.
1013 : */
1014 628780 : if (pg_mul_s32_overflow(E, eml, &slice_size))
1015 6 : slice_size = -1;
1016 : }
1017 :
1018 : /*
1019 : * If we're working with an untoasted source, no need to do an extra
1020 : * copying step.
1021 : */
1022 628862 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1023 628808 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1024 324 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
1025 : else
1026 628538 : slice = (text *) DatumGetPointer(str);
1027 :
1028 : /* see if we got back an empty string */
1029 628862 : if (VARSIZE_ANY_EXHDR(slice) == 0)
1030 : {
1031 0 : if (slice != (text *) DatumGetPointer(str))
1032 0 : pfree(slice);
1033 0 : return cstring_to_text("");
1034 : }
1035 :
1036 : /* Now we can get the actual length of the slice in MB characters */
1037 628862 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1038 628862 : VARSIZE_ANY_EXHDR(slice));
1039 :
1040 : /*
1041 : * Check that the start position wasn't > slice_strlen. If so, SQL99
1042 : * says to return a zero-length string.
1043 : */
1044 628862 : if (S1 > slice_strlen)
1045 : {
1046 22 : if (slice != (text *) DatumGetPointer(str))
1047 0 : pfree(slice);
1048 22 : return cstring_to_text("");
1049 : }
1050 :
1051 : /*
1052 : * Adjust L1 and E1 now that we know the slice string length. Again
1053 : * remember that S1 is one based, and slice_start is zero based.
1054 : */
1055 628840 : if (L1 > -1)
1056 628780 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1057 : else
1058 60 : E1 = slice_start + 1 + slice_strlen;
1059 :
1060 : /*
1061 : * Find the start position in the slice; remember S1 is not zero based
1062 : */
1063 628840 : p = VARDATA_ANY(slice);
1064 5450186 : for (i = 0; i < S1 - 1; i++)
1065 4821346 : p += pg_mblen(p);
1066 :
1067 : /* hang onto a pointer to our start position */
1068 628840 : s = p;
1069 :
1070 : /*
1071 : * Count the actual bytes used by the substring of the requested
1072 : * length.
1073 : */
1074 9793252 : for (i = S1; i < E1; i++)
1075 9164412 : p += pg_mblen(p);
1076 :
1077 628840 : ret = (text *) palloc(VARHDRSZ + (p - s));
1078 628840 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
1079 628840 : memcpy(VARDATA(ret), s, (p - s));
1080 :
1081 628840 : if (slice != (text *) DatumGetPointer(str))
1082 324 : pfree(slice);
1083 :
1084 628840 : return ret;
1085 : }
1086 : else
1087 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
1088 :
1089 : /* not reached: suppress compiler warning */
1090 : return NULL;
1091 : }
1092 :
1093 : /*
1094 : * textoverlay
1095 : * Replace specified substring of first string with second
1096 : *
1097 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1098 : * This code is a direct implementation of what the standard says.
1099 : */
1100 : Datum
1101 28 : textoverlay(PG_FUNCTION_ARGS)
1102 : {
1103 28 : text *t1 = PG_GETARG_TEXT_PP(0);
1104 28 : text *t2 = PG_GETARG_TEXT_PP(1);
1105 28 : int sp = PG_GETARG_INT32(2); /* substring start position */
1106 28 : int sl = PG_GETARG_INT32(3); /* substring length */
1107 :
1108 28 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1109 : }
1110 :
1111 : Datum
1112 12 : textoverlay_no_len(PG_FUNCTION_ARGS)
1113 : {
1114 12 : text *t1 = PG_GETARG_TEXT_PP(0);
1115 12 : text *t2 = PG_GETARG_TEXT_PP(1);
1116 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
1117 : int sl;
1118 :
1119 12 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1120 12 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1121 : }
1122 :
1123 : static text *
1124 40 : text_overlay(text *t1, text *t2, int sp, int sl)
1125 : {
1126 : text *result;
1127 : text *s1;
1128 : text *s2;
1129 : int sp_pl_sl;
1130 :
1131 : /*
1132 : * Check for possible integer-overflow cases. For negative sp, throw a
1133 : * "substring length" error because that's what should be expected
1134 : * according to the spec's definition of OVERLAY().
1135 : */
1136 40 : if (sp <= 0)
1137 0 : ereport(ERROR,
1138 : (errcode(ERRCODE_SUBSTRING_ERROR),
1139 : errmsg("negative substring length not allowed")));
1140 40 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1141 0 : ereport(ERROR,
1142 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1143 : errmsg("integer out of range")));
1144 :
1145 40 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1146 40 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1147 40 : result = text_catenate(s1, t2);
1148 40 : result = text_catenate(result, s2);
1149 :
1150 40 : return result;
1151 : }
1152 :
1153 : /*
1154 : * textpos -
1155 : * Return the position of the specified substring.
1156 : * Implements the SQL POSITION() function.
1157 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1158 : * - thomas 1997-07-27
1159 : */
1160 : Datum
1161 130 : textpos(PG_FUNCTION_ARGS)
1162 : {
1163 130 : text *str = PG_GETARG_TEXT_PP(0);
1164 130 : text *search_str = PG_GETARG_TEXT_PP(1);
1165 :
1166 130 : PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1167 : }
1168 :
1169 : /*
1170 : * text_position -
1171 : * Does the real work for textpos()
1172 : *
1173 : * Inputs:
1174 : * t1 - string to be searched
1175 : * t2 - pattern to match within t1
1176 : * Result:
1177 : * Character index of the first matched char, starting from 1,
1178 : * or 0 if no match.
1179 : *
1180 : * This is broken out so it can be called directly by other string processing
1181 : * functions.
1182 : */
1183 : static int
1184 130 : text_position(text *t1, text *t2, Oid collid)
1185 : {
1186 : TextPositionState state;
1187 : int result;
1188 :
1189 130 : check_collation_set(collid);
1190 :
1191 : /* Empty needle always matches at position 1 */
1192 130 : if (VARSIZE_ANY_EXHDR(t2) < 1)
1193 12 : return 1;
1194 :
1195 : /* Otherwise, can't match if haystack is shorter than needle */
1196 118 : if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
1197 22 : pg_newlocale_from_collation(collid)->deterministic)
1198 22 : return 0;
1199 :
1200 96 : text_position_setup(t1, t2, collid, &state);
1201 : /* don't need greedy mode here */
1202 96 : state.greedy = false;
1203 :
1204 96 : if (!text_position_next(&state))
1205 24 : result = 0;
1206 : else
1207 72 : result = text_position_get_match_pos(&state);
1208 96 : text_position_cleanup(&state);
1209 96 : return result;
1210 : }
1211 :
1212 :
1213 : /*
1214 : * text_position_setup, text_position_next, text_position_cleanup -
1215 : * Component steps of text_position()
1216 : *
1217 : * These are broken out so that a string can be efficiently searched for
1218 : * multiple occurrences of the same pattern. text_position_next may be
1219 : * called multiple times, and it advances to the next match on each call.
1220 : * text_position_get_match_ptr() and text_position_get_match_pos() return
1221 : * a pointer or 1-based character position of the last match, respectively.
1222 : *
1223 : * The "state" variable is normally just a local variable in the caller.
1224 : *
1225 : * NOTE: text_position_next skips over the matched portion. For example,
1226 : * searching for "xx" in "xxx" returns only one match, not two.
1227 : */
1228 :
1229 : static void
1230 1688 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1231 : {
1232 1688 : int len1 = VARSIZE_ANY_EXHDR(t1);
1233 1688 : int len2 = VARSIZE_ANY_EXHDR(t2);
1234 :
1235 1688 : check_collation_set(collid);
1236 :
1237 1688 : state->locale = pg_newlocale_from_collation(collid);
1238 :
1239 : /*
1240 : * Most callers need greedy mode, but some might want to unset this to
1241 : * optimize.
1242 : */
1243 1688 : state->greedy = true;
1244 :
1245 : Assert(len2 > 0);
1246 :
1247 : /*
1248 : * Even with a multi-byte encoding, we perform the search using the raw
1249 : * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1250 : * because in UTF-8 the byte sequence of one character cannot contain
1251 : * another character. For other multi-byte encodings, we do the search
1252 : * initially as a simple byte search, ignoring multibyte issues, but
1253 : * verify afterwards that the match we found is at a character boundary,
1254 : * and continue the search if it was a false match.
1255 : */
1256 1688 : if (pg_database_encoding_max_length() == 1)
1257 108 : state->is_multibyte_char_in_char = false;
1258 1580 : else if (GetDatabaseEncoding() == PG_UTF8)
1259 1580 : state->is_multibyte_char_in_char = false;
1260 : else
1261 0 : state->is_multibyte_char_in_char = true;
1262 :
1263 1688 : state->str1 = VARDATA_ANY(t1);
1264 1688 : state->str2 = VARDATA_ANY(t2);
1265 1688 : state->len1 = len1;
1266 1688 : state->len2 = len2;
1267 1688 : state->last_match = NULL;
1268 1688 : state->refpoint = state->str1;
1269 1688 : state->refpos = 0;
1270 :
1271 : /*
1272 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1273 : * notes we use the terminology that the "haystack" is the string to be
1274 : * searched (t1) and the "needle" is the pattern being sought (t2).
1275 : *
1276 : * If the needle is empty or bigger than the haystack then there is no
1277 : * point in wasting cycles initializing the table. We also choose not to
1278 : * use B-M-H for needles of length 1, since the skip table can't possibly
1279 : * save anything in that case.
1280 : *
1281 : * (With nondeterministic collations, the search is already
1282 : * multibyte-aware, so we don't need this.)
1283 : */
1284 1688 : if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
1285 : {
1286 1360 : int searchlength = len1 - len2;
1287 : int skiptablemask;
1288 : int last;
1289 : int i;
1290 1360 : const char *str2 = state->str2;
1291 :
1292 : /*
1293 : * First we must determine how much of the skip table to use. The
1294 : * declaration of TextPositionState allows up to 256 elements, but for
1295 : * short search problems we don't really want to have to initialize so
1296 : * many elements --- it would take too long in comparison to the
1297 : * actual search time. So we choose a useful skip table size based on
1298 : * the haystack length minus the needle length. The closer the needle
1299 : * length is to the haystack length the less useful skipping becomes.
1300 : *
1301 : * Note: since we use bit-masking to select table elements, the skip
1302 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1303 : */
1304 1360 : if (searchlength < 16)
1305 114 : skiptablemask = 3;
1306 1246 : else if (searchlength < 64)
1307 16 : skiptablemask = 7;
1308 1230 : else if (searchlength < 128)
1309 14 : skiptablemask = 15;
1310 1216 : else if (searchlength < 512)
1311 244 : skiptablemask = 31;
1312 972 : else if (searchlength < 2048)
1313 746 : skiptablemask = 63;
1314 226 : else if (searchlength < 4096)
1315 154 : skiptablemask = 127;
1316 : else
1317 72 : skiptablemask = 255;
1318 1360 : state->skiptablemask = skiptablemask;
1319 :
1320 : /*
1321 : * Initialize the skip table. We set all elements to the needle
1322 : * length, since this is the correct skip distance for any character
1323 : * not found in the needle.
1324 : */
1325 95864 : for (i = 0; i <= skiptablemask; i++)
1326 94504 : state->skiptable[i] = len2;
1327 :
1328 : /*
1329 : * Now examine the needle. For each character except the last one,
1330 : * set the corresponding table element to the appropriate skip
1331 : * distance. Note that when two characters share the same skip table
1332 : * entry, the one later in the needle must determine the skip
1333 : * distance.
1334 : */
1335 1360 : last = len2 - 1;
1336 :
1337 18044 : for (i = 0; i < last; i++)
1338 16684 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1339 : }
1340 1688 : }
1341 :
1342 : /*
1343 : * Advance to the next match, starting from the end of the previous match
1344 : * (or the beginning of the string, on first call). Returns true if a match
1345 : * is found.
1346 : *
1347 : * Note that this refuses to match an empty-string needle. Most callers
1348 : * will have handled that case specially and we'll never see it here.
1349 : */
1350 : static bool
1351 7628 : text_position_next(TextPositionState *state)
1352 : {
1353 7628 : int needle_len = state->len2;
1354 : char *start_ptr;
1355 : char *matchptr;
1356 :
1357 7628 : if (needle_len <= 0)
1358 0 : return false; /* result for empty pattern */
1359 :
1360 : /* Start from the point right after the previous match. */
1361 7628 : if (state->last_match)
1362 5928 : start_ptr = state->last_match + state->last_match_len;
1363 : else
1364 1700 : start_ptr = state->str1;
1365 :
1366 7628 : retry:
1367 7628 : matchptr = text_position_next_internal(start_ptr, state);
1368 :
1369 7628 : if (!matchptr)
1370 1604 : return false;
1371 :
1372 : /*
1373 : * Found a match for the byte sequence. If this is a multibyte encoding,
1374 : * where one character's byte sequence can appear inside a longer
1375 : * multi-byte character, we need to verify that the match was at a
1376 : * character boundary, not in the middle of a multi-byte character.
1377 : */
1378 6024 : if (state->is_multibyte_char_in_char && state->locale->deterministic)
1379 : {
1380 : /* Walk one character at a time, until we reach the match. */
1381 :
1382 : /* the search should never move backwards. */
1383 : Assert(state->refpoint <= matchptr);
1384 :
1385 0 : while (state->refpoint < matchptr)
1386 : {
1387 : /* step to next character. */
1388 0 : state->refpoint += pg_mblen(state->refpoint);
1389 0 : state->refpos++;
1390 :
1391 : /*
1392 : * If we stepped over the match's start position, then it was a
1393 : * false positive, where the byte sequence appeared in the middle
1394 : * of a multi-byte character. Skip it, and continue the search at
1395 : * the next character boundary.
1396 : */
1397 0 : if (state->refpoint > matchptr)
1398 : {
1399 0 : start_ptr = state->refpoint;
1400 0 : goto retry;
1401 : }
1402 : }
1403 : }
1404 :
1405 6024 : state->last_match = matchptr;
1406 6024 : state->last_match_len = state->last_match_len_tmp;
1407 6024 : return true;
1408 : }
1409 :
1410 : /*
1411 : * Subroutine of text_position_next(). This searches for the raw byte
1412 : * sequence, ignoring any multi-byte encoding issues. Returns the first
1413 : * match starting at 'start_ptr', or NULL if no match is found.
1414 : */
1415 : static char *
1416 7628 : text_position_next_internal(char *start_ptr, TextPositionState *state)
1417 : {
1418 7628 : int haystack_len = state->len1;
1419 7628 : int needle_len = state->len2;
1420 7628 : int skiptablemask = state->skiptablemask;
1421 7628 : const char *haystack = state->str1;
1422 7628 : const char *needle = state->str2;
1423 7628 : const char *haystack_end = &haystack[haystack_len];
1424 : const char *hptr;
1425 :
1426 : Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1427 :
1428 7628 : state->last_match_len_tmp = needle_len;
1429 :
1430 7628 : if (!state->locale->deterministic)
1431 : {
1432 : /*
1433 : * With a nondeterministic collation, we have to use an unoptimized
1434 : * route. We walk through the haystack and see if at each position
1435 : * there is a substring of the remaining string that is equal to the
1436 : * needle under the given collation.
1437 : *
1438 : * Note, the found substring could have a different length than the
1439 : * needle, including being empty. Callers that want to skip over the
1440 : * found string need to read the length of the found substring from
1441 : * last_match_len rather than just using the length of their needle.
1442 : *
1443 : * Most callers will require "greedy" semantics, meaning that we need
1444 : * to find the longest such substring, not the shortest. For callers
1445 : * that don't need greedy semantics, we can finish on the first match.
1446 : */
1447 240 : const char *result_hptr = NULL;
1448 :
1449 240 : hptr = start_ptr;
1450 642 : while (hptr < haystack_end)
1451 : {
1452 : /*
1453 : * First check the common case that there is a match in the
1454 : * haystack of exactly the length of the needle.
1455 : */
1456 534 : if (!state->greedy &&
1457 108 : haystack_end - hptr >= needle_len &&
1458 54 : pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
1459 12 : return (char *) hptr;
1460 :
1461 : /*
1462 : * Else check if any of the possible substrings starting at hptr
1463 : * are equal to the needle.
1464 : */
1465 2586 : for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1466 : {
1467 2064 : if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
1468 : {
1469 132 : state->last_match_len_tmp = (test_end - hptr);
1470 132 : result_hptr = hptr;
1471 132 : if (!state->greedy)
1472 0 : break;
1473 : }
1474 : }
1475 522 : if (result_hptr)
1476 120 : break;
1477 :
1478 402 : hptr += pg_mblen(hptr);
1479 : }
1480 :
1481 228 : return (char *) result_hptr;
1482 : }
1483 7388 : else if (needle_len == 1)
1484 : {
1485 : /* No point in using B-M-H for a one-character needle */
1486 760 : char nchar = *needle;
1487 :
1488 760 : hptr = start_ptr;
1489 5878 : while (hptr < haystack_end)
1490 : {
1491 5712 : if (*hptr == nchar)
1492 594 : return (char *) hptr;
1493 5118 : hptr++;
1494 : }
1495 : }
1496 : else
1497 : {
1498 6628 : const char *needle_last = &needle[needle_len - 1];
1499 :
1500 : /* Start at startpos plus the length of the needle */
1501 6628 : hptr = start_ptr + needle_len - 1;
1502 169760 : while (hptr < haystack_end)
1503 : {
1504 : /* Match the needle scanning *backward* */
1505 : const char *nptr;
1506 : const char *p;
1507 :
1508 168430 : nptr = needle_last;
1509 168430 : p = hptr;
1510 247434 : while (*nptr == *p)
1511 : {
1512 : /* Matched it all? If so, return 1-based position */
1513 84302 : if (nptr == needle)
1514 5298 : return (char *) p;
1515 79004 : nptr--, p--;
1516 : }
1517 :
1518 : /*
1519 : * No match, so use the haystack char at hptr to decide how far to
1520 : * advance. If the needle had any occurrence of that character
1521 : * (or more precisely, one sharing the same skiptable entry)
1522 : * before its last character, then we advance far enough to align
1523 : * the last such needle character with that haystack position.
1524 : * Otherwise we can advance by the whole needle length.
1525 : */
1526 163132 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1527 : }
1528 : }
1529 :
1530 1496 : return 0; /* not found */
1531 : }
1532 :
1533 : /*
1534 : * Return a pointer to the current match.
1535 : *
1536 : * The returned pointer points into the original haystack string.
1537 : */
1538 : static char *
1539 5922 : text_position_get_match_ptr(TextPositionState *state)
1540 : {
1541 5922 : return state->last_match;
1542 : }
1543 :
1544 : /*
1545 : * Return the offset of the current match.
1546 : *
1547 : * The offset is in characters, 1-based.
1548 : */
1549 : static int
1550 72 : text_position_get_match_pos(TextPositionState *state)
1551 : {
1552 : /* Convert the byte position to char position. */
1553 144 : state->refpos += pg_mbstrlen_with_len(state->refpoint,
1554 72 : state->last_match - state->refpoint);
1555 72 : state->refpoint = state->last_match;
1556 72 : return state->refpos + 1;
1557 : }
1558 :
1559 : /*
1560 : * Reset search state to the initial state installed by text_position_setup.
1561 : *
1562 : * The next call to text_position_next will search from the beginning
1563 : * of the string.
1564 : */
1565 : static void
1566 12 : text_position_reset(TextPositionState *state)
1567 : {
1568 12 : state->last_match = NULL;
1569 12 : state->refpoint = state->str1;
1570 12 : state->refpos = 0;
1571 12 : }
1572 :
1573 : static void
1574 1688 : text_position_cleanup(TextPositionState *state)
1575 : {
1576 : /* no cleanup needed */
1577 1688 : }
1578 :
1579 :
1580 : static void
1581 16792886 : check_collation_set(Oid collid)
1582 : {
1583 16792886 : if (!OidIsValid(collid))
1584 : {
1585 : /*
1586 : * This typically means that the parser could not resolve a conflict
1587 : * of implicit collations, so report it that way.
1588 : */
1589 30 : ereport(ERROR,
1590 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1591 : errmsg("could not determine which collation to use for string comparison"),
1592 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1593 : }
1594 16792856 : }
1595 :
1596 : /*
1597 : * varstr_cmp()
1598 : *
1599 : * Comparison function for text strings with given lengths, using the
1600 : * appropriate locale. Returns an integer less than, equal to, or greater than
1601 : * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
1602 : *
1603 : * Note: many functions that depend on this are marked leakproof; therefore,
1604 : * avoid reporting the actual contents of the input when throwing errors.
1605 : * All errors herein should be things that can't happen except on corrupt
1606 : * data, anyway; otherwise we will have trouble with indexing strings that
1607 : * would cause them.
1608 : */
1609 : int
1610 9790138 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1611 : {
1612 : int result;
1613 : pg_locale_t mylocale;
1614 :
1615 9790138 : check_collation_set(collid);
1616 :
1617 9790120 : mylocale = pg_newlocale_from_collation(collid);
1618 :
1619 9790120 : if (mylocale->collate_is_c)
1620 : {
1621 3809724 : result = memcmp(arg1, arg2, Min(len1, len2));
1622 3809724 : if ((result == 0) && (len1 != len2))
1623 121616 : result = (len1 < len2) ? -1 : 1;
1624 : }
1625 : else
1626 : {
1627 : /*
1628 : * memcmp() can't tell us which of two unequal strings sorts first,
1629 : * but it's a cheap way to tell if they're equal. Testing shows that
1630 : * memcmp() followed by strcoll() is only trivially slower than
1631 : * strcoll() by itself, so we don't lose much if this doesn't work out
1632 : * very often, and if it does - for example, because there are many
1633 : * equal strings in the input - then we win big by avoiding expensive
1634 : * collation-aware comparisons.
1635 : */
1636 5980396 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1637 1505640 : return 0;
1638 :
1639 4474756 : result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1640 :
1641 : /* Break tie if necessary. */
1642 4474756 : if (result == 0 && mylocale->deterministic)
1643 : {
1644 0 : result = memcmp(arg1, arg2, Min(len1, len2));
1645 0 : if ((result == 0) && (len1 != len2))
1646 0 : result = (len1 < len2) ? -1 : 1;
1647 : }
1648 : }
1649 :
1650 8284480 : return result;
1651 : }
1652 :
1653 : /* text_cmp()
1654 : * Internal comparison function for text strings.
1655 : * Returns -1, 0 or 1
1656 : */
1657 : static int
1658 7630972 : text_cmp(text *arg1, text *arg2, Oid collid)
1659 : {
1660 : char *a1p,
1661 : *a2p;
1662 : int len1,
1663 : len2;
1664 :
1665 7630972 : a1p = VARDATA_ANY(arg1);
1666 7630972 : a2p = VARDATA_ANY(arg2);
1667 :
1668 7630972 : len1 = VARSIZE_ANY_EXHDR(arg1);
1669 7630972 : len2 = VARSIZE_ANY_EXHDR(arg2);
1670 :
1671 7630972 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1672 : }
1673 :
1674 : /*
1675 : * Comparison functions for text strings.
1676 : *
1677 : * Note: btree indexes need these routines not to leak memory; therefore,
1678 : * be careful to free working copies of toasted datums. Most places don't
1679 : * need to be so careful.
1680 : */
1681 :
1682 : Datum
1683 6594788 : texteq(PG_FUNCTION_ARGS)
1684 : {
1685 6594788 : Oid collid = PG_GET_COLLATION();
1686 6594788 : pg_locale_t mylocale = 0;
1687 : bool result;
1688 :
1689 6594788 : check_collation_set(collid);
1690 :
1691 6594788 : mylocale = pg_newlocale_from_collation(collid);
1692 :
1693 6594788 : if (mylocale->deterministic)
1694 : {
1695 6586348 : Datum arg1 = PG_GETARG_DATUM(0);
1696 6586348 : Datum arg2 = PG_GETARG_DATUM(1);
1697 : Size len1,
1698 : len2;
1699 :
1700 : /*
1701 : * Since we only care about equality or not-equality, we can avoid all
1702 : * the expense of strcoll() here, and just do bitwise comparison. In
1703 : * fact, we don't even have to do a bitwise comparison if we can show
1704 : * the lengths of the strings are unequal; which might save us from
1705 : * having to detoast one or both values.
1706 : */
1707 6586348 : len1 = toast_raw_datum_size(arg1);
1708 6586348 : len2 = toast_raw_datum_size(arg2);
1709 6586348 : if (len1 != len2)
1710 3138376 : result = false;
1711 : else
1712 : {
1713 3447972 : text *targ1 = DatumGetTextPP(arg1);
1714 3447972 : text *targ2 = DatumGetTextPP(arg2);
1715 :
1716 3447972 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1717 : len1 - VARHDRSZ) == 0);
1718 :
1719 3447972 : PG_FREE_IF_COPY(targ1, 0);
1720 3447972 : PG_FREE_IF_COPY(targ2, 1);
1721 : }
1722 : }
1723 : else
1724 : {
1725 8440 : text *arg1 = PG_GETARG_TEXT_PP(0);
1726 8440 : text *arg2 = PG_GETARG_TEXT_PP(1);
1727 :
1728 8440 : result = (text_cmp(arg1, arg2, collid) == 0);
1729 :
1730 8440 : PG_FREE_IF_COPY(arg1, 0);
1731 8440 : PG_FREE_IF_COPY(arg2, 1);
1732 : }
1733 :
1734 6594788 : PG_RETURN_BOOL(result);
1735 : }
1736 :
1737 : Datum
1738 22758 : textne(PG_FUNCTION_ARGS)
1739 : {
1740 22758 : Oid collid = PG_GET_COLLATION();
1741 : pg_locale_t mylocale;
1742 : bool result;
1743 :
1744 22758 : check_collation_set(collid);
1745 :
1746 22758 : mylocale = pg_newlocale_from_collation(collid);
1747 :
1748 22758 : if (mylocale->deterministic)
1749 : {
1750 22734 : Datum arg1 = PG_GETARG_DATUM(0);
1751 22734 : Datum arg2 = PG_GETARG_DATUM(1);
1752 : Size len1,
1753 : len2;
1754 :
1755 : /* See comment in texteq() */
1756 22734 : len1 = toast_raw_datum_size(arg1);
1757 22734 : len2 = toast_raw_datum_size(arg2);
1758 22734 : if (len1 != len2)
1759 4424 : result = true;
1760 : else
1761 : {
1762 18310 : text *targ1 = DatumGetTextPP(arg1);
1763 18310 : text *targ2 = DatumGetTextPP(arg2);
1764 :
1765 18310 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1766 : len1 - VARHDRSZ) != 0);
1767 :
1768 18310 : PG_FREE_IF_COPY(targ1, 0);
1769 18310 : PG_FREE_IF_COPY(targ2, 1);
1770 : }
1771 : }
1772 : else
1773 : {
1774 24 : text *arg1 = PG_GETARG_TEXT_PP(0);
1775 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
1776 :
1777 24 : result = (text_cmp(arg1, arg2, collid) != 0);
1778 :
1779 24 : PG_FREE_IF_COPY(arg1, 0);
1780 24 : PG_FREE_IF_COPY(arg2, 1);
1781 : }
1782 :
1783 22758 : PG_RETURN_BOOL(result);
1784 : }
1785 :
1786 : Datum
1787 211848 : text_lt(PG_FUNCTION_ARGS)
1788 : {
1789 211848 : text *arg1 = PG_GETARG_TEXT_PP(0);
1790 211848 : text *arg2 = PG_GETARG_TEXT_PP(1);
1791 : bool result;
1792 :
1793 211848 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1794 :
1795 211830 : PG_FREE_IF_COPY(arg1, 0);
1796 211830 : PG_FREE_IF_COPY(arg2, 1);
1797 :
1798 211830 : PG_RETURN_BOOL(result);
1799 : }
1800 :
1801 : Datum
1802 318114 : text_le(PG_FUNCTION_ARGS)
1803 : {
1804 318114 : text *arg1 = PG_GETARG_TEXT_PP(0);
1805 318114 : text *arg2 = PG_GETARG_TEXT_PP(1);
1806 : bool result;
1807 :
1808 318114 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1809 :
1810 318114 : PG_FREE_IF_COPY(arg1, 0);
1811 318114 : PG_FREE_IF_COPY(arg2, 1);
1812 :
1813 318114 : PG_RETURN_BOOL(result);
1814 : }
1815 :
1816 : Datum
1817 195830 : text_gt(PG_FUNCTION_ARGS)
1818 : {
1819 195830 : text *arg1 = PG_GETARG_TEXT_PP(0);
1820 195830 : text *arg2 = PG_GETARG_TEXT_PP(1);
1821 : bool result;
1822 :
1823 195830 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1824 :
1825 195830 : PG_FREE_IF_COPY(arg1, 0);
1826 195830 : PG_FREE_IF_COPY(arg2, 1);
1827 :
1828 195830 : PG_RETURN_BOOL(result);
1829 : }
1830 :
1831 : Datum
1832 177896 : text_ge(PG_FUNCTION_ARGS)
1833 : {
1834 177896 : text *arg1 = PG_GETARG_TEXT_PP(0);
1835 177896 : text *arg2 = PG_GETARG_TEXT_PP(1);
1836 : bool result;
1837 :
1838 177896 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1839 :
1840 177896 : PG_FREE_IF_COPY(arg1, 0);
1841 177896 : PG_FREE_IF_COPY(arg2, 1);
1842 :
1843 177896 : PG_RETURN_BOOL(result);
1844 : }
1845 :
1846 : Datum
1847 37914 : text_starts_with(PG_FUNCTION_ARGS)
1848 : {
1849 37914 : Datum arg1 = PG_GETARG_DATUM(0);
1850 37914 : Datum arg2 = PG_GETARG_DATUM(1);
1851 37914 : Oid collid = PG_GET_COLLATION();
1852 : pg_locale_t mylocale;
1853 : bool result;
1854 : Size len1,
1855 : len2;
1856 :
1857 37914 : check_collation_set(collid);
1858 :
1859 37914 : mylocale = pg_newlocale_from_collation(collid);
1860 :
1861 37914 : if (!mylocale->deterministic)
1862 0 : ereport(ERROR,
1863 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1864 : errmsg("nondeterministic collations are not supported for substring searches")));
1865 :
1866 37914 : len1 = toast_raw_datum_size(arg1);
1867 37914 : len2 = toast_raw_datum_size(arg2);
1868 37914 : if (len2 > len1)
1869 0 : result = false;
1870 : else
1871 : {
1872 37914 : text *targ1 = text_substring(arg1, 1, len2, false);
1873 37914 : text *targ2 = DatumGetTextPP(arg2);
1874 :
1875 37914 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1876 37914 : VARSIZE_ANY_EXHDR(targ2)) == 0);
1877 :
1878 37914 : PG_FREE_IF_COPY(targ1, 0);
1879 37914 : PG_FREE_IF_COPY(targ2, 1);
1880 : }
1881 :
1882 37914 : PG_RETURN_BOOL(result);
1883 : }
1884 :
1885 : Datum
1886 6403184 : bttextcmp(PG_FUNCTION_ARGS)
1887 : {
1888 6403184 : text *arg1 = PG_GETARG_TEXT_PP(0);
1889 6403184 : text *arg2 = PG_GETARG_TEXT_PP(1);
1890 : int32 result;
1891 :
1892 6403184 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1893 :
1894 6403184 : PG_FREE_IF_COPY(arg1, 0);
1895 6403184 : PG_FREE_IF_COPY(arg2, 1);
1896 :
1897 6403184 : PG_RETURN_INT32(result);
1898 : }
1899 :
1900 : Datum
1901 84468 : bttextsortsupport(PG_FUNCTION_ARGS)
1902 : {
1903 84468 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1904 84468 : Oid collid = ssup->ssup_collation;
1905 : MemoryContext oldcontext;
1906 :
1907 84468 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1908 :
1909 : /* Use generic string SortSupport */
1910 84468 : varstr_sortsupport(ssup, TEXTOID, collid);
1911 :
1912 84456 : MemoryContextSwitchTo(oldcontext);
1913 :
1914 84456 : PG_RETURN_VOID();
1915 : }
1916 :
1917 : /*
1918 : * Generic sortsupport interface for character type's operator classes.
1919 : * Includes locale support, and support for BpChar semantics (i.e. removing
1920 : * trailing spaces before comparison).
1921 : *
1922 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1923 : * same representation. Callers that always use the C collation (e.g.
1924 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
1925 : * this will not work with any other collation, though.
1926 : */
1927 : void
1928 140284 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1929 : {
1930 140284 : bool abbreviate = ssup->abbreviate;
1931 140284 : bool collate_c = false;
1932 : VarStringSortSupport *sss;
1933 : pg_locale_t locale;
1934 :
1935 140284 : check_collation_set(collid);
1936 :
1937 140272 : locale = pg_newlocale_from_collation(collid);
1938 :
1939 : /*
1940 : * If possible, set ssup->comparator to a function which can be used to
1941 : * directly compare two datums. If we can do this, we'll avoid the
1942 : * overhead of a trip through the fmgr layer for every comparison, which
1943 : * can be substantial.
1944 : *
1945 : * Most typically, we'll set the comparator to varlenafastcmp_locale,
1946 : * which uses strcoll() to perform comparisons. We use that for the
1947 : * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1948 : * LC_COLLATE = C, we can make things quite a bit faster with
1949 : * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1950 : * memcmp() rather than strcoll().
1951 : */
1952 140272 : if (locale->collate_is_c)
1953 : {
1954 95588 : if (typid == BPCHAROID)
1955 308 : ssup->comparator = bpcharfastcmp_c;
1956 95280 : else if (typid == NAMEOID)
1957 : {
1958 54752 : ssup->comparator = namefastcmp_c;
1959 : /* Not supporting abbreviation with type NAME, for now */
1960 54752 : abbreviate = false;
1961 : }
1962 : else
1963 40528 : ssup->comparator = varstrfastcmp_c;
1964 :
1965 95588 : collate_c = true;
1966 : }
1967 : else
1968 : {
1969 : /*
1970 : * We use varlenafastcmp_locale except for type NAME.
1971 : */
1972 44684 : if (typid == NAMEOID)
1973 : {
1974 0 : ssup->comparator = namefastcmp_locale;
1975 : /* Not supporting abbreviation with type NAME, for now */
1976 0 : abbreviate = false;
1977 : }
1978 : else
1979 44684 : ssup->comparator = varlenafastcmp_locale;
1980 :
1981 : /*
1982 : * Unfortunately, it seems that abbreviation for non-C collations is
1983 : * broken on many common platforms; see pg_strxfrm_enabled().
1984 : *
1985 : * Even apart from the risk of broken locales, it's possible that
1986 : * there are platforms where the use of abbreviated keys should be
1987 : * disabled at compile time. Having only 4 byte datums could make
1988 : * worst-case performance drastically more likely, for example.
1989 : * Moreover, macOS's strxfrm() implementation is known to not
1990 : * effectively concentrate a significant amount of entropy from the
1991 : * original string in earlier transformed blobs. It's possible that
1992 : * other supported platforms are similarly encumbered. So, if we ever
1993 : * get past disabling this categorically, we may still want or need to
1994 : * disable it for particular platforms.
1995 : */
1996 44684 : if (!pg_strxfrm_enabled(locale))
1997 43894 : abbreviate = false;
1998 : }
1999 :
2000 : /*
2001 : * If we're using abbreviated keys, or if we're using a locale-aware
2002 : * comparison, we need to initialize a VarStringSortSupport object. Both
2003 : * cases will make use of the temporary buffers we initialize here for
2004 : * scratch space (and to detect requirement for BpChar semantics from
2005 : * caller), and the abbreviation case requires additional state.
2006 : */
2007 140272 : if (abbreviate || !collate_c)
2008 : {
2009 67816 : sss = palloc(sizeof(VarStringSortSupport));
2010 67816 : sss->buf1 = palloc(TEXTBUFLEN);
2011 67816 : sss->buflen1 = TEXTBUFLEN;
2012 67816 : sss->buf2 = palloc(TEXTBUFLEN);
2013 67816 : sss->buflen2 = TEXTBUFLEN;
2014 : /* Start with invalid values */
2015 67816 : sss->last_len1 = -1;
2016 67816 : sss->last_len2 = -1;
2017 : /* Initialize */
2018 67816 : sss->last_returned = 0;
2019 67816 : if (collate_c)
2020 23132 : sss->locale = NULL;
2021 : else
2022 44684 : sss->locale = locale;
2023 :
2024 : /*
2025 : * To avoid somehow confusing a strxfrm() blob and an original string,
2026 : * constantly keep track of the variety of data that buf1 and buf2
2027 : * currently contain.
2028 : *
2029 : * Comparisons may be interleaved with conversion calls. Frequently,
2030 : * conversions and comparisons are batched into two distinct phases,
2031 : * but the correctness of caching cannot hinge upon this. For
2032 : * comparison caching, buffer state is only trusted if cache_blob is
2033 : * found set to false, whereas strxfrm() caching only trusts the state
2034 : * when cache_blob is found set to true.
2035 : *
2036 : * Arbitrarily initialize cache_blob to true.
2037 : */
2038 67816 : sss->cache_blob = true;
2039 67816 : sss->collate_c = collate_c;
2040 67816 : sss->typid = typid;
2041 67816 : ssup->ssup_extra = sss;
2042 :
2043 : /*
2044 : * If possible, plan to use the abbreviated keys optimization. The
2045 : * core code may switch back to authoritative comparator should
2046 : * abbreviation be aborted.
2047 : */
2048 67816 : if (abbreviate)
2049 : {
2050 23724 : sss->prop_card = 0.20;
2051 23724 : initHyperLogLog(&sss->abbr_card, 10);
2052 23724 : initHyperLogLog(&sss->full_card, 10);
2053 23724 : ssup->abbrev_full_comparator = ssup->comparator;
2054 23724 : ssup->comparator = ssup_datum_unsigned_cmp;
2055 23724 : ssup->abbrev_converter = varstr_abbrev_convert;
2056 23724 : ssup->abbrev_abort = varstr_abbrev_abort;
2057 : }
2058 : }
2059 140272 : }
2060 :
2061 : /*
2062 : * sortsupport comparison func (for C locale case)
2063 : */
2064 : static int
2065 41463928 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2066 : {
2067 41463928 : VarString *arg1 = DatumGetVarStringPP(x);
2068 41463928 : VarString *arg2 = DatumGetVarStringPP(y);
2069 : char *a1p,
2070 : *a2p;
2071 : int len1,
2072 : len2,
2073 : result;
2074 :
2075 41463928 : a1p = VARDATA_ANY(arg1);
2076 41463928 : a2p = VARDATA_ANY(arg2);
2077 :
2078 41463928 : len1 = VARSIZE_ANY_EXHDR(arg1);
2079 41463928 : len2 = VARSIZE_ANY_EXHDR(arg2);
2080 :
2081 41463928 : result = memcmp(a1p, a2p, Min(len1, len2));
2082 41463928 : if ((result == 0) && (len1 != len2))
2083 1150456 : result = (len1 < len2) ? -1 : 1;
2084 :
2085 : /* We can't afford to leak memory here. */
2086 41463928 : if (PointerGetDatum(arg1) != x)
2087 2 : pfree(arg1);
2088 41463928 : if (PointerGetDatum(arg2) != y)
2089 2 : pfree(arg2);
2090 :
2091 41463928 : return result;
2092 : }
2093 :
2094 : /*
2095 : * sortsupport comparison func (for BpChar C locale case)
2096 : *
2097 : * BpChar outsources its sortsupport to this module. Specialization for the
2098 : * varstr_sortsupport BpChar case, modeled on
2099 : * internal_bpchar_pattern_compare().
2100 : */
2101 : static int
2102 62424 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2103 : {
2104 62424 : BpChar *arg1 = DatumGetBpCharPP(x);
2105 62424 : BpChar *arg2 = DatumGetBpCharPP(y);
2106 : char *a1p,
2107 : *a2p;
2108 : int len1,
2109 : len2,
2110 : result;
2111 :
2112 62424 : a1p = VARDATA_ANY(arg1);
2113 62424 : a2p = VARDATA_ANY(arg2);
2114 :
2115 62424 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2116 62424 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2117 :
2118 62424 : result = memcmp(a1p, a2p, Min(len1, len2));
2119 62424 : if ((result == 0) && (len1 != len2))
2120 4 : result = (len1 < len2) ? -1 : 1;
2121 :
2122 : /* We can't afford to leak memory here. */
2123 62424 : if (PointerGetDatum(arg1) != x)
2124 0 : pfree(arg1);
2125 62424 : if (PointerGetDatum(arg2) != y)
2126 0 : pfree(arg2);
2127 :
2128 62424 : return result;
2129 : }
2130 :
2131 : /*
2132 : * sortsupport comparison func (for NAME C locale case)
2133 : */
2134 : static int
2135 38023972 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2136 : {
2137 38023972 : Name arg1 = DatumGetName(x);
2138 38023972 : Name arg2 = DatumGetName(y);
2139 :
2140 38023972 : return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2141 : }
2142 :
2143 : /*
2144 : * sortsupport comparison func (for locale case with all varlena types)
2145 : */
2146 : static int
2147 35981984 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2148 : {
2149 35981984 : VarString *arg1 = DatumGetVarStringPP(x);
2150 35981984 : VarString *arg2 = DatumGetVarStringPP(y);
2151 : char *a1p,
2152 : *a2p;
2153 : int len1,
2154 : len2,
2155 : result;
2156 :
2157 35981984 : a1p = VARDATA_ANY(arg1);
2158 35981984 : a2p = VARDATA_ANY(arg2);
2159 :
2160 35981984 : len1 = VARSIZE_ANY_EXHDR(arg1);
2161 35981984 : len2 = VARSIZE_ANY_EXHDR(arg2);
2162 :
2163 35981984 : result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2164 :
2165 : /* We can't afford to leak memory here. */
2166 35981984 : if (PointerGetDatum(arg1) != x)
2167 4 : pfree(arg1);
2168 35981984 : if (PointerGetDatum(arg2) != y)
2169 4 : pfree(arg2);
2170 :
2171 35981984 : return result;
2172 : }
2173 :
2174 : /*
2175 : * sortsupport comparison func (for locale case with NAME type)
2176 : */
2177 : static int
2178 0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2179 : {
2180 0 : Name arg1 = DatumGetName(x);
2181 0 : Name arg2 = DatumGetName(y);
2182 :
2183 0 : return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2184 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2185 : ssup);
2186 : }
2187 :
2188 : /*
2189 : * sortsupport comparison func for locale cases
2190 : */
2191 : static int
2192 35981984 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2193 : {
2194 35981984 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2195 : int result;
2196 : bool arg1_match;
2197 :
2198 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2199 35981984 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2200 : {
2201 : /*
2202 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2203 : * last_len2. Existing contents of buffers might still be used by
2204 : * next call.
2205 : *
2206 : * It's fine to allow the comparison of BpChar padding bytes here,
2207 : * even though that implies that the memcmp() will usually be
2208 : * performed for BpChar callers (though multibyte characters could
2209 : * still prevent that from occurring). The memcmp() is still very
2210 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2211 : * (not limited to padding), so we need make no distinction between
2212 : * padding space characters and "real" space characters.
2213 : */
2214 9274380 : return 0;
2215 : }
2216 :
2217 26707604 : if (sss->typid == BPCHAROID)
2218 : {
2219 : /* Get true number of bytes, ignoring trailing spaces */
2220 34496 : len1 = bpchartruelen(a1p, len1);
2221 34496 : len2 = bpchartruelen(a2p, len2);
2222 : }
2223 :
2224 26707604 : if (len1 >= sss->buflen1)
2225 : {
2226 14 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2227 14 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2228 : }
2229 26707604 : if (len2 >= sss->buflen2)
2230 : {
2231 10 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2232 10 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2233 : }
2234 :
2235 : /*
2236 : * We're likely to be asked to compare the same strings repeatedly, and
2237 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2238 : * comparisons, even though in general there is no reason to think that
2239 : * that will work out (every string datum may be unique). Caching does
2240 : * not slow things down measurably when it doesn't work out, and can speed
2241 : * things up by rather a lot when it does. In part, this is because the
2242 : * memcmp() compares data from cachelines that are needed in L1 cache even
2243 : * when the last comparison's result cannot be reused.
2244 : */
2245 26707604 : arg1_match = true;
2246 26707604 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2247 : {
2248 24726646 : arg1_match = false;
2249 24726646 : memcpy(sss->buf1, a1p, len1);
2250 24726646 : sss->buf1[len1] = '\0';
2251 24726646 : sss->last_len1 = len1;
2252 : }
2253 :
2254 : /*
2255 : * If we're comparing the same two strings as last time, we can return the
2256 : * same answer without calling strcoll() again. This is more likely than
2257 : * it seems (at least with moderate to low cardinality sets), because
2258 : * quicksort compares the same pivot against many values.
2259 : */
2260 26707604 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2261 : {
2262 4065954 : memcpy(sss->buf2, a2p, len2);
2263 4065954 : sss->buf2[len2] = '\0';
2264 4065954 : sss->last_len2 = len2;
2265 : }
2266 22641650 : else if (arg1_match && !sss->cache_blob)
2267 : {
2268 : /* Use result cached following last actual strcoll() call */
2269 1554488 : return sss->last_returned;
2270 : }
2271 :
2272 25153116 : result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2273 :
2274 : /* Break tie if necessary. */
2275 25153116 : if (result == 0 && sss->locale->deterministic)
2276 0 : result = strcmp(sss->buf1, sss->buf2);
2277 :
2278 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2279 25153116 : sss->cache_blob = false;
2280 25153116 : sss->last_returned = result;
2281 25153116 : return result;
2282 : }
2283 :
2284 : /*
2285 : * Conversion routine for sortsupport. Converts original to abbreviated key
2286 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2287 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2288 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2289 : * locale is used, or in case of bytea, just memcpy() from original instead.
2290 : */
2291 : static Datum
2292 829642 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2293 : {
2294 829642 : const size_t max_prefix_bytes = sizeof(Datum);
2295 829642 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2296 829642 : VarString *authoritative = DatumGetVarStringPP(original);
2297 829642 : char *authoritative_data = VARDATA_ANY(authoritative);
2298 :
2299 : /* working state */
2300 : Datum res;
2301 : char *pres;
2302 : int len;
2303 : uint32 hash;
2304 :
2305 829642 : pres = (char *) &res;
2306 : /* memset(), so any non-overwritten bytes are NUL */
2307 829642 : memset(pres, 0, max_prefix_bytes);
2308 829642 : len = VARSIZE_ANY_EXHDR(authoritative);
2309 :
2310 : /* Get number of bytes, ignoring trailing spaces */
2311 829642 : if (sss->typid == BPCHAROID)
2312 1010 : len = bpchartruelen(authoritative_data, len);
2313 :
2314 : /*
2315 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2316 : * abbreviate keys. The full comparator for the C locale is always
2317 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2318 : * always force the C collation -- bytea isn't a collatable type, but this
2319 : * approach is convenient) to use strxfrm(). This is because bytea
2320 : * strings may contain NUL bytes. Besides, this should be faster, too.
2321 : *
2322 : * More generally, it's okay that bytea callers can have NUL bytes in
2323 : * strings because abbreviated cmp need not make a distinction between
2324 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2325 : * authoritative representation. Hopefully a comparison at or past one
2326 : * abbreviated key's terminating NUL byte will resolve the comparison
2327 : * without consulting the authoritative representation; specifically, some
2328 : * later non-NUL byte in the longer string can resolve the comparison
2329 : * against a subsequent terminating NUL in the shorter string. There will
2330 : * usually be what is effectively a "length-wise" resolution there and
2331 : * then.
2332 : *
2333 : * If that doesn't work out -- if all bytes in the longer string
2334 : * positioned at or past the offset of the smaller string's (first)
2335 : * terminating NUL are actually representative of NUL bytes in the
2336 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2337 : * towards the end of the longer string iff it happens to still be small)
2338 : * -- then an authoritative tie-breaker will happen, and do the right
2339 : * thing: explicitly consider string length.
2340 : */
2341 829642 : if (sss->collate_c)
2342 827818 : memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2343 : else
2344 : {
2345 : Size bsize;
2346 :
2347 : /*
2348 : * We're not using the C collation, so fall back on strxfrm or ICU
2349 : * analogs.
2350 : */
2351 :
2352 : /* By convention, we use buffer 1 to store and NUL-terminate */
2353 1824 : if (len >= sss->buflen1)
2354 : {
2355 0 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2356 0 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2357 : }
2358 :
2359 : /* Might be able to reuse strxfrm() blob from last call */
2360 1824 : if (sss->last_len1 == len && sss->cache_blob &&
2361 912 : memcmp(sss->buf1, authoritative_data, len) == 0)
2362 : {
2363 168 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2364 : /* No change affecting cardinality, so no hashing required */
2365 168 : goto done;
2366 : }
2367 :
2368 1656 : memcpy(sss->buf1, authoritative_data, len);
2369 :
2370 : /*
2371 : * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2372 : */
2373 1656 : sss->buf1[len] = '\0';
2374 1656 : sss->last_len1 = len;
2375 :
2376 1656 : if (pg_strxfrm_prefix_enabled(sss->locale))
2377 : {
2378 1656 : if (sss->buflen2 < max_prefix_bytes)
2379 : {
2380 0 : sss->buflen2 = Max(max_prefix_bytes,
2381 : Min(sss->buflen2 * 2, MaxAllocSize));
2382 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2383 : }
2384 :
2385 1656 : bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2386 : max_prefix_bytes, sss->locale);
2387 1656 : sss->last_len2 = bsize;
2388 : }
2389 : else
2390 : {
2391 : /*
2392 : * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2393 : * again. The pg_strxfrm() function leaves the result buffer
2394 : * content undefined if the result did not fit, so we need to
2395 : * retry until everything fits, even though we only need the first
2396 : * few bytes in the end.
2397 : */
2398 : for (;;)
2399 : {
2400 0 : bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2401 : sss->locale);
2402 :
2403 0 : sss->last_len2 = bsize;
2404 0 : if (bsize < sss->buflen2)
2405 0 : break;
2406 :
2407 : /*
2408 : * Grow buffer and retry.
2409 : */
2410 0 : sss->buflen2 = Max(bsize + 1,
2411 : Min(sss->buflen2 * 2, MaxAllocSize));
2412 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2413 : }
2414 : }
2415 :
2416 : /*
2417 : * Every Datum byte is always compared. This is safe because the
2418 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2419 : * misinterpreting any NUL bytes not intended to be interpreted as
2420 : * logically representing termination.
2421 : *
2422 : * (Actually, even if there were NUL bytes in the blob it would be
2423 : * okay. See remarks on bytea case above.)
2424 : */
2425 1656 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2426 : }
2427 :
2428 : /*
2429 : * Maintain approximate cardinality of both abbreviated keys and original,
2430 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2431 : * the worst case, where we do many string transformations for no saving
2432 : * in full strcoll()-based comparisons. These statistics are used by
2433 : * varstr_abbrev_abort().
2434 : *
2435 : * First, Hash key proper, or a significant fraction of it. Mix in length
2436 : * in order to compensate for cases where differences are past
2437 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2438 : */
2439 829474 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2440 : Min(len, PG_CACHE_LINE_SIZE)));
2441 :
2442 829474 : if (len > PG_CACHE_LINE_SIZE)
2443 190 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2444 :
2445 829474 : addHyperLogLog(&sss->full_card, hash);
2446 :
2447 : /* Hash abbreviated key */
2448 : #if SIZEOF_DATUM == 8
2449 : {
2450 : uint32 lohalf,
2451 : hihalf;
2452 :
2453 829474 : lohalf = (uint32) res;
2454 829474 : hihalf = (uint32) (res >> 32);
2455 829474 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2456 : }
2457 : #else /* SIZEOF_DATUM != 8 */
2458 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2459 : #endif
2460 :
2461 829474 : addHyperLogLog(&sss->abbr_card, hash);
2462 :
2463 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2464 829474 : sss->cache_blob = true;
2465 829642 : done:
2466 :
2467 : /*
2468 : * Byteswap on little-endian machines.
2469 : *
2470 : * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2471 : * 3-way comparator) works correctly on all platforms. If we didn't do
2472 : * this, the comparator would have to call memcmp() with a pair of
2473 : * pointers to the first byte of each abbreviated key, which is slower.
2474 : */
2475 829642 : res = DatumBigEndianToNative(res);
2476 :
2477 : /* Don't leak memory here */
2478 829642 : if (PointerGetDatum(authoritative) != original)
2479 4 : pfree(authoritative);
2480 :
2481 829642 : return res;
2482 : }
2483 :
2484 : /*
2485 : * Callback for estimating effectiveness of abbreviated key optimization, using
2486 : * heuristic rules. Returns value indicating if the abbreviation optimization
2487 : * should be aborted, based on its projected effectiveness.
2488 : */
2489 : static bool
2490 2264 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2491 : {
2492 2264 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2493 : double abbrev_distinct,
2494 : key_distinct;
2495 :
2496 : Assert(ssup->abbreviate);
2497 :
2498 : /* Have a little patience */
2499 2264 : if (memtupcount < 100)
2500 1284 : return false;
2501 :
2502 980 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2503 980 : key_distinct = estimateHyperLogLog(&sss->full_card);
2504 :
2505 : /*
2506 : * Clamp cardinality estimates to at least one distinct value. While
2507 : * NULLs are generally disregarded, if only NULL values were seen so far,
2508 : * that might misrepresent costs if we failed to clamp.
2509 : */
2510 980 : if (abbrev_distinct <= 1.0)
2511 0 : abbrev_distinct = 1.0;
2512 :
2513 980 : if (key_distinct <= 1.0)
2514 0 : key_distinct = 1.0;
2515 :
2516 : /*
2517 : * In the worst case all abbreviated keys are identical, while at the same
2518 : * time there are differences within full key strings not captured in
2519 : * abbreviations.
2520 : */
2521 980 : if (trace_sort)
2522 : {
2523 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2524 :
2525 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2526 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2527 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2528 : sss->prop_card);
2529 : }
2530 :
2531 : /*
2532 : * If the number of distinct abbreviated keys approximately matches the
2533 : * number of distinct authoritative original keys, that's reason enough to
2534 : * proceed. We can win even with a very low cardinality set if most
2535 : * tie-breakers only memcmp(). This is by far the most important
2536 : * consideration.
2537 : *
2538 : * While comparisons that are resolved at the abbreviated key level are
2539 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2540 : * those two outcomes are so much cheaper than a full strcoll() once
2541 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2542 : * cardinality against the overall size of the set in order to more
2543 : * accurately model costs. Assume that an abbreviated comparison, and an
2544 : * abbreviated comparison with a cheap memcmp()-based authoritative
2545 : * resolution are equivalent.
2546 : */
2547 980 : if (abbrev_distinct > key_distinct * sss->prop_card)
2548 : {
2549 : /*
2550 : * When we have exceeded 10,000 tuples, decay required cardinality
2551 : * aggressively for next call.
2552 : *
2553 : * This is useful because the number of comparisons required on
2554 : * average increases at a linearithmic rate, and at roughly 10,000
2555 : * tuples that factor will start to dominate over the linear costs of
2556 : * string transformation (this is a conservative estimate). The decay
2557 : * rate is chosen to be a little less aggressive than halving -- which
2558 : * (since we're called at points at which memtupcount has doubled)
2559 : * would never see the cost model actually abort past the first call
2560 : * following a decay. This decay rate is mostly a precaution against
2561 : * a sudden, violent swing in how well abbreviated cardinality tracks
2562 : * full key cardinality. The decay also serves to prevent a marginal
2563 : * case from being aborted too late, when too much has already been
2564 : * invested in string transformation.
2565 : *
2566 : * It's possible for sets of several million distinct strings with
2567 : * mere tens of thousands of distinct abbreviated keys to still
2568 : * benefit very significantly. This will generally occur provided
2569 : * each abbreviated key is a proxy for a roughly uniform number of the
2570 : * set's full keys. If it isn't so, we hope to catch that early and
2571 : * abort. If it isn't caught early, by the time the problem is
2572 : * apparent it's probably not worth aborting.
2573 : */
2574 980 : if (memtupcount > 10000)
2575 4 : sss->prop_card *= 0.65;
2576 :
2577 980 : return false;
2578 : }
2579 :
2580 : /*
2581 : * Abort abbreviation strategy.
2582 : *
2583 : * The worst case, where all abbreviated keys are identical while all
2584 : * original strings differ will typically only see a regression of about
2585 : * 10% in execution time for small to medium sized lists of strings.
2586 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2587 : * often expect very large improvements, particularly with sets of strings
2588 : * of moderately high to high abbreviated cardinality. There is little to
2589 : * lose but much to gain, which our strategy reflects.
2590 : */
2591 0 : if (trace_sort)
2592 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2593 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2594 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2595 :
2596 0 : return true;
2597 : }
2598 :
2599 : /*
2600 : * Generic equalimage support function for character type's operator classes.
2601 : * Disables the use of deduplication with nondeterministic collations.
2602 : */
2603 : Datum
2604 8464 : btvarstrequalimage(PG_FUNCTION_ARGS)
2605 : {
2606 : /* Oid opcintype = PG_GETARG_OID(0); */
2607 8464 : Oid collid = PG_GET_COLLATION();
2608 : pg_locale_t locale;
2609 :
2610 8464 : check_collation_set(collid);
2611 :
2612 8464 : locale = pg_newlocale_from_collation(collid);
2613 :
2614 8464 : PG_RETURN_BOOL(locale->deterministic);
2615 : }
2616 :
2617 : Datum
2618 229560 : text_larger(PG_FUNCTION_ARGS)
2619 : {
2620 229560 : text *arg1 = PG_GETARG_TEXT_PP(0);
2621 229560 : text *arg2 = PG_GETARG_TEXT_PP(1);
2622 : text *result;
2623 :
2624 229560 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2625 :
2626 229560 : PG_RETURN_TEXT_P(result);
2627 : }
2628 :
2629 : Datum
2630 86076 : text_smaller(PG_FUNCTION_ARGS)
2631 : {
2632 86076 : text *arg1 = PG_GETARG_TEXT_PP(0);
2633 86076 : text *arg2 = PG_GETARG_TEXT_PP(1);
2634 : text *result;
2635 :
2636 86076 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2637 :
2638 86076 : PG_RETURN_TEXT_P(result);
2639 : }
2640 :
2641 :
2642 : /*
2643 : * Cross-type comparison functions for types text and name.
2644 : */
2645 :
2646 : Datum
2647 188878 : nameeqtext(PG_FUNCTION_ARGS)
2648 : {
2649 188878 : Name arg1 = PG_GETARG_NAME(0);
2650 188878 : text *arg2 = PG_GETARG_TEXT_PP(1);
2651 188878 : size_t len1 = strlen(NameStr(*arg1));
2652 188878 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2653 188878 : Oid collid = PG_GET_COLLATION();
2654 : bool result;
2655 :
2656 188878 : check_collation_set(collid);
2657 :
2658 188878 : if (collid == C_COLLATION_OID)
2659 254748 : result = (len1 == len2 &&
2660 123254 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2661 : else
2662 57384 : result = (varstr_cmp(NameStr(*arg1), len1,
2663 57384 : VARDATA_ANY(arg2), len2,
2664 : collid) == 0);
2665 :
2666 188878 : PG_FREE_IF_COPY(arg2, 1);
2667 :
2668 188878 : PG_RETURN_BOOL(result);
2669 : }
2670 :
2671 : Datum
2672 7808 : texteqname(PG_FUNCTION_ARGS)
2673 : {
2674 7808 : text *arg1 = PG_GETARG_TEXT_PP(0);
2675 7808 : Name arg2 = PG_GETARG_NAME(1);
2676 7808 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2677 7808 : size_t len2 = strlen(NameStr(*arg2));
2678 7808 : Oid collid = PG_GET_COLLATION();
2679 : bool result;
2680 :
2681 7808 : check_collation_set(collid);
2682 :
2683 7808 : if (collid == C_COLLATION_OID)
2684 568 : result = (len1 == len2 &&
2685 182 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2686 : else
2687 7422 : result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2688 7422 : NameStr(*arg2), len2,
2689 : collid) == 0);
2690 :
2691 7808 : PG_FREE_IF_COPY(arg1, 0);
2692 :
2693 7808 : PG_RETURN_BOOL(result);
2694 : }
2695 :
2696 : Datum
2697 18 : namenetext(PG_FUNCTION_ARGS)
2698 : {
2699 18 : Name arg1 = PG_GETARG_NAME(0);
2700 18 : text *arg2 = PG_GETARG_TEXT_PP(1);
2701 18 : size_t len1 = strlen(NameStr(*arg1));
2702 18 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2703 18 : Oid collid = PG_GET_COLLATION();
2704 : bool result;
2705 :
2706 18 : check_collation_set(collid);
2707 :
2708 18 : if (collid == C_COLLATION_OID)
2709 0 : result = !(len1 == len2 &&
2710 0 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2711 : else
2712 18 : result = !(varstr_cmp(NameStr(*arg1), len1,
2713 18 : VARDATA_ANY(arg2), len2,
2714 : collid) == 0);
2715 :
2716 18 : PG_FREE_IF_COPY(arg2, 1);
2717 :
2718 18 : PG_RETURN_BOOL(result);
2719 : }
2720 :
2721 : Datum
2722 18 : textnename(PG_FUNCTION_ARGS)
2723 : {
2724 18 : text *arg1 = PG_GETARG_TEXT_PP(0);
2725 18 : Name arg2 = PG_GETARG_NAME(1);
2726 18 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2727 18 : size_t len2 = strlen(NameStr(*arg2));
2728 18 : Oid collid = PG_GET_COLLATION();
2729 : bool result;
2730 :
2731 18 : check_collation_set(collid);
2732 :
2733 18 : if (collid == C_COLLATION_OID)
2734 0 : result = !(len1 == len2 &&
2735 0 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2736 : else
2737 18 : result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2738 18 : NameStr(*arg2), len2,
2739 : collid) == 0);
2740 :
2741 18 : PG_FREE_IF_COPY(arg1, 0);
2742 :
2743 18 : PG_RETURN_BOOL(result);
2744 : }
2745 :
2746 : Datum
2747 108296 : btnametextcmp(PG_FUNCTION_ARGS)
2748 : {
2749 108296 : Name arg1 = PG_GETARG_NAME(0);
2750 108296 : text *arg2 = PG_GETARG_TEXT_PP(1);
2751 : int32 result;
2752 :
2753 216592 : result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2754 216592 : VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2755 : PG_GET_COLLATION());
2756 :
2757 108296 : PG_FREE_IF_COPY(arg2, 1);
2758 :
2759 108296 : PG_RETURN_INT32(result);
2760 : }
2761 :
2762 : Datum
2763 0 : bttextnamecmp(PG_FUNCTION_ARGS)
2764 : {
2765 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2766 0 : Name arg2 = PG_GETARG_NAME(1);
2767 : int32 result;
2768 :
2769 0 : result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2770 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2771 : PG_GET_COLLATION());
2772 :
2773 0 : PG_FREE_IF_COPY(arg1, 0);
2774 :
2775 0 : PG_RETURN_INT32(result);
2776 : }
2777 :
2778 : #define CmpCall(cmpfunc) \
2779 : DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2780 : PG_GET_COLLATION(), \
2781 : PG_GETARG_DATUM(0), \
2782 : PG_GETARG_DATUM(1)))
2783 :
2784 : Datum
2785 56482 : namelttext(PG_FUNCTION_ARGS)
2786 : {
2787 56482 : PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2788 : }
2789 :
2790 : Datum
2791 0 : nameletext(PG_FUNCTION_ARGS)
2792 : {
2793 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2794 : }
2795 :
2796 : Datum
2797 0 : namegttext(PG_FUNCTION_ARGS)
2798 : {
2799 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2800 : }
2801 :
2802 : Datum
2803 39330 : namegetext(PG_FUNCTION_ARGS)
2804 : {
2805 39330 : PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2806 : }
2807 :
2808 : Datum
2809 0 : textltname(PG_FUNCTION_ARGS)
2810 : {
2811 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2812 : }
2813 :
2814 : Datum
2815 0 : textlename(PG_FUNCTION_ARGS)
2816 : {
2817 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2818 : }
2819 :
2820 : Datum
2821 0 : textgtname(PG_FUNCTION_ARGS)
2822 : {
2823 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2824 : }
2825 :
2826 : Datum
2827 0 : textgename(PG_FUNCTION_ARGS)
2828 : {
2829 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2830 : }
2831 :
2832 : #undef CmpCall
2833 :
2834 :
2835 : /*
2836 : * The following operators support character-by-character comparison
2837 : * of text datums, to allow building indexes suitable for LIKE clauses.
2838 : * Note that the regular texteq/textne comparison operators, and regular
2839 : * support functions 1 and 2 with "C" collation are assumed to be
2840 : * compatible with these!
2841 : */
2842 :
2843 : static int
2844 152158 : internal_text_pattern_compare(text *arg1, text *arg2)
2845 : {
2846 : int result;
2847 : int len1,
2848 : len2;
2849 :
2850 152158 : len1 = VARSIZE_ANY_EXHDR(arg1);
2851 152158 : len2 = VARSIZE_ANY_EXHDR(arg2);
2852 :
2853 152158 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2854 152158 : if (result != 0)
2855 152092 : return result;
2856 66 : else if (len1 < len2)
2857 0 : return -1;
2858 66 : else if (len1 > len2)
2859 18 : return 1;
2860 : else
2861 48 : return 0;
2862 : }
2863 :
2864 :
2865 : Datum
2866 39580 : text_pattern_lt(PG_FUNCTION_ARGS)
2867 : {
2868 39580 : text *arg1 = PG_GETARG_TEXT_PP(0);
2869 39580 : text *arg2 = PG_GETARG_TEXT_PP(1);
2870 : int result;
2871 :
2872 39580 : result = internal_text_pattern_compare(arg1, arg2);
2873 :
2874 39580 : PG_FREE_IF_COPY(arg1, 0);
2875 39580 : PG_FREE_IF_COPY(arg2, 1);
2876 :
2877 39580 : PG_RETURN_BOOL(result < 0);
2878 : }
2879 :
2880 :
2881 : Datum
2882 37510 : text_pattern_le(PG_FUNCTION_ARGS)
2883 : {
2884 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2885 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2886 : int result;
2887 :
2888 37510 : result = internal_text_pattern_compare(arg1, arg2);
2889 :
2890 37510 : PG_FREE_IF_COPY(arg1, 0);
2891 37510 : PG_FREE_IF_COPY(arg2, 1);
2892 :
2893 37510 : PG_RETURN_BOOL(result <= 0);
2894 : }
2895 :
2896 :
2897 : Datum
2898 37534 : text_pattern_ge(PG_FUNCTION_ARGS)
2899 : {
2900 37534 : text *arg1 = PG_GETARG_TEXT_PP(0);
2901 37534 : text *arg2 = PG_GETARG_TEXT_PP(1);
2902 : int result;
2903 :
2904 37534 : result = internal_text_pattern_compare(arg1, arg2);
2905 :
2906 37534 : PG_FREE_IF_COPY(arg1, 0);
2907 37534 : PG_FREE_IF_COPY(arg2, 1);
2908 :
2909 37534 : PG_RETURN_BOOL(result >= 0);
2910 : }
2911 :
2912 :
2913 : Datum
2914 37510 : text_pattern_gt(PG_FUNCTION_ARGS)
2915 : {
2916 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2917 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2918 : int result;
2919 :
2920 37510 : result = internal_text_pattern_compare(arg1, arg2);
2921 :
2922 37510 : PG_FREE_IF_COPY(arg1, 0);
2923 37510 : PG_FREE_IF_COPY(arg2, 1);
2924 :
2925 37510 : PG_RETURN_BOOL(result > 0);
2926 : }
2927 :
2928 :
2929 : Datum
2930 24 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
2931 : {
2932 24 : text *arg1 = PG_GETARG_TEXT_PP(0);
2933 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
2934 : int result;
2935 :
2936 24 : result = internal_text_pattern_compare(arg1, arg2);
2937 :
2938 24 : PG_FREE_IF_COPY(arg1, 0);
2939 24 : PG_FREE_IF_COPY(arg2, 1);
2940 :
2941 24 : PG_RETURN_INT32(result);
2942 : }
2943 :
2944 :
2945 : Datum
2946 116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2947 : {
2948 116 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2949 : MemoryContext oldcontext;
2950 :
2951 116 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2952 :
2953 : /* Use generic string SortSupport, forcing "C" collation */
2954 116 : varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2955 :
2956 116 : MemoryContextSwitchTo(oldcontext);
2957 :
2958 116 : PG_RETURN_VOID();
2959 : }
2960 :
2961 :
2962 : /*-------------------------------------------------------------
2963 : * byteaoctetlen
2964 : *
2965 : * get the number of bytes contained in an instance of type 'bytea'
2966 : *-------------------------------------------------------------
2967 : */
2968 : Datum
2969 602 : byteaoctetlen(PG_FUNCTION_ARGS)
2970 : {
2971 602 : Datum str = PG_GETARG_DATUM(0);
2972 :
2973 : /* We need not detoast the input at all */
2974 602 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2975 : }
2976 :
2977 : /*
2978 : * byteacat -
2979 : * takes two bytea* and returns a bytea* that is the concatenation of
2980 : * the two.
2981 : *
2982 : * Cloned from textcat and modified as required.
2983 : */
2984 : Datum
2985 1522 : byteacat(PG_FUNCTION_ARGS)
2986 : {
2987 1522 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2988 1522 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2989 :
2990 1522 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2991 : }
2992 :
2993 : /*
2994 : * bytea_catenate
2995 : * Guts of byteacat(), broken out so it can be used by other functions
2996 : *
2997 : * Arguments can be in short-header form, but not compressed or out-of-line
2998 : */
2999 : static bytea *
3000 1558 : bytea_catenate(bytea *t1, bytea *t2)
3001 : {
3002 : bytea *result;
3003 : int len1,
3004 : len2,
3005 : len;
3006 : char *ptr;
3007 :
3008 1558 : len1 = VARSIZE_ANY_EXHDR(t1);
3009 1558 : len2 = VARSIZE_ANY_EXHDR(t2);
3010 :
3011 : /* paranoia ... probably should throw error instead? */
3012 1558 : if (len1 < 0)
3013 0 : len1 = 0;
3014 1558 : if (len2 < 0)
3015 0 : len2 = 0;
3016 :
3017 1558 : len = len1 + len2 + VARHDRSZ;
3018 1558 : result = (bytea *) palloc(len);
3019 :
3020 : /* Set size of result string... */
3021 1558 : SET_VARSIZE(result, len);
3022 :
3023 : /* Fill data field of result string... */
3024 1558 : ptr = VARDATA(result);
3025 1558 : if (len1 > 0)
3026 1558 : memcpy(ptr, VARDATA_ANY(t1), len1);
3027 1558 : if (len2 > 0)
3028 1540 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3029 :
3030 1558 : return result;
3031 : }
3032 :
3033 : #define PG_STR_GET_BYTEA(str_) \
3034 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3035 :
3036 : /*
3037 : * bytea_substr()
3038 : * Return a substring starting at the specified position.
3039 : * Cloned from text_substr and modified as required.
3040 : *
3041 : * Input:
3042 : * - string
3043 : * - starting position (is one-based)
3044 : * - string length (optional)
3045 : *
3046 : * If the starting position is zero or less, then return from the start of the string
3047 : * adjusting the length to be consistent with the "negative start" per SQL.
3048 : * If the length is less than zero, an ERROR is thrown. If no third argument
3049 : * (length) is provided, the length to the end of the string is assumed.
3050 : */
3051 : Datum
3052 86 : bytea_substr(PG_FUNCTION_ARGS)
3053 : {
3054 86 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3055 : PG_GETARG_INT32(1),
3056 : PG_GETARG_INT32(2),
3057 : false));
3058 : }
3059 :
3060 : /*
3061 : * bytea_substr_no_len -
3062 : * Wrapper to avoid opr_sanity failure due to
3063 : * one function accepting a different number of args.
3064 : */
3065 : Datum
3066 3900 : bytea_substr_no_len(PG_FUNCTION_ARGS)
3067 : {
3068 3900 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3069 : PG_GETARG_INT32(1),
3070 : -1,
3071 : true));
3072 : }
3073 :
3074 : static bytea *
3075 4022 : bytea_substring(Datum str,
3076 : int S,
3077 : int L,
3078 : bool length_not_specified)
3079 : {
3080 : int32 S1; /* adjusted start position */
3081 : int32 L1; /* adjusted substring length */
3082 : int32 E; /* end position */
3083 :
3084 : /*
3085 : * The logic here should generally match text_substring().
3086 : */
3087 4022 : S1 = Max(S, 1);
3088 :
3089 4022 : if (length_not_specified)
3090 : {
3091 : /*
3092 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3093 : * end of the string if we pass it a negative value for length.
3094 : */
3095 3918 : L1 = -1;
3096 : }
3097 104 : else if (L < 0)
3098 : {
3099 : /* SQL99 says to throw an error for E < S, i.e., negative length */
3100 12 : ereport(ERROR,
3101 : (errcode(ERRCODE_SUBSTRING_ERROR),
3102 : errmsg("negative substring length not allowed")));
3103 : L1 = -1; /* silence stupider compilers */
3104 : }
3105 92 : else if (pg_add_s32_overflow(S, L, &E))
3106 : {
3107 : /*
3108 : * L could be large enough for S + L to overflow, in which case the
3109 : * substring must run to end of string.
3110 : */
3111 6 : L1 = -1;
3112 : }
3113 : else
3114 : {
3115 : /*
3116 : * A zero or negative value for the end position can happen if the
3117 : * start was negative or one. SQL99 says to return a zero-length
3118 : * string.
3119 : */
3120 86 : if (E < 1)
3121 0 : return PG_STR_GET_BYTEA("");
3122 :
3123 86 : L1 = E - S1;
3124 : }
3125 :
3126 : /*
3127 : * If the start position is past the end of the string, SQL99 says to
3128 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
3129 : * us. We need only convert S1 to zero-based starting position.
3130 : */
3131 4010 : return DatumGetByteaPSlice(str, S1 - 1, L1);
3132 : }
3133 :
3134 : /*
3135 : * byteaoverlay
3136 : * Replace specified substring of first string with second
3137 : *
3138 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3139 : * This code is a direct implementation of what the standard says.
3140 : */
3141 : Datum
3142 6 : byteaoverlay(PG_FUNCTION_ARGS)
3143 : {
3144 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3145 6 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3146 6 : int sp = PG_GETARG_INT32(2); /* substring start position */
3147 6 : int sl = PG_GETARG_INT32(3); /* substring length */
3148 :
3149 6 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3150 : }
3151 :
3152 : Datum
3153 12 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
3154 : {
3155 12 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3156 12 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3157 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
3158 : int sl;
3159 :
3160 12 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3161 12 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3162 : }
3163 :
3164 : static bytea *
3165 18 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3166 : {
3167 : bytea *result;
3168 : bytea *s1;
3169 : bytea *s2;
3170 : int sp_pl_sl;
3171 :
3172 : /*
3173 : * Check for possible integer-overflow cases. For negative sp, throw a
3174 : * "substring length" error because that's what should be expected
3175 : * according to the spec's definition of OVERLAY().
3176 : */
3177 18 : if (sp <= 0)
3178 0 : ereport(ERROR,
3179 : (errcode(ERRCODE_SUBSTRING_ERROR),
3180 : errmsg("negative substring length not allowed")));
3181 18 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3182 0 : ereport(ERROR,
3183 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3184 : errmsg("integer out of range")));
3185 :
3186 18 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3187 18 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3188 18 : result = bytea_catenate(s1, t2);
3189 18 : result = bytea_catenate(result, s2);
3190 :
3191 18 : return result;
3192 : }
3193 :
3194 : /*
3195 : * bit_count
3196 : */
3197 : Datum
3198 6 : bytea_bit_count(PG_FUNCTION_ARGS)
3199 : {
3200 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3201 :
3202 6 : PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3203 : }
3204 :
3205 : /*
3206 : * byteapos -
3207 : * Return the position of the specified substring.
3208 : * Implements the SQL POSITION() function.
3209 : * Cloned from textpos and modified as required.
3210 : */
3211 : Datum
3212 30 : byteapos(PG_FUNCTION_ARGS)
3213 : {
3214 30 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3215 30 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3216 : int pos;
3217 : int px,
3218 : p;
3219 : int len1,
3220 : len2;
3221 : char *p1,
3222 : *p2;
3223 :
3224 30 : len1 = VARSIZE_ANY_EXHDR(t1);
3225 30 : len2 = VARSIZE_ANY_EXHDR(t2);
3226 :
3227 30 : if (len2 <= 0)
3228 6 : PG_RETURN_INT32(1); /* result for empty pattern */
3229 :
3230 24 : p1 = VARDATA_ANY(t1);
3231 24 : p2 = VARDATA_ANY(t2);
3232 :
3233 24 : pos = 0;
3234 24 : px = (len1 - len2);
3235 54 : for (p = 0; p <= px; p++)
3236 : {
3237 42 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3238 : {
3239 12 : pos = p + 1;
3240 12 : break;
3241 : };
3242 30 : p1++;
3243 : };
3244 :
3245 24 : PG_RETURN_INT32(pos);
3246 : }
3247 :
3248 : /*-------------------------------------------------------------
3249 : * byteaGetByte
3250 : *
3251 : * this routine treats "bytea" as an array of bytes.
3252 : * It returns the Nth byte (a number between 0 and 255).
3253 : *-------------------------------------------------------------
3254 : */
3255 : Datum
3256 60 : byteaGetByte(PG_FUNCTION_ARGS)
3257 : {
3258 60 : bytea *v = PG_GETARG_BYTEA_PP(0);
3259 60 : int32 n = PG_GETARG_INT32(1);
3260 : int len;
3261 : int byte;
3262 :
3263 60 : len = VARSIZE_ANY_EXHDR(v);
3264 :
3265 60 : if (n < 0 || n >= len)
3266 6 : ereport(ERROR,
3267 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3268 : errmsg("index %d out of valid range, 0..%d",
3269 : n, len - 1)));
3270 :
3271 54 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3272 :
3273 54 : PG_RETURN_INT32(byte);
3274 : }
3275 :
3276 : /*-------------------------------------------------------------
3277 : * byteaGetBit
3278 : *
3279 : * This routine treats a "bytea" type like an array of bits.
3280 : * It returns the value of the Nth bit (0 or 1).
3281 : *
3282 : *-------------------------------------------------------------
3283 : */
3284 : Datum
3285 12 : byteaGetBit(PG_FUNCTION_ARGS)
3286 : {
3287 12 : bytea *v = PG_GETARG_BYTEA_PP(0);
3288 12 : int64 n = PG_GETARG_INT64(1);
3289 : int byteNo,
3290 : bitNo;
3291 : int len;
3292 : int byte;
3293 :
3294 12 : len = VARSIZE_ANY_EXHDR(v);
3295 :
3296 12 : if (n < 0 || n >= (int64) len * 8)
3297 6 : ereport(ERROR,
3298 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3299 : errmsg("index %" PRId64 " out of valid range, 0..%" PRId64,
3300 : n, (int64) len * 8 - 1)));
3301 :
3302 : /* n/8 is now known < len, so safe to cast to int */
3303 6 : byteNo = (int) (n / 8);
3304 6 : bitNo = (int) (n % 8);
3305 :
3306 6 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3307 :
3308 6 : if (byte & (1 << bitNo))
3309 6 : PG_RETURN_INT32(1);
3310 : else
3311 0 : PG_RETURN_INT32(0);
3312 : }
3313 :
3314 : /*-------------------------------------------------------------
3315 : * byteaSetByte
3316 : *
3317 : * Given an instance of type 'bytea' creates a new one with
3318 : * the Nth byte set to the given value.
3319 : *
3320 : *-------------------------------------------------------------
3321 : */
3322 : Datum
3323 12 : byteaSetByte(PG_FUNCTION_ARGS)
3324 : {
3325 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3326 12 : int32 n = PG_GETARG_INT32(1);
3327 12 : int32 newByte = PG_GETARG_INT32(2);
3328 : int len;
3329 :
3330 12 : len = VARSIZE(res) - VARHDRSZ;
3331 :
3332 12 : if (n < 0 || n >= len)
3333 6 : ereport(ERROR,
3334 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3335 : errmsg("index %d out of valid range, 0..%d",
3336 : n, len - 1)));
3337 :
3338 : /*
3339 : * Now set the byte.
3340 : */
3341 6 : ((unsigned char *) VARDATA(res))[n] = newByte;
3342 :
3343 6 : PG_RETURN_BYTEA_P(res);
3344 : }
3345 :
3346 : /*-------------------------------------------------------------
3347 : * byteaSetBit
3348 : *
3349 : * Given an instance of type 'bytea' creates a new one with
3350 : * the Nth bit set to the given value.
3351 : *
3352 : *-------------------------------------------------------------
3353 : */
3354 : Datum
3355 12 : byteaSetBit(PG_FUNCTION_ARGS)
3356 : {
3357 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3358 12 : int64 n = PG_GETARG_INT64(1);
3359 12 : int32 newBit = PG_GETARG_INT32(2);
3360 : int len;
3361 : int oldByte,
3362 : newByte;
3363 : int byteNo,
3364 : bitNo;
3365 :
3366 12 : len = VARSIZE(res) - VARHDRSZ;
3367 :
3368 12 : if (n < 0 || n >= (int64) len * 8)
3369 6 : ereport(ERROR,
3370 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3371 : errmsg("index %" PRId64 " out of valid range, 0..%" PRId64,
3372 : n, (int64) len * 8 - 1)));
3373 :
3374 : /* n/8 is now known < len, so safe to cast to int */
3375 6 : byteNo = (int) (n / 8);
3376 6 : bitNo = (int) (n % 8);
3377 :
3378 : /*
3379 : * sanity check!
3380 : */
3381 6 : if (newBit != 0 && newBit != 1)
3382 0 : ereport(ERROR,
3383 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3384 : errmsg("new bit must be 0 or 1")));
3385 :
3386 : /*
3387 : * Update the byte.
3388 : */
3389 6 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3390 :
3391 6 : if (newBit == 0)
3392 6 : newByte = oldByte & (~(1 << bitNo));
3393 : else
3394 0 : newByte = oldByte | (1 << bitNo);
3395 :
3396 6 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3397 :
3398 6 : PG_RETURN_BYTEA_P(res);
3399 : }
3400 :
3401 : /*
3402 : * Return reversed bytea
3403 : */
3404 : Datum
3405 18 : bytea_reverse(PG_FUNCTION_ARGS)
3406 : {
3407 18 : bytea *v = PG_GETARG_BYTEA_PP(0);
3408 18 : const char *p = VARDATA_ANY(v);
3409 18 : int len = VARSIZE_ANY_EXHDR(v);
3410 18 : const char *endp = p + len;
3411 18 : bytea *result = palloc(len + VARHDRSZ);
3412 18 : char *dst = (char *) VARDATA(result) + len;
3413 :
3414 18 : SET_VARSIZE(result, len + VARHDRSZ);
3415 :
3416 36 : while (p < endp)
3417 18 : *(--dst) = *p++;
3418 :
3419 18 : PG_RETURN_BYTEA_P(result);
3420 : }
3421 :
3422 :
3423 : /* text_name()
3424 : * Converts a text type to a Name type.
3425 : */
3426 : Datum
3427 30590 : text_name(PG_FUNCTION_ARGS)
3428 : {
3429 30590 : text *s = PG_GETARG_TEXT_PP(0);
3430 : Name result;
3431 : int len;
3432 :
3433 30590 : len = VARSIZE_ANY_EXHDR(s);
3434 :
3435 : /* Truncate oversize input */
3436 30590 : if (len >= NAMEDATALEN)
3437 6 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3438 :
3439 : /* We use palloc0 here to ensure result is zero-padded */
3440 30590 : result = (Name) palloc0(NAMEDATALEN);
3441 30590 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3442 :
3443 30590 : PG_RETURN_NAME(result);
3444 : }
3445 :
3446 : /* name_text()
3447 : * Converts a Name type to a text type.
3448 : */
3449 : Datum
3450 647668 : name_text(PG_FUNCTION_ARGS)
3451 : {
3452 647668 : Name s = PG_GETARG_NAME(0);
3453 :
3454 647668 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3455 : }
3456 :
3457 :
3458 : /*
3459 : * textToQualifiedNameList - convert a text object to list of names
3460 : *
3461 : * This implements the input parsing needed by nextval() and other
3462 : * functions that take a text parameter representing a qualified name.
3463 : * We split the name at dots, downcase if not double-quoted, and
3464 : * truncate names if they're too long.
3465 : */
3466 : List *
3467 5408 : textToQualifiedNameList(text *textval)
3468 : {
3469 : char *rawname;
3470 5408 : List *result = NIL;
3471 : List *namelist;
3472 : ListCell *l;
3473 :
3474 : /* Convert to C string (handles possible detoasting). */
3475 : /* Note we rely on being able to modify rawname below. */
3476 5408 : rawname = text_to_cstring(textval);
3477 :
3478 5408 : if (!SplitIdentifierString(rawname, '.', &namelist))
3479 0 : ereport(ERROR,
3480 : (errcode(ERRCODE_INVALID_NAME),
3481 : errmsg("invalid name syntax")));
3482 :
3483 5408 : if (namelist == NIL)
3484 0 : ereport(ERROR,
3485 : (errcode(ERRCODE_INVALID_NAME),
3486 : errmsg("invalid name syntax")));
3487 :
3488 10926 : foreach(l, namelist)
3489 : {
3490 5518 : char *curname = (char *) lfirst(l);
3491 :
3492 5518 : result = lappend(result, makeString(pstrdup(curname)));
3493 : }
3494 :
3495 5408 : pfree(rawname);
3496 5408 : list_free(namelist);
3497 :
3498 5408 : return result;
3499 : }
3500 :
3501 : /*
3502 : * SplitIdentifierString --- parse a string containing identifiers
3503 : *
3504 : * This is the guts of textToQualifiedNameList, and is exported for use in
3505 : * other situations such as parsing GUC variables. In the GUC case, it's
3506 : * important to avoid memory leaks, so the API is designed to minimize the
3507 : * amount of stuff that needs to be allocated and freed.
3508 : *
3509 : * Inputs:
3510 : * rawstring: the input string; must be overwritable! On return, it's
3511 : * been modified to contain the separated identifiers.
3512 : * separator: the separator punctuation expected between identifiers
3513 : * (typically '.' or ','). Whitespace may also appear around
3514 : * identifiers.
3515 : * Outputs:
3516 : * namelist: filled with a palloc'd list of pointers to identifiers within
3517 : * rawstring. Caller should list_free() this even on error return.
3518 : *
3519 : * Returns true if okay, false if there is a syntax error in the string.
3520 : *
3521 : * Note that an empty string is considered okay here, though not in
3522 : * textToQualifiedNameList.
3523 : */
3524 : bool
3525 253140 : SplitIdentifierString(char *rawstring, char separator,
3526 : List **namelist)
3527 : {
3528 253140 : char *nextp = rawstring;
3529 253140 : bool done = false;
3530 :
3531 253140 : *namelist = NIL;
3532 :
3533 253146 : while (scanner_isspace(*nextp))
3534 6 : nextp++; /* skip leading whitespace */
3535 :
3536 253140 : if (*nextp == '\0')
3537 26898 : return true; /* allow empty string */
3538 :
3539 : /* At the top of the loop, we are at start of a new identifier. */
3540 : do
3541 : {
3542 : char *curname;
3543 : char *endp;
3544 :
3545 410712 : if (*nextp == '"')
3546 : {
3547 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3548 37896 : curname = nextp + 1;
3549 : for (;;)
3550 : {
3551 37900 : endp = strchr(nextp + 1, '"');
3552 37898 : if (endp == NULL)
3553 0 : return false; /* mismatched quotes */
3554 37898 : if (endp[1] != '"')
3555 37896 : break; /* found end of quoted name */
3556 : /* Collapse adjacent quotes into one quote, and look again */
3557 2 : memmove(endp, endp + 1, strlen(endp));
3558 2 : nextp = endp;
3559 : }
3560 : /* endp now points at the terminating quote */
3561 37896 : nextp = endp + 1;
3562 : }
3563 : else
3564 : {
3565 : /* Unquoted name --- extends to separator or whitespace */
3566 : char *downname;
3567 : int len;
3568 :
3569 372816 : curname = nextp;
3570 3352576 : while (*nextp && *nextp != separator &&
3571 2979762 : !scanner_isspace(*nextp))
3572 2979760 : nextp++;
3573 372816 : endp = nextp;
3574 372816 : if (curname == nextp)
3575 0 : return false; /* empty unquoted name not allowed */
3576 :
3577 : /*
3578 : * Downcase the identifier, using same code as main lexer does.
3579 : *
3580 : * XXX because we want to overwrite the input in-place, we cannot
3581 : * support a downcasing transformation that increases the string
3582 : * length. This is not a problem given the current implementation
3583 : * of downcase_truncate_identifier, but we'll probably have to do
3584 : * something about this someday.
3585 : */
3586 372816 : len = endp - curname;
3587 372816 : downname = downcase_truncate_identifier(curname, len, false);
3588 : Assert(strlen(downname) <= len);
3589 372816 : strncpy(curname, downname, len); /* strncpy is required here */
3590 372816 : pfree(downname);
3591 : }
3592 :
3593 410714 : while (scanner_isspace(*nextp))
3594 2 : nextp++; /* skip trailing whitespace */
3595 :
3596 410712 : if (*nextp == separator)
3597 : {
3598 184470 : nextp++;
3599 344876 : while (scanner_isspace(*nextp))
3600 160406 : nextp++; /* skip leading whitespace for next */
3601 : /* we expect another name, so done remains false */
3602 : }
3603 226242 : else if (*nextp == '\0')
3604 226240 : done = true;
3605 : else
3606 2 : return false; /* invalid syntax */
3607 :
3608 : /* Now safe to overwrite separator with a null */
3609 410710 : *endp = '\0';
3610 :
3611 : /* Truncate name if it's overlength */
3612 410710 : truncate_identifier(curname, strlen(curname), false);
3613 :
3614 : /*
3615 : * Finished isolating current name --- add it to list
3616 : */
3617 410710 : *namelist = lappend(*namelist, curname);
3618 :
3619 : /* Loop back if we didn't reach end of string */
3620 410710 : } while (!done);
3621 :
3622 226240 : return true;
3623 : }
3624 :
3625 :
3626 : /*
3627 : * SplitDirectoriesString --- parse a string containing file/directory names
3628 : *
3629 : * This works fine on file names too; the function name is historical.
3630 : *
3631 : * This is similar to SplitIdentifierString, except that the parsing
3632 : * rules are meant to handle pathnames instead of identifiers: there is
3633 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3634 : * and we apply canonicalize_path() to each extracted string. Because of the
3635 : * last, the returned strings are separately palloc'd rather than being
3636 : * pointers into rawstring --- but we still scribble on rawstring.
3637 : *
3638 : * Inputs:
3639 : * rawstring: the input string; must be modifiable!
3640 : * separator: the separator punctuation expected between directories
3641 : * (typically ',' or ';'). Whitespace may also appear around
3642 : * directories.
3643 : * Outputs:
3644 : * namelist: filled with a palloc'd list of directory names.
3645 : * Caller should list_free_deep() this even on error return.
3646 : *
3647 : * Returns true if okay, false if there is a syntax error in the string.
3648 : *
3649 : * Note that an empty string is considered okay here.
3650 : */
3651 : bool
3652 1718 : SplitDirectoriesString(char *rawstring, char separator,
3653 : List **namelist)
3654 : {
3655 1718 : char *nextp = rawstring;
3656 1718 : bool done = false;
3657 :
3658 1718 : *namelist = NIL;
3659 :
3660 1718 : while (scanner_isspace(*nextp))
3661 0 : nextp++; /* skip leading whitespace */
3662 :
3663 1718 : if (*nextp == '\0')
3664 2 : return true; /* allow empty string */
3665 :
3666 : /* At the top of the loop, we are at start of a new directory. */
3667 : do
3668 : {
3669 : char *curname;
3670 : char *endp;
3671 :
3672 1718 : if (*nextp == '"')
3673 : {
3674 : /* Quoted name --- collapse quote-quote pairs */
3675 0 : curname = nextp + 1;
3676 : for (;;)
3677 : {
3678 0 : endp = strchr(nextp + 1, '"');
3679 0 : if (endp == NULL)
3680 0 : return false; /* mismatched quotes */
3681 0 : if (endp[1] != '"')
3682 0 : break; /* found end of quoted name */
3683 : /* Collapse adjacent quotes into one quote, and look again */
3684 0 : memmove(endp, endp + 1, strlen(endp));
3685 0 : nextp = endp;
3686 : }
3687 : /* endp now points at the terminating quote */
3688 0 : nextp = endp + 1;
3689 : }
3690 : else
3691 : {
3692 : /* Unquoted name --- extends to separator or end of string */
3693 1718 : curname = endp = nextp;
3694 28736 : while (*nextp && *nextp != separator)
3695 : {
3696 : /* trailing whitespace should not be included in name */
3697 27018 : if (!scanner_isspace(*nextp))
3698 27018 : endp = nextp + 1;
3699 27018 : nextp++;
3700 : }
3701 1718 : if (curname == endp)
3702 0 : return false; /* empty unquoted name not allowed */
3703 : }
3704 :
3705 1718 : while (scanner_isspace(*nextp))
3706 0 : nextp++; /* skip trailing whitespace */
3707 :
3708 1718 : if (*nextp == separator)
3709 : {
3710 2 : nextp++;
3711 2 : while (scanner_isspace(*nextp))
3712 0 : nextp++; /* skip leading whitespace for next */
3713 : /* we expect another name, so done remains false */
3714 : }
3715 1716 : else if (*nextp == '\0')
3716 1716 : done = true;
3717 : else
3718 0 : return false; /* invalid syntax */
3719 :
3720 : /* Now safe to overwrite separator with a null */
3721 1718 : *endp = '\0';
3722 :
3723 : /* Truncate path if it's overlength */
3724 1718 : if (strlen(curname) >= MAXPGPATH)
3725 0 : curname[MAXPGPATH - 1] = '\0';
3726 :
3727 : /*
3728 : * Finished isolating current name --- add it to list
3729 : */
3730 1718 : curname = pstrdup(curname);
3731 1718 : canonicalize_path(curname);
3732 1718 : *namelist = lappend(*namelist, curname);
3733 :
3734 : /* Loop back if we didn't reach end of string */
3735 1718 : } while (!done);
3736 :
3737 1716 : return true;
3738 : }
3739 :
3740 :
3741 : /*
3742 : * SplitGUCList --- parse a string containing identifiers or file names
3743 : *
3744 : * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3745 : * presuming whether the elements will be taken as identifiers or file names.
3746 : * We assume the input has already been through flatten_set_variable_args(),
3747 : * so that we need never downcase (if appropriate, that was done already).
3748 : * Nor do we ever truncate, since we don't know the correct max length.
3749 : * We disallow embedded whitespace for simplicity (it shouldn't matter,
3750 : * because any embedded whitespace should have led to double-quoting).
3751 : * Otherwise the API is identical to SplitIdentifierString.
3752 : *
3753 : * XXX it's annoying to have so many copies of this string-splitting logic.
3754 : * However, it's not clear that having one function with a bunch of option
3755 : * flags would be much better.
3756 : *
3757 : * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3758 : * Be sure to update that if you have to change this.
3759 : *
3760 : * Inputs:
3761 : * rawstring: the input string; must be overwritable! On return, it's
3762 : * been modified to contain the separated identifiers.
3763 : * separator: the separator punctuation expected between identifiers
3764 : * (typically '.' or ','). Whitespace may also appear around
3765 : * identifiers.
3766 : * Outputs:
3767 : * namelist: filled with a palloc'd list of pointers to identifiers within
3768 : * rawstring. Caller should list_free() this even on error return.
3769 : *
3770 : * Returns true if okay, false if there is a syntax error in the string.
3771 : */
3772 : bool
3773 3758 : SplitGUCList(char *rawstring, char separator,
3774 : List **namelist)
3775 : {
3776 3758 : char *nextp = rawstring;
3777 3758 : bool done = false;
3778 :
3779 3758 : *namelist = NIL;
3780 :
3781 3758 : while (scanner_isspace(*nextp))
3782 0 : nextp++; /* skip leading whitespace */
3783 :
3784 3758 : if (*nextp == '\0')
3785 3684 : return true; /* allow empty string */
3786 :
3787 : /* At the top of the loop, we are at start of a new identifier. */
3788 : do
3789 : {
3790 : char *curname;
3791 : char *endp;
3792 :
3793 100 : if (*nextp == '"')
3794 : {
3795 : /* Quoted name --- collapse quote-quote pairs */
3796 24 : curname = nextp + 1;
3797 : for (;;)
3798 : {
3799 36 : endp = strchr(nextp + 1, '"');
3800 30 : if (endp == NULL)
3801 0 : return false; /* mismatched quotes */
3802 30 : if (endp[1] != '"')
3803 24 : break; /* found end of quoted name */
3804 : /* Collapse adjacent quotes into one quote, and look again */
3805 6 : memmove(endp, endp + 1, strlen(endp));
3806 6 : nextp = endp;
3807 : }
3808 : /* endp now points at the terminating quote */
3809 24 : nextp = endp + 1;
3810 : }
3811 : else
3812 : {
3813 : /* Unquoted name --- extends to separator or whitespace */
3814 76 : curname = nextp;
3815 718 : while (*nextp && *nextp != separator &&
3816 642 : !scanner_isspace(*nextp))
3817 642 : nextp++;
3818 76 : endp = nextp;
3819 76 : if (curname == nextp)
3820 0 : return false; /* empty unquoted name not allowed */
3821 : }
3822 :
3823 100 : while (scanner_isspace(*nextp))
3824 0 : nextp++; /* skip trailing whitespace */
3825 :
3826 100 : if (*nextp == separator)
3827 : {
3828 26 : nextp++;
3829 44 : while (scanner_isspace(*nextp))
3830 18 : nextp++; /* skip leading whitespace for next */
3831 : /* we expect another name, so done remains false */
3832 : }
3833 74 : else if (*nextp == '\0')
3834 74 : done = true;
3835 : else
3836 0 : return false; /* invalid syntax */
3837 :
3838 : /* Now safe to overwrite separator with a null */
3839 100 : *endp = '\0';
3840 :
3841 : /*
3842 : * Finished isolating current name --- add it to list
3843 : */
3844 100 : *namelist = lappend(*namelist, curname);
3845 :
3846 : /* Loop back if we didn't reach end of string */
3847 100 : } while (!done);
3848 :
3849 74 : return true;
3850 : }
3851 :
3852 :
3853 : /*****************************************************************************
3854 : * Comparison Functions used for bytea
3855 : *
3856 : * Note: btree indexes need these routines not to leak memory; therefore,
3857 : * be careful to free working copies of toasted datums. Most places don't
3858 : * need to be so careful.
3859 : *****************************************************************************/
3860 :
3861 : Datum
3862 10390 : byteaeq(PG_FUNCTION_ARGS)
3863 : {
3864 10390 : Datum arg1 = PG_GETARG_DATUM(0);
3865 10390 : Datum arg2 = PG_GETARG_DATUM(1);
3866 : bool result;
3867 : Size len1,
3868 : len2;
3869 :
3870 : /*
3871 : * We can use a fast path for unequal lengths, which might save us from
3872 : * having to detoast one or both values.
3873 : */
3874 10390 : len1 = toast_raw_datum_size(arg1);
3875 10390 : len2 = toast_raw_datum_size(arg2);
3876 10390 : if (len1 != len2)
3877 4316 : result = false;
3878 : else
3879 : {
3880 6074 : bytea *barg1 = DatumGetByteaPP(arg1);
3881 6074 : bytea *barg2 = DatumGetByteaPP(arg2);
3882 :
3883 6074 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3884 : len1 - VARHDRSZ) == 0);
3885 :
3886 6074 : PG_FREE_IF_COPY(barg1, 0);
3887 6074 : PG_FREE_IF_COPY(barg2, 1);
3888 : }
3889 :
3890 10390 : PG_RETURN_BOOL(result);
3891 : }
3892 :
3893 : Datum
3894 768 : byteane(PG_FUNCTION_ARGS)
3895 : {
3896 768 : Datum arg1 = PG_GETARG_DATUM(0);
3897 768 : Datum arg2 = PG_GETARG_DATUM(1);
3898 : bool result;
3899 : Size len1,
3900 : len2;
3901 :
3902 : /*
3903 : * We can use a fast path for unequal lengths, which might save us from
3904 : * having to detoast one or both values.
3905 : */
3906 768 : len1 = toast_raw_datum_size(arg1);
3907 768 : len2 = toast_raw_datum_size(arg2);
3908 768 : if (len1 != len2)
3909 0 : result = true;
3910 : else
3911 : {
3912 768 : bytea *barg1 = DatumGetByteaPP(arg1);
3913 768 : bytea *barg2 = DatumGetByteaPP(arg2);
3914 :
3915 768 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3916 : len1 - VARHDRSZ) != 0);
3917 :
3918 768 : PG_FREE_IF_COPY(barg1, 0);
3919 768 : PG_FREE_IF_COPY(barg2, 1);
3920 : }
3921 :
3922 768 : PG_RETURN_BOOL(result);
3923 : }
3924 :
3925 : Datum
3926 8318 : bytealt(PG_FUNCTION_ARGS)
3927 : {
3928 8318 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3929 8318 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3930 : int len1,
3931 : len2;
3932 : int cmp;
3933 :
3934 8318 : len1 = VARSIZE_ANY_EXHDR(arg1);
3935 8318 : len2 = VARSIZE_ANY_EXHDR(arg2);
3936 :
3937 8318 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3938 :
3939 8318 : PG_FREE_IF_COPY(arg1, 0);
3940 8318 : PG_FREE_IF_COPY(arg2, 1);
3941 :
3942 8318 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3943 : }
3944 :
3945 : Datum
3946 6356 : byteale(PG_FUNCTION_ARGS)
3947 : {
3948 6356 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3949 6356 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3950 : int len1,
3951 : len2;
3952 : int cmp;
3953 :
3954 6356 : len1 = VARSIZE_ANY_EXHDR(arg1);
3955 6356 : len2 = VARSIZE_ANY_EXHDR(arg2);
3956 :
3957 6356 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3958 :
3959 6356 : PG_FREE_IF_COPY(arg1, 0);
3960 6356 : PG_FREE_IF_COPY(arg2, 1);
3961 :
3962 6356 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3963 : }
3964 :
3965 : Datum
3966 6228 : byteagt(PG_FUNCTION_ARGS)
3967 : {
3968 6228 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3969 6228 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3970 : int len1,
3971 : len2;
3972 : int cmp;
3973 :
3974 6228 : len1 = VARSIZE_ANY_EXHDR(arg1);
3975 6228 : len2 = VARSIZE_ANY_EXHDR(arg2);
3976 :
3977 6228 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3978 :
3979 6228 : PG_FREE_IF_COPY(arg1, 0);
3980 6228 : PG_FREE_IF_COPY(arg2, 1);
3981 :
3982 6228 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3983 : }
3984 :
3985 : Datum
3986 5010 : byteage(PG_FUNCTION_ARGS)
3987 : {
3988 5010 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3989 5010 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3990 : int len1,
3991 : len2;
3992 : int cmp;
3993 :
3994 5010 : len1 = VARSIZE_ANY_EXHDR(arg1);
3995 5010 : len2 = VARSIZE_ANY_EXHDR(arg2);
3996 :
3997 5010 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3998 :
3999 5010 : PG_FREE_IF_COPY(arg1, 0);
4000 5010 : PG_FREE_IF_COPY(arg2, 1);
4001 :
4002 5010 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4003 : }
4004 :
4005 : Datum
4006 87420 : byteacmp(PG_FUNCTION_ARGS)
4007 : {
4008 87420 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4009 87420 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4010 : int len1,
4011 : len2;
4012 : int cmp;
4013 :
4014 87420 : len1 = VARSIZE_ANY_EXHDR(arg1);
4015 87420 : len2 = VARSIZE_ANY_EXHDR(arg2);
4016 :
4017 87420 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4018 87420 : if ((cmp == 0) && (len1 != len2))
4019 14576 : cmp = (len1 < len2) ? -1 : 1;
4020 :
4021 87420 : PG_FREE_IF_COPY(arg1, 0);
4022 87420 : PG_FREE_IF_COPY(arg2, 1);
4023 :
4024 87420 : PG_RETURN_INT32(cmp);
4025 : }
4026 :
4027 : Datum
4028 24 : bytea_larger(PG_FUNCTION_ARGS)
4029 : {
4030 24 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4031 24 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4032 : bytea *result;
4033 : int len1,
4034 : len2;
4035 : int cmp;
4036 :
4037 24 : len1 = VARSIZE_ANY_EXHDR(arg1);
4038 24 : len2 = VARSIZE_ANY_EXHDR(arg2);
4039 :
4040 24 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4041 24 : result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2);
4042 :
4043 24 : PG_RETURN_BYTEA_P(result);
4044 : }
4045 :
4046 : Datum
4047 24 : bytea_smaller(PG_FUNCTION_ARGS)
4048 : {
4049 24 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4050 24 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4051 : bytea *result;
4052 : int len1,
4053 : len2;
4054 : int cmp;
4055 :
4056 24 : len1 = VARSIZE_ANY_EXHDR(arg1);
4057 24 : len2 = VARSIZE_ANY_EXHDR(arg2);
4058 :
4059 24 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4060 24 : result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2);
4061 :
4062 24 : PG_RETURN_BYTEA_P(result);
4063 : }
4064 :
4065 : Datum
4066 30 : bytea_sortsupport(PG_FUNCTION_ARGS)
4067 : {
4068 30 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4069 : MemoryContext oldcontext;
4070 :
4071 30 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4072 :
4073 : /* Use generic string SortSupport, forcing "C" collation */
4074 30 : varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4075 :
4076 30 : MemoryContextSwitchTo(oldcontext);
4077 :
4078 30 : PG_RETURN_VOID();
4079 : }
4080 :
4081 : /* Cast bytea -> int2 */
4082 : Datum
4083 36 : bytea_int2(PG_FUNCTION_ARGS)
4084 : {
4085 36 : bytea *v = PG_GETARG_BYTEA_PP(0);
4086 36 : int len = VARSIZE_ANY_EXHDR(v);
4087 : uint16 result;
4088 :
4089 : /* Check that the byte array is not too long */
4090 36 : if (len > sizeof(result))
4091 6 : ereport(ERROR,
4092 : errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4093 : errmsg("smallint out of range"));
4094 :
4095 : /* Convert it to an integer; most significant bytes come first */
4096 30 : result = 0;
4097 72 : for (int i = 0; i < len; i++)
4098 : {
4099 42 : result <<= BITS_PER_BYTE;
4100 42 : result |= ((unsigned char *) VARDATA_ANY(v))[i];
4101 : }
4102 :
4103 30 : PG_RETURN_INT16(result);
4104 : }
4105 :
4106 : /* Cast bytea -> int4 */
4107 : Datum
4108 36 : bytea_int4(PG_FUNCTION_ARGS)
4109 : {
4110 36 : bytea *v = PG_GETARG_BYTEA_PP(0);
4111 36 : int len = VARSIZE_ANY_EXHDR(v);
4112 : uint32 result;
4113 :
4114 : /* Check that the byte array is not too long */
4115 36 : if (len > sizeof(result))
4116 6 : ereport(ERROR,
4117 : errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4118 : errmsg("integer out of range"));
4119 :
4120 : /* Convert it to an integer; most significant bytes come first */
4121 30 : result = 0;
4122 108 : for (int i = 0; i < len; i++)
4123 : {
4124 78 : result <<= BITS_PER_BYTE;
4125 78 : result |= ((unsigned char *) VARDATA_ANY(v))[i];
4126 : }
4127 :
4128 30 : PG_RETURN_INT32(result);
4129 : }
4130 :
4131 : /* Cast bytea -> int8 */
4132 : Datum
4133 36 : bytea_int8(PG_FUNCTION_ARGS)
4134 : {
4135 36 : bytea *v = PG_GETARG_BYTEA_PP(0);
4136 36 : int len = VARSIZE_ANY_EXHDR(v);
4137 : uint64 result;
4138 :
4139 : /* Check that the byte array is not too long */
4140 36 : if (len > sizeof(result))
4141 6 : ereport(ERROR,
4142 : errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
4143 : errmsg("bigint out of range"));
4144 :
4145 : /* Convert it to an integer; most significant bytes come first */
4146 30 : result = 0;
4147 180 : for (int i = 0; i < len; i++)
4148 : {
4149 150 : result <<= BITS_PER_BYTE;
4150 150 : result |= ((unsigned char *) VARDATA_ANY(v))[i];
4151 : }
4152 :
4153 30 : PG_RETURN_INT64(result);
4154 : }
4155 :
4156 : /* Cast int2 -> bytea; can just use int2send() */
4157 : Datum
4158 12 : int2_bytea(PG_FUNCTION_ARGS)
4159 : {
4160 12 : return int2send(fcinfo);
4161 : }
4162 :
4163 : /* Cast int4 -> bytea; can just use int4send() */
4164 : Datum
4165 12 : int4_bytea(PG_FUNCTION_ARGS)
4166 : {
4167 12 : return int4send(fcinfo);
4168 : }
4169 :
4170 : /* Cast int8 -> bytea; can just use int8send() */
4171 : Datum
4172 12 : int8_bytea(PG_FUNCTION_ARGS)
4173 : {
4174 12 : return int8send(fcinfo);
4175 : }
4176 :
4177 : /*
4178 : * appendStringInfoText
4179 : *
4180 : * Append a text to str.
4181 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4182 : */
4183 : static void
4184 1706226 : appendStringInfoText(StringInfo str, const text *t)
4185 : {
4186 1706226 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4187 1706226 : }
4188 :
4189 : /*
4190 : * replace_text
4191 : * replace all occurrences of 'old_sub_str' in 'orig_str'
4192 : * with 'new_sub_str' to form 'new_str'
4193 : *
4194 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4195 : * otherwise returns 'new_str'
4196 : */
4197 : Datum
4198 1332 : replace_text(PG_FUNCTION_ARGS)
4199 : {
4200 1332 : text *src_text = PG_GETARG_TEXT_PP(0);
4201 1332 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
4202 1332 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
4203 : int src_text_len;
4204 : int from_sub_text_len;
4205 : TextPositionState state;
4206 : text *ret_text;
4207 : int chunk_len;
4208 : char *curr_ptr;
4209 : char *start_ptr;
4210 : StringInfoData str;
4211 : bool found;
4212 :
4213 1332 : src_text_len = VARSIZE_ANY_EXHDR(src_text);
4214 1332 : from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4215 :
4216 : /* Return unmodified source string if empty source or pattern */
4217 1332 : if (src_text_len < 1 || from_sub_text_len < 1)
4218 : {
4219 0 : PG_RETURN_TEXT_P(src_text);
4220 : }
4221 :
4222 1332 : text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4223 :
4224 1332 : found = text_position_next(&state);
4225 :
4226 : /* When the from_sub_text is not found, there is nothing to do. */
4227 1332 : if (!found)
4228 : {
4229 298 : text_position_cleanup(&state);
4230 298 : PG_RETURN_TEXT_P(src_text);
4231 : }
4232 1034 : curr_ptr = text_position_get_match_ptr(&state);
4233 1034 : start_ptr = VARDATA_ANY(src_text);
4234 :
4235 1034 : initStringInfo(&str);
4236 :
4237 : do
4238 : {
4239 5314 : CHECK_FOR_INTERRUPTS();
4240 :
4241 : /* copy the data skipped over by last text_position_next() */
4242 5314 : chunk_len = curr_ptr - start_ptr;
4243 5314 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4244 :
4245 5314 : appendStringInfoText(&str, to_sub_text);
4246 :
4247 5314 : start_ptr = curr_ptr + state.last_match_len;
4248 :
4249 5314 : found = text_position_next(&state);
4250 5314 : if (found)
4251 4280 : curr_ptr = text_position_get_match_ptr(&state);
4252 : }
4253 5314 : while (found);
4254 :
4255 : /* copy trailing data */
4256 1034 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4257 1034 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4258 :
4259 1034 : text_position_cleanup(&state);
4260 :
4261 1034 : ret_text = cstring_to_text_with_len(str.data, str.len);
4262 1034 : pfree(str.data);
4263 :
4264 1034 : PG_RETURN_TEXT_P(ret_text);
4265 : }
4266 :
4267 : /*
4268 : * check_replace_text_has_escape
4269 : *
4270 : * Returns 0 if text contains no backslashes that need processing.
4271 : * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4272 : * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4273 : */
4274 : static int
4275 16790 : check_replace_text_has_escape(const text *replace_text)
4276 : {
4277 16790 : int result = 0;
4278 16790 : const char *p = VARDATA_ANY(replace_text);
4279 16790 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4280 :
4281 16834 : while (p < p_end)
4282 : {
4283 : /* Find next escape char, if any. */
4284 15768 : p = memchr(p, '\\', p_end - p);
4285 15768 : if (p == NULL)
4286 14946 : break;
4287 822 : p++;
4288 : /* Note: a backslash at the end doesn't require extra processing. */
4289 822 : if (p < p_end)
4290 : {
4291 822 : if (*p >= '1' && *p <= '9')
4292 778 : return 2; /* Found a submatch specifier, so done */
4293 44 : result = 1; /* Found some other sequence, keep looking */
4294 44 : p++;
4295 : }
4296 : }
4297 16012 : return result;
4298 : }
4299 :
4300 : /*
4301 : * appendStringInfoRegexpSubstr
4302 : *
4303 : * Append replace_text to str, substituting regexp back references for
4304 : * \n escapes. start_ptr is the start of the match in the source string,
4305 : * at logical character position data_pos.
4306 : */
4307 : static void
4308 236 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4309 : regmatch_t *pmatch,
4310 : char *start_ptr, int data_pos)
4311 : {
4312 236 : const char *p = VARDATA_ANY(replace_text);
4313 236 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4314 :
4315 574 : while (p < p_end)
4316 : {
4317 518 : const char *chunk_start = p;
4318 : int so;
4319 : int eo;
4320 :
4321 : /* Find next escape char, if any. */
4322 518 : p = memchr(p, '\\', p_end - p);
4323 518 : if (p == NULL)
4324 174 : p = p_end;
4325 :
4326 : /* Copy the text we just scanned over, if any. */
4327 518 : if (p > chunk_start)
4328 318 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4329 :
4330 : /* Done if at end of string, else advance over escape char. */
4331 518 : if (p >= p_end)
4332 174 : break;
4333 344 : p++;
4334 :
4335 344 : if (p >= p_end)
4336 : {
4337 : /* Escape at very end of input. Treat same as unexpected char */
4338 6 : appendStringInfoChar(str, '\\');
4339 6 : break;
4340 : }
4341 :
4342 338 : if (*p >= '1' && *p <= '9')
4343 278 : {
4344 : /* Use the back reference of regexp. */
4345 278 : int idx = *p - '0';
4346 :
4347 278 : so = pmatch[idx].rm_so;
4348 278 : eo = pmatch[idx].rm_eo;
4349 278 : p++;
4350 : }
4351 60 : else if (*p == '&')
4352 : {
4353 : /* Use the entire matched string. */
4354 18 : so = pmatch[0].rm_so;
4355 18 : eo = pmatch[0].rm_eo;
4356 18 : p++;
4357 : }
4358 42 : else if (*p == '\\')
4359 : {
4360 : /* \\ means transfer one \ to output. */
4361 36 : appendStringInfoChar(str, '\\');
4362 36 : p++;
4363 36 : continue;
4364 : }
4365 : else
4366 : {
4367 : /*
4368 : * If escape char is not followed by any expected char, just treat
4369 : * it as ordinary data to copy. (XXX would it be better to throw
4370 : * an error?)
4371 : */
4372 6 : appendStringInfoChar(str, '\\');
4373 6 : continue;
4374 : }
4375 :
4376 296 : if (so >= 0 && eo >= 0)
4377 : {
4378 : /*
4379 : * Copy the text that is back reference of regexp. Note so and eo
4380 : * are counted in characters not bytes.
4381 : */
4382 : char *chunk_start;
4383 : int chunk_len;
4384 :
4385 : Assert(so >= data_pos);
4386 296 : chunk_start = start_ptr;
4387 296 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4388 296 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4389 296 : appendBinaryStringInfo(str, chunk_start, chunk_len);
4390 : }
4391 : }
4392 236 : }
4393 :
4394 : /*
4395 : * replace_text_regexp
4396 : *
4397 : * replace substring(s) in src_text that match pattern with replace_text.
4398 : * The replace_text can contain backslash markers to substitute
4399 : * (parts of) the matched text.
4400 : *
4401 : * cflags: regexp compile flags.
4402 : * collation: collation to use.
4403 : * search_start: the character (not byte) offset in src_text at which to
4404 : * begin searching.
4405 : * n: if 0, replace all matches; if > 0, replace only the N'th match.
4406 : */
4407 : text *
4408 16790 : replace_text_regexp(text *src_text, text *pattern_text,
4409 : text *replace_text,
4410 : int cflags, Oid collation,
4411 : int search_start, int n)
4412 : {
4413 : text *ret_text;
4414 : regex_t *re;
4415 16790 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4416 16790 : int nmatches = 0;
4417 : StringInfoData buf;
4418 : regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4419 16790 : int nmatch = lengthof(pmatch);
4420 : pg_wchar *data;
4421 : size_t data_len;
4422 : int data_pos;
4423 : char *start_ptr;
4424 : int escape_status;
4425 :
4426 16790 : initStringInfo(&buf);
4427 :
4428 : /* Convert data string to wide characters. */
4429 16790 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4430 16790 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4431 :
4432 : /* Check whether replace_text has escapes, especially regexp submatches. */
4433 16790 : escape_status = check_replace_text_has_escape(replace_text);
4434 :
4435 : /* If no regexp submatches, we can use REG_NOSUB. */
4436 16790 : if (escape_status < 2)
4437 : {
4438 16012 : cflags |= REG_NOSUB;
4439 : /* Also tell pg_regexec we only want the whole-match location. */
4440 16012 : nmatch = 1;
4441 : }
4442 :
4443 : /* Prepare the regexp. */
4444 16790 : re = RE_compile_and_cache(pattern_text, cflags, collation);
4445 :
4446 : /* start_ptr points to the data_pos'th character of src_text */
4447 16790 : start_ptr = (char *) VARDATA_ANY(src_text);
4448 16790 : data_pos = 0;
4449 :
4450 23108 : while (search_start <= data_len)
4451 : {
4452 : int regexec_result;
4453 :
4454 23102 : CHECK_FOR_INTERRUPTS();
4455 :
4456 23102 : regexec_result = pg_regexec(re,
4457 : data,
4458 : data_len,
4459 : search_start,
4460 : NULL, /* no details */
4461 : nmatch,
4462 : pmatch,
4463 : 0);
4464 :
4465 23102 : if (regexec_result == REG_NOMATCH)
4466 14958 : break;
4467 :
4468 8144 : if (regexec_result != REG_OKAY)
4469 : {
4470 : char errMsg[100];
4471 :
4472 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4473 0 : ereport(ERROR,
4474 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4475 : errmsg("regular expression failed: %s", errMsg)));
4476 : }
4477 :
4478 : /*
4479 : * Count matches, and decide whether to replace this match.
4480 : */
4481 8144 : nmatches++;
4482 8144 : if (n > 0 && nmatches != n)
4483 : {
4484 : /*
4485 : * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4486 : * we treat the matched text as if it weren't matched, and copy it
4487 : * to the output later.)
4488 : */
4489 60 : search_start = pmatch[0].rm_eo;
4490 60 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4491 0 : search_start++;
4492 60 : continue;
4493 : }
4494 :
4495 : /*
4496 : * Copy the text to the left of the match position. Note we are given
4497 : * character not byte indexes.
4498 : */
4499 8084 : if (pmatch[0].rm_so - data_pos > 0)
4500 : {
4501 : int chunk_len;
4502 :
4503 7910 : chunk_len = charlen_to_bytelen(start_ptr,
4504 7910 : pmatch[0].rm_so - data_pos);
4505 7910 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4506 :
4507 : /*
4508 : * Advance start_ptr over that text, to avoid multiple rescans of
4509 : * it if the replace_text contains multiple back-references.
4510 : */
4511 7910 : start_ptr += chunk_len;
4512 7910 : data_pos = pmatch[0].rm_so;
4513 : }
4514 :
4515 : /*
4516 : * Copy the replace_text, processing escapes if any are present.
4517 : */
4518 8084 : if (escape_status > 0)
4519 236 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4520 : start_ptr, data_pos);
4521 : else
4522 7848 : appendStringInfoText(&buf, replace_text);
4523 :
4524 : /* Advance start_ptr and data_pos over the matched text. */
4525 16168 : start_ptr += charlen_to_bytelen(start_ptr,
4526 8084 : pmatch[0].rm_eo - data_pos);
4527 8084 : data_pos = pmatch[0].rm_eo;
4528 :
4529 : /*
4530 : * If we only want to replace one occurrence, we're done.
4531 : */
4532 8084 : if (n > 0)
4533 1826 : break;
4534 :
4535 : /*
4536 : * Advance search position. Normally we start the next search at the
4537 : * end of the previous match; but if the match was of zero length, we
4538 : * have to advance by one character, or we'd just find the same match
4539 : * again.
4540 : */
4541 6258 : search_start = data_pos;
4542 6258 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4543 12 : search_start++;
4544 : }
4545 :
4546 : /*
4547 : * Copy the text to the right of the last match.
4548 : */
4549 16790 : if (data_pos < data_len)
4550 : {
4551 : int chunk_len;
4552 :
4553 16088 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4554 16088 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4555 : }
4556 :
4557 16790 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4558 16790 : pfree(buf.data);
4559 16790 : pfree(data);
4560 :
4561 16790 : return ret_text;
4562 : }
4563 :
4564 : /*
4565 : * split_part
4566 : * parse input string based on provided field separator
4567 : * return N'th item (1 based, negative counts from end)
4568 : */
4569 : Datum
4570 150 : split_part(PG_FUNCTION_ARGS)
4571 : {
4572 150 : text *inputstring = PG_GETARG_TEXT_PP(0);
4573 150 : text *fldsep = PG_GETARG_TEXT_PP(1);
4574 150 : int fldnum = PG_GETARG_INT32(2);
4575 : int inputstring_len;
4576 : int fldsep_len;
4577 : TextPositionState state;
4578 : char *start_ptr;
4579 : char *end_ptr;
4580 : text *result_text;
4581 : bool found;
4582 :
4583 : /* field number is 1 based */
4584 150 : if (fldnum == 0)
4585 6 : ereport(ERROR,
4586 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4587 : errmsg("field position must not be zero")));
4588 :
4589 144 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4590 144 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4591 :
4592 : /* return empty string for empty input string */
4593 144 : if (inputstring_len < 1)
4594 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4595 :
4596 : /* handle empty field separator */
4597 132 : if (fldsep_len < 1)
4598 : {
4599 : /* if first or last field, return input string, else empty string */
4600 24 : if (fldnum == 1 || fldnum == -1)
4601 12 : PG_RETURN_TEXT_P(inputstring);
4602 : else
4603 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4604 : }
4605 :
4606 : /* find the first field separator */
4607 108 : text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4608 :
4609 108 : found = text_position_next(&state);
4610 :
4611 : /* special case if fldsep not found at all */
4612 108 : if (!found)
4613 : {
4614 24 : text_position_cleanup(&state);
4615 : /* if first or last field, return input string, else empty string */
4616 24 : if (fldnum == 1 || fldnum == -1)
4617 12 : PG_RETURN_TEXT_P(inputstring);
4618 : else
4619 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4620 : }
4621 :
4622 : /*
4623 : * take care of a negative field number (i.e. count from the right) by
4624 : * converting to a positive field number; we need total number of fields
4625 : */
4626 84 : if (fldnum < 0)
4627 : {
4628 : /* we found a fldsep, so there are at least two fields */
4629 42 : int numfields = 2;
4630 :
4631 54 : while (text_position_next(&state))
4632 12 : numfields++;
4633 :
4634 : /* special case of last field does not require an extra pass */
4635 42 : if (fldnum == -1)
4636 : {
4637 24 : start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
4638 24 : end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4639 24 : text_position_cleanup(&state);
4640 24 : PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4641 : end_ptr - start_ptr));
4642 : }
4643 :
4644 : /* else, convert fldnum to positive notation */
4645 18 : fldnum += numfields + 1;
4646 :
4647 : /* if nonexistent field, return empty string */
4648 18 : if (fldnum <= 0)
4649 : {
4650 6 : text_position_cleanup(&state);
4651 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4652 : }
4653 :
4654 : /* reset to pointing at first match, but now with positive fldnum */
4655 12 : text_position_reset(&state);
4656 12 : found = text_position_next(&state);
4657 : Assert(found);
4658 : }
4659 :
4660 : /* identify bounds of first field */
4661 54 : start_ptr = VARDATA_ANY(inputstring);
4662 54 : end_ptr = text_position_get_match_ptr(&state);
4663 :
4664 102 : while (found && --fldnum > 0)
4665 : {
4666 : /* identify bounds of next field */
4667 48 : start_ptr = end_ptr + state.last_match_len;
4668 48 : found = text_position_next(&state);
4669 48 : if (found)
4670 18 : end_ptr = text_position_get_match_ptr(&state);
4671 : }
4672 :
4673 54 : text_position_cleanup(&state);
4674 :
4675 54 : if (fldnum > 0)
4676 : {
4677 : /* N'th field separator not found */
4678 : /* if last field requested, return it, else empty string */
4679 30 : if (fldnum == 1)
4680 : {
4681 24 : int last_len = start_ptr - VARDATA_ANY(inputstring);
4682 :
4683 24 : result_text = cstring_to_text_with_len(start_ptr,
4684 : inputstring_len - last_len);
4685 : }
4686 : else
4687 6 : result_text = cstring_to_text("");
4688 : }
4689 : else
4690 : {
4691 : /* non-last field requested */
4692 24 : result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4693 : }
4694 :
4695 54 : PG_RETURN_TEXT_P(result_text);
4696 : }
4697 :
4698 : /*
4699 : * Convenience function to return true when two text params are equal.
4700 : */
4701 : static bool
4702 384 : text_isequal(text *txt1, text *txt2, Oid collid)
4703 : {
4704 384 : return DatumGetBool(DirectFunctionCall2Coll(texteq,
4705 : collid,
4706 : PointerGetDatum(txt1),
4707 : PointerGetDatum(txt2)));
4708 : }
4709 :
4710 : /*
4711 : * text_to_array
4712 : * parse input string and return text array of elements,
4713 : * based on provided field separator
4714 : */
4715 : Datum
4716 170 : text_to_array(PG_FUNCTION_ARGS)
4717 : {
4718 : SplitTextOutputData tstate;
4719 :
4720 : /* For array output, tstate should start as all zeroes */
4721 170 : memset(&tstate, 0, sizeof(tstate));
4722 :
4723 170 : if (!split_text(fcinfo, &tstate))
4724 6 : PG_RETURN_NULL();
4725 :
4726 164 : if (tstate.astate == NULL)
4727 6 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4728 :
4729 158 : PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4730 : CurrentMemoryContext));
4731 : }
4732 :
4733 : /*
4734 : * text_to_array_null
4735 : * parse input string and return text array of elements,
4736 : * based on provided field separator and null string
4737 : *
4738 : * This is a separate entry point only to prevent the regression tests from
4739 : * complaining about different argument sets for the same internal function.
4740 : */
4741 : Datum
4742 60 : text_to_array_null(PG_FUNCTION_ARGS)
4743 : {
4744 60 : return text_to_array(fcinfo);
4745 : }
4746 :
4747 : /*
4748 : * text_to_table
4749 : * parse input string and return table of elements,
4750 : * based on provided field separator
4751 : */
4752 : Datum
4753 84 : text_to_table(PG_FUNCTION_ARGS)
4754 : {
4755 84 : ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4756 : SplitTextOutputData tstate;
4757 :
4758 84 : tstate.astate = NULL;
4759 84 : InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4760 84 : tstate.tupstore = rsi->setResult;
4761 84 : tstate.tupdesc = rsi->setDesc;
4762 :
4763 84 : (void) split_text(fcinfo, &tstate);
4764 :
4765 84 : return (Datum) 0;
4766 : }
4767 :
4768 : /*
4769 : * text_to_table_null
4770 : * parse input string and return table of elements,
4771 : * based on provided field separator and null string
4772 : *
4773 : * This is a separate entry point only to prevent the regression tests from
4774 : * complaining about different argument sets for the same internal function.
4775 : */
4776 : Datum
4777 24 : text_to_table_null(PG_FUNCTION_ARGS)
4778 : {
4779 24 : return text_to_table(fcinfo);
4780 : }
4781 :
4782 : /*
4783 : * Common code for text_to_array, text_to_array_null, text_to_table
4784 : * and text_to_table_null functions.
4785 : *
4786 : * These are not strict so we have to test for null inputs explicitly.
4787 : * Returns false if result is to be null, else returns true.
4788 : *
4789 : * Note that if the result is valid but empty (zero elements), we return
4790 : * without changing *tstate --- caller must handle that case, too.
4791 : */
4792 : static bool
4793 254 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4794 : {
4795 : text *inputstring;
4796 : text *fldsep;
4797 : text *null_string;
4798 254 : Oid collation = PG_GET_COLLATION();
4799 : int inputstring_len;
4800 : int fldsep_len;
4801 : char *start_ptr;
4802 : text *result_text;
4803 :
4804 : /* when input string is NULL, then result is NULL too */
4805 254 : if (PG_ARGISNULL(0))
4806 12 : return false;
4807 :
4808 242 : inputstring = PG_GETARG_TEXT_PP(0);
4809 :
4810 : /* fldsep can be NULL */
4811 242 : if (!PG_ARGISNULL(1))
4812 212 : fldsep = PG_GETARG_TEXT_PP(1);
4813 : else
4814 30 : fldsep = NULL;
4815 :
4816 : /* null_string can be NULL or omitted */
4817 242 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4818 84 : null_string = PG_GETARG_TEXT_PP(2);
4819 : else
4820 158 : null_string = NULL;
4821 :
4822 242 : if (fldsep != NULL)
4823 : {
4824 : /*
4825 : * Normal case with non-null fldsep. Use the text_position machinery
4826 : * to search for occurrences of fldsep.
4827 : */
4828 : TextPositionState state;
4829 :
4830 212 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4831 212 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4832 :
4833 : /* return empty set for empty input string */
4834 212 : if (inputstring_len < 1)
4835 60 : return true;
4836 :
4837 : /* empty field separator: return input string as a one-element set */
4838 200 : if (fldsep_len < 1)
4839 : {
4840 48 : split_text_accum_result(tstate, inputstring,
4841 : null_string, collation);
4842 48 : return true;
4843 : }
4844 :
4845 152 : text_position_setup(inputstring, fldsep, collation, &state);
4846 :
4847 152 : start_ptr = VARDATA_ANY(inputstring);
4848 :
4849 : for (;;)
4850 512 : {
4851 : bool found;
4852 : char *end_ptr;
4853 : int chunk_len;
4854 :
4855 664 : CHECK_FOR_INTERRUPTS();
4856 :
4857 664 : found = text_position_next(&state);
4858 664 : if (!found)
4859 : {
4860 : /* fetch last field */
4861 152 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4862 152 : end_ptr = NULL; /* not used, but some compilers complain */
4863 : }
4864 : else
4865 : {
4866 : /* fetch non-last field */
4867 512 : end_ptr = text_position_get_match_ptr(&state);
4868 512 : chunk_len = end_ptr - start_ptr;
4869 : }
4870 :
4871 : /* build a temp text datum to pass to split_text_accum_result */
4872 664 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4873 :
4874 : /* stash away this field */
4875 664 : split_text_accum_result(tstate, result_text,
4876 : null_string, collation);
4877 :
4878 664 : pfree(result_text);
4879 :
4880 664 : if (!found)
4881 152 : break;
4882 :
4883 512 : start_ptr = end_ptr + state.last_match_len;
4884 : }
4885 :
4886 152 : text_position_cleanup(&state);
4887 : }
4888 : else
4889 : {
4890 : /*
4891 : * When fldsep is NULL, each character in the input string becomes a
4892 : * separate element in the result set. The separator is effectively
4893 : * the space between characters.
4894 : */
4895 30 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4896 :
4897 30 : start_ptr = VARDATA_ANY(inputstring);
4898 :
4899 252 : while (inputstring_len > 0)
4900 : {
4901 222 : int chunk_len = pg_mblen(start_ptr);
4902 :
4903 222 : CHECK_FOR_INTERRUPTS();
4904 :
4905 : /* build a temp text datum to pass to split_text_accum_result */
4906 222 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4907 :
4908 : /* stash away this field */
4909 222 : split_text_accum_result(tstate, result_text,
4910 : null_string, collation);
4911 :
4912 222 : pfree(result_text);
4913 :
4914 222 : start_ptr += chunk_len;
4915 222 : inputstring_len -= chunk_len;
4916 : }
4917 : }
4918 :
4919 182 : return true;
4920 : }
4921 :
4922 : /*
4923 : * Add text item to result set (table or array).
4924 : *
4925 : * This is also responsible for checking to see if the item matches
4926 : * the null_string, in which case we should emit NULL instead.
4927 : */
4928 : static void
4929 934 : split_text_accum_result(SplitTextOutputData *tstate,
4930 : text *field_value,
4931 : text *null_string,
4932 : Oid collation)
4933 : {
4934 934 : bool is_null = false;
4935 :
4936 934 : if (null_string && text_isequal(field_value, null_string, collation))
4937 72 : is_null = true;
4938 :
4939 934 : if (tstate->tupstore)
4940 : {
4941 : Datum values[1];
4942 : bool nulls[1];
4943 :
4944 228 : values[0] = PointerGetDatum(field_value);
4945 228 : nulls[0] = is_null;
4946 :
4947 228 : tuplestore_putvalues(tstate->tupstore,
4948 : tstate->tupdesc,
4949 : values,
4950 : nulls);
4951 : }
4952 : else
4953 : {
4954 706 : tstate->astate = accumArrayResult(tstate->astate,
4955 : PointerGetDatum(field_value),
4956 : is_null,
4957 : TEXTOID,
4958 : CurrentMemoryContext);
4959 : }
4960 934 : }
4961 :
4962 : /*
4963 : * array_to_text
4964 : * concatenate Cstring representation of input array elements
4965 : * using provided field separator
4966 : */
4967 : Datum
4968 75310 : array_to_text(PG_FUNCTION_ARGS)
4969 : {
4970 75310 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4971 75310 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4972 :
4973 75310 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4974 : }
4975 :
4976 : /*
4977 : * array_to_text_null
4978 : * concatenate Cstring representation of input array elements
4979 : * using provided field separator and null string
4980 : *
4981 : * This version is not strict so we have to test for null inputs explicitly.
4982 : */
4983 : Datum
4984 12 : array_to_text_null(PG_FUNCTION_ARGS)
4985 : {
4986 : ArrayType *v;
4987 : char *fldsep;
4988 : char *null_string;
4989 :
4990 : /* returns NULL when first or second parameter is NULL */
4991 12 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4992 0 : PG_RETURN_NULL();
4993 :
4994 12 : v = PG_GETARG_ARRAYTYPE_P(0);
4995 12 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4996 :
4997 : /* NULL null string is passed through as a null pointer */
4998 12 : if (!PG_ARGISNULL(2))
4999 6 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5000 : else
5001 6 : null_string = NULL;
5002 :
5003 12 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5004 : }
5005 :
5006 : /*
5007 : * common code for array_to_text and array_to_text_null functions
5008 : */
5009 : static text *
5010 75340 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5011 : const char *fldsep, const char *null_string)
5012 : {
5013 : text *result;
5014 : int nitems,
5015 : *dims,
5016 : ndims;
5017 : Oid element_type;
5018 : int typlen;
5019 : bool typbyval;
5020 : char typalign;
5021 : StringInfoData buf;
5022 75340 : bool printed = false;
5023 : char *p;
5024 : bits8 *bitmap;
5025 : int bitmask;
5026 : int i;
5027 : ArrayMetaState *my_extra;
5028 :
5029 75340 : ndims = ARR_NDIM(v);
5030 75340 : dims = ARR_DIMS(v);
5031 75340 : nitems = ArrayGetNItems(ndims, dims);
5032 :
5033 : /* if there are no elements, return an empty string */
5034 75340 : if (nitems == 0)
5035 50376 : return cstring_to_text_with_len("", 0);
5036 :
5037 24964 : element_type = ARR_ELEMTYPE(v);
5038 24964 : initStringInfo(&buf);
5039 :
5040 : /*
5041 : * We arrange to look up info about element type, including its output
5042 : * conversion proc, only once per series of calls, assuming the element
5043 : * type doesn't change underneath us.
5044 : */
5045 24964 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5046 24964 : if (my_extra == NULL)
5047 : {
5048 1452 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5049 : sizeof(ArrayMetaState));
5050 1452 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5051 1452 : my_extra->element_type = ~element_type;
5052 : }
5053 :
5054 24964 : if (my_extra->element_type != element_type)
5055 : {
5056 : /*
5057 : * Get info about element type, including its output conversion proc
5058 : */
5059 1452 : get_type_io_data(element_type, IOFunc_output,
5060 : &my_extra->typlen, &my_extra->typbyval,
5061 : &my_extra->typalign, &my_extra->typdelim,
5062 : &my_extra->typioparam, &my_extra->typiofunc);
5063 1452 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5064 1452 : fcinfo->flinfo->fn_mcxt);
5065 1452 : my_extra->element_type = element_type;
5066 : }
5067 24964 : typlen = my_extra->typlen;
5068 24964 : typbyval = my_extra->typbyval;
5069 24964 : typalign = my_extra->typalign;
5070 :
5071 24964 : p = ARR_DATA_PTR(v);
5072 24964 : bitmap = ARR_NULLBITMAP(v);
5073 24964 : bitmask = 1;
5074 :
5075 84842 : for (i = 0; i < nitems; i++)
5076 : {
5077 : Datum itemvalue;
5078 : char *value;
5079 :
5080 : /* Get source element, checking for NULL */
5081 59878 : if (bitmap && (*bitmap & bitmask) == 0)
5082 : {
5083 : /* if null_string is NULL, we just ignore null elements */
5084 18 : if (null_string != NULL)
5085 : {
5086 6 : if (printed)
5087 6 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
5088 : else
5089 0 : appendStringInfoString(&buf, null_string);
5090 6 : printed = true;
5091 : }
5092 : }
5093 : else
5094 : {
5095 59860 : itemvalue = fetch_att(p, typbyval, typlen);
5096 :
5097 59860 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
5098 :
5099 59860 : if (printed)
5100 34896 : appendStringInfo(&buf, "%s%s", fldsep, value);
5101 : else
5102 24964 : appendStringInfoString(&buf, value);
5103 59860 : printed = true;
5104 :
5105 59860 : p = att_addlength_pointer(p, typlen, p);
5106 59860 : p = (char *) att_align_nominal(p, typalign);
5107 : }
5108 :
5109 : /* advance bitmap pointer if any */
5110 59878 : if (bitmap)
5111 : {
5112 108 : bitmask <<= 1;
5113 108 : if (bitmask == 0x100)
5114 : {
5115 0 : bitmap++;
5116 0 : bitmask = 1;
5117 : }
5118 : }
5119 : }
5120 :
5121 24964 : result = cstring_to_text_with_len(buf.data, buf.len);
5122 24964 : pfree(buf.data);
5123 :
5124 24964 : return result;
5125 : }
5126 :
5127 : /*
5128 : * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
5129 : * 16.
5130 : */
5131 : static inline text *
5132 38750 : convert_to_base(uint64 value, int base)
5133 : {
5134 38750 : const char *digits = "0123456789abcdef";
5135 :
5136 : /* We size the buffer for to_bin's longest possible return value. */
5137 : char buf[sizeof(uint64) * BITS_PER_BYTE];
5138 38750 : char *const end = buf + sizeof(buf);
5139 38750 : char *ptr = end;
5140 :
5141 : Assert(base > 1);
5142 : Assert(base <= 16);
5143 :
5144 : do
5145 : {
5146 75974 : *--ptr = digits[value % base];
5147 75974 : value /= base;
5148 75974 : } while (ptr > buf && value);
5149 :
5150 38750 : return cstring_to_text_with_len(ptr, end - ptr);
5151 : }
5152 :
5153 : /*
5154 : * Convert an integer to a string containing a base-2 (binary) representation
5155 : * of the number.
5156 : */
5157 : Datum
5158 12 : to_bin32(PG_FUNCTION_ARGS)
5159 : {
5160 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
5161 :
5162 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
5163 : }
5164 : Datum
5165 12 : to_bin64(PG_FUNCTION_ARGS)
5166 : {
5167 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5168 :
5169 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
5170 : }
5171 :
5172 : /*
5173 : * Convert an integer to a string containing a base-8 (oct) representation of
5174 : * the number.
5175 : */
5176 : Datum
5177 12 : to_oct32(PG_FUNCTION_ARGS)
5178 : {
5179 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
5180 :
5181 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
5182 : }
5183 : Datum
5184 12 : to_oct64(PG_FUNCTION_ARGS)
5185 : {
5186 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5187 :
5188 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
5189 : }
5190 :
5191 : /*
5192 : * Convert an integer to a string containing a base-16 (hex) representation of
5193 : * the number.
5194 : */
5195 : Datum
5196 38690 : to_hex32(PG_FUNCTION_ARGS)
5197 : {
5198 38690 : uint64 value = (uint32) PG_GETARG_INT32(0);
5199 :
5200 38690 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
5201 : }
5202 : Datum
5203 12 : to_hex64(PG_FUNCTION_ARGS)
5204 : {
5205 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5206 :
5207 12 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
5208 : }
5209 :
5210 : /*
5211 : * Return the size of a datum, possibly compressed
5212 : *
5213 : * Works on any data type
5214 : */
5215 : Datum
5216 122 : pg_column_size(PG_FUNCTION_ARGS)
5217 : {
5218 122 : Datum value = PG_GETARG_DATUM(0);
5219 : int32 result;
5220 : int typlen;
5221 :
5222 : /* On first call, get the input type's typlen, and save at *fn_extra */
5223 122 : if (fcinfo->flinfo->fn_extra == NULL)
5224 : {
5225 : /* Lookup the datatype of the supplied argument */
5226 122 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5227 :
5228 122 : typlen = get_typlen(argtypeid);
5229 122 : if (typlen == 0) /* should not happen */
5230 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5231 :
5232 122 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5233 : sizeof(int));
5234 122 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5235 : }
5236 : else
5237 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5238 :
5239 122 : if (typlen == -1)
5240 : {
5241 : /* varlena type, possibly toasted */
5242 122 : result = toast_datum_size(value);
5243 : }
5244 0 : else if (typlen == -2)
5245 : {
5246 : /* cstring */
5247 0 : result = strlen(DatumGetCString(value)) + 1;
5248 : }
5249 : else
5250 : {
5251 : /* ordinary fixed-width type */
5252 0 : result = typlen;
5253 : }
5254 :
5255 122 : PG_RETURN_INT32(result);
5256 : }
5257 :
5258 : /*
5259 : * Return the compression method stored in the compressed attribute. Return
5260 : * NULL for non varlena type or uncompressed data.
5261 : */
5262 : Datum
5263 162 : pg_column_compression(PG_FUNCTION_ARGS)
5264 : {
5265 : int typlen;
5266 : char *result;
5267 : ToastCompressionId cmid;
5268 :
5269 : /* On first call, get the input type's typlen, and save at *fn_extra */
5270 162 : if (fcinfo->flinfo->fn_extra == NULL)
5271 : {
5272 : /* Lookup the datatype of the supplied argument */
5273 108 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5274 :
5275 108 : typlen = get_typlen(argtypeid);
5276 108 : if (typlen == 0) /* should not happen */
5277 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5278 :
5279 108 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5280 : sizeof(int));
5281 108 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5282 : }
5283 : else
5284 54 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5285 :
5286 162 : if (typlen != -1)
5287 0 : PG_RETURN_NULL();
5288 :
5289 : /* get the compression method id stored in the compressed varlena */
5290 162 : cmid = toast_get_compression_id((struct varlena *)
5291 162 : DatumGetPointer(PG_GETARG_DATUM(0)));
5292 162 : if (cmid == TOAST_INVALID_COMPRESSION_ID)
5293 6 : PG_RETURN_NULL();
5294 :
5295 : /* convert compression method id to compression method name */
5296 156 : switch (cmid)
5297 : {
5298 66 : case TOAST_PGLZ_COMPRESSION_ID:
5299 66 : result = "pglz";
5300 66 : break;
5301 90 : case TOAST_LZ4_COMPRESSION_ID:
5302 90 : result = "lz4";
5303 90 : break;
5304 0 : default:
5305 0 : elog(ERROR, "invalid compression method id %d", cmid);
5306 : }
5307 :
5308 156 : PG_RETURN_TEXT_P(cstring_to_text(result));
5309 : }
5310 :
5311 : /*
5312 : * Return the chunk_id of the on-disk TOASTed value. Return NULL if the value
5313 : * is un-TOASTed or not on-disk.
5314 : */
5315 : Datum
5316 12 : pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
5317 : {
5318 : int typlen;
5319 : struct varlena *attr;
5320 : struct varatt_external toast_pointer;
5321 :
5322 : /* On first call, get the input type's typlen, and save at *fn_extra */
5323 12 : if (fcinfo->flinfo->fn_extra == NULL)
5324 : {
5325 : /* Lookup the datatype of the supplied argument */
5326 12 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5327 :
5328 12 : typlen = get_typlen(argtypeid);
5329 12 : if (typlen == 0) /* should not happen */
5330 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5331 :
5332 12 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5333 : sizeof(int));
5334 12 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5335 : }
5336 : else
5337 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5338 :
5339 12 : if (typlen != -1)
5340 0 : PG_RETURN_NULL();
5341 :
5342 12 : attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
5343 :
5344 12 : if (!VARATT_IS_EXTERNAL_ONDISK(attr))
5345 6 : PG_RETURN_NULL();
5346 :
5347 6 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
5348 :
5349 6 : PG_RETURN_OID(toast_pointer.va_valueid);
5350 : }
5351 :
5352 : /*
5353 : * string_agg - Concatenates values and returns string.
5354 : *
5355 : * Syntax: string_agg(value text, delimiter text) RETURNS text
5356 : *
5357 : * Note: Any NULL values are ignored. The first-call delimiter isn't
5358 : * actually used at all, and on subsequent calls the delimiter precedes
5359 : * the associated value.
5360 : */
5361 :
5362 : /* subroutine to initialize state */
5363 : static StringInfo
5364 2334 : makeStringAggState(FunctionCallInfo fcinfo)
5365 : {
5366 : StringInfo state;
5367 : MemoryContext aggcontext;
5368 : MemoryContext oldcontext;
5369 :
5370 2334 : if (!AggCheckCallContext(fcinfo, &aggcontext))
5371 : {
5372 : /* cannot be called directly because of internal-type argument */
5373 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
5374 : }
5375 :
5376 : /*
5377 : * Create state in aggregate context. It'll stay there across subsequent
5378 : * calls.
5379 : */
5380 2334 : oldcontext = MemoryContextSwitchTo(aggcontext);
5381 2334 : state = makeStringInfo();
5382 2334 : MemoryContextSwitchTo(oldcontext);
5383 :
5384 2334 : return state;
5385 : }
5386 :
5387 : Datum
5388 861580 : string_agg_transfn(PG_FUNCTION_ARGS)
5389 : {
5390 : StringInfo state;
5391 :
5392 861580 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5393 :
5394 : /* Append the value unless null, preceding it with the delimiter. */
5395 861580 : if (!PG_ARGISNULL(1))
5396 : {
5397 846532 : text *value = PG_GETARG_TEXT_PP(1);
5398 846532 : bool isfirst = false;
5399 :
5400 : /*
5401 : * You might think we can just throw away the first delimiter, however
5402 : * we must keep it as we may be a parallel worker doing partial
5403 : * aggregation building a state to send to the main process. We need
5404 : * to keep the delimiter of every aggregation so that the combine
5405 : * function can properly join up the strings of two separately
5406 : * partially aggregated results. The first delimiter is only stripped
5407 : * off in the final function. To know how much to strip off the front
5408 : * of the string, we store the length of the first delimiter in the
5409 : * StringInfo's cursor field, which we don't otherwise need here.
5410 : */
5411 846532 : if (state == NULL)
5412 : {
5413 1946 : state = makeStringAggState(fcinfo);
5414 1946 : isfirst = true;
5415 : }
5416 :
5417 846532 : if (!PG_ARGISNULL(2))
5418 : {
5419 846532 : text *delim = PG_GETARG_TEXT_PP(2);
5420 :
5421 846532 : appendStringInfoText(state, delim);
5422 846532 : if (isfirst)
5423 1946 : state->cursor = VARSIZE_ANY_EXHDR(delim);
5424 : }
5425 :
5426 846532 : appendStringInfoText(state, value);
5427 : }
5428 :
5429 : /*
5430 : * The transition type for string_agg() is declared to be "internal",
5431 : * which is a pass-by-value type the same size as a pointer.
5432 : */
5433 861580 : if (state)
5434 861502 : PG_RETURN_POINTER(state);
5435 78 : PG_RETURN_NULL();
5436 : }
5437 :
5438 : /*
5439 : * string_agg_combine
5440 : * Aggregate combine function for string_agg(text) and string_agg(bytea)
5441 : */
5442 : Datum
5443 120 : string_agg_combine(PG_FUNCTION_ARGS)
5444 : {
5445 : StringInfo state1;
5446 : StringInfo state2;
5447 : MemoryContext agg_context;
5448 :
5449 120 : if (!AggCheckCallContext(fcinfo, &agg_context))
5450 0 : elog(ERROR, "aggregate function called in non-aggregate context");
5451 :
5452 120 : state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5453 120 : state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5454 :
5455 120 : if (state2 == NULL)
5456 : {
5457 : /*
5458 : * NULL state2 is easy, just return state1, which we know is already
5459 : * in the agg_context
5460 : */
5461 0 : if (state1 == NULL)
5462 0 : PG_RETURN_NULL();
5463 0 : PG_RETURN_POINTER(state1);
5464 : }
5465 :
5466 120 : if (state1 == NULL)
5467 : {
5468 : /* We must copy state2's data into the agg_context */
5469 : MemoryContext old_context;
5470 :
5471 120 : old_context = MemoryContextSwitchTo(agg_context);
5472 120 : state1 = makeStringAggState(fcinfo);
5473 120 : appendBinaryStringInfo(state1, state2->data, state2->len);
5474 120 : state1->cursor = state2->cursor;
5475 120 : MemoryContextSwitchTo(old_context);
5476 : }
5477 0 : else if (state2->len > 0)
5478 : {
5479 : /* Combine ... state1->cursor does not change in this case */
5480 0 : appendBinaryStringInfo(state1, state2->data, state2->len);
5481 : }
5482 :
5483 120 : PG_RETURN_POINTER(state1);
5484 : }
5485 :
5486 : /*
5487 : * string_agg_serialize
5488 : * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5489 : *
5490 : * This is strict, so we need not handle NULL input
5491 : */
5492 : Datum
5493 120 : string_agg_serialize(PG_FUNCTION_ARGS)
5494 : {
5495 : StringInfo state;
5496 : StringInfoData buf;
5497 : bytea *result;
5498 :
5499 : /* cannot be called directly because of internal-type argument */
5500 : Assert(AggCheckCallContext(fcinfo, NULL));
5501 :
5502 120 : state = (StringInfo) PG_GETARG_POINTER(0);
5503 :
5504 120 : pq_begintypsend(&buf);
5505 :
5506 : /* cursor */
5507 120 : pq_sendint(&buf, state->cursor, 4);
5508 :
5509 : /* data */
5510 120 : pq_sendbytes(&buf, state->data, state->len);
5511 :
5512 120 : result = pq_endtypsend(&buf);
5513 :
5514 120 : PG_RETURN_BYTEA_P(result);
5515 : }
5516 :
5517 : /*
5518 : * string_agg_deserialize
5519 : * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5520 : *
5521 : * This is strict, so we need not handle NULL input
5522 : */
5523 : Datum
5524 120 : string_agg_deserialize(PG_FUNCTION_ARGS)
5525 : {
5526 : bytea *sstate;
5527 : StringInfo result;
5528 : StringInfoData buf;
5529 : char *data;
5530 : int datalen;
5531 :
5532 : /* cannot be called directly because of internal-type argument */
5533 : Assert(AggCheckCallContext(fcinfo, NULL));
5534 :
5535 120 : sstate = PG_GETARG_BYTEA_PP(0);
5536 :
5537 : /*
5538 : * Initialize a StringInfo so that we can "receive" it using the standard
5539 : * recv-function infrastructure.
5540 : */
5541 120 : initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
5542 120 : VARSIZE_ANY_EXHDR(sstate));
5543 :
5544 120 : result = makeStringAggState(fcinfo);
5545 :
5546 : /* cursor */
5547 120 : result->cursor = pq_getmsgint(&buf, 4);
5548 :
5549 : /* data */
5550 120 : datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5551 120 : data = (char *) pq_getmsgbytes(&buf, datalen);
5552 120 : appendBinaryStringInfo(result, data, datalen);
5553 :
5554 120 : pq_getmsgend(&buf);
5555 :
5556 120 : PG_RETURN_POINTER(result);
5557 : }
5558 :
5559 : Datum
5560 2018 : string_agg_finalfn(PG_FUNCTION_ARGS)
5561 : {
5562 : StringInfo state;
5563 :
5564 : /* cannot be called directly because of internal-type argument */
5565 : Assert(AggCheckCallContext(fcinfo, NULL));
5566 :
5567 2018 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5568 :
5569 2018 : if (state != NULL)
5570 : {
5571 : /* As per comment in transfn, strip data before the cursor position */
5572 1946 : PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5573 : state->len - state->cursor));
5574 : }
5575 : else
5576 72 : PG_RETURN_NULL();
5577 : }
5578 :
5579 : /*
5580 : * Prepare cache with fmgr info for the output functions of the datatypes of
5581 : * the arguments of a concat-like function, beginning with argument "argidx".
5582 : * (Arguments before that will have corresponding slots in the resulting
5583 : * FmgrInfo array, but we don't fill those slots.)
5584 : */
5585 : static FmgrInfo *
5586 106 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5587 : {
5588 : FmgrInfo *foutcache;
5589 : int i;
5590 :
5591 : /* We keep the info in fn_mcxt so it survives across calls */
5592 106 : foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5593 106 : PG_NARGS() * sizeof(FmgrInfo));
5594 :
5595 400 : for (i = argidx; i < PG_NARGS(); i++)
5596 : {
5597 : Oid valtype;
5598 : Oid typOutput;
5599 : bool typIsVarlena;
5600 :
5601 294 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5602 294 : if (!OidIsValid(valtype))
5603 0 : elog(ERROR, "could not determine data type of concat() input");
5604 :
5605 294 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5606 294 : fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5607 : }
5608 :
5609 106 : fcinfo->flinfo->fn_extra = foutcache;
5610 :
5611 106 : return foutcache;
5612 : }
5613 :
5614 : /*
5615 : * Implementation of both concat() and concat_ws().
5616 : *
5617 : * sepstr is the separator string to place between values.
5618 : * argidx identifies the first argument to concatenate (counting from zero);
5619 : * note that this must be constant across any one series of calls.
5620 : *
5621 : * Returns NULL if result should be NULL, else text value.
5622 : */
5623 : static text *
5624 264 : concat_internal(const char *sepstr, int argidx,
5625 : FunctionCallInfo fcinfo)
5626 : {
5627 : text *result;
5628 : StringInfoData str;
5629 : FmgrInfo *foutcache;
5630 264 : bool first_arg = true;
5631 : int i;
5632 :
5633 : /*
5634 : * concat(VARIADIC some-array) is essentially equivalent to
5635 : * array_to_text(), ie concat the array elements with the given separator.
5636 : * So we just pass the case off to that code.
5637 : */
5638 264 : if (get_fn_expr_variadic(fcinfo->flinfo))
5639 : {
5640 : ArrayType *arr;
5641 :
5642 : /* Should have just the one argument */
5643 : Assert(argidx == PG_NARGS() - 1);
5644 :
5645 : /* concat(VARIADIC NULL) is defined as NULL */
5646 30 : if (PG_ARGISNULL(argidx))
5647 12 : return NULL;
5648 :
5649 : /*
5650 : * Non-null argument had better be an array. We assume that any call
5651 : * context that could let get_fn_expr_variadic return true will have
5652 : * checked that a VARIADIC-labeled parameter actually is an array. So
5653 : * it should be okay to just Assert that it's an array rather than
5654 : * doing a full-fledged error check.
5655 : */
5656 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5657 :
5658 : /* OK, safe to fetch the array value */
5659 18 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
5660 :
5661 : /*
5662 : * And serialize the array. We tell array_to_text to ignore null
5663 : * elements, which matches the behavior of the loop below.
5664 : */
5665 18 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5666 : }
5667 :
5668 : /* Normal case without explicit VARIADIC marker */
5669 234 : initStringInfo(&str);
5670 :
5671 : /* Get output function info, building it if first time through */
5672 234 : foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5673 234 : if (foutcache == NULL)
5674 106 : foutcache = build_concat_foutcache(fcinfo, argidx);
5675 :
5676 822 : for (i = argidx; i < PG_NARGS(); i++)
5677 : {
5678 588 : if (!PG_ARGISNULL(i))
5679 : {
5680 510 : Datum value = PG_GETARG_DATUM(i);
5681 :
5682 : /* add separator if appropriate */
5683 510 : if (first_arg)
5684 228 : first_arg = false;
5685 : else
5686 282 : appendStringInfoString(&str, sepstr);
5687 :
5688 : /* call the appropriate type output function, append the result */
5689 510 : appendStringInfoString(&str,
5690 510 : OutputFunctionCall(&foutcache[i], value));
5691 : }
5692 : }
5693 :
5694 234 : result = cstring_to_text_with_len(str.data, str.len);
5695 234 : pfree(str.data);
5696 :
5697 234 : return result;
5698 : }
5699 :
5700 : /*
5701 : * Concatenate all arguments. NULL arguments are ignored.
5702 : */
5703 : Datum
5704 186 : text_concat(PG_FUNCTION_ARGS)
5705 : {
5706 : text *result;
5707 :
5708 186 : result = concat_internal("", 0, fcinfo);
5709 186 : if (result == NULL)
5710 6 : PG_RETURN_NULL();
5711 180 : PG_RETURN_TEXT_P(result);
5712 : }
5713 :
5714 : /*
5715 : * Concatenate all but first argument value with separators. The first
5716 : * parameter is used as the separator. NULL arguments are ignored.
5717 : */
5718 : Datum
5719 84 : text_concat_ws(PG_FUNCTION_ARGS)
5720 : {
5721 : char *sep;
5722 : text *result;
5723 :
5724 : /* return NULL when separator is NULL */
5725 84 : if (PG_ARGISNULL(0))
5726 6 : PG_RETURN_NULL();
5727 78 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5728 :
5729 78 : result = concat_internal(sep, 1, fcinfo);
5730 78 : if (result == NULL)
5731 6 : PG_RETURN_NULL();
5732 72 : PG_RETURN_TEXT_P(result);
5733 : }
5734 :
5735 : /*
5736 : * Return first n characters in the string. When n is negative,
5737 : * return all but last |n| characters.
5738 : */
5739 : Datum
5740 2148 : text_left(PG_FUNCTION_ARGS)
5741 : {
5742 2148 : int n = PG_GETARG_INT32(1);
5743 :
5744 2148 : if (n < 0)
5745 : {
5746 30 : text *str = PG_GETARG_TEXT_PP(0);
5747 30 : const char *p = VARDATA_ANY(str);
5748 30 : int len = VARSIZE_ANY_EXHDR(str);
5749 : int rlen;
5750 :
5751 30 : n = pg_mbstrlen_with_len(p, len) + n;
5752 30 : rlen = pg_mbcharcliplen(p, len, n);
5753 30 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5754 : }
5755 : else
5756 2118 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5757 : }
5758 :
5759 : /*
5760 : * Return last n characters in the string. When n is negative,
5761 : * return all but first |n| characters.
5762 : */
5763 : Datum
5764 66 : text_right(PG_FUNCTION_ARGS)
5765 : {
5766 66 : text *str = PG_GETARG_TEXT_PP(0);
5767 66 : const char *p = VARDATA_ANY(str);
5768 66 : int len = VARSIZE_ANY_EXHDR(str);
5769 66 : int n = PG_GETARG_INT32(1);
5770 : int off;
5771 :
5772 66 : if (n < 0)
5773 30 : n = -n;
5774 : else
5775 36 : n = pg_mbstrlen_with_len(p, len) - n;
5776 66 : off = pg_mbcharcliplen(p, len, n);
5777 :
5778 66 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5779 : }
5780 :
5781 : /*
5782 : * Return reversed string
5783 : */
5784 : Datum
5785 6 : text_reverse(PG_FUNCTION_ARGS)
5786 : {
5787 6 : text *str = PG_GETARG_TEXT_PP(0);
5788 6 : const char *p = VARDATA_ANY(str);
5789 6 : int len = VARSIZE_ANY_EXHDR(str);
5790 6 : const char *endp = p + len;
5791 : text *result;
5792 : char *dst;
5793 :
5794 6 : result = palloc(len + VARHDRSZ);
5795 6 : dst = (char *) VARDATA(result) + len;
5796 6 : SET_VARSIZE(result, len + VARHDRSZ);
5797 :
5798 6 : if (pg_database_encoding_max_length() > 1)
5799 : {
5800 : /* multibyte version */
5801 36 : while (p < endp)
5802 : {
5803 : int sz;
5804 :
5805 30 : sz = pg_mblen(p);
5806 30 : dst -= sz;
5807 30 : memcpy(dst, p, sz);
5808 30 : p += sz;
5809 : }
5810 : }
5811 : else
5812 : {
5813 : /* single byte version */
5814 0 : while (p < endp)
5815 0 : *(--dst) = *p++;
5816 : }
5817 :
5818 6 : PG_RETURN_TEXT_P(result);
5819 : }
5820 :
5821 :
5822 : /*
5823 : * Support macros for text_format()
5824 : */
5825 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5826 :
5827 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5828 : do { \
5829 : if (++(ptr) >= (end_ptr)) \
5830 : ereport(ERROR, \
5831 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5832 : errmsg("unterminated format() type specifier"), \
5833 : errhint("For a single \"%%\" use \"%%%%\"."))); \
5834 : } while (0)
5835 :
5836 : /*
5837 : * Returns a formatted string
5838 : */
5839 : Datum
5840 33090 : text_format(PG_FUNCTION_ARGS)
5841 : {
5842 : text *fmt;
5843 : StringInfoData str;
5844 : const char *cp;
5845 : const char *start_ptr;
5846 : const char *end_ptr;
5847 : text *result;
5848 : int arg;
5849 : bool funcvariadic;
5850 : int nargs;
5851 33090 : Datum *elements = NULL;
5852 33090 : bool *nulls = NULL;
5853 33090 : Oid element_type = InvalidOid;
5854 33090 : Oid prev_type = InvalidOid;
5855 33090 : Oid prev_width_type = InvalidOid;
5856 : FmgrInfo typoutputfinfo;
5857 : FmgrInfo typoutputinfo_width;
5858 :
5859 : /* When format string is null, immediately return null */
5860 33090 : if (PG_ARGISNULL(0))
5861 6 : PG_RETURN_NULL();
5862 :
5863 : /* If argument is marked VARIADIC, expand array into elements */
5864 33084 : if (get_fn_expr_variadic(fcinfo->flinfo))
5865 : {
5866 : ArrayType *arr;
5867 : int16 elmlen;
5868 : bool elmbyval;
5869 : char elmalign;
5870 : int nitems;
5871 :
5872 : /* Should have just the one argument */
5873 : Assert(PG_NARGS() == 2);
5874 :
5875 : /* If argument is NULL, we treat it as zero-length array */
5876 48 : if (PG_ARGISNULL(1))
5877 6 : nitems = 0;
5878 : else
5879 : {
5880 : /*
5881 : * Non-null argument had better be an array. We assume that any
5882 : * call context that could let get_fn_expr_variadic return true
5883 : * will have checked that a VARIADIC-labeled parameter actually is
5884 : * an array. So it should be okay to just Assert that it's an
5885 : * array rather than doing a full-fledged error check.
5886 : */
5887 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5888 :
5889 : /* OK, safe to fetch the array value */
5890 42 : arr = PG_GETARG_ARRAYTYPE_P(1);
5891 :
5892 : /* Get info about array element type */
5893 42 : element_type = ARR_ELEMTYPE(arr);
5894 42 : get_typlenbyvalalign(element_type,
5895 : &elmlen, &elmbyval, &elmalign);
5896 :
5897 : /* Extract all array elements */
5898 42 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5899 : &elements, &nulls, &nitems);
5900 : }
5901 :
5902 48 : nargs = nitems + 1;
5903 48 : funcvariadic = true;
5904 : }
5905 : else
5906 : {
5907 : /* Non-variadic case, we'll process the arguments individually */
5908 33036 : nargs = PG_NARGS();
5909 33036 : funcvariadic = false;
5910 : }
5911 :
5912 : /* Setup for main loop. */
5913 33084 : fmt = PG_GETARG_TEXT_PP(0);
5914 33084 : start_ptr = VARDATA_ANY(fmt);
5915 33084 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5916 33084 : initStringInfo(&str);
5917 33084 : arg = 1; /* next argument position to print */
5918 :
5919 : /* Scan format string, looking for conversion specifiers. */
5920 1011050 : for (cp = start_ptr; cp < end_ptr; cp++)
5921 : {
5922 : int argpos;
5923 : int widthpos;
5924 : int flags;
5925 : int width;
5926 : Datum value;
5927 : bool isNull;
5928 : Oid typid;
5929 :
5930 : /*
5931 : * If it's not the start of a conversion specifier, just copy it to
5932 : * the output buffer.
5933 : */
5934 978026 : if (*cp != '%')
5935 : {
5936 912284 : appendStringInfoCharMacro(&str, *cp);
5937 912302 : continue;
5938 : }
5939 :
5940 65742 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5941 :
5942 : /* Easy case: %% outputs a single % */
5943 65742 : if (*cp == '%')
5944 : {
5945 18 : appendStringInfoCharMacro(&str, *cp);
5946 18 : continue;
5947 : }
5948 :
5949 : /* Parse the optional portions of the format specifier */
5950 65724 : cp = text_format_parse_format(cp, end_ptr,
5951 : &argpos, &widthpos,
5952 : &flags, &width);
5953 :
5954 : /*
5955 : * Next we should see the main conversion specifier. Whether or not
5956 : * an argument position was present, it's known that at least one
5957 : * character remains in the string at this point. Experience suggests
5958 : * that it's worth checking that that character is one of the expected
5959 : * ones before we try to fetch arguments, so as to produce the least
5960 : * confusing response to a mis-formatted specifier.
5961 : */
5962 65700 : if (strchr("sIL", *cp) == NULL)
5963 6 : ereport(ERROR,
5964 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5965 : errmsg("unrecognized format() type specifier \"%.*s\"",
5966 : pg_mblen(cp), cp),
5967 : errhint("For a single \"%%\" use \"%%%%\".")));
5968 :
5969 : /* If indirect width was specified, get its value */
5970 65694 : if (widthpos >= 0)
5971 : {
5972 : /* Collect the specified or next argument position */
5973 42 : if (widthpos > 0)
5974 36 : arg = widthpos;
5975 42 : if (arg >= nargs)
5976 0 : ereport(ERROR,
5977 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5978 : errmsg("too few arguments for format()")));
5979 :
5980 : /* Get the value and type of the selected argument */
5981 42 : if (!funcvariadic)
5982 : {
5983 42 : value = PG_GETARG_DATUM(arg);
5984 42 : isNull = PG_ARGISNULL(arg);
5985 42 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5986 : }
5987 : else
5988 : {
5989 0 : value = elements[arg - 1];
5990 0 : isNull = nulls[arg - 1];
5991 0 : typid = element_type;
5992 : }
5993 42 : if (!OidIsValid(typid))
5994 0 : elog(ERROR, "could not determine data type of format() input");
5995 :
5996 42 : arg++;
5997 :
5998 : /* We can treat NULL width the same as zero */
5999 42 : if (isNull)
6000 6 : width = 0;
6001 36 : else if (typid == INT4OID)
6002 36 : width = DatumGetInt32(value);
6003 0 : else if (typid == INT2OID)
6004 0 : width = DatumGetInt16(value);
6005 : else
6006 : {
6007 : /* For less-usual datatypes, convert to text then to int */
6008 : char *str;
6009 :
6010 0 : if (typid != prev_width_type)
6011 : {
6012 : Oid typoutputfunc;
6013 : bool typIsVarlena;
6014 :
6015 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
6016 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
6017 0 : prev_width_type = typid;
6018 : }
6019 :
6020 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
6021 :
6022 : /* pg_strtoint32 will complain about bad data or overflow */
6023 0 : width = pg_strtoint32(str);
6024 :
6025 0 : pfree(str);
6026 : }
6027 : }
6028 :
6029 : /* Collect the specified or next argument position */
6030 65694 : if (argpos > 0)
6031 132 : arg = argpos;
6032 65694 : if (arg >= nargs)
6033 24 : ereport(ERROR,
6034 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6035 : errmsg("too few arguments for format()")));
6036 :
6037 : /* Get the value and type of the selected argument */
6038 65670 : if (!funcvariadic)
6039 : {
6040 64398 : value = PG_GETARG_DATUM(arg);
6041 64398 : isNull = PG_ARGISNULL(arg);
6042 64398 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
6043 : }
6044 : else
6045 : {
6046 1272 : value = elements[arg - 1];
6047 1272 : isNull = nulls[arg - 1];
6048 1272 : typid = element_type;
6049 : }
6050 65670 : if (!OidIsValid(typid))
6051 0 : elog(ERROR, "could not determine data type of format() input");
6052 :
6053 65670 : arg++;
6054 :
6055 : /*
6056 : * Get the appropriate typOutput function, reusing previous one if
6057 : * same type as previous argument. That's particularly useful in the
6058 : * variadic-array case, but often saves work even for ordinary calls.
6059 : */
6060 65670 : if (typid != prev_type)
6061 : {
6062 : Oid typoutputfunc;
6063 : bool typIsVarlena;
6064 :
6065 34152 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
6066 34152 : fmgr_info(typoutputfunc, &typoutputfinfo);
6067 34152 : prev_type = typid;
6068 : }
6069 :
6070 : /*
6071 : * And now we can format the value.
6072 : */
6073 65670 : switch (*cp)
6074 : {
6075 65670 : case 's':
6076 : case 'I':
6077 : case 'L':
6078 65670 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
6079 : value, isNull,
6080 : flags, width);
6081 65664 : break;
6082 0 : default:
6083 : /* should not get here, because of previous check */
6084 0 : ereport(ERROR,
6085 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6086 : errmsg("unrecognized format() type specifier \"%.*s\"",
6087 : pg_mblen(cp), cp),
6088 : errhint("For a single \"%%\" use \"%%%%\".")));
6089 : break;
6090 : }
6091 : }
6092 :
6093 : /* Don't need deconstruct_array results anymore. */
6094 33024 : if (elements != NULL)
6095 42 : pfree(elements);
6096 33024 : if (nulls != NULL)
6097 42 : pfree(nulls);
6098 :
6099 : /* Generate results. */
6100 33024 : result = cstring_to_text_with_len(str.data, str.len);
6101 33024 : pfree(str.data);
6102 :
6103 33024 : PG_RETURN_TEXT_P(result);
6104 : }
6105 :
6106 : /*
6107 : * Parse contiguous digits as a decimal number.
6108 : *
6109 : * Returns true if some digits could be parsed.
6110 : * The value is returned into *value, and *ptr is advanced to the next
6111 : * character to be parsed.
6112 : *
6113 : * Note parsing invariant: at least one character is known available before
6114 : * string end (end_ptr) at entry, and this is still true at exit.
6115 : */
6116 : static bool
6117 131412 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6118 : {
6119 131412 : bool found = false;
6120 131412 : const char *cp = *ptr;
6121 131412 : int val = 0;
6122 :
6123 131724 : while (*cp >= '0' && *cp <= '9')
6124 : {
6125 318 : int8 digit = (*cp - '0');
6126 :
6127 318 : if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6128 318 : unlikely(pg_add_s32_overflow(val, digit, &val)))
6129 0 : ereport(ERROR,
6130 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6131 : errmsg("number is out of range")));
6132 318 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6133 312 : found = true;
6134 : }
6135 :
6136 131406 : *ptr = cp;
6137 131406 : *value = val;
6138 :
6139 131406 : return found;
6140 : }
6141 :
6142 : /*
6143 : * Parse a format specifier (generally following the SUS printf spec).
6144 : *
6145 : * We have already advanced over the initial '%', and we are looking for
6146 : * [argpos][flags][width]type (but the type character is not consumed here).
6147 : *
6148 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6149 : * Output parameters:
6150 : * argpos: argument position for value to be printed. -1 means unspecified.
6151 : * widthpos: argument position for width. Zero means the argument position
6152 : * was unspecified (ie, take the next arg) and -1 means no width
6153 : * argument (width was omitted or specified as a constant).
6154 : * flags: bitmask of flags.
6155 : * width: directly-specified width value. Zero means the width was omitted
6156 : * (note it's not necessary to distinguish this case from an explicit
6157 : * zero width value).
6158 : *
6159 : * The function result is the next character position to be parsed, ie, the
6160 : * location where the type character is/should be.
6161 : *
6162 : * Note parsing invariant: at least one character is known available before
6163 : * string end (end_ptr) at entry, and this is still true at exit.
6164 : */
6165 : static const char *
6166 65724 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
6167 : int *argpos, int *widthpos,
6168 : int *flags, int *width)
6169 : {
6170 65724 : const char *cp = start_ptr;
6171 : int n;
6172 :
6173 : /* set defaults for output parameters */
6174 65724 : *argpos = -1;
6175 65724 : *widthpos = -1;
6176 65724 : *flags = 0;
6177 65724 : *width = 0;
6178 :
6179 : /* try to identify first number */
6180 65724 : if (text_format_parse_digits(&cp, end_ptr, &n))
6181 : {
6182 174 : if (*cp != '$')
6183 : {
6184 : /* Must be just a width and a type, so we're done */
6185 24 : *width = n;
6186 24 : return cp;
6187 : }
6188 : /* The number was argument position */
6189 150 : *argpos = n;
6190 : /* Explicit 0 for argument index is immediately refused */
6191 150 : if (n == 0)
6192 6 : ereport(ERROR,
6193 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6194 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6195 144 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6196 : }
6197 :
6198 : /* Handle flags (only minus is supported now) */
6199 65718 : while (*cp == '-')
6200 : {
6201 30 : *flags |= TEXT_FORMAT_FLAG_MINUS;
6202 30 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6203 : }
6204 :
6205 65688 : if (*cp == '*')
6206 : {
6207 : /* Handle indirect width */
6208 48 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6209 48 : if (text_format_parse_digits(&cp, end_ptr, &n))
6210 : {
6211 : /* number in this position must be closed by $ */
6212 42 : if (*cp != '$')
6213 0 : ereport(ERROR,
6214 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6215 : errmsg("width argument position must be ended by \"$\"")));
6216 : /* The number was width argument position */
6217 42 : *widthpos = n;
6218 : /* Explicit 0 for argument index is immediately refused */
6219 42 : if (n == 0)
6220 6 : ereport(ERROR,
6221 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6222 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6223 36 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6224 : }
6225 : else
6226 6 : *widthpos = 0; /* width's argument position is unspecified */
6227 : }
6228 : else
6229 : {
6230 : /* Check for direct width specification */
6231 65640 : if (text_format_parse_digits(&cp, end_ptr, &n))
6232 30 : *width = n;
6233 : }
6234 :
6235 : /* cp should now be pointing at type character */
6236 65676 : return cp;
6237 : }
6238 :
6239 : /*
6240 : * Format a %s, %I, or %L conversion
6241 : */
6242 : static void
6243 65670 : text_format_string_conversion(StringInfo buf, char conversion,
6244 : FmgrInfo *typOutputInfo,
6245 : Datum value, bool isNull,
6246 : int flags, int width)
6247 : {
6248 : char *str;
6249 :
6250 : /* Handle NULL arguments before trying to stringify the value. */
6251 65670 : if (isNull)
6252 : {
6253 342 : if (conversion == 's')
6254 270 : text_format_append_string(buf, "", flags, width);
6255 72 : else if (conversion == 'L')
6256 66 : text_format_append_string(buf, "NULL", flags, width);
6257 6 : else if (conversion == 'I')
6258 6 : ereport(ERROR,
6259 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6260 : errmsg("null values cannot be formatted as an SQL identifier")));
6261 336 : return;
6262 : }
6263 :
6264 : /* Stringify. */
6265 65328 : str = OutputFunctionCall(typOutputInfo, value);
6266 :
6267 : /* Escape. */
6268 65328 : if (conversion == 'I')
6269 : {
6270 : /* quote_identifier may or may not allocate a new string. */
6271 4896 : text_format_append_string(buf, quote_identifier(str), flags, width);
6272 : }
6273 60432 : else if (conversion == 'L')
6274 : {
6275 3232 : char *qstr = quote_literal_cstr(str);
6276 :
6277 3232 : text_format_append_string(buf, qstr, flags, width);
6278 : /* quote_literal_cstr() always allocates a new string */
6279 3232 : pfree(qstr);
6280 : }
6281 : else
6282 57200 : text_format_append_string(buf, str, flags, width);
6283 :
6284 : /* Cleanup. */
6285 65328 : pfree(str);
6286 : }
6287 :
6288 : /*
6289 : * Append str to buf, padding as directed by flags/width
6290 : */
6291 : static void
6292 65664 : text_format_append_string(StringInfo buf, const char *str,
6293 : int flags, int width)
6294 : {
6295 65664 : bool align_to_left = false;
6296 : int len;
6297 :
6298 : /* fast path for typical easy case */
6299 65664 : if (width == 0)
6300 : {
6301 65580 : appendStringInfoString(buf, str);
6302 65580 : return;
6303 : }
6304 :
6305 84 : if (width < 0)
6306 : {
6307 : /* Negative width: implicit '-' flag, then take absolute value */
6308 6 : align_to_left = true;
6309 : /* -INT_MIN is undefined */
6310 6 : if (width <= INT_MIN)
6311 0 : ereport(ERROR,
6312 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6313 : errmsg("number is out of range")));
6314 6 : width = -width;
6315 : }
6316 78 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
6317 24 : align_to_left = true;
6318 :
6319 84 : len = pg_mbstrlen(str);
6320 84 : if (align_to_left)
6321 : {
6322 : /* left justify */
6323 30 : appendStringInfoString(buf, str);
6324 30 : if (len < width)
6325 30 : appendStringInfoSpaces(buf, width - len);
6326 : }
6327 : else
6328 : {
6329 : /* right justify */
6330 54 : if (len < width)
6331 54 : appendStringInfoSpaces(buf, width - len);
6332 54 : appendStringInfoString(buf, str);
6333 : }
6334 : }
6335 :
6336 : /*
6337 : * text_format_nv - nonvariadic wrapper for text_format function.
6338 : *
6339 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6340 : * which checks that all built-in functions that share the implementing C
6341 : * function take the same number of arguments.
6342 : */
6343 : Datum
6344 3810 : text_format_nv(PG_FUNCTION_ARGS)
6345 : {
6346 3810 : return text_format(fcinfo);
6347 : }
6348 :
6349 : /*
6350 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
6351 : * for this use case.
6352 : */
6353 : static inline bool
6354 0 : rest_of_char_same(const char *s1, const char *s2, int len)
6355 : {
6356 0 : while (len > 0)
6357 : {
6358 0 : len--;
6359 0 : if (s1[len] != s2[len])
6360 0 : return false;
6361 : }
6362 0 : return true;
6363 : }
6364 :
6365 : /* Expand each Levenshtein distance variant */
6366 : #include "levenshtein.c"
6367 : #define LEVENSHTEIN_LESS_EQUAL
6368 : #include "levenshtein.c"
6369 :
6370 :
6371 : /*
6372 : * The following *ClosestMatch() functions can be used to determine whether a
6373 : * user-provided string resembles any known valid values, which is useful for
6374 : * providing hints in log messages, among other things. Use these functions
6375 : * like so:
6376 : *
6377 : * initClosestMatch(&state, source_string, max_distance);
6378 : *
6379 : * for (int i = 0; i < num_valid_strings; i++)
6380 : * updateClosestMatch(&state, valid_strings[i]);
6381 : *
6382 : * closestMatch = getClosestMatch(&state);
6383 : */
6384 :
6385 : /*
6386 : * Initialize the given state with the source string and maximum Levenshtein
6387 : * distance to consider.
6388 : */
6389 : void
6390 60 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6391 : {
6392 : Assert(state);
6393 : Assert(max_d >= 0);
6394 :
6395 60 : state->source = source;
6396 60 : state->min_d = -1;
6397 60 : state->max_d = max_d;
6398 60 : state->match = NULL;
6399 60 : }
6400 :
6401 : /*
6402 : * If the candidate string is a closer match than the current one saved (or
6403 : * there is no match saved), save it as the closest match.
6404 : *
6405 : * If the source or candidate string is NULL, empty, or too long, this function
6406 : * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6407 : * allowed or more than half the characters are different, no action is taken.
6408 : */
6409 : void
6410 372 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
6411 : {
6412 : int dist;
6413 :
6414 : Assert(state);
6415 :
6416 372 : if (state->source == NULL || state->source[0] == '\0' ||
6417 372 : candidate == NULL || candidate[0] == '\0')
6418 0 : return;
6419 :
6420 : /*
6421 : * To avoid ERROR-ing, we check the lengths here instead of setting
6422 : * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6423 : */
6424 372 : if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6425 372 : strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6426 0 : return;
6427 :
6428 372 : dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6429 372 : candidate, strlen(candidate), 1, 1, 1,
6430 : state->max_d, true);
6431 372 : if (dist <= state->max_d &&
6432 56 : dist <= strlen(state->source) / 2 &&
6433 14 : (state->min_d == -1 || dist < state->min_d))
6434 : {
6435 14 : state->min_d = dist;
6436 14 : state->match = candidate;
6437 : }
6438 : }
6439 :
6440 : /*
6441 : * Return the closest match. If no suitable candidates were provided via
6442 : * updateClosestMatch(), return NULL.
6443 : */
6444 : const char *
6445 60 : getClosestMatch(ClosestMatchState *state)
6446 : {
6447 : Assert(state);
6448 :
6449 60 : return state->match;
6450 : }
6451 :
6452 :
6453 : /*
6454 : * Unicode support
6455 : */
6456 :
6457 : static UnicodeNormalizationForm
6458 210 : unicode_norm_form_from_string(const char *formstr)
6459 : {
6460 210 : UnicodeNormalizationForm form = -1;
6461 :
6462 : /*
6463 : * Might as well check this while we're here.
6464 : */
6465 210 : if (GetDatabaseEncoding() != PG_UTF8)
6466 0 : ereport(ERROR,
6467 : (errcode(ERRCODE_SYNTAX_ERROR),
6468 : errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6469 :
6470 210 : if (pg_strcasecmp(formstr, "NFC") == 0)
6471 66 : form = UNICODE_NFC;
6472 144 : else if (pg_strcasecmp(formstr, "NFD") == 0)
6473 60 : form = UNICODE_NFD;
6474 84 : else if (pg_strcasecmp(formstr, "NFKC") == 0)
6475 36 : form = UNICODE_NFKC;
6476 48 : else if (pg_strcasecmp(formstr, "NFKD") == 0)
6477 36 : form = UNICODE_NFKD;
6478 : else
6479 12 : ereport(ERROR,
6480 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6481 : errmsg("invalid normalization form: %s", formstr)));
6482 :
6483 198 : return form;
6484 : }
6485 :
6486 : /*
6487 : * Returns version of Unicode used by Postgres in "major.minor" format (the
6488 : * same format as the Unicode version reported by ICU). The third component
6489 : * ("update version") never involves additions to the character repertoire and
6490 : * is unimportant for most purposes.
6491 : *
6492 : * See: https://unicode.org/versions/
6493 : */
6494 : Datum
6495 6 : unicode_version(PG_FUNCTION_ARGS)
6496 : {
6497 6 : PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6498 : }
6499 :
6500 : /*
6501 : * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6502 : */
6503 : Datum
6504 2 : icu_unicode_version(PG_FUNCTION_ARGS)
6505 : {
6506 : #ifdef USE_ICU
6507 2 : PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6508 : #else
6509 : PG_RETURN_NULL();
6510 : #endif
6511 : }
6512 :
6513 : /*
6514 : * Check whether the string contains only assigned Unicode code
6515 : * points. Requires that the database encoding is UTF-8.
6516 : */
6517 : Datum
6518 12 : unicode_assigned(PG_FUNCTION_ARGS)
6519 : {
6520 12 : text *input = PG_GETARG_TEXT_PP(0);
6521 : unsigned char *p;
6522 : int size;
6523 :
6524 12 : if (GetDatabaseEncoding() != PG_UTF8)
6525 0 : ereport(ERROR,
6526 : (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6527 :
6528 : /* convert to pg_wchar */
6529 12 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6530 12 : p = (unsigned char *) VARDATA_ANY(input);
6531 48 : for (int i = 0; i < size; i++)
6532 : {
6533 42 : pg_wchar uchar = utf8_to_unicode(p);
6534 42 : int category = unicode_category(uchar);
6535 :
6536 42 : if (category == PG_U_UNASSIGNED)
6537 6 : PG_RETURN_BOOL(false);
6538 :
6539 36 : p += pg_utf_mblen(p);
6540 : }
6541 :
6542 6 : PG_RETURN_BOOL(true);
6543 : }
6544 :
6545 : Datum
6546 72 : unicode_normalize_func(PG_FUNCTION_ARGS)
6547 : {
6548 72 : text *input = PG_GETARG_TEXT_PP(0);
6549 72 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6550 : UnicodeNormalizationForm form;
6551 : int size;
6552 : pg_wchar *input_chars;
6553 : pg_wchar *output_chars;
6554 : unsigned char *p;
6555 : text *result;
6556 : int i;
6557 :
6558 72 : form = unicode_norm_form_from_string(formstr);
6559 :
6560 : /* convert to pg_wchar */
6561 66 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6562 66 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6563 66 : p = (unsigned char *) VARDATA_ANY(input);
6564 288 : for (i = 0; i < size; i++)
6565 : {
6566 222 : input_chars[i] = utf8_to_unicode(p);
6567 222 : p += pg_utf_mblen(p);
6568 : }
6569 66 : input_chars[i] = (pg_wchar) '\0';
6570 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6571 :
6572 : /* action */
6573 66 : output_chars = unicode_normalize(form, input_chars);
6574 :
6575 : /* convert back to UTF-8 string */
6576 66 : size = 0;
6577 306 : for (pg_wchar *wp = output_chars; *wp; wp++)
6578 : {
6579 : unsigned char buf[4];
6580 :
6581 240 : unicode_to_utf8(*wp, buf);
6582 240 : size += pg_utf_mblen(buf);
6583 : }
6584 :
6585 66 : result = palloc(size + VARHDRSZ);
6586 66 : SET_VARSIZE(result, size + VARHDRSZ);
6587 :
6588 66 : p = (unsigned char *) VARDATA_ANY(result);
6589 306 : for (pg_wchar *wp = output_chars; *wp; wp++)
6590 : {
6591 240 : unicode_to_utf8(*wp, p);
6592 240 : p += pg_utf_mblen(p);
6593 : }
6594 : Assert((char *) p == (char *) result + size + VARHDRSZ);
6595 :
6596 66 : PG_RETURN_TEXT_P(result);
6597 : }
6598 :
6599 : /*
6600 : * Check whether the string is in the specified Unicode normalization form.
6601 : *
6602 : * This is done by converting the string to the specified normal form and then
6603 : * comparing that to the original string. To speed that up, we also apply the
6604 : * "quick check" algorithm specified in UAX #15, which can give a yes or no
6605 : * answer for many strings by just scanning the string once.
6606 : *
6607 : * This function should generally be optimized for the case where the string
6608 : * is in fact normalized. In that case, we'll end up looking at the entire
6609 : * string, so it's probably not worth doing any incremental conversion etc.
6610 : */
6611 : Datum
6612 138 : unicode_is_normalized(PG_FUNCTION_ARGS)
6613 : {
6614 138 : text *input = PG_GETARG_TEXT_PP(0);
6615 138 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6616 : UnicodeNormalizationForm form;
6617 : int size;
6618 : pg_wchar *input_chars;
6619 : pg_wchar *output_chars;
6620 : unsigned char *p;
6621 : int i;
6622 : UnicodeNormalizationQC quickcheck;
6623 : int output_size;
6624 : bool result;
6625 :
6626 138 : form = unicode_norm_form_from_string(formstr);
6627 :
6628 : /* convert to pg_wchar */
6629 132 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6630 132 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6631 132 : p = (unsigned char *) VARDATA_ANY(input);
6632 504 : for (i = 0; i < size; i++)
6633 : {
6634 372 : input_chars[i] = utf8_to_unicode(p);
6635 372 : p += pg_utf_mblen(p);
6636 : }
6637 132 : input_chars[i] = (pg_wchar) '\0';
6638 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6639 :
6640 : /* quick check (see UAX #15) */
6641 132 : quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6642 132 : if (quickcheck == UNICODE_NORM_QC_YES)
6643 42 : PG_RETURN_BOOL(true);
6644 90 : else if (quickcheck == UNICODE_NORM_QC_NO)
6645 12 : PG_RETURN_BOOL(false);
6646 :
6647 : /* normalize and compare with original */
6648 78 : output_chars = unicode_normalize(form, input_chars);
6649 :
6650 78 : output_size = 0;
6651 324 : for (pg_wchar *wp = output_chars; *wp; wp++)
6652 246 : output_size++;
6653 :
6654 114 : result = (size == output_size) &&
6655 36 : (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6656 :
6657 78 : PG_RETURN_BOOL(result);
6658 : }
6659 :
6660 : /*
6661 : * Check if first n chars are hexadecimal digits
6662 : */
6663 : static bool
6664 156 : isxdigits_n(const char *instr, size_t n)
6665 : {
6666 660 : for (size_t i = 0; i < n; i++)
6667 570 : if (!isxdigit((unsigned char) instr[i]))
6668 66 : return false;
6669 :
6670 90 : return true;
6671 : }
6672 :
6673 : static unsigned int
6674 504 : hexval(unsigned char c)
6675 : {
6676 504 : if (c >= '0' && c <= '9')
6677 384 : return c - '0';
6678 120 : if (c >= 'a' && c <= 'f')
6679 60 : return c - 'a' + 0xA;
6680 60 : if (c >= 'A' && c <= 'F')
6681 60 : return c - 'A' + 0xA;
6682 0 : elog(ERROR, "invalid hexadecimal digit");
6683 : return 0; /* not reached */
6684 : }
6685 :
6686 : /*
6687 : * Translate string with hexadecimal digits to number
6688 : */
6689 : static unsigned int
6690 90 : hexval_n(const char *instr, size_t n)
6691 : {
6692 90 : unsigned int result = 0;
6693 :
6694 594 : for (size_t i = 0; i < n; i++)
6695 504 : result += hexval(instr[i]) << (4 * (n - i - 1));
6696 :
6697 90 : return result;
6698 : }
6699 :
6700 : /*
6701 : * Replaces Unicode escape sequences by Unicode characters
6702 : */
6703 : Datum
6704 66 : unistr(PG_FUNCTION_ARGS)
6705 : {
6706 66 : text *input_text = PG_GETARG_TEXT_PP(0);
6707 : char *instr;
6708 : int len;
6709 : StringInfoData str;
6710 : text *result;
6711 66 : pg_wchar pair_first = 0;
6712 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6713 :
6714 66 : instr = VARDATA_ANY(input_text);
6715 66 : len = VARSIZE_ANY_EXHDR(input_text);
6716 :
6717 66 : initStringInfo(&str);
6718 :
6719 510 : while (len > 0)
6720 : {
6721 486 : if (instr[0] == '\\')
6722 : {
6723 102 : if (len >= 2 &&
6724 102 : instr[1] == '\\')
6725 : {
6726 6 : if (pair_first)
6727 0 : goto invalid_pair;
6728 6 : appendStringInfoChar(&str, '\\');
6729 6 : instr += 2;
6730 6 : len -= 2;
6731 : }
6732 96 : else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6733 66 : (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6734 30 : {
6735 : pg_wchar unicode;
6736 42 : int offset = instr[1] == 'u' ? 2 : 1;
6737 :
6738 42 : unicode = hexval_n(instr + offset, 4);
6739 :
6740 42 : if (!is_valid_unicode_codepoint(unicode))
6741 0 : ereport(ERROR,
6742 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6743 : errmsg("invalid Unicode code point: %04X", unicode));
6744 :
6745 42 : if (pair_first)
6746 : {
6747 12 : if (is_utf16_surrogate_second(unicode))
6748 : {
6749 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6750 0 : pair_first = 0;
6751 : }
6752 : else
6753 12 : goto invalid_pair;
6754 : }
6755 30 : else if (is_utf16_surrogate_second(unicode))
6756 0 : goto invalid_pair;
6757 :
6758 30 : if (is_utf16_surrogate_first(unicode))
6759 18 : pair_first = unicode;
6760 : else
6761 : {
6762 12 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6763 12 : appendStringInfoString(&str, cbuf);
6764 : }
6765 :
6766 30 : instr += 4 + offset;
6767 30 : len -= 4 + offset;
6768 : }
6769 54 : else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6770 12 : {
6771 : pg_wchar unicode;
6772 :
6773 24 : unicode = hexval_n(instr + 2, 6);
6774 :
6775 24 : if (!is_valid_unicode_codepoint(unicode))
6776 6 : ereport(ERROR,
6777 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6778 : errmsg("invalid Unicode code point: %04X", unicode));
6779 :
6780 18 : if (pair_first)
6781 : {
6782 6 : if (is_utf16_surrogate_second(unicode))
6783 : {
6784 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6785 0 : pair_first = 0;
6786 : }
6787 : else
6788 6 : goto invalid_pair;
6789 : }
6790 12 : else if (is_utf16_surrogate_second(unicode))
6791 0 : goto invalid_pair;
6792 :
6793 12 : if (is_utf16_surrogate_first(unicode))
6794 6 : pair_first = unicode;
6795 : else
6796 : {
6797 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6798 6 : appendStringInfoString(&str, cbuf);
6799 : }
6800 :
6801 12 : instr += 8;
6802 12 : len -= 8;
6803 : }
6804 30 : else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6805 12 : {
6806 : pg_wchar unicode;
6807 :
6808 24 : unicode = hexval_n(instr + 2, 8);
6809 :
6810 24 : if (!is_valid_unicode_codepoint(unicode))
6811 6 : ereport(ERROR,
6812 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6813 : errmsg("invalid Unicode code point: %04X", unicode));
6814 :
6815 18 : if (pair_first)
6816 : {
6817 6 : if (is_utf16_surrogate_second(unicode))
6818 : {
6819 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6820 0 : pair_first = 0;
6821 : }
6822 : else
6823 6 : goto invalid_pair;
6824 : }
6825 12 : else if (is_utf16_surrogate_second(unicode))
6826 0 : goto invalid_pair;
6827 :
6828 12 : if (is_utf16_surrogate_first(unicode))
6829 6 : pair_first = unicode;
6830 : else
6831 : {
6832 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6833 6 : appendStringInfoString(&str, cbuf);
6834 : }
6835 :
6836 12 : instr += 10;
6837 12 : len -= 10;
6838 : }
6839 : else
6840 6 : ereport(ERROR,
6841 : (errcode(ERRCODE_SYNTAX_ERROR),
6842 : errmsg("invalid Unicode escape"),
6843 : errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6844 : }
6845 : else
6846 : {
6847 384 : if (pair_first)
6848 0 : goto invalid_pair;
6849 :
6850 384 : appendStringInfoChar(&str, *instr++);
6851 384 : len--;
6852 : }
6853 : }
6854 :
6855 : /* unfinished surrogate pair? */
6856 24 : if (pair_first)
6857 6 : goto invalid_pair;
6858 :
6859 18 : result = cstring_to_text_with_len(str.data, str.len);
6860 18 : pfree(str.data);
6861 :
6862 18 : PG_RETURN_TEXT_P(result);
6863 :
6864 30 : invalid_pair:
6865 30 : ereport(ERROR,
6866 : (errcode(ERRCODE_SYNTAX_ERROR),
6867 : errmsg("invalid Unicode surrogate pair")));
6868 : PG_RETURN_NULL(); /* keep compiler quiet */
6869 : }
|