Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/detoast.h"
21 : #include "access/toast_compression.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/hashfn.h"
25 : #include "common/int.h"
26 : #include "common/unicode_category.h"
27 : #include "common/unicode_norm.h"
28 : #include "common/unicode_version.h"
29 : #include "funcapi.h"
30 : #include "lib/hyperloglog.h"
31 : #include "libpq/pqformat.h"
32 : #include "miscadmin.h"
33 : #include "nodes/execnodes.h"
34 : #include "parser/scansup.h"
35 : #include "port/pg_bswap.h"
36 : #include "regex/regex.h"
37 : #include "utils/builtins.h"
38 : #include "utils/bytea.h"
39 : #include "utils/guc.h"
40 : #include "utils/lsyscache.h"
41 : #include "utils/memutils.h"
42 : #include "utils/pg_locale.h"
43 : #include "utils/sortsupport.h"
44 : #include "utils/varlena.h"
45 :
46 :
47 : /* GUC variable */
48 : int bytea_output = BYTEA_OUTPUT_HEX;
49 :
50 : typedef struct varlena VarString;
51 :
52 : /*
53 : * State for text_position_* functions.
54 : */
55 : typedef struct
56 : {
57 : pg_locale_t locale; /* collation used for substring matching */
58 : bool is_multibyte_char_in_char; /* need to check char boundaries? */
59 : bool greedy; /* find longest possible substring? */
60 :
61 : char *str1; /* haystack string */
62 : char *str2; /* needle string */
63 : int len1; /* string lengths in bytes */
64 : int len2;
65 :
66 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
67 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
68 : int skiptable[256]; /* skip distance for given mismatched char */
69 :
70 : /*
71 : * Note that with nondeterministic collations, the length of the last
72 : * match is not necessarily equal to the length of the "needle" passed in.
73 : */
74 : char *last_match; /* pointer to last match in 'str1' */
75 : int last_match_len; /* length of last match */
76 : int last_match_len_tmp; /* same but for internal use */
77 :
78 : /*
79 : * Sometimes we need to convert the byte position of a match to a
80 : * character position. These store the last position that was converted,
81 : * so that on the next call, we can continue from that point, rather than
82 : * count characters from the very beginning.
83 : */
84 : char *refpoint; /* pointer within original haystack string */
85 : int refpos; /* 0-based character offset of the same point */
86 : } TextPositionState;
87 :
88 : typedef struct
89 : {
90 : char *buf1; /* 1st string, or abbreviation original string
91 : * buf */
92 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
93 : int buflen1; /* Allocated length of buf1 */
94 : int buflen2; /* Allocated length of buf2 */
95 : int last_len1; /* Length of last buf1 string/strxfrm() input */
96 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
97 : int last_returned; /* Last comparison result (cache) */
98 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
99 : bool collate_c;
100 : Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
101 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
102 : hyperLogLogState full_card; /* Full key cardinality state */
103 : double prop_card; /* Required cardinality proportion */
104 : pg_locale_t locale;
105 : } VarStringSortSupport;
106 :
107 : /*
108 : * Output data for split_text(): we output either to an array or a table.
109 : * tupstore and tupdesc must be set up in advance to output to a table.
110 : */
111 : typedef struct
112 : {
113 : ArrayBuildState *astate;
114 : Tuplestorestate *tupstore;
115 : TupleDesc tupdesc;
116 : } SplitTextOutputData;
117 :
118 : /*
119 : * This should be large enough that most strings will fit, but small enough
120 : * that we feel comfortable putting it on the stack
121 : */
122 : #define TEXTBUFLEN 1024
123 :
124 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
125 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
126 :
127 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
128 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
129 : static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
130 : static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
131 : static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
132 : static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
133 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
134 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
135 : static int32 text_length(Datum str);
136 : static text *text_catenate(text *t1, text *t2);
137 : static text *text_substring(Datum str,
138 : int32 start,
139 : int32 length,
140 : bool length_not_specified);
141 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
142 : static int text_position(text *t1, text *t2, Oid collid);
143 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
144 : static bool text_position_next(TextPositionState *state);
145 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
146 : static char *text_position_get_match_ptr(TextPositionState *state);
147 : static int text_position_get_match_pos(TextPositionState *state);
148 : static void text_position_cleanup(TextPositionState *state);
149 : static void check_collation_set(Oid collid);
150 : static int text_cmp(text *arg1, text *arg2, Oid collid);
151 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
152 : static bytea *bytea_substring(Datum str,
153 : int S,
154 : int L,
155 : bool length_not_specified);
156 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
157 : static void appendStringInfoText(StringInfo str, const text *t);
158 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
159 : static void split_text_accum_result(SplitTextOutputData *tstate,
160 : text *field_value,
161 : text *null_string,
162 : Oid collation);
163 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
164 : const char *fldsep, const char *null_string);
165 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
166 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
167 : int *value);
168 : static const char *text_format_parse_format(const char *start_ptr,
169 : const char *end_ptr,
170 : int *argpos, int *widthpos,
171 : int *flags, int *width);
172 : static void text_format_string_conversion(StringInfo buf, char conversion,
173 : FmgrInfo *typOutputInfo,
174 : Datum value, bool isNull,
175 : int flags, int width);
176 : static void text_format_append_string(StringInfo buf, const char *str,
177 : int flags, int width);
178 :
179 :
180 : /*****************************************************************************
181 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
182 : *****************************************************************************/
183 :
184 : /*
185 : * cstring_to_text
186 : *
187 : * Create a text value from a null-terminated C string.
188 : *
189 : * The new text value is freshly palloc'd with a full-size VARHDR.
190 : */
191 : text *
192 23171408 : cstring_to_text(const char *s)
193 : {
194 23171408 : return cstring_to_text_with_len(s, strlen(s));
195 : }
196 :
197 : /*
198 : * cstring_to_text_with_len
199 : *
200 : * Same as cstring_to_text except the caller specifies the string length;
201 : * the string need not be null_terminated.
202 : */
203 : text *
204 25813784 : cstring_to_text_with_len(const char *s, int len)
205 : {
206 25813784 : text *result = (text *) palloc(len + VARHDRSZ);
207 :
208 25813784 : SET_VARSIZE(result, len + VARHDRSZ);
209 25813784 : memcpy(VARDATA(result), s, len);
210 :
211 25813784 : return result;
212 : }
213 :
214 : /*
215 : * text_to_cstring
216 : *
217 : * Create a palloc'd, null-terminated C string from a text value.
218 : *
219 : * We support being passed a compressed or toasted text value.
220 : * This is a bit bogus since such values shouldn't really be referred to as
221 : * "text *", but it seems useful for robustness. If we didn't handle that
222 : * case here, we'd need another routine that did, anyway.
223 : */
224 : char *
225 15422016 : text_to_cstring(const text *t)
226 : {
227 : /* must cast away the const, unfortunately */
228 15422016 : text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
229 15422016 : int len = VARSIZE_ANY_EXHDR(tunpacked);
230 : char *result;
231 :
232 15422016 : result = (char *) palloc(len + 1);
233 15422016 : memcpy(result, VARDATA_ANY(tunpacked), len);
234 15422016 : result[len] = '\0';
235 :
236 15422016 : if (tunpacked != t)
237 42932 : pfree(tunpacked);
238 :
239 15422016 : return result;
240 : }
241 :
242 : /*
243 : * text_to_cstring_buffer
244 : *
245 : * Copy a text value into a caller-supplied buffer of size dst_len.
246 : *
247 : * The text string is truncated if necessary to fit. The result is
248 : * guaranteed null-terminated (unless dst_len == 0).
249 : *
250 : * We support being passed a compressed or toasted text value.
251 : * This is a bit bogus since such values shouldn't really be referred to as
252 : * "text *", but it seems useful for robustness. If we didn't handle that
253 : * case here, we'd need another routine that did, anyway.
254 : */
255 : void
256 978 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
257 : {
258 : /* must cast away the const, unfortunately */
259 978 : text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
260 978 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
261 :
262 978 : if (dst_len > 0)
263 : {
264 978 : dst_len--;
265 978 : if (dst_len >= src_len)
266 978 : dst_len = src_len;
267 : else /* ensure truncation is encoding-safe */
268 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
269 978 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
270 978 : dst[dst_len] = '\0';
271 : }
272 :
273 978 : if (srcunpacked != src)
274 0 : pfree(srcunpacked);
275 978 : }
276 :
277 :
278 : /*****************************************************************************
279 : * USER I/O ROUTINES *
280 : *****************************************************************************/
281 :
282 :
283 : #define VAL(CH) ((CH) - '0')
284 : #define DIG(VAL) ((VAL) + '0')
285 :
286 : /*
287 : * byteain - converts from printable representation of byte array
288 : *
289 : * Non-printable characters must be passed as '\nnn' (octal) and are
290 : * converted to internal form. '\' must be passed as '\\'.
291 : * ereport(ERROR, ...) if bad form.
292 : *
293 : * BUGS:
294 : * The input is scanned twice.
295 : * The error checking of input is minimal.
296 : */
297 : Datum
298 986182 : byteain(PG_FUNCTION_ARGS)
299 : {
300 986182 : char *inputText = PG_GETARG_CSTRING(0);
301 986182 : Node *escontext = fcinfo->context;
302 : char *tp;
303 : char *rp;
304 : int bc;
305 : bytea *result;
306 :
307 : /* Recognize hex input */
308 986182 : if (inputText[0] == '\\' && inputText[1] == 'x')
309 : {
310 111158 : size_t len = strlen(inputText);
311 :
312 111158 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
313 111158 : result = palloc(bc);
314 111158 : bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
315 : escontext);
316 111146 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
317 :
318 111146 : PG_RETURN_BYTEA_P(result);
319 : }
320 :
321 : /* Else, it's the traditional escaped style */
322 8103152 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
323 : {
324 7228140 : if (tp[0] != '\\')
325 7227122 : tp++;
326 1018 : else if ((tp[0] == '\\') &&
327 1018 : (tp[1] >= '0' && tp[1] <= '3') &&
328 1006 : (tp[2] >= '0' && tp[2] <= '7') &&
329 1006 : (tp[3] >= '0' && tp[3] <= '7'))
330 1006 : tp += 4;
331 12 : else if ((tp[0] == '\\') &&
332 12 : (tp[1] == '\\'))
333 0 : tp += 2;
334 : else
335 : {
336 : /*
337 : * one backslash, not followed by another or ### valid octal
338 : */
339 12 : ereturn(escontext, (Datum) 0,
340 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
341 : errmsg("invalid input syntax for type %s", "bytea")));
342 : }
343 : }
344 :
345 875012 : bc += VARHDRSZ;
346 :
347 875012 : result = (bytea *) palloc(bc);
348 875012 : SET_VARSIZE(result, bc);
349 :
350 875012 : tp = inputText;
351 875012 : rp = VARDATA(result);
352 8103110 : while (*tp != '\0')
353 : {
354 7228098 : if (tp[0] != '\\')
355 7227092 : *rp++ = *tp++;
356 1006 : else if ((tp[0] == '\\') &&
357 1006 : (tp[1] >= '0' && tp[1] <= '3') &&
358 1006 : (tp[2] >= '0' && tp[2] <= '7') &&
359 1006 : (tp[3] >= '0' && tp[3] <= '7'))
360 : {
361 1006 : bc = VAL(tp[1]);
362 1006 : bc <<= 3;
363 1006 : bc += VAL(tp[2]);
364 1006 : bc <<= 3;
365 1006 : *rp++ = bc + VAL(tp[3]);
366 :
367 1006 : tp += 4;
368 : }
369 0 : else if ((tp[0] == '\\') &&
370 0 : (tp[1] == '\\'))
371 : {
372 0 : *rp++ = '\\';
373 0 : tp += 2;
374 : }
375 : else
376 : {
377 : /*
378 : * We should never get here. The first pass should not allow it.
379 : */
380 0 : ereturn(escontext, (Datum) 0,
381 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
382 : errmsg("invalid input syntax for type %s", "bytea")));
383 : }
384 : }
385 :
386 875012 : PG_RETURN_BYTEA_P(result);
387 : }
388 :
389 : /*
390 : * byteaout - converts to printable representation of byte array
391 : *
392 : * In the traditional escaped format, non-printable characters are
393 : * printed as '\nnn' (octal) and '\' as '\\'.
394 : */
395 : Datum
396 159892 : byteaout(PG_FUNCTION_ARGS)
397 : {
398 159892 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
399 : char *result;
400 : char *rp;
401 :
402 159892 : if (bytea_output == BYTEA_OUTPUT_HEX)
403 : {
404 : /* Print hex format */
405 159508 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
406 159508 : *rp++ = '\\';
407 159508 : *rp++ = 'x';
408 159508 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
409 : }
410 384 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
411 : {
412 : /* Print traditional escaped format */
413 : char *vp;
414 : uint64 len;
415 : int i;
416 :
417 384 : len = 1; /* empty string has 1 char */
418 384 : vp = VARDATA_ANY(vlena);
419 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420 : {
421 217276 : if (*vp == '\\')
422 0 : len += 2;
423 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
424 498 : len += 4;
425 : else
426 216778 : len++;
427 : }
428 :
429 : /*
430 : * In principle len can't overflow uint32 if the input fit in 1GB, but
431 : * for safety let's check rather than relying on palloc's internal
432 : * check.
433 : */
434 384 : if (len > MaxAllocSize)
435 0 : ereport(ERROR,
436 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
437 : errmsg_internal("result of bytea output conversion is too large")));
438 384 : rp = result = (char *) palloc(len);
439 :
440 384 : vp = VARDATA_ANY(vlena);
441 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
442 : {
443 217276 : if (*vp == '\\')
444 : {
445 0 : *rp++ = '\\';
446 0 : *rp++ = '\\';
447 : }
448 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
449 498 : {
450 : int val; /* holds unprintable chars */
451 :
452 498 : val = *vp;
453 498 : rp[0] = '\\';
454 498 : rp[3] = DIG(val & 07);
455 498 : val >>= 3;
456 498 : rp[2] = DIG(val & 07);
457 498 : val >>= 3;
458 498 : rp[1] = DIG(val & 03);
459 498 : rp += 4;
460 : }
461 : else
462 216778 : *rp++ = *vp;
463 : }
464 : }
465 : else
466 : {
467 0 : elog(ERROR, "unrecognized \"bytea_output\" setting: %d",
468 : bytea_output);
469 : rp = result = NULL; /* keep compiler quiet */
470 : }
471 159892 : *rp = '\0';
472 159892 : PG_RETURN_CSTRING(result);
473 : }
474 :
475 : /*
476 : * bytearecv - converts external binary format to bytea
477 : */
478 : Datum
479 107710 : bytearecv(PG_FUNCTION_ARGS)
480 : {
481 107710 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
482 : bytea *result;
483 : int nbytes;
484 :
485 107710 : nbytes = buf->len - buf->cursor;
486 107710 : result = (bytea *) palloc(nbytes + VARHDRSZ);
487 107710 : SET_VARSIZE(result, nbytes + VARHDRSZ);
488 107710 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
489 107710 : PG_RETURN_BYTEA_P(result);
490 : }
491 :
492 : /*
493 : * byteasend - converts bytea to binary format
494 : *
495 : * This is a special case: just copy the input...
496 : */
497 : Datum
498 68974 : byteasend(PG_FUNCTION_ARGS)
499 : {
500 68974 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
501 :
502 68974 : PG_RETURN_BYTEA_P(vlena);
503 : }
504 :
505 : Datum
506 258774 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
507 : {
508 : StringInfo state;
509 :
510 258774 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
511 :
512 : /* Append the value unless null, preceding it with the delimiter. */
513 258774 : if (!PG_ARGISNULL(1))
514 : {
515 243774 : bytea *value = PG_GETARG_BYTEA_PP(1);
516 243774 : bool isfirst = false;
517 :
518 : /*
519 : * You might think we can just throw away the first delimiter, however
520 : * we must keep it as we may be a parallel worker doing partial
521 : * aggregation building a state to send to the main process. We need
522 : * to keep the delimiter of every aggregation so that the combine
523 : * function can properly join up the strings of two separately
524 : * partially aggregated results. The first delimiter is only stripped
525 : * off in the final function. To know how much to strip off the front
526 : * of the string, we store the length of the first delimiter in the
527 : * StringInfo's cursor field, which we don't otherwise need here.
528 : */
529 243774 : if (state == NULL)
530 : {
531 148 : state = makeStringAggState(fcinfo);
532 148 : isfirst = true;
533 : }
534 :
535 243774 : if (!PG_ARGISNULL(2))
536 : {
537 243762 : bytea *delim = PG_GETARG_BYTEA_PP(2);
538 :
539 243762 : appendBinaryStringInfo(state, VARDATA_ANY(delim),
540 243762 : VARSIZE_ANY_EXHDR(delim));
541 243762 : if (isfirst)
542 142 : state->cursor = VARSIZE_ANY_EXHDR(delim);
543 : }
544 :
545 243774 : appendBinaryStringInfo(state, VARDATA_ANY(value),
546 243774 : VARSIZE_ANY_EXHDR(value));
547 : }
548 :
549 : /*
550 : * The transition type for string_agg() is declared to be "internal",
551 : * which is a pass-by-value type the same size as a pointer.
552 : */
553 258774 : if (state)
554 258738 : PG_RETURN_POINTER(state);
555 36 : PG_RETURN_NULL();
556 : }
557 :
558 : Datum
559 154 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
560 : {
561 : StringInfo state;
562 :
563 : /* cannot be called directly because of internal-type argument */
564 : Assert(AggCheckCallContext(fcinfo, NULL));
565 :
566 154 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
567 :
568 154 : if (state != NULL)
569 : {
570 : /* As per comment in transfn, strip data before the cursor position */
571 : bytea *result;
572 148 : int strippedlen = state->len - state->cursor;
573 :
574 148 : result = (bytea *) palloc(strippedlen + VARHDRSZ);
575 148 : SET_VARSIZE(result, strippedlen + VARHDRSZ);
576 148 : memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
577 148 : PG_RETURN_BYTEA_P(result);
578 : }
579 : else
580 6 : PG_RETURN_NULL();
581 : }
582 :
583 : /*
584 : * textin - converts cstring to internal representation
585 : */
586 : Datum
587 20090222 : textin(PG_FUNCTION_ARGS)
588 : {
589 20090222 : char *inputText = PG_GETARG_CSTRING(0);
590 :
591 20090222 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
592 : }
593 :
594 : /*
595 : * textout - converts internal representation to cstring
596 : */
597 : Datum
598 7419814 : textout(PG_FUNCTION_ARGS)
599 : {
600 7419814 : Datum txt = PG_GETARG_DATUM(0);
601 :
602 7419814 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
603 : }
604 :
605 : /*
606 : * textrecv - converts external binary format to text
607 : */
608 : Datum
609 48 : textrecv(PG_FUNCTION_ARGS)
610 : {
611 48 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
612 : text *result;
613 : char *str;
614 : int nbytes;
615 :
616 48 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
617 :
618 48 : result = cstring_to_text_with_len(str, nbytes);
619 48 : pfree(str);
620 48 : PG_RETURN_TEXT_P(result);
621 : }
622 :
623 : /*
624 : * textsend - converts text to binary format
625 : */
626 : Datum
627 4914 : textsend(PG_FUNCTION_ARGS)
628 : {
629 4914 : text *t = PG_GETARG_TEXT_PP(0);
630 : StringInfoData buf;
631 :
632 4914 : pq_begintypsend(&buf);
633 4914 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
634 4914 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
635 : }
636 :
637 :
638 : /*
639 : * unknownin - converts cstring to internal representation
640 : */
641 : Datum
642 0 : unknownin(PG_FUNCTION_ARGS)
643 : {
644 0 : char *str = PG_GETARG_CSTRING(0);
645 :
646 : /* representation is same as cstring */
647 0 : PG_RETURN_CSTRING(pstrdup(str));
648 : }
649 :
650 : /*
651 : * unknownout - converts internal representation to cstring
652 : */
653 : Datum
654 940 : unknownout(PG_FUNCTION_ARGS)
655 : {
656 : /* representation is same as cstring */
657 940 : char *str = PG_GETARG_CSTRING(0);
658 :
659 940 : PG_RETURN_CSTRING(pstrdup(str));
660 : }
661 :
662 : /*
663 : * unknownrecv - converts external binary format to unknown
664 : */
665 : Datum
666 0 : unknownrecv(PG_FUNCTION_ARGS)
667 : {
668 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
669 : char *str;
670 : int nbytes;
671 :
672 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
673 : /* representation is same as cstring */
674 0 : PG_RETURN_CSTRING(str);
675 : }
676 :
677 : /*
678 : * unknownsend - converts unknown to binary format
679 : */
680 : Datum
681 0 : unknownsend(PG_FUNCTION_ARGS)
682 : {
683 : /* representation is same as cstring */
684 0 : char *str = PG_GETARG_CSTRING(0);
685 : StringInfoData buf;
686 :
687 0 : pq_begintypsend(&buf);
688 0 : pq_sendtext(&buf, str, strlen(str));
689 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
690 : }
691 :
692 :
693 : /* ========== PUBLIC ROUTINES ========== */
694 :
695 : /*
696 : * textlen -
697 : * returns the logical length of a text*
698 : * (which is less than the VARSIZE of the text*)
699 : */
700 : Datum
701 430718 : textlen(PG_FUNCTION_ARGS)
702 : {
703 430718 : Datum str = PG_GETARG_DATUM(0);
704 :
705 : /* try to avoid decompressing argument */
706 430718 : PG_RETURN_INT32(text_length(str));
707 : }
708 :
709 : /*
710 : * text_length -
711 : * Does the real work for textlen()
712 : *
713 : * This is broken out so it can be called directly by other string processing
714 : * functions. Note that the argument is passed as a Datum, to indicate that
715 : * it may still be in compressed form. We can avoid decompressing it at all
716 : * in some cases.
717 : */
718 : static int32
719 430730 : text_length(Datum str)
720 : {
721 : /* fastpath when max encoding length is one */
722 430730 : if (pg_database_encoding_max_length() == 1)
723 20 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
724 : else
725 : {
726 430710 : text *t = DatumGetTextPP(str);
727 :
728 430710 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
729 : VARSIZE_ANY_EXHDR(t)));
730 : }
731 : }
732 :
733 : /*
734 : * textoctetlen -
735 : * returns the physical length of a text*
736 : * (which is less than the VARSIZE of the text*)
737 : */
738 : Datum
739 70 : textoctetlen(PG_FUNCTION_ARGS)
740 : {
741 70 : Datum str = PG_GETARG_DATUM(0);
742 :
743 : /* We need not detoast the input at all */
744 70 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
745 : }
746 :
747 : /*
748 : * textcat -
749 : * takes two text* and returns a text* that is the concatenation of
750 : * the two.
751 : *
752 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
753 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
754 : * Allocate space for output in all cases.
755 : * XXX - thomas 1997-07-10
756 : */
757 : Datum
758 1817434 : textcat(PG_FUNCTION_ARGS)
759 : {
760 1817434 : text *t1 = PG_GETARG_TEXT_PP(0);
761 1817434 : text *t2 = PG_GETARG_TEXT_PP(1);
762 :
763 1817434 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
764 : }
765 :
766 : /*
767 : * text_catenate
768 : * Guts of textcat(), broken out so it can be used by other functions
769 : *
770 : * Arguments can be in short-header form, but not compressed or out-of-line
771 : */
772 : static text *
773 1817514 : text_catenate(text *t1, text *t2)
774 : {
775 : text *result;
776 : int len1,
777 : len2,
778 : len;
779 : char *ptr;
780 :
781 1817514 : len1 = VARSIZE_ANY_EXHDR(t1);
782 1817514 : len2 = VARSIZE_ANY_EXHDR(t2);
783 :
784 : /* paranoia ... probably should throw error instead? */
785 1817514 : if (len1 < 0)
786 0 : len1 = 0;
787 1817514 : if (len2 < 0)
788 0 : len2 = 0;
789 :
790 1817514 : len = len1 + len2 + VARHDRSZ;
791 1817514 : result = (text *) palloc(len);
792 :
793 : /* Set size of result string... */
794 1817514 : SET_VARSIZE(result, len);
795 :
796 : /* Fill data field of result string... */
797 1817514 : ptr = VARDATA(result);
798 1817514 : if (len1 > 0)
799 1816696 : memcpy(ptr, VARDATA_ANY(t1), len1);
800 1817514 : if (len2 > 0)
801 1817304 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
802 :
803 1817514 : return result;
804 : }
805 :
806 : /*
807 : * charlen_to_bytelen()
808 : * Compute the number of bytes occupied by n characters starting at *p
809 : *
810 : * It is caller's responsibility that there actually are n characters;
811 : * the string need not be null-terminated.
812 : */
813 : static int
814 15158 : charlen_to_bytelen(const char *p, int n)
815 : {
816 15158 : if (pg_database_encoding_max_length() == 1)
817 : {
818 : /* Optimization for single-byte encodings */
819 180 : return n;
820 : }
821 : else
822 : {
823 : const char *s;
824 :
825 6001770 : for (s = p; n > 0; n--)
826 5986792 : s += pg_mblen(s);
827 :
828 14978 : return s - p;
829 : }
830 : }
831 :
832 : /*
833 : * text_substr()
834 : * Return a substring starting at the specified position.
835 : * - thomas 1997-12-31
836 : *
837 : * Input:
838 : * - string
839 : * - starting position (is one-based)
840 : * - string length
841 : *
842 : * If the starting position is zero or less, then return from the start of the string
843 : * adjusting the length to be consistent with the "negative start" per SQL.
844 : * If the length is less than zero, return the remaining string.
845 : *
846 : * Added multibyte support.
847 : * - Tatsuo Ishii 1998-4-21
848 : * Changed behavior if starting position is less than one to conform to SQL behavior.
849 : * Formerly returned the entire string; now returns a portion.
850 : * - Thomas Lockhart 1998-12-10
851 : * Now uses faster TOAST-slicing interface
852 : * - John Gray 2002-02-22
853 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
854 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
855 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
856 : * S > LC and < LC + 4 sometimes garbage characters are returned.
857 : * - Joe Conway 2002-08-10
858 : */
859 : Datum
860 590368 : text_substr(PG_FUNCTION_ARGS)
861 : {
862 590368 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
863 : PG_GETARG_INT32(1),
864 : PG_GETARG_INT32(2),
865 : false));
866 : }
867 :
868 : /*
869 : * text_substr_no_len -
870 : * Wrapper to avoid opr_sanity failure due to
871 : * one function accepting a different number of args.
872 : */
873 : Datum
874 36 : text_substr_no_len(PG_FUNCTION_ARGS)
875 : {
876 36 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
877 : PG_GETARG_INT32(1),
878 : -1, true));
879 : }
880 :
881 : /*
882 : * text_substring -
883 : * Does the real work for text_substr() and text_substr_no_len()
884 : *
885 : * This is broken out so it can be called directly by other string processing
886 : * functions. Note that the argument is passed as a Datum, to indicate that
887 : * it may still be in compressed/toasted form. We can avoid detoasting all
888 : * of it in some cases.
889 : *
890 : * The result is always a freshly palloc'd datum.
891 : */
892 : static text *
893 630504 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
894 : {
895 630504 : int32 eml = pg_database_encoding_max_length();
896 630504 : int32 S = start; /* start position */
897 : int32 S1; /* adjusted start position */
898 : int32 L1; /* adjusted substring length */
899 : int32 E; /* end position */
900 :
901 : /*
902 : * SQL99 says S can be zero or negative (which we don't document), but we
903 : * still must fetch from the start of the string.
904 : * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
905 : */
906 630504 : S1 = Max(S, 1);
907 :
908 : /* life is easy if the encoding max length is 1 */
909 630504 : if (eml == 1)
910 : {
911 22 : if (length_not_specified) /* special case - get length to end of
912 : * string */
913 0 : L1 = -1;
914 22 : else if (length < 0)
915 : {
916 : /* SQL99 says to throw an error for E < S, i.e., negative length */
917 0 : ereport(ERROR,
918 : (errcode(ERRCODE_SUBSTRING_ERROR),
919 : errmsg("negative substring length not allowed")));
920 : L1 = -1; /* silence stupider compilers */
921 : }
922 22 : else if (pg_add_s32_overflow(S, length, &E))
923 : {
924 : /*
925 : * L could be large enough for S + L to overflow, in which case
926 : * the substring must run to end of string.
927 : */
928 0 : L1 = -1;
929 : }
930 : else
931 : {
932 : /*
933 : * A zero or negative value for the end position can happen if the
934 : * start was negative or one. SQL99 says to return a zero-length
935 : * string.
936 : */
937 22 : if (E < 1)
938 0 : return cstring_to_text("");
939 :
940 22 : L1 = E - S1;
941 : }
942 :
943 : /*
944 : * If the start position is past the end of the string, SQL99 says to
945 : * return a zero-length string -- DatumGetTextPSlice() will do that
946 : * for us. We need only convert S1 to zero-based starting position.
947 : */
948 22 : return DatumGetTextPSlice(str, S1 - 1, L1);
949 : }
950 630482 : else if (eml > 1)
951 : {
952 : /*
953 : * When encoding max length is > 1, we can't get LC without
954 : * detoasting, so we'll grab a conservatively large slice now and go
955 : * back later to do the right thing
956 : */
957 : int32 slice_start;
958 : int32 slice_size;
959 : int32 slice_strlen;
960 : text *slice;
961 : int32 E1;
962 : int32 i;
963 : char *p;
964 : char *s;
965 : text *ret;
966 :
967 : /*
968 : * We need to start at position zero because there is no way to know
969 : * in advance which byte offset corresponds to the supplied start
970 : * position.
971 : */
972 630482 : slice_start = 0;
973 :
974 630482 : if (length_not_specified) /* special case - get length to end of
975 : * string */
976 76 : slice_size = L1 = -1;
977 630406 : else if (length < 0)
978 : {
979 : /* SQL99 says to throw an error for E < S, i.e., negative length */
980 12 : ereport(ERROR,
981 : (errcode(ERRCODE_SUBSTRING_ERROR),
982 : errmsg("negative substring length not allowed")));
983 : slice_size = L1 = -1; /* silence stupider compilers */
984 : }
985 630394 : else if (pg_add_s32_overflow(S, length, &E))
986 : {
987 : /*
988 : * L could be large enough for S + L to overflow, in which case
989 : * the substring must run to end of string.
990 : */
991 6 : slice_size = L1 = -1;
992 : }
993 : else
994 : {
995 : /*
996 : * A zero or negative value for the end position can happen if the
997 : * start was negative or one. SQL99 says to return a zero-length
998 : * string.
999 : */
1000 630388 : if (E < 1)
1001 0 : return cstring_to_text("");
1002 :
1003 : /*
1004 : * if E is past the end of the string, the tuple toaster will
1005 : * truncate the length for us
1006 : */
1007 630388 : L1 = E - S1;
1008 :
1009 : /*
1010 : * Total slice size in bytes can't be any longer than the start
1011 : * position plus substring length times the encoding max length.
1012 : * If that overflows, we can just use -1.
1013 : */
1014 630388 : if (pg_mul_s32_overflow(E, eml, &slice_size))
1015 6 : slice_size = -1;
1016 : }
1017 :
1018 : /*
1019 : * If we're working with an untoasted source, no need to do an extra
1020 : * copying step.
1021 : */
1022 630470 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1023 630416 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1024 324 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
1025 : else
1026 630146 : slice = (text *) DatumGetPointer(str);
1027 :
1028 : /* see if we got back an empty string */
1029 630470 : if (VARSIZE_ANY_EXHDR(slice) == 0)
1030 : {
1031 0 : if (slice != (text *) DatumGetPointer(str))
1032 0 : pfree(slice);
1033 0 : return cstring_to_text("");
1034 : }
1035 :
1036 : /* Now we can get the actual length of the slice in MB characters */
1037 630470 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1038 630470 : VARSIZE_ANY_EXHDR(slice));
1039 :
1040 : /*
1041 : * Check that the start position wasn't > slice_strlen. If so, SQL99
1042 : * says to return a zero-length string.
1043 : */
1044 630470 : if (S1 > slice_strlen)
1045 : {
1046 22 : if (slice != (text *) DatumGetPointer(str))
1047 0 : pfree(slice);
1048 22 : return cstring_to_text("");
1049 : }
1050 :
1051 : /*
1052 : * Adjust L1 and E1 now that we know the slice string length. Again
1053 : * remember that S1 is one based, and slice_start is zero based.
1054 : */
1055 630448 : if (L1 > -1)
1056 630388 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1057 : else
1058 60 : E1 = slice_start + 1 + slice_strlen;
1059 :
1060 : /*
1061 : * Find the start position in the slice; remember S1 is not zero based
1062 : */
1063 630448 : p = VARDATA_ANY(slice);
1064 5451794 : for (i = 0; i < S1 - 1; i++)
1065 4821346 : p += pg_mblen(p);
1066 :
1067 : /* hang onto a pointer to our start position */
1068 630448 : s = p;
1069 :
1070 : /*
1071 : * Count the actual bytes used by the substring of the requested
1072 : * length.
1073 : */
1074 9804544 : for (i = S1; i < E1; i++)
1075 9174096 : p += pg_mblen(p);
1076 :
1077 630448 : ret = (text *) palloc(VARHDRSZ + (p - s));
1078 630448 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
1079 630448 : memcpy(VARDATA(ret), s, (p - s));
1080 :
1081 630448 : if (slice != (text *) DatumGetPointer(str))
1082 324 : pfree(slice);
1083 :
1084 630448 : return ret;
1085 : }
1086 : else
1087 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
1088 :
1089 : /* not reached: suppress compiler warning */
1090 : return NULL;
1091 : }
1092 :
1093 : /*
1094 : * textoverlay
1095 : * Replace specified substring of first string with second
1096 : *
1097 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1098 : * This code is a direct implementation of what the standard says.
1099 : */
1100 : Datum
1101 28 : textoverlay(PG_FUNCTION_ARGS)
1102 : {
1103 28 : text *t1 = PG_GETARG_TEXT_PP(0);
1104 28 : text *t2 = PG_GETARG_TEXT_PP(1);
1105 28 : int sp = PG_GETARG_INT32(2); /* substring start position */
1106 28 : int sl = PG_GETARG_INT32(3); /* substring length */
1107 :
1108 28 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1109 : }
1110 :
1111 : Datum
1112 12 : textoverlay_no_len(PG_FUNCTION_ARGS)
1113 : {
1114 12 : text *t1 = PG_GETARG_TEXT_PP(0);
1115 12 : text *t2 = PG_GETARG_TEXT_PP(1);
1116 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
1117 : int sl;
1118 :
1119 12 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1120 12 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1121 : }
1122 :
1123 : static text *
1124 40 : text_overlay(text *t1, text *t2, int sp, int sl)
1125 : {
1126 : text *result;
1127 : text *s1;
1128 : text *s2;
1129 : int sp_pl_sl;
1130 :
1131 : /*
1132 : * Check for possible integer-overflow cases. For negative sp, throw a
1133 : * "substring length" error because that's what should be expected
1134 : * according to the spec's definition of OVERLAY().
1135 : */
1136 40 : if (sp <= 0)
1137 0 : ereport(ERROR,
1138 : (errcode(ERRCODE_SUBSTRING_ERROR),
1139 : errmsg("negative substring length not allowed")));
1140 40 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1141 0 : ereport(ERROR,
1142 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1143 : errmsg("integer out of range")));
1144 :
1145 40 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1146 40 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1147 40 : result = text_catenate(s1, t2);
1148 40 : result = text_catenate(result, s2);
1149 :
1150 40 : return result;
1151 : }
1152 :
1153 : /*
1154 : * textpos -
1155 : * Return the position of the specified substring.
1156 : * Implements the SQL POSITION() function.
1157 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1158 : * - thomas 1997-07-27
1159 : */
1160 : Datum
1161 130 : textpos(PG_FUNCTION_ARGS)
1162 : {
1163 130 : text *str = PG_GETARG_TEXT_PP(0);
1164 130 : text *search_str = PG_GETARG_TEXT_PP(1);
1165 :
1166 130 : PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1167 : }
1168 :
1169 : /*
1170 : * text_position -
1171 : * Does the real work for textpos()
1172 : *
1173 : * Inputs:
1174 : * t1 - string to be searched
1175 : * t2 - pattern to match within t1
1176 : * Result:
1177 : * Character index of the first matched char, starting from 1,
1178 : * or 0 if no match.
1179 : *
1180 : * This is broken out so it can be called directly by other string processing
1181 : * functions.
1182 : */
1183 : static int
1184 130 : text_position(text *t1, text *t2, Oid collid)
1185 : {
1186 : TextPositionState state;
1187 : int result;
1188 :
1189 130 : check_collation_set(collid);
1190 :
1191 : /* Empty needle always matches at position 1 */
1192 130 : if (VARSIZE_ANY_EXHDR(t2) < 1)
1193 12 : return 1;
1194 :
1195 : /* Otherwise, can't match if haystack is shorter than needle */
1196 118 : if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
1197 22 : pg_newlocale_from_collation(collid)->deterministic)
1198 22 : return 0;
1199 :
1200 96 : text_position_setup(t1, t2, collid, &state);
1201 : /* don't need greedy mode here */
1202 96 : state.greedy = false;
1203 :
1204 96 : if (!text_position_next(&state))
1205 24 : result = 0;
1206 : else
1207 72 : result = text_position_get_match_pos(&state);
1208 96 : text_position_cleanup(&state);
1209 96 : return result;
1210 : }
1211 :
1212 :
1213 : /*
1214 : * text_position_setup, text_position_next, text_position_cleanup -
1215 : * Component steps of text_position()
1216 : *
1217 : * These are broken out so that a string can be efficiently searched for
1218 : * multiple occurrences of the same pattern. text_position_next may be
1219 : * called multiple times, and it advances to the next match on each call.
1220 : * text_position_get_match_ptr() and text_position_get_match_pos() return
1221 : * a pointer or 1-based character position of the last match, respectively.
1222 : *
1223 : * The "state" variable is normally just a local variable in the caller.
1224 : *
1225 : * NOTE: text_position_next skips over the matched portion. For example,
1226 : * searching for "xx" in "xxx" returns only one match, not two.
1227 : */
1228 :
1229 : static void
1230 1638 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1231 : {
1232 1638 : int len1 = VARSIZE_ANY_EXHDR(t1);
1233 1638 : int len2 = VARSIZE_ANY_EXHDR(t2);
1234 :
1235 1638 : check_collation_set(collid);
1236 :
1237 1638 : state->locale = pg_newlocale_from_collation(collid);
1238 :
1239 : /*
1240 : * Most callers need greedy mode, but some might want to unset this to
1241 : * optimize.
1242 : */
1243 1638 : state->greedy = true;
1244 :
1245 : Assert(len2 > 0);
1246 :
1247 : /*
1248 : * Even with a multi-byte encoding, we perform the search using the raw
1249 : * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1250 : * because in UTF-8 the byte sequence of one character cannot contain
1251 : * another character. For other multi-byte encodings, we do the search
1252 : * initially as a simple byte search, ignoring multibyte issues, but
1253 : * verify afterwards that the match we found is at a character boundary,
1254 : * and continue the search if it was a false match.
1255 : */
1256 1638 : if (pg_database_encoding_max_length() == 1)
1257 108 : state->is_multibyte_char_in_char = false;
1258 1530 : else if (GetDatabaseEncoding() == PG_UTF8)
1259 1530 : state->is_multibyte_char_in_char = false;
1260 : else
1261 0 : state->is_multibyte_char_in_char = true;
1262 :
1263 1638 : state->str1 = VARDATA_ANY(t1);
1264 1638 : state->str2 = VARDATA_ANY(t2);
1265 1638 : state->len1 = len1;
1266 1638 : state->len2 = len2;
1267 1638 : state->last_match = NULL;
1268 1638 : state->refpoint = state->str1;
1269 1638 : state->refpos = 0;
1270 :
1271 : /*
1272 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1273 : * notes we use the terminology that the "haystack" is the string to be
1274 : * searched (t1) and the "needle" is the pattern being sought (t2).
1275 : *
1276 : * If the needle is empty or bigger than the haystack then there is no
1277 : * point in wasting cycles initializing the table. We also choose not to
1278 : * use B-M-H for needles of length 1, since the skip table can't possibly
1279 : * save anything in that case.
1280 : *
1281 : * (With nondeterministic collations, the search is already
1282 : * multibyte-aware, so we don't need this.)
1283 : */
1284 1638 : if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
1285 : {
1286 1310 : int searchlength = len1 - len2;
1287 : int skiptablemask;
1288 : int last;
1289 : int i;
1290 1310 : const char *str2 = state->str2;
1291 :
1292 : /*
1293 : * First we must determine how much of the skip table to use. The
1294 : * declaration of TextPositionState allows up to 256 elements, but for
1295 : * short search problems we don't really want to have to initialize so
1296 : * many elements --- it would take too long in comparison to the
1297 : * actual search time. So we choose a useful skip table size based on
1298 : * the haystack length minus the needle length. The closer the needle
1299 : * length is to the haystack length the less useful skipping becomes.
1300 : *
1301 : * Note: since we use bit-masking to select table elements, the skip
1302 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1303 : */
1304 1310 : if (searchlength < 16)
1305 114 : skiptablemask = 3;
1306 1196 : else if (searchlength < 64)
1307 16 : skiptablemask = 7;
1308 1180 : else if (searchlength < 128)
1309 14 : skiptablemask = 15;
1310 1166 : else if (searchlength < 512)
1311 214 : skiptablemask = 31;
1312 952 : else if (searchlength < 2048)
1313 734 : skiptablemask = 63;
1314 218 : else if (searchlength < 4096)
1315 148 : skiptablemask = 127;
1316 : else
1317 70 : skiptablemask = 255;
1318 1310 : state->skiptablemask = skiptablemask;
1319 :
1320 : /*
1321 : * Initialize the skip table. We set all elements to the needle
1322 : * length, since this is the correct skip distance for any character
1323 : * not found in the needle.
1324 : */
1325 92806 : for (i = 0; i <= skiptablemask; i++)
1326 91496 : state->skiptable[i] = len2;
1327 :
1328 : /*
1329 : * Now examine the needle. For each character except the last one,
1330 : * set the corresponding table element to the appropriate skip
1331 : * distance. Note that when two characters share the same skip table
1332 : * entry, the one later in the needle must determine the skip
1333 : * distance.
1334 : */
1335 1310 : last = len2 - 1;
1336 :
1337 17294 : for (i = 0; i < last; i++)
1338 15984 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1339 : }
1340 1638 : }
1341 :
1342 : /*
1343 : * Advance to the next match, starting from the end of the previous match
1344 : * (or the beginning of the string, on first call). Returns true if a match
1345 : * is found.
1346 : *
1347 : * Note that this refuses to match an empty-string needle. Most callers
1348 : * will have handled that case specially and we'll never see it here.
1349 : */
1350 : static bool
1351 7426 : text_position_next(TextPositionState *state)
1352 : {
1353 7426 : int needle_len = state->len2;
1354 : char *start_ptr;
1355 : char *matchptr;
1356 :
1357 7426 : if (needle_len <= 0)
1358 0 : return false; /* result for empty pattern */
1359 :
1360 : /* Start from the point right after the previous match. */
1361 7426 : if (state->last_match)
1362 5776 : start_ptr = state->last_match + state->last_match_len;
1363 : else
1364 1650 : start_ptr = state->str1;
1365 :
1366 7426 : retry:
1367 7426 : matchptr = text_position_next_internal(start_ptr, state);
1368 :
1369 7426 : if (!matchptr)
1370 1554 : return false;
1371 :
1372 : /*
1373 : * Found a match for the byte sequence. If this is a multibyte encoding,
1374 : * where one character's byte sequence can appear inside a longer
1375 : * multi-byte character, we need to verify that the match was at a
1376 : * character boundary, not in the middle of a multi-byte character.
1377 : */
1378 5872 : if (state->is_multibyte_char_in_char && state->locale->deterministic)
1379 : {
1380 : /* Walk one character at a time, until we reach the match. */
1381 :
1382 : /* the search should never move backwards. */
1383 : Assert(state->refpoint <= matchptr);
1384 :
1385 0 : while (state->refpoint < matchptr)
1386 : {
1387 : /* step to next character. */
1388 0 : state->refpoint += pg_mblen(state->refpoint);
1389 0 : state->refpos++;
1390 :
1391 : /*
1392 : * If we stepped over the match's start position, then it was a
1393 : * false positive, where the byte sequence appeared in the middle
1394 : * of a multi-byte character. Skip it, and continue the search at
1395 : * the next character boundary.
1396 : */
1397 0 : if (state->refpoint > matchptr)
1398 : {
1399 0 : start_ptr = state->refpoint;
1400 0 : goto retry;
1401 : }
1402 : }
1403 : }
1404 :
1405 5872 : state->last_match = matchptr;
1406 5872 : state->last_match_len = state->last_match_len_tmp;
1407 5872 : return true;
1408 : }
1409 :
1410 : /*
1411 : * Subroutine of text_position_next(). This searches for the raw byte
1412 : * sequence, ignoring any multi-byte encoding issues. Returns the first
1413 : * match starting at 'start_ptr', or NULL if no match is found.
1414 : */
1415 : static char *
1416 7426 : text_position_next_internal(char *start_ptr, TextPositionState *state)
1417 : {
1418 7426 : int haystack_len = state->len1;
1419 7426 : int needle_len = state->len2;
1420 7426 : int skiptablemask = state->skiptablemask;
1421 7426 : const char *haystack = state->str1;
1422 7426 : const char *needle = state->str2;
1423 7426 : const char *haystack_end = &haystack[haystack_len];
1424 : const char *hptr;
1425 :
1426 : Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1427 :
1428 7426 : state->last_match_len_tmp = needle_len;
1429 :
1430 7426 : if (!state->locale->deterministic)
1431 : {
1432 : /*
1433 : * With a nondeterministic collation, we have to use an unoptimized
1434 : * route. We walk through the haystack and see if at each position
1435 : * there is a substring of the remaining string that is equal to the
1436 : * needle under the given collation.
1437 : *
1438 : * Note, the found substring could have a different length than the
1439 : * needle, including being empty. Callers that want to skip over the
1440 : * found string need to read the length of the found substring from
1441 : * last_match_len rather than just using the length of their needle.
1442 : *
1443 : * Most callers will require "greedy" semantics, meaning that we need
1444 : * to find the longest such substring, not the shortest. For callers
1445 : * that don't need greedy semantics, we can finish on the first match.
1446 : */
1447 240 : const char *result_hptr = NULL;
1448 :
1449 240 : hptr = start_ptr;
1450 642 : while (hptr < haystack_end)
1451 : {
1452 : /*
1453 : * First check the common case that there is a match in the
1454 : * haystack of exactly the length of the needle.
1455 : */
1456 534 : if (!state->greedy &&
1457 108 : haystack_end - hptr >= needle_len &&
1458 54 : pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
1459 12 : return (char *) hptr;
1460 :
1461 : /*
1462 : * Else check if any of the possible substrings starting at hptr
1463 : * are equal to the needle.
1464 : */
1465 2586 : for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1466 : {
1467 2064 : if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
1468 : {
1469 132 : state->last_match_len_tmp = (test_end - hptr);
1470 132 : result_hptr = hptr;
1471 132 : if (!state->greedy)
1472 0 : break;
1473 : }
1474 : }
1475 522 : if (result_hptr)
1476 120 : break;
1477 :
1478 402 : hptr += pg_mblen(hptr);
1479 : }
1480 :
1481 228 : return (char *) result_hptr;
1482 : }
1483 7186 : else if (needle_len == 1)
1484 : {
1485 : /* No point in using B-M-H for a one-character needle */
1486 760 : char nchar = *needle;
1487 :
1488 760 : hptr = start_ptr;
1489 5878 : while (hptr < haystack_end)
1490 : {
1491 5712 : if (*hptr == nchar)
1492 594 : return (char *) hptr;
1493 5118 : hptr++;
1494 : }
1495 : }
1496 : else
1497 : {
1498 6426 : const char *needle_last = &needle[needle_len - 1];
1499 :
1500 : /* Start at startpos plus the length of the needle */
1501 6426 : hptr = start_ptr + needle_len - 1;
1502 165256 : while (hptr < haystack_end)
1503 : {
1504 : /* Match the needle scanning *backward* */
1505 : const char *nptr;
1506 : const char *p;
1507 :
1508 163976 : nptr = needle_last;
1509 163976 : p = hptr;
1510 240692 : while (*nptr == *p)
1511 : {
1512 : /* Matched it all? If so, return 1-based position */
1513 81862 : if (nptr == needle)
1514 5146 : return (char *) p;
1515 76716 : nptr--, p--;
1516 : }
1517 :
1518 : /*
1519 : * No match, so use the haystack char at hptr to decide how far to
1520 : * advance. If the needle had any occurrence of that character
1521 : * (or more precisely, one sharing the same skiptable entry)
1522 : * before its last character, then we advance far enough to align
1523 : * the last such needle character with that haystack position.
1524 : * Otherwise we can advance by the whole needle length.
1525 : */
1526 158830 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1527 : }
1528 : }
1529 :
1530 1446 : return 0; /* not found */
1531 : }
1532 :
1533 : /*
1534 : * Return a pointer to the current match.
1535 : *
1536 : * The returned pointer points into the original haystack string.
1537 : */
1538 : static char *
1539 5770 : text_position_get_match_ptr(TextPositionState *state)
1540 : {
1541 5770 : return state->last_match;
1542 : }
1543 :
1544 : /*
1545 : * Return the offset of the current match.
1546 : *
1547 : * The offset is in characters, 1-based.
1548 : */
1549 : static int
1550 72 : text_position_get_match_pos(TextPositionState *state)
1551 : {
1552 : /* Convert the byte position to char position. */
1553 144 : state->refpos += pg_mbstrlen_with_len(state->refpoint,
1554 72 : state->last_match - state->refpoint);
1555 72 : state->refpoint = state->last_match;
1556 72 : return state->refpos + 1;
1557 : }
1558 :
1559 : /*
1560 : * Reset search state to the initial state installed by text_position_setup.
1561 : *
1562 : * The next call to text_position_next will search from the beginning
1563 : * of the string.
1564 : */
1565 : static void
1566 12 : text_position_reset(TextPositionState *state)
1567 : {
1568 12 : state->last_match = NULL;
1569 12 : state->refpoint = state->str1;
1570 12 : state->refpos = 0;
1571 12 : }
1572 :
1573 : static void
1574 1638 : text_position_cleanup(TextPositionState *state)
1575 : {
1576 : /* no cleanup needed */
1577 1638 : }
1578 :
1579 :
1580 : static void
1581 13063004 : check_collation_set(Oid collid)
1582 : {
1583 13063004 : if (!OidIsValid(collid))
1584 : {
1585 : /*
1586 : * This typically means that the parser could not resolve a conflict
1587 : * of implicit collations, so report it that way.
1588 : */
1589 30 : ereport(ERROR,
1590 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1591 : errmsg("could not determine which collation to use for string comparison"),
1592 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1593 : }
1594 13062974 : }
1595 :
1596 : /*
1597 : * varstr_cmp()
1598 : *
1599 : * Comparison function for text strings with given lengths, using the
1600 : * appropriate locale. Returns an integer less than, equal to, or greater than
1601 : * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
1602 : *
1603 : * Note: many functions that depend on this are marked leakproof; therefore,
1604 : * avoid reporting the actual contents of the input when throwing errors.
1605 : * All errors herein should be things that can't happen except on corrupt
1606 : * data, anyway; otherwise we will have trouble with indexing strings that
1607 : * would cause them.
1608 : */
1609 : int
1610 6487314 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1611 : {
1612 : int result;
1613 : pg_locale_t mylocale;
1614 :
1615 6487314 : check_collation_set(collid);
1616 :
1617 6487296 : mylocale = pg_newlocale_from_collation(collid);
1618 :
1619 6487296 : if (mylocale->collate_is_c)
1620 : {
1621 3115844 : result = memcmp(arg1, arg2, Min(len1, len2));
1622 3115844 : if ((result == 0) && (len1 != len2))
1623 113320 : result = (len1 < len2) ? -1 : 1;
1624 : }
1625 : else
1626 : {
1627 : /*
1628 : * memcmp() can't tell us which of two unequal strings sorts first,
1629 : * but it's a cheap way to tell if they're equal. Testing shows that
1630 : * memcmp() followed by strcoll() is only trivially slower than
1631 : * strcoll() by itself, so we don't lose much if this doesn't work out
1632 : * very often, and if it does - for example, because there are many
1633 : * equal strings in the input - then we win big by avoiding expensive
1634 : * collation-aware comparisons.
1635 : */
1636 3371452 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1637 1288394 : return 0;
1638 :
1639 2083058 : result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1640 :
1641 : /* Break tie if necessary. */
1642 2083058 : if (result == 0 && mylocale->deterministic)
1643 : {
1644 0 : result = memcmp(arg1, arg2, Min(len1, len2));
1645 0 : if ((result == 0) && (len1 != len2))
1646 0 : result = (len1 < len2) ? -1 : 1;
1647 : }
1648 : }
1649 :
1650 5198902 : return result;
1651 : }
1652 :
1653 : /* text_cmp()
1654 : * Internal comparison function for text strings.
1655 : * Returns -1, 0 or 1
1656 : */
1657 : static int
1658 4937186 : text_cmp(text *arg1, text *arg2, Oid collid)
1659 : {
1660 : char *a1p,
1661 : *a2p;
1662 : int len1,
1663 : len2;
1664 :
1665 4937186 : a1p = VARDATA_ANY(arg1);
1666 4937186 : a2p = VARDATA_ANY(arg2);
1667 :
1668 4937186 : len1 = VARSIZE_ANY_EXHDR(arg1);
1669 4937186 : len2 = VARSIZE_ANY_EXHDR(arg2);
1670 :
1671 4937186 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1672 : }
1673 :
1674 : /*
1675 : * Comparison functions for text strings.
1676 : *
1677 : * Note: btree indexes need these routines not to leak memory; therefore,
1678 : * be careful to free working copies of toasted datums. Most places don't
1679 : * need to be so careful.
1680 : */
1681 :
1682 : Datum
1683 6181796 : texteq(PG_FUNCTION_ARGS)
1684 : {
1685 6181796 : Oid collid = PG_GET_COLLATION();
1686 6181796 : pg_locale_t mylocale = 0;
1687 : bool result;
1688 :
1689 6181796 : check_collation_set(collid);
1690 :
1691 6181796 : mylocale = pg_newlocale_from_collation(collid);
1692 :
1693 6181796 : if (mylocale->deterministic)
1694 : {
1695 6173356 : Datum arg1 = PG_GETARG_DATUM(0);
1696 6173356 : Datum arg2 = PG_GETARG_DATUM(1);
1697 : Size len1,
1698 : len2;
1699 :
1700 : /*
1701 : * Since we only care about equality or not-equality, we can avoid all
1702 : * the expense of strcoll() here, and just do bitwise comparison. In
1703 : * fact, we don't even have to do a bitwise comparison if we can show
1704 : * the lengths of the strings are unequal; which might save us from
1705 : * having to detoast one or both values.
1706 : */
1707 6173356 : len1 = toast_raw_datum_size(arg1);
1708 6173356 : len2 = toast_raw_datum_size(arg2);
1709 6173356 : if (len1 != len2)
1710 2886152 : result = false;
1711 : else
1712 : {
1713 3287204 : text *targ1 = DatumGetTextPP(arg1);
1714 3287204 : text *targ2 = DatumGetTextPP(arg2);
1715 :
1716 3287204 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1717 : len1 - VARHDRSZ) == 0);
1718 :
1719 3287204 : PG_FREE_IF_COPY(targ1, 0);
1720 3287204 : PG_FREE_IF_COPY(targ2, 1);
1721 : }
1722 : }
1723 : else
1724 : {
1725 8440 : text *arg1 = PG_GETARG_TEXT_PP(0);
1726 8440 : text *arg2 = PG_GETARG_TEXT_PP(1);
1727 :
1728 8440 : result = (text_cmp(arg1, arg2, collid) == 0);
1729 :
1730 8440 : PG_FREE_IF_COPY(arg1, 0);
1731 8440 : PG_FREE_IF_COPY(arg2, 1);
1732 : }
1733 :
1734 6181796 : PG_RETURN_BOOL(result);
1735 : }
1736 :
1737 : Datum
1738 22626 : textne(PG_FUNCTION_ARGS)
1739 : {
1740 22626 : Oid collid = PG_GET_COLLATION();
1741 : pg_locale_t mylocale;
1742 : bool result;
1743 :
1744 22626 : check_collation_set(collid);
1745 :
1746 22626 : mylocale = pg_newlocale_from_collation(collid);
1747 :
1748 22626 : if (mylocale->deterministic)
1749 : {
1750 22602 : Datum arg1 = PG_GETARG_DATUM(0);
1751 22602 : Datum arg2 = PG_GETARG_DATUM(1);
1752 : Size len1,
1753 : len2;
1754 :
1755 : /* See comment in texteq() */
1756 22602 : len1 = toast_raw_datum_size(arg1);
1757 22602 : len2 = toast_raw_datum_size(arg2);
1758 22602 : if (len1 != len2)
1759 4356 : result = true;
1760 : else
1761 : {
1762 18246 : text *targ1 = DatumGetTextPP(arg1);
1763 18246 : text *targ2 = DatumGetTextPP(arg2);
1764 :
1765 18246 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1766 : len1 - VARHDRSZ) != 0);
1767 :
1768 18246 : PG_FREE_IF_COPY(targ1, 0);
1769 18246 : PG_FREE_IF_COPY(targ2, 1);
1770 : }
1771 : }
1772 : else
1773 : {
1774 24 : text *arg1 = PG_GETARG_TEXT_PP(0);
1775 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
1776 :
1777 24 : result = (text_cmp(arg1, arg2, collid) != 0);
1778 :
1779 24 : PG_FREE_IF_COPY(arg1, 0);
1780 24 : PG_FREE_IF_COPY(arg2, 1);
1781 : }
1782 :
1783 22626 : PG_RETURN_BOOL(result);
1784 : }
1785 :
1786 : Datum
1787 208762 : text_lt(PG_FUNCTION_ARGS)
1788 : {
1789 208762 : text *arg1 = PG_GETARG_TEXT_PP(0);
1790 208762 : text *arg2 = PG_GETARG_TEXT_PP(1);
1791 : bool result;
1792 :
1793 208762 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1794 :
1795 208744 : PG_FREE_IF_COPY(arg1, 0);
1796 208744 : PG_FREE_IF_COPY(arg2, 1);
1797 :
1798 208744 : PG_RETURN_BOOL(result);
1799 : }
1800 :
1801 : Datum
1802 316946 : text_le(PG_FUNCTION_ARGS)
1803 : {
1804 316946 : text *arg1 = PG_GETARG_TEXT_PP(0);
1805 316946 : text *arg2 = PG_GETARG_TEXT_PP(1);
1806 : bool result;
1807 :
1808 316946 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1809 :
1810 316946 : PG_FREE_IF_COPY(arg1, 0);
1811 316946 : PG_FREE_IF_COPY(arg2, 1);
1812 :
1813 316946 : PG_RETURN_BOOL(result);
1814 : }
1815 :
1816 : Datum
1817 195830 : text_gt(PG_FUNCTION_ARGS)
1818 : {
1819 195830 : text *arg1 = PG_GETARG_TEXT_PP(0);
1820 195830 : text *arg2 = PG_GETARG_TEXT_PP(1);
1821 : bool result;
1822 :
1823 195830 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1824 :
1825 195830 : PG_FREE_IF_COPY(arg1, 0);
1826 195830 : PG_FREE_IF_COPY(arg2, 1);
1827 :
1828 195830 : PG_RETURN_BOOL(result);
1829 : }
1830 :
1831 : Datum
1832 177896 : text_ge(PG_FUNCTION_ARGS)
1833 : {
1834 177896 : text *arg1 = PG_GETARG_TEXT_PP(0);
1835 177896 : text *arg2 = PG_GETARG_TEXT_PP(1);
1836 : bool result;
1837 :
1838 177896 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1839 :
1840 177896 : PG_FREE_IF_COPY(arg1, 0);
1841 177896 : PG_FREE_IF_COPY(arg2, 1);
1842 :
1843 177896 : PG_RETURN_BOOL(result);
1844 : }
1845 :
1846 : Datum
1847 37914 : text_starts_with(PG_FUNCTION_ARGS)
1848 : {
1849 37914 : Datum arg1 = PG_GETARG_DATUM(0);
1850 37914 : Datum arg2 = PG_GETARG_DATUM(1);
1851 37914 : Oid collid = PG_GET_COLLATION();
1852 : pg_locale_t mylocale;
1853 : bool result;
1854 : Size len1,
1855 : len2;
1856 :
1857 37914 : check_collation_set(collid);
1858 :
1859 37914 : mylocale = pg_newlocale_from_collation(collid);
1860 :
1861 37914 : if (!mylocale->deterministic)
1862 0 : ereport(ERROR,
1863 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1864 : errmsg("nondeterministic collations are not supported for substring searches")));
1865 :
1866 37914 : len1 = toast_raw_datum_size(arg1);
1867 37914 : len2 = toast_raw_datum_size(arg2);
1868 37914 : if (len2 > len1)
1869 0 : result = false;
1870 : else
1871 : {
1872 37914 : text *targ1 = text_substring(arg1, 1, len2, false);
1873 37914 : text *targ2 = DatumGetTextPP(arg2);
1874 :
1875 37914 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1876 37914 : VARSIZE_ANY_EXHDR(targ2)) == 0);
1877 :
1878 37914 : PG_FREE_IF_COPY(targ1, 0);
1879 37914 : PG_FREE_IF_COPY(targ2, 1);
1880 : }
1881 :
1882 37914 : PG_RETURN_BOOL(result);
1883 : }
1884 :
1885 : Datum
1886 3713652 : bttextcmp(PG_FUNCTION_ARGS)
1887 : {
1888 3713652 : text *arg1 = PG_GETARG_TEXT_PP(0);
1889 3713652 : text *arg2 = PG_GETARG_TEXT_PP(1);
1890 : int32 result;
1891 :
1892 3713652 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1893 :
1894 3713652 : PG_FREE_IF_COPY(arg1, 0);
1895 3713652 : PG_FREE_IF_COPY(arg2, 1);
1896 :
1897 3713652 : PG_RETURN_INT32(result);
1898 : }
1899 :
1900 : Datum
1901 83572 : bttextsortsupport(PG_FUNCTION_ARGS)
1902 : {
1903 83572 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1904 83572 : Oid collid = ssup->ssup_collation;
1905 : MemoryContext oldcontext;
1906 :
1907 83572 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1908 :
1909 : /* Use generic string SortSupport */
1910 83572 : varstr_sortsupport(ssup, TEXTOID, collid);
1911 :
1912 83560 : MemoryContextSwitchTo(oldcontext);
1913 :
1914 83560 : PG_RETURN_VOID();
1915 : }
1916 :
1917 : /*
1918 : * Generic sortsupport interface for character type's operator classes.
1919 : * Includes locale support, and support for BpChar semantics (i.e. removing
1920 : * trailing spaces before comparison).
1921 : *
1922 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1923 : * same representation. Callers that always use the C collation (e.g.
1924 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
1925 : * this will not work with any other collation, though.
1926 : */
1927 : void
1928 138908 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1929 : {
1930 138908 : bool abbreviate = ssup->abbreviate;
1931 138908 : bool collate_c = false;
1932 : VarStringSortSupport *sss;
1933 : pg_locale_t locale;
1934 :
1935 138908 : check_collation_set(collid);
1936 :
1937 138896 : locale = pg_newlocale_from_collation(collid);
1938 :
1939 : /*
1940 : * If possible, set ssup->comparator to a function which can be used to
1941 : * directly compare two datums. If we can do this, we'll avoid the
1942 : * overhead of a trip through the fmgr layer for every comparison, which
1943 : * can be substantial.
1944 : *
1945 : * Most typically, we'll set the comparator to varlenafastcmp_locale,
1946 : * which uses strcoll() to perform comparisons. We use that for the
1947 : * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1948 : * LC_COLLATE = C, we can make things quite a bit faster with
1949 : * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1950 : * memcmp() rather than strcoll().
1951 : */
1952 138896 : if (locale->collate_is_c)
1953 : {
1954 94468 : if (typid == BPCHAROID)
1955 308 : ssup->comparator = bpcharfastcmp_c;
1956 94160 : else if (typid == NAMEOID)
1957 : {
1958 54256 : ssup->comparator = namefastcmp_c;
1959 : /* Not supporting abbreviation with type NAME, for now */
1960 54256 : abbreviate = false;
1961 : }
1962 : else
1963 39904 : ssup->comparator = varstrfastcmp_c;
1964 :
1965 94468 : collate_c = true;
1966 : }
1967 : else
1968 : {
1969 : /*
1970 : * We use varlenafastcmp_locale except for type NAME.
1971 : */
1972 44428 : if (typid == NAMEOID)
1973 : {
1974 0 : ssup->comparator = namefastcmp_locale;
1975 : /* Not supporting abbreviation with type NAME, for now */
1976 0 : abbreviate = false;
1977 : }
1978 : else
1979 44428 : ssup->comparator = varlenafastcmp_locale;
1980 :
1981 : /*
1982 : * Unfortunately, it seems that abbreviation for non-C collations is
1983 : * broken on many common platforms; see pg_strxfrm_enabled().
1984 : *
1985 : * Even apart from the risk of broken locales, it's possible that
1986 : * there are platforms where the use of abbreviated keys should be
1987 : * disabled at compile time. Having only 4 byte datums could make
1988 : * worst-case performance drastically more likely, for example.
1989 : * Moreover, macOS's strxfrm() implementation is known to not
1990 : * effectively concentrate a significant amount of entropy from the
1991 : * original string in earlier transformed blobs. It's possible that
1992 : * other supported platforms are similarly encumbered. So, if we ever
1993 : * get past disabling this categorically, we may still want or need to
1994 : * disable it for particular platforms.
1995 : */
1996 44428 : if (!pg_strxfrm_enabled(locale))
1997 43638 : abbreviate = false;
1998 : }
1999 :
2000 : /*
2001 : * If we're using abbreviated keys, or if we're using a locale-aware
2002 : * comparison, we need to initialize a VarStringSortSupport object. Both
2003 : * cases will make use of the temporary buffers we initialize here for
2004 : * scratch space (and to detect requirement for BpChar semantics from
2005 : * caller), and the abbreviation case requires additional state.
2006 : */
2007 138896 : if (abbreviate || !collate_c)
2008 : {
2009 67358 : sss = palloc(sizeof(VarStringSortSupport));
2010 67358 : sss->buf1 = palloc(TEXTBUFLEN);
2011 67358 : sss->buflen1 = TEXTBUFLEN;
2012 67358 : sss->buf2 = palloc(TEXTBUFLEN);
2013 67358 : sss->buflen2 = TEXTBUFLEN;
2014 : /* Start with invalid values */
2015 67358 : sss->last_len1 = -1;
2016 67358 : sss->last_len2 = -1;
2017 : /* Initialize */
2018 67358 : sss->last_returned = 0;
2019 67358 : if (collate_c)
2020 22930 : sss->locale = NULL;
2021 : else
2022 44428 : sss->locale = locale;
2023 :
2024 : /*
2025 : * To avoid somehow confusing a strxfrm() blob and an original string,
2026 : * constantly keep track of the variety of data that buf1 and buf2
2027 : * currently contain.
2028 : *
2029 : * Comparisons may be interleaved with conversion calls. Frequently,
2030 : * conversions and comparisons are batched into two distinct phases,
2031 : * but the correctness of caching cannot hinge upon this. For
2032 : * comparison caching, buffer state is only trusted if cache_blob is
2033 : * found set to false, whereas strxfrm() caching only trusts the state
2034 : * when cache_blob is found set to true.
2035 : *
2036 : * Arbitrarily initialize cache_blob to true.
2037 : */
2038 67358 : sss->cache_blob = true;
2039 67358 : sss->collate_c = collate_c;
2040 67358 : sss->typid = typid;
2041 67358 : ssup->ssup_extra = sss;
2042 :
2043 : /*
2044 : * If possible, plan to use the abbreviated keys optimization. The
2045 : * core code may switch back to authoritative comparator should
2046 : * abbreviation be aborted.
2047 : */
2048 67358 : if (abbreviate)
2049 : {
2050 23522 : sss->prop_card = 0.20;
2051 23522 : initHyperLogLog(&sss->abbr_card, 10);
2052 23522 : initHyperLogLog(&sss->full_card, 10);
2053 23522 : ssup->abbrev_full_comparator = ssup->comparator;
2054 23522 : ssup->comparator = ssup_datum_unsigned_cmp;
2055 23522 : ssup->abbrev_converter = varstr_abbrev_convert;
2056 23522 : ssup->abbrev_abort = varstr_abbrev_abort;
2057 : }
2058 : }
2059 138896 : }
2060 :
2061 : /*
2062 : * sortsupport comparison func (for C locale case)
2063 : */
2064 : static int
2065 39114086 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2066 : {
2067 39114086 : VarString *arg1 = DatumGetVarStringPP(x);
2068 39114086 : VarString *arg2 = DatumGetVarStringPP(y);
2069 : char *a1p,
2070 : *a2p;
2071 : int len1,
2072 : len2,
2073 : result;
2074 :
2075 39114086 : a1p = VARDATA_ANY(arg1);
2076 39114086 : a2p = VARDATA_ANY(arg2);
2077 :
2078 39114086 : len1 = VARSIZE_ANY_EXHDR(arg1);
2079 39114086 : len2 = VARSIZE_ANY_EXHDR(arg2);
2080 :
2081 39114086 : result = memcmp(a1p, a2p, Min(len1, len2));
2082 39114086 : if ((result == 0) && (len1 != len2))
2083 1040690 : result = (len1 < len2) ? -1 : 1;
2084 :
2085 : /* We can't afford to leak memory here. */
2086 39114086 : if (PointerGetDatum(arg1) != x)
2087 2 : pfree(arg1);
2088 39114086 : if (PointerGetDatum(arg2) != y)
2089 2 : pfree(arg2);
2090 :
2091 39114086 : return result;
2092 : }
2093 :
2094 : /*
2095 : * sortsupport comparison func (for BpChar C locale case)
2096 : *
2097 : * BpChar outsources its sortsupport to this module. Specialization for the
2098 : * varstr_sortsupport BpChar case, modeled on
2099 : * internal_bpchar_pattern_compare().
2100 : */
2101 : static int
2102 62420 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2103 : {
2104 62420 : BpChar *arg1 = DatumGetBpCharPP(x);
2105 62420 : BpChar *arg2 = DatumGetBpCharPP(y);
2106 : char *a1p,
2107 : *a2p;
2108 : int len1,
2109 : len2,
2110 : result;
2111 :
2112 62420 : a1p = VARDATA_ANY(arg1);
2113 62420 : a2p = VARDATA_ANY(arg2);
2114 :
2115 62420 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2116 62420 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2117 :
2118 62420 : result = memcmp(a1p, a2p, Min(len1, len2));
2119 62420 : if ((result == 0) && (len1 != len2))
2120 4 : result = (len1 < len2) ? -1 : 1;
2121 :
2122 : /* We can't afford to leak memory here. */
2123 62420 : if (PointerGetDatum(arg1) != x)
2124 0 : pfree(arg1);
2125 62420 : if (PointerGetDatum(arg2) != y)
2126 0 : pfree(arg2);
2127 :
2128 62420 : return result;
2129 : }
2130 :
2131 : /*
2132 : * sortsupport comparison func (for NAME C locale case)
2133 : */
2134 : static int
2135 36486100 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2136 : {
2137 36486100 : Name arg1 = DatumGetName(x);
2138 36486100 : Name arg2 = DatumGetName(y);
2139 :
2140 36486100 : return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2141 : }
2142 :
2143 : /*
2144 : * sortsupport comparison func (for locale case with all varlena types)
2145 : */
2146 : static int
2147 34804198 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2148 : {
2149 34804198 : VarString *arg1 = DatumGetVarStringPP(x);
2150 34804198 : VarString *arg2 = DatumGetVarStringPP(y);
2151 : char *a1p,
2152 : *a2p;
2153 : int len1,
2154 : len2,
2155 : result;
2156 :
2157 34804198 : a1p = VARDATA_ANY(arg1);
2158 34804198 : a2p = VARDATA_ANY(arg2);
2159 :
2160 34804198 : len1 = VARSIZE_ANY_EXHDR(arg1);
2161 34804198 : len2 = VARSIZE_ANY_EXHDR(arg2);
2162 :
2163 34804198 : result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2164 :
2165 : /* We can't afford to leak memory here. */
2166 34804198 : if (PointerGetDatum(arg1) != x)
2167 4 : pfree(arg1);
2168 34804198 : if (PointerGetDatum(arg2) != y)
2169 4 : pfree(arg2);
2170 :
2171 34804198 : return result;
2172 : }
2173 :
2174 : /*
2175 : * sortsupport comparison func (for locale case with NAME type)
2176 : */
2177 : static int
2178 0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2179 : {
2180 0 : Name arg1 = DatumGetName(x);
2181 0 : Name arg2 = DatumGetName(y);
2182 :
2183 0 : return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2184 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2185 : ssup);
2186 : }
2187 :
2188 : /*
2189 : * sortsupport comparison func for locale cases
2190 : */
2191 : static int
2192 34804198 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2193 : {
2194 34804198 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2195 : int result;
2196 : bool arg1_match;
2197 :
2198 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2199 34804198 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2200 : {
2201 : /*
2202 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2203 : * last_len2. Existing contents of buffers might still be used by
2204 : * next call.
2205 : *
2206 : * It's fine to allow the comparison of BpChar padding bytes here,
2207 : * even though that implies that the memcmp() will usually be
2208 : * performed for BpChar callers (though multibyte characters could
2209 : * still prevent that from occurring). The memcmp() is still very
2210 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2211 : * (not limited to padding), so we need make no distinction between
2212 : * padding space characters and "real" space characters.
2213 : */
2214 9258876 : return 0;
2215 : }
2216 :
2217 25545322 : if (sss->typid == BPCHAROID)
2218 : {
2219 : /* Get true number of bytes, ignoring trailing spaces */
2220 37998 : len1 = bpchartruelen(a1p, len1);
2221 37998 : len2 = bpchartruelen(a2p, len2);
2222 : }
2223 :
2224 25545322 : if (len1 >= sss->buflen1)
2225 : {
2226 14 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2227 14 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2228 : }
2229 25545322 : if (len2 >= sss->buflen2)
2230 : {
2231 10 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2232 10 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2233 : }
2234 :
2235 : /*
2236 : * We're likely to be asked to compare the same strings repeatedly, and
2237 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2238 : * comparisons, even though in general there is no reason to think that
2239 : * that will work out (every string datum may be unique). Caching does
2240 : * not slow things down measurably when it doesn't work out, and can speed
2241 : * things up by rather a lot when it does. In part, this is because the
2242 : * memcmp() compares data from cachelines that are needed in L1 cache even
2243 : * when the last comparison's result cannot be reused.
2244 : */
2245 25545322 : arg1_match = true;
2246 25545322 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2247 : {
2248 23588356 : arg1_match = false;
2249 23588356 : memcpy(sss->buf1, a1p, len1);
2250 23588356 : sss->buf1[len1] = '\0';
2251 23588356 : sss->last_len1 = len1;
2252 : }
2253 :
2254 : /*
2255 : * If we're comparing the same two strings as last time, we can return the
2256 : * same answer without calling strcoll() again. This is more likely than
2257 : * it seems (at least with moderate to low cardinality sets), because
2258 : * quicksort compares the same pivot against many values.
2259 : */
2260 25545322 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2261 : {
2262 3868636 : memcpy(sss->buf2, a2p, len2);
2263 3868636 : sss->buf2[len2] = '\0';
2264 3868636 : sss->last_len2 = len2;
2265 : }
2266 21676686 : else if (arg1_match && !sss->cache_blob)
2267 : {
2268 : /* Use result cached following last actual strcoll() call */
2269 1550758 : return sss->last_returned;
2270 : }
2271 :
2272 23994564 : result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2273 :
2274 : /* Break tie if necessary. */
2275 23994564 : if (result == 0 && sss->locale->deterministic)
2276 0 : result = strcmp(sss->buf1, sss->buf2);
2277 :
2278 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2279 23994564 : sss->cache_blob = false;
2280 23994564 : sss->last_returned = result;
2281 23994564 : return result;
2282 : }
2283 :
2284 : /*
2285 : * Conversion routine for sortsupport. Converts original to abbreviated key
2286 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2287 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2288 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2289 : * locale is used, or in case of bytea, just memcpy() from original instead.
2290 : */
2291 : static Datum
2292 827368 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2293 : {
2294 827368 : const size_t max_prefix_bytes = sizeof(Datum);
2295 827368 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2296 827368 : VarString *authoritative = DatumGetVarStringPP(original);
2297 827368 : char *authoritative_data = VARDATA_ANY(authoritative);
2298 :
2299 : /* working state */
2300 : Datum res;
2301 : char *pres;
2302 : int len;
2303 : uint32 hash;
2304 :
2305 827368 : pres = (char *) &res;
2306 : /* memset(), so any non-overwritten bytes are NUL */
2307 827368 : memset(pres, 0, max_prefix_bytes);
2308 827368 : len = VARSIZE_ANY_EXHDR(authoritative);
2309 :
2310 : /* Get number of bytes, ignoring trailing spaces */
2311 827368 : if (sss->typid == BPCHAROID)
2312 1010 : len = bpchartruelen(authoritative_data, len);
2313 :
2314 : /*
2315 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2316 : * abbreviate keys. The full comparator for the C locale is always
2317 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2318 : * always force the C collation -- bytea isn't a collatable type, but this
2319 : * approach is convenient) to use strxfrm(). This is because bytea
2320 : * strings may contain NUL bytes. Besides, this should be faster, too.
2321 : *
2322 : * More generally, it's okay that bytea callers can have NUL bytes in
2323 : * strings because abbreviated cmp need not make a distinction between
2324 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2325 : * authoritative representation. Hopefully a comparison at or past one
2326 : * abbreviated key's terminating NUL byte will resolve the comparison
2327 : * without consulting the authoritative representation; specifically, some
2328 : * later non-NUL byte in the longer string can resolve the comparison
2329 : * against a subsequent terminating NUL in the shorter string. There will
2330 : * usually be what is effectively a "length-wise" resolution there and
2331 : * then.
2332 : *
2333 : * If that doesn't work out -- if all bytes in the longer string
2334 : * positioned at or past the offset of the smaller string's (first)
2335 : * terminating NUL are actually representative of NUL bytes in the
2336 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2337 : * towards the end of the longer string iff it happens to still be small)
2338 : * -- then an authoritative tie-breaker will happen, and do the right
2339 : * thing: explicitly consider string length.
2340 : */
2341 827368 : if (sss->collate_c)
2342 825544 : memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2343 : else
2344 : {
2345 : Size bsize;
2346 :
2347 : /*
2348 : * We're not using the C collation, so fall back on strxfrm or ICU
2349 : * analogs.
2350 : */
2351 :
2352 : /* By convention, we use buffer 1 to store and NUL-terminate */
2353 1824 : if (len >= sss->buflen1)
2354 : {
2355 0 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2356 0 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2357 : }
2358 :
2359 : /* Might be able to reuse strxfrm() blob from last call */
2360 1824 : if (sss->last_len1 == len && sss->cache_blob &&
2361 912 : memcmp(sss->buf1, authoritative_data, len) == 0)
2362 : {
2363 168 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2364 : /* No change affecting cardinality, so no hashing required */
2365 168 : goto done;
2366 : }
2367 :
2368 1656 : memcpy(sss->buf1, authoritative_data, len);
2369 :
2370 : /*
2371 : * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2372 : */
2373 1656 : sss->buf1[len] = '\0';
2374 1656 : sss->last_len1 = len;
2375 :
2376 1656 : if (pg_strxfrm_prefix_enabled(sss->locale))
2377 : {
2378 1656 : if (sss->buflen2 < max_prefix_bytes)
2379 : {
2380 0 : sss->buflen2 = Max(max_prefix_bytes,
2381 : Min(sss->buflen2 * 2, MaxAllocSize));
2382 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2383 : }
2384 :
2385 1656 : bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2386 : max_prefix_bytes, sss->locale);
2387 1656 : sss->last_len2 = bsize;
2388 : }
2389 : else
2390 : {
2391 : /*
2392 : * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2393 : * again. The pg_strxfrm() function leaves the result buffer
2394 : * content undefined if the result did not fit, so we need to
2395 : * retry until everything fits, even though we only need the first
2396 : * few bytes in the end.
2397 : */
2398 : for (;;)
2399 : {
2400 0 : bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2401 : sss->locale);
2402 :
2403 0 : sss->last_len2 = bsize;
2404 0 : if (bsize < sss->buflen2)
2405 0 : break;
2406 :
2407 : /*
2408 : * Grow buffer and retry.
2409 : */
2410 0 : sss->buflen2 = Max(bsize + 1,
2411 : Min(sss->buflen2 * 2, MaxAllocSize));
2412 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2413 : }
2414 : }
2415 :
2416 : /*
2417 : * Every Datum byte is always compared. This is safe because the
2418 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2419 : * misinterpreting any NUL bytes not intended to be interpreted as
2420 : * logically representing termination.
2421 : *
2422 : * (Actually, even if there were NUL bytes in the blob it would be
2423 : * okay. See remarks on bytea case above.)
2424 : */
2425 1656 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2426 : }
2427 :
2428 : /*
2429 : * Maintain approximate cardinality of both abbreviated keys and original,
2430 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2431 : * the worst case, where we do many string transformations for no saving
2432 : * in full strcoll()-based comparisons. These statistics are used by
2433 : * varstr_abbrev_abort().
2434 : *
2435 : * First, Hash key proper, or a significant fraction of it. Mix in length
2436 : * in order to compensate for cases where differences are past
2437 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2438 : */
2439 827200 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2440 : Min(len, PG_CACHE_LINE_SIZE)));
2441 :
2442 827200 : if (len > PG_CACHE_LINE_SIZE)
2443 184 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2444 :
2445 827200 : addHyperLogLog(&sss->full_card, hash);
2446 :
2447 : /* Hash abbreviated key */
2448 : #if SIZEOF_DATUM == 8
2449 : {
2450 : uint32 lohalf,
2451 : hihalf;
2452 :
2453 827200 : lohalf = (uint32) res;
2454 827200 : hihalf = (uint32) (res >> 32);
2455 827200 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2456 : }
2457 : #else /* SIZEOF_DATUM != 8 */
2458 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2459 : #endif
2460 :
2461 827200 : addHyperLogLog(&sss->abbr_card, hash);
2462 :
2463 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2464 827200 : sss->cache_blob = true;
2465 827368 : done:
2466 :
2467 : /*
2468 : * Byteswap on little-endian machines.
2469 : *
2470 : * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2471 : * 3-way comparator) works correctly on all platforms. If we didn't do
2472 : * this, the comparator would have to call memcmp() with a pair of
2473 : * pointers to the first byte of each abbreviated key, which is slower.
2474 : */
2475 827368 : res = DatumBigEndianToNative(res);
2476 :
2477 : /* Don't leak memory here */
2478 827368 : if (PointerGetDatum(authoritative) != original)
2479 4 : pfree(authoritative);
2480 :
2481 827368 : return res;
2482 : }
2483 :
2484 : /*
2485 : * Callback for estimating effectiveness of abbreviated key optimization, using
2486 : * heuristic rules. Returns value indicating if the abbreviation optimization
2487 : * should be aborted, based on its projected effectiveness.
2488 : */
2489 : static bool
2490 2224 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2491 : {
2492 2224 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2493 : double abbrev_distinct,
2494 : key_distinct;
2495 :
2496 : Assert(ssup->abbreviate);
2497 :
2498 : /* Have a little patience */
2499 2224 : if (memtupcount < 100)
2500 1244 : return false;
2501 :
2502 980 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2503 980 : key_distinct = estimateHyperLogLog(&sss->full_card);
2504 :
2505 : /*
2506 : * Clamp cardinality estimates to at least one distinct value. While
2507 : * NULLs are generally disregarded, if only NULL values were seen so far,
2508 : * that might misrepresent costs if we failed to clamp.
2509 : */
2510 980 : if (abbrev_distinct <= 1.0)
2511 0 : abbrev_distinct = 1.0;
2512 :
2513 980 : if (key_distinct <= 1.0)
2514 0 : key_distinct = 1.0;
2515 :
2516 : /*
2517 : * In the worst case all abbreviated keys are identical, while at the same
2518 : * time there are differences within full key strings not captured in
2519 : * abbreviations.
2520 : */
2521 980 : if (trace_sort)
2522 : {
2523 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2524 :
2525 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2526 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2527 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2528 : sss->prop_card);
2529 : }
2530 :
2531 : /*
2532 : * If the number of distinct abbreviated keys approximately matches the
2533 : * number of distinct authoritative original keys, that's reason enough to
2534 : * proceed. We can win even with a very low cardinality set if most
2535 : * tie-breakers only memcmp(). This is by far the most important
2536 : * consideration.
2537 : *
2538 : * While comparisons that are resolved at the abbreviated key level are
2539 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2540 : * those two outcomes are so much cheaper than a full strcoll() once
2541 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2542 : * cardinality against the overall size of the set in order to more
2543 : * accurately model costs. Assume that an abbreviated comparison, and an
2544 : * abbreviated comparison with a cheap memcmp()-based authoritative
2545 : * resolution are equivalent.
2546 : */
2547 980 : if (abbrev_distinct > key_distinct * sss->prop_card)
2548 : {
2549 : /*
2550 : * When we have exceeded 10,000 tuples, decay required cardinality
2551 : * aggressively for next call.
2552 : *
2553 : * This is useful because the number of comparisons required on
2554 : * average increases at a linearithmic rate, and at roughly 10,000
2555 : * tuples that factor will start to dominate over the linear costs of
2556 : * string transformation (this is a conservative estimate). The decay
2557 : * rate is chosen to be a little less aggressive than halving -- which
2558 : * (since we're called at points at which memtupcount has doubled)
2559 : * would never see the cost model actually abort past the first call
2560 : * following a decay. This decay rate is mostly a precaution against
2561 : * a sudden, violent swing in how well abbreviated cardinality tracks
2562 : * full key cardinality. The decay also serves to prevent a marginal
2563 : * case from being aborted too late, when too much has already been
2564 : * invested in string transformation.
2565 : *
2566 : * It's possible for sets of several million distinct strings with
2567 : * mere tens of thousands of distinct abbreviated keys to still
2568 : * benefit very significantly. This will generally occur provided
2569 : * each abbreviated key is a proxy for a roughly uniform number of the
2570 : * set's full keys. If it isn't so, we hope to catch that early and
2571 : * abort. If it isn't caught early, by the time the problem is
2572 : * apparent it's probably not worth aborting.
2573 : */
2574 980 : if (memtupcount > 10000)
2575 4 : sss->prop_card *= 0.65;
2576 :
2577 980 : return false;
2578 : }
2579 :
2580 : /*
2581 : * Abort abbreviation strategy.
2582 : *
2583 : * The worst case, where all abbreviated keys are identical while all
2584 : * original strings differ will typically only see a regression of about
2585 : * 10% in execution time for small to medium sized lists of strings.
2586 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2587 : * often expect very large improvements, particularly with sets of strings
2588 : * of moderately high to high abbreviated cardinality. There is little to
2589 : * lose but much to gain, which our strategy reflects.
2590 : */
2591 0 : if (trace_sort)
2592 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2593 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2594 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2595 :
2596 0 : return true;
2597 : }
2598 :
2599 : /*
2600 : * Generic equalimage support function for character type's operator classes.
2601 : * Disables the use of deduplication with nondeterministic collations.
2602 : */
2603 : Datum
2604 8374 : btvarstrequalimage(PG_FUNCTION_ARGS)
2605 : {
2606 : /* Oid opcintype = PG_GETARG_OID(0); */
2607 8374 : Oid collid = PG_GET_COLLATION();
2608 : pg_locale_t locale;
2609 :
2610 8374 : check_collation_set(collid);
2611 :
2612 8374 : locale = pg_newlocale_from_collation(collid);
2613 :
2614 8374 : PG_RETURN_BOOL(locale->deterministic);
2615 : }
2616 :
2617 : Datum
2618 229560 : text_larger(PG_FUNCTION_ARGS)
2619 : {
2620 229560 : text *arg1 = PG_GETARG_TEXT_PP(0);
2621 229560 : text *arg2 = PG_GETARG_TEXT_PP(1);
2622 : text *result;
2623 :
2624 229560 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2625 :
2626 229560 : PG_RETURN_TEXT_P(result);
2627 : }
2628 :
2629 : Datum
2630 86076 : text_smaller(PG_FUNCTION_ARGS)
2631 : {
2632 86076 : text *arg1 = PG_GETARG_TEXT_PP(0);
2633 86076 : text *arg2 = PG_GETARG_TEXT_PP(1);
2634 : text *result;
2635 :
2636 86076 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2637 :
2638 86076 : PG_RETURN_TEXT_P(result);
2639 : }
2640 :
2641 :
2642 : /*
2643 : * Cross-type comparison functions for types text and name.
2644 : */
2645 :
2646 : Datum
2647 176468 : nameeqtext(PG_FUNCTION_ARGS)
2648 : {
2649 176468 : Name arg1 = PG_GETARG_NAME(0);
2650 176468 : text *arg2 = PG_GETARG_TEXT_PP(1);
2651 176468 : size_t len1 = strlen(NameStr(*arg1));
2652 176468 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2653 176468 : Oid collid = PG_GET_COLLATION();
2654 : bool result;
2655 :
2656 176468 : check_collation_set(collid);
2657 :
2658 176468 : if (collid == C_COLLATION_OID)
2659 253840 : result = (len1 == len2 &&
2660 123050 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2661 : else
2662 45678 : result = (varstr_cmp(NameStr(*arg1), len1,
2663 45678 : VARDATA_ANY(arg2), len2,
2664 : collid) == 0);
2665 :
2666 176468 : PG_FREE_IF_COPY(arg2, 1);
2667 :
2668 176468 : PG_RETURN_BOOL(result);
2669 : }
2670 :
2671 : Datum
2672 7800 : texteqname(PG_FUNCTION_ARGS)
2673 : {
2674 7800 : text *arg1 = PG_GETARG_TEXT_PP(0);
2675 7800 : Name arg2 = PG_GETARG_NAME(1);
2676 7800 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2677 7800 : size_t len2 = strlen(NameStr(*arg2));
2678 7800 : Oid collid = PG_GET_COLLATION();
2679 : bool result;
2680 :
2681 7800 : check_collation_set(collid);
2682 :
2683 7800 : if (collid == C_COLLATION_OID)
2684 568 : result = (len1 == len2 &&
2685 182 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2686 : else
2687 7414 : result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2688 7414 : NameStr(*arg2), len2,
2689 : collid) == 0);
2690 :
2691 7800 : PG_FREE_IF_COPY(arg1, 0);
2692 :
2693 7800 : PG_RETURN_BOOL(result);
2694 : }
2695 :
2696 : Datum
2697 18 : namenetext(PG_FUNCTION_ARGS)
2698 : {
2699 18 : Name arg1 = PG_GETARG_NAME(0);
2700 18 : text *arg2 = PG_GETARG_TEXT_PP(1);
2701 18 : size_t len1 = strlen(NameStr(*arg1));
2702 18 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2703 18 : Oid collid = PG_GET_COLLATION();
2704 : bool result;
2705 :
2706 18 : check_collation_set(collid);
2707 :
2708 18 : if (collid == C_COLLATION_OID)
2709 0 : result = !(len1 == len2 &&
2710 0 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2711 : else
2712 18 : result = !(varstr_cmp(NameStr(*arg1), len1,
2713 18 : VARDATA_ANY(arg2), len2,
2714 : collid) == 0);
2715 :
2716 18 : PG_FREE_IF_COPY(arg2, 1);
2717 :
2718 18 : PG_RETURN_BOOL(result);
2719 : }
2720 :
2721 : Datum
2722 18 : textnename(PG_FUNCTION_ARGS)
2723 : {
2724 18 : text *arg1 = PG_GETARG_TEXT_PP(0);
2725 18 : Name arg2 = PG_GETARG_NAME(1);
2726 18 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2727 18 : size_t len2 = strlen(NameStr(*arg2));
2728 18 : Oid collid = PG_GET_COLLATION();
2729 : bool result;
2730 :
2731 18 : check_collation_set(collid);
2732 :
2733 18 : if (collid == C_COLLATION_OID)
2734 0 : result = !(len1 == len2 &&
2735 0 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2736 : else
2737 18 : result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2738 18 : NameStr(*arg2), len2,
2739 : collid) == 0);
2740 :
2741 18 : PG_FREE_IF_COPY(arg1, 0);
2742 :
2743 18 : PG_RETURN_BOOL(result);
2744 : }
2745 :
2746 : Datum
2747 98720 : btnametextcmp(PG_FUNCTION_ARGS)
2748 : {
2749 98720 : Name arg1 = PG_GETARG_NAME(0);
2750 98720 : text *arg2 = PG_GETARG_TEXT_PP(1);
2751 : int32 result;
2752 :
2753 197440 : result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2754 197440 : VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2755 : PG_GET_COLLATION());
2756 :
2757 98720 : PG_FREE_IF_COPY(arg2, 1);
2758 :
2759 98720 : PG_RETURN_INT32(result);
2760 : }
2761 :
2762 : Datum
2763 0 : bttextnamecmp(PG_FUNCTION_ARGS)
2764 : {
2765 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2766 0 : Name arg2 = PG_GETARG_NAME(1);
2767 : int32 result;
2768 :
2769 0 : result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2770 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2771 : PG_GET_COLLATION());
2772 :
2773 0 : PG_FREE_IF_COPY(arg1, 0);
2774 :
2775 0 : PG_RETURN_INT32(result);
2776 : }
2777 :
2778 : #define CmpCall(cmpfunc) \
2779 : DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2780 : PG_GET_COLLATION(), \
2781 : PG_GETARG_DATUM(0), \
2782 : PG_GETARG_DATUM(1)))
2783 :
2784 : Datum
2785 51462 : namelttext(PG_FUNCTION_ARGS)
2786 : {
2787 51462 : PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2788 : }
2789 :
2790 : Datum
2791 0 : nameletext(PG_FUNCTION_ARGS)
2792 : {
2793 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2794 : }
2795 :
2796 : Datum
2797 0 : namegttext(PG_FUNCTION_ARGS)
2798 : {
2799 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2800 : }
2801 :
2802 : Datum
2803 34868 : namegetext(PG_FUNCTION_ARGS)
2804 : {
2805 34868 : PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2806 : }
2807 :
2808 : Datum
2809 0 : textltname(PG_FUNCTION_ARGS)
2810 : {
2811 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2812 : }
2813 :
2814 : Datum
2815 0 : textlename(PG_FUNCTION_ARGS)
2816 : {
2817 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2818 : }
2819 :
2820 : Datum
2821 0 : textgtname(PG_FUNCTION_ARGS)
2822 : {
2823 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2824 : }
2825 :
2826 : Datum
2827 0 : textgename(PG_FUNCTION_ARGS)
2828 : {
2829 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2830 : }
2831 :
2832 : #undef CmpCall
2833 :
2834 :
2835 : /*
2836 : * The following operators support character-by-character comparison
2837 : * of text datums, to allow building indexes suitable for LIKE clauses.
2838 : * Note that the regular texteq/textne comparison operators, and regular
2839 : * support functions 1 and 2 with "C" collation are assumed to be
2840 : * compatible with these!
2841 : */
2842 :
2843 : static int
2844 152158 : internal_text_pattern_compare(text *arg1, text *arg2)
2845 : {
2846 : int result;
2847 : int len1,
2848 : len2;
2849 :
2850 152158 : len1 = VARSIZE_ANY_EXHDR(arg1);
2851 152158 : len2 = VARSIZE_ANY_EXHDR(arg2);
2852 :
2853 152158 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2854 152158 : if (result != 0)
2855 152092 : return result;
2856 66 : else if (len1 < len2)
2857 0 : return -1;
2858 66 : else if (len1 > len2)
2859 18 : return 1;
2860 : else
2861 48 : return 0;
2862 : }
2863 :
2864 :
2865 : Datum
2866 39580 : text_pattern_lt(PG_FUNCTION_ARGS)
2867 : {
2868 39580 : text *arg1 = PG_GETARG_TEXT_PP(0);
2869 39580 : text *arg2 = PG_GETARG_TEXT_PP(1);
2870 : int result;
2871 :
2872 39580 : result = internal_text_pattern_compare(arg1, arg2);
2873 :
2874 39580 : PG_FREE_IF_COPY(arg1, 0);
2875 39580 : PG_FREE_IF_COPY(arg2, 1);
2876 :
2877 39580 : PG_RETURN_BOOL(result < 0);
2878 : }
2879 :
2880 :
2881 : Datum
2882 37510 : text_pattern_le(PG_FUNCTION_ARGS)
2883 : {
2884 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2885 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2886 : int result;
2887 :
2888 37510 : result = internal_text_pattern_compare(arg1, arg2);
2889 :
2890 37510 : PG_FREE_IF_COPY(arg1, 0);
2891 37510 : PG_FREE_IF_COPY(arg2, 1);
2892 :
2893 37510 : PG_RETURN_BOOL(result <= 0);
2894 : }
2895 :
2896 :
2897 : Datum
2898 37534 : text_pattern_ge(PG_FUNCTION_ARGS)
2899 : {
2900 37534 : text *arg1 = PG_GETARG_TEXT_PP(0);
2901 37534 : text *arg2 = PG_GETARG_TEXT_PP(1);
2902 : int result;
2903 :
2904 37534 : result = internal_text_pattern_compare(arg1, arg2);
2905 :
2906 37534 : PG_FREE_IF_COPY(arg1, 0);
2907 37534 : PG_FREE_IF_COPY(arg2, 1);
2908 :
2909 37534 : PG_RETURN_BOOL(result >= 0);
2910 : }
2911 :
2912 :
2913 : Datum
2914 37510 : text_pattern_gt(PG_FUNCTION_ARGS)
2915 : {
2916 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2917 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2918 : int result;
2919 :
2920 37510 : result = internal_text_pattern_compare(arg1, arg2);
2921 :
2922 37510 : PG_FREE_IF_COPY(arg1, 0);
2923 37510 : PG_FREE_IF_COPY(arg2, 1);
2924 :
2925 37510 : PG_RETURN_BOOL(result > 0);
2926 : }
2927 :
2928 :
2929 : Datum
2930 24 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
2931 : {
2932 24 : text *arg1 = PG_GETARG_TEXT_PP(0);
2933 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
2934 : int result;
2935 :
2936 24 : result = internal_text_pattern_compare(arg1, arg2);
2937 :
2938 24 : PG_FREE_IF_COPY(arg1, 0);
2939 24 : PG_FREE_IF_COPY(arg2, 1);
2940 :
2941 24 : PG_RETURN_INT32(result);
2942 : }
2943 :
2944 :
2945 : Datum
2946 116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2947 : {
2948 116 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2949 : MemoryContext oldcontext;
2950 :
2951 116 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2952 :
2953 : /* Use generic string SortSupport, forcing "C" collation */
2954 116 : varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2955 :
2956 116 : MemoryContextSwitchTo(oldcontext);
2957 :
2958 116 : PG_RETURN_VOID();
2959 : }
2960 :
2961 :
2962 : /*-------------------------------------------------------------
2963 : * byteaoctetlen
2964 : *
2965 : * get the number of bytes contained in an instance of type 'bytea'
2966 : *-------------------------------------------------------------
2967 : */
2968 : Datum
2969 602 : byteaoctetlen(PG_FUNCTION_ARGS)
2970 : {
2971 602 : Datum str = PG_GETARG_DATUM(0);
2972 :
2973 : /* We need not detoast the input at all */
2974 602 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2975 : }
2976 :
2977 : /*
2978 : * byteacat -
2979 : * takes two bytea* and returns a bytea* that is the concatenation of
2980 : * the two.
2981 : *
2982 : * Cloned from textcat and modified as required.
2983 : */
2984 : Datum
2985 1522 : byteacat(PG_FUNCTION_ARGS)
2986 : {
2987 1522 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2988 1522 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2989 :
2990 1522 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2991 : }
2992 :
2993 : /*
2994 : * bytea_catenate
2995 : * Guts of byteacat(), broken out so it can be used by other functions
2996 : *
2997 : * Arguments can be in short-header form, but not compressed or out-of-line
2998 : */
2999 : static bytea *
3000 1558 : bytea_catenate(bytea *t1, bytea *t2)
3001 : {
3002 : bytea *result;
3003 : int len1,
3004 : len2,
3005 : len;
3006 : char *ptr;
3007 :
3008 1558 : len1 = VARSIZE_ANY_EXHDR(t1);
3009 1558 : len2 = VARSIZE_ANY_EXHDR(t2);
3010 :
3011 : /* paranoia ... probably should throw error instead? */
3012 1558 : if (len1 < 0)
3013 0 : len1 = 0;
3014 1558 : if (len2 < 0)
3015 0 : len2 = 0;
3016 :
3017 1558 : len = len1 + len2 + VARHDRSZ;
3018 1558 : result = (bytea *) palloc(len);
3019 :
3020 : /* Set size of result string... */
3021 1558 : SET_VARSIZE(result, len);
3022 :
3023 : /* Fill data field of result string... */
3024 1558 : ptr = VARDATA(result);
3025 1558 : if (len1 > 0)
3026 1558 : memcpy(ptr, VARDATA_ANY(t1), len1);
3027 1558 : if (len2 > 0)
3028 1540 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3029 :
3030 1558 : return result;
3031 : }
3032 :
3033 : #define PG_STR_GET_BYTEA(str_) \
3034 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3035 :
3036 : /*
3037 : * bytea_substr()
3038 : * Return a substring starting at the specified position.
3039 : * Cloned from text_substr and modified as required.
3040 : *
3041 : * Input:
3042 : * - string
3043 : * - starting position (is one-based)
3044 : * - string length (optional)
3045 : *
3046 : * If the starting position is zero or less, then return from the start of the string
3047 : * adjusting the length to be consistent with the "negative start" per SQL.
3048 : * If the length is less than zero, an ERROR is thrown. If no third argument
3049 : * (length) is provided, the length to the end of the string is assumed.
3050 : */
3051 : Datum
3052 86 : bytea_substr(PG_FUNCTION_ARGS)
3053 : {
3054 86 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3055 : PG_GETARG_INT32(1),
3056 : PG_GETARG_INT32(2),
3057 : false));
3058 : }
3059 :
3060 : /*
3061 : * bytea_substr_no_len -
3062 : * Wrapper to avoid opr_sanity failure due to
3063 : * one function accepting a different number of args.
3064 : */
3065 : Datum
3066 3900 : bytea_substr_no_len(PG_FUNCTION_ARGS)
3067 : {
3068 3900 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3069 : PG_GETARG_INT32(1),
3070 : -1,
3071 : true));
3072 : }
3073 :
3074 : static bytea *
3075 4022 : bytea_substring(Datum str,
3076 : int S,
3077 : int L,
3078 : bool length_not_specified)
3079 : {
3080 : int32 S1; /* adjusted start position */
3081 : int32 L1; /* adjusted substring length */
3082 : int32 E; /* end position */
3083 :
3084 : /*
3085 : * The logic here should generally match text_substring().
3086 : */
3087 4022 : S1 = Max(S, 1);
3088 :
3089 4022 : if (length_not_specified)
3090 : {
3091 : /*
3092 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3093 : * end of the string if we pass it a negative value for length.
3094 : */
3095 3918 : L1 = -1;
3096 : }
3097 104 : else if (L < 0)
3098 : {
3099 : /* SQL99 says to throw an error for E < S, i.e., negative length */
3100 12 : ereport(ERROR,
3101 : (errcode(ERRCODE_SUBSTRING_ERROR),
3102 : errmsg("negative substring length not allowed")));
3103 : L1 = -1; /* silence stupider compilers */
3104 : }
3105 92 : else if (pg_add_s32_overflow(S, L, &E))
3106 : {
3107 : /*
3108 : * L could be large enough for S + L to overflow, in which case the
3109 : * substring must run to end of string.
3110 : */
3111 6 : L1 = -1;
3112 : }
3113 : else
3114 : {
3115 : /*
3116 : * A zero or negative value for the end position can happen if the
3117 : * start was negative or one. SQL99 says to return a zero-length
3118 : * string.
3119 : */
3120 86 : if (E < 1)
3121 0 : return PG_STR_GET_BYTEA("");
3122 :
3123 86 : L1 = E - S1;
3124 : }
3125 :
3126 : /*
3127 : * If the start position is past the end of the string, SQL99 says to
3128 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
3129 : * us. We need only convert S1 to zero-based starting position.
3130 : */
3131 4010 : return DatumGetByteaPSlice(str, S1 - 1, L1);
3132 : }
3133 :
3134 : /*
3135 : * byteaoverlay
3136 : * Replace specified substring of first string with second
3137 : *
3138 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3139 : * This code is a direct implementation of what the standard says.
3140 : */
3141 : Datum
3142 6 : byteaoverlay(PG_FUNCTION_ARGS)
3143 : {
3144 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3145 6 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3146 6 : int sp = PG_GETARG_INT32(2); /* substring start position */
3147 6 : int sl = PG_GETARG_INT32(3); /* substring length */
3148 :
3149 6 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3150 : }
3151 :
3152 : Datum
3153 12 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
3154 : {
3155 12 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3156 12 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3157 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
3158 : int sl;
3159 :
3160 12 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3161 12 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3162 : }
3163 :
3164 : static bytea *
3165 18 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3166 : {
3167 : bytea *result;
3168 : bytea *s1;
3169 : bytea *s2;
3170 : int sp_pl_sl;
3171 :
3172 : /*
3173 : * Check for possible integer-overflow cases. For negative sp, throw a
3174 : * "substring length" error because that's what should be expected
3175 : * according to the spec's definition of OVERLAY().
3176 : */
3177 18 : if (sp <= 0)
3178 0 : ereport(ERROR,
3179 : (errcode(ERRCODE_SUBSTRING_ERROR),
3180 : errmsg("negative substring length not allowed")));
3181 18 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3182 0 : ereport(ERROR,
3183 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3184 : errmsg("integer out of range")));
3185 :
3186 18 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3187 18 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3188 18 : result = bytea_catenate(s1, t2);
3189 18 : result = bytea_catenate(result, s2);
3190 :
3191 18 : return result;
3192 : }
3193 :
3194 : /*
3195 : * bit_count
3196 : */
3197 : Datum
3198 6 : bytea_bit_count(PG_FUNCTION_ARGS)
3199 : {
3200 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3201 :
3202 6 : PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3203 : }
3204 :
3205 : /*
3206 : * byteapos -
3207 : * Return the position of the specified substring.
3208 : * Implements the SQL POSITION() function.
3209 : * Cloned from textpos and modified as required.
3210 : */
3211 : Datum
3212 0 : byteapos(PG_FUNCTION_ARGS)
3213 : {
3214 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3215 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3216 : int pos;
3217 : int px,
3218 : p;
3219 : int len1,
3220 : len2;
3221 : char *p1,
3222 : *p2;
3223 :
3224 0 : len1 = VARSIZE_ANY_EXHDR(t1);
3225 0 : len2 = VARSIZE_ANY_EXHDR(t2);
3226 :
3227 0 : if (len2 <= 0)
3228 0 : PG_RETURN_INT32(1); /* result for empty pattern */
3229 :
3230 0 : p1 = VARDATA_ANY(t1);
3231 0 : p2 = VARDATA_ANY(t2);
3232 :
3233 0 : pos = 0;
3234 0 : px = (len1 - len2);
3235 0 : for (p = 0; p <= px; p++)
3236 : {
3237 0 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3238 : {
3239 0 : pos = p + 1;
3240 0 : break;
3241 : };
3242 0 : p1++;
3243 : };
3244 :
3245 0 : PG_RETURN_INT32(pos);
3246 : }
3247 :
3248 : /*-------------------------------------------------------------
3249 : * byteaGetByte
3250 : *
3251 : * this routine treats "bytea" as an array of bytes.
3252 : * It returns the Nth byte (a number between 0 and 255).
3253 : *-------------------------------------------------------------
3254 : */
3255 : Datum
3256 60 : byteaGetByte(PG_FUNCTION_ARGS)
3257 : {
3258 60 : bytea *v = PG_GETARG_BYTEA_PP(0);
3259 60 : int32 n = PG_GETARG_INT32(1);
3260 : int len;
3261 : int byte;
3262 :
3263 60 : len = VARSIZE_ANY_EXHDR(v);
3264 :
3265 60 : if (n < 0 || n >= len)
3266 6 : ereport(ERROR,
3267 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3268 : errmsg("index %d out of valid range, 0..%d",
3269 : n, len - 1)));
3270 :
3271 54 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3272 :
3273 54 : PG_RETURN_INT32(byte);
3274 : }
3275 :
3276 : /*-------------------------------------------------------------
3277 : * byteaGetBit
3278 : *
3279 : * This routine treats a "bytea" type like an array of bits.
3280 : * It returns the value of the Nth bit (0 or 1).
3281 : *
3282 : *-------------------------------------------------------------
3283 : */
3284 : Datum
3285 12 : byteaGetBit(PG_FUNCTION_ARGS)
3286 : {
3287 12 : bytea *v = PG_GETARG_BYTEA_PP(0);
3288 12 : int64 n = PG_GETARG_INT64(1);
3289 : int byteNo,
3290 : bitNo;
3291 : int len;
3292 : int byte;
3293 :
3294 12 : len = VARSIZE_ANY_EXHDR(v);
3295 :
3296 12 : if (n < 0 || n >= (int64) len * 8)
3297 6 : ereport(ERROR,
3298 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3299 : errmsg("index %lld out of valid range, 0..%lld",
3300 : (long long) n, (long long) len * 8 - 1)));
3301 :
3302 : /* n/8 is now known < len, so safe to cast to int */
3303 6 : byteNo = (int) (n / 8);
3304 6 : bitNo = (int) (n % 8);
3305 :
3306 6 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3307 :
3308 6 : if (byte & (1 << bitNo))
3309 6 : PG_RETURN_INT32(1);
3310 : else
3311 0 : PG_RETURN_INT32(0);
3312 : }
3313 :
3314 : /*-------------------------------------------------------------
3315 : * byteaSetByte
3316 : *
3317 : * Given an instance of type 'bytea' creates a new one with
3318 : * the Nth byte set to the given value.
3319 : *
3320 : *-------------------------------------------------------------
3321 : */
3322 : Datum
3323 12 : byteaSetByte(PG_FUNCTION_ARGS)
3324 : {
3325 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3326 12 : int32 n = PG_GETARG_INT32(1);
3327 12 : int32 newByte = PG_GETARG_INT32(2);
3328 : int len;
3329 :
3330 12 : len = VARSIZE(res) - VARHDRSZ;
3331 :
3332 12 : if (n < 0 || n >= len)
3333 6 : ereport(ERROR,
3334 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3335 : errmsg("index %d out of valid range, 0..%d",
3336 : n, len - 1)));
3337 :
3338 : /*
3339 : * Now set the byte.
3340 : */
3341 6 : ((unsigned char *) VARDATA(res))[n] = newByte;
3342 :
3343 6 : PG_RETURN_BYTEA_P(res);
3344 : }
3345 :
3346 : /*-------------------------------------------------------------
3347 : * byteaSetBit
3348 : *
3349 : * Given an instance of type 'bytea' creates a new one with
3350 : * the Nth bit set to the given value.
3351 : *
3352 : *-------------------------------------------------------------
3353 : */
3354 : Datum
3355 12 : byteaSetBit(PG_FUNCTION_ARGS)
3356 : {
3357 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3358 12 : int64 n = PG_GETARG_INT64(1);
3359 12 : int32 newBit = PG_GETARG_INT32(2);
3360 : int len;
3361 : int oldByte,
3362 : newByte;
3363 : int byteNo,
3364 : bitNo;
3365 :
3366 12 : len = VARSIZE(res) - VARHDRSZ;
3367 :
3368 12 : if (n < 0 || n >= (int64) len * 8)
3369 6 : ereport(ERROR,
3370 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3371 : errmsg("index %lld out of valid range, 0..%lld",
3372 : (long long) n, (long long) len * 8 - 1)));
3373 :
3374 : /* n/8 is now known < len, so safe to cast to int */
3375 6 : byteNo = (int) (n / 8);
3376 6 : bitNo = (int) (n % 8);
3377 :
3378 : /*
3379 : * sanity check!
3380 : */
3381 6 : if (newBit != 0 && newBit != 1)
3382 0 : ereport(ERROR,
3383 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3384 : errmsg("new bit must be 0 or 1")));
3385 :
3386 : /*
3387 : * Update the byte.
3388 : */
3389 6 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3390 :
3391 6 : if (newBit == 0)
3392 6 : newByte = oldByte & (~(1 << bitNo));
3393 : else
3394 0 : newByte = oldByte | (1 << bitNo);
3395 :
3396 6 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3397 :
3398 6 : PG_RETURN_BYTEA_P(res);
3399 : }
3400 :
3401 :
3402 : /* text_name()
3403 : * Converts a text type to a Name type.
3404 : */
3405 : Datum
3406 30578 : text_name(PG_FUNCTION_ARGS)
3407 : {
3408 30578 : text *s = PG_GETARG_TEXT_PP(0);
3409 : Name result;
3410 : int len;
3411 :
3412 30578 : len = VARSIZE_ANY_EXHDR(s);
3413 :
3414 : /* Truncate oversize input */
3415 30578 : if (len >= NAMEDATALEN)
3416 6 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3417 :
3418 : /* We use palloc0 here to ensure result is zero-padded */
3419 30578 : result = (Name) palloc0(NAMEDATALEN);
3420 30578 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3421 :
3422 30578 : PG_RETURN_NAME(result);
3423 : }
3424 :
3425 : /* name_text()
3426 : * Converts a Name type to a text type.
3427 : */
3428 : Datum
3429 647682 : name_text(PG_FUNCTION_ARGS)
3430 : {
3431 647682 : Name s = PG_GETARG_NAME(0);
3432 :
3433 647682 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3434 : }
3435 :
3436 :
3437 : /*
3438 : * textToQualifiedNameList - convert a text object to list of names
3439 : *
3440 : * This implements the input parsing needed by nextval() and other
3441 : * functions that take a text parameter representing a qualified name.
3442 : * We split the name at dots, downcase if not double-quoted, and
3443 : * truncate names if they're too long.
3444 : */
3445 : List *
3446 5516 : textToQualifiedNameList(text *textval)
3447 : {
3448 : char *rawname;
3449 5516 : List *result = NIL;
3450 : List *namelist;
3451 : ListCell *l;
3452 :
3453 : /* Convert to C string (handles possible detoasting). */
3454 : /* Note we rely on being able to modify rawname below. */
3455 5516 : rawname = text_to_cstring(textval);
3456 :
3457 5516 : if (!SplitIdentifierString(rawname, '.', &namelist))
3458 0 : ereport(ERROR,
3459 : (errcode(ERRCODE_INVALID_NAME),
3460 : errmsg("invalid name syntax")));
3461 :
3462 5516 : if (namelist == NIL)
3463 0 : ereport(ERROR,
3464 : (errcode(ERRCODE_INVALID_NAME),
3465 : errmsg("invalid name syntax")));
3466 :
3467 11250 : foreach(l, namelist)
3468 : {
3469 5734 : char *curname = (char *) lfirst(l);
3470 :
3471 5734 : result = lappend(result, makeString(pstrdup(curname)));
3472 : }
3473 :
3474 5516 : pfree(rawname);
3475 5516 : list_free(namelist);
3476 :
3477 5516 : return result;
3478 : }
3479 :
3480 : /*
3481 : * SplitIdentifierString --- parse a string containing identifiers
3482 : *
3483 : * This is the guts of textToQualifiedNameList, and is exported for use in
3484 : * other situations such as parsing GUC variables. In the GUC case, it's
3485 : * important to avoid memory leaks, so the API is designed to minimize the
3486 : * amount of stuff that needs to be allocated and freed.
3487 : *
3488 : * Inputs:
3489 : * rawstring: the input string; must be overwritable! On return, it's
3490 : * been modified to contain the separated identifiers.
3491 : * separator: the separator punctuation expected between identifiers
3492 : * (typically '.' or ','). Whitespace may also appear around
3493 : * identifiers.
3494 : * Outputs:
3495 : * namelist: filled with a palloc'd list of pointers to identifiers within
3496 : * rawstring. Caller should list_free() this even on error return.
3497 : *
3498 : * Returns true if okay, false if there is a syntax error in the string.
3499 : *
3500 : * Note that an empty string is considered okay here, though not in
3501 : * textToQualifiedNameList.
3502 : */
3503 : bool
3504 266418 : SplitIdentifierString(char *rawstring, char separator,
3505 : List **namelist)
3506 : {
3507 266418 : char *nextp = rawstring;
3508 266418 : bool done = false;
3509 :
3510 266418 : *namelist = NIL;
3511 :
3512 266424 : while (scanner_isspace(*nextp))
3513 6 : nextp++; /* skip leading whitespace */
3514 :
3515 266418 : if (*nextp == '\0')
3516 31868 : return true; /* allow empty string */
3517 :
3518 : /* At the top of the loop, we are at start of a new identifier. */
3519 : do
3520 : {
3521 : char *curname;
3522 : char *endp;
3523 :
3524 429220 : if (*nextp == '"')
3525 : {
3526 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3527 43854 : curname = nextp + 1;
3528 : for (;;)
3529 : {
3530 43858 : endp = strchr(nextp + 1, '"');
3531 43856 : if (endp == NULL)
3532 0 : return false; /* mismatched quotes */
3533 43856 : if (endp[1] != '"')
3534 43854 : break; /* found end of quoted name */
3535 : /* Collapse adjacent quotes into one quote, and look again */
3536 2 : memmove(endp, endp + 1, strlen(endp));
3537 2 : nextp = endp;
3538 : }
3539 : /* endp now points at the terminating quote */
3540 43854 : nextp = endp + 1;
3541 : }
3542 : else
3543 : {
3544 : /* Unquoted name --- extends to separator or whitespace */
3545 : char *downname;
3546 : int len;
3547 :
3548 385366 : curname = nextp;
3549 3477460 : while (*nextp && *nextp != separator &&
3550 3092096 : !scanner_isspace(*nextp))
3551 3092094 : nextp++;
3552 385366 : endp = nextp;
3553 385366 : if (curname == nextp)
3554 0 : return false; /* empty unquoted name not allowed */
3555 :
3556 : /*
3557 : * Downcase the identifier, using same code as main lexer does.
3558 : *
3559 : * XXX because we want to overwrite the input in-place, we cannot
3560 : * support a downcasing transformation that increases the string
3561 : * length. This is not a problem given the current implementation
3562 : * of downcase_truncate_identifier, but we'll probably have to do
3563 : * something about this someday.
3564 : */
3565 385366 : len = endp - curname;
3566 385366 : downname = downcase_truncate_identifier(curname, len, false);
3567 : Assert(strlen(downname) <= len);
3568 385366 : strncpy(curname, downname, len); /* strncpy is required here */
3569 385366 : pfree(downname);
3570 : }
3571 :
3572 429222 : while (scanner_isspace(*nextp))
3573 2 : nextp++; /* skip trailing whitespace */
3574 :
3575 429220 : if (*nextp == separator)
3576 : {
3577 194670 : nextp++;
3578 361920 : while (scanner_isspace(*nextp))
3579 167250 : nextp++; /* skip leading whitespace for next */
3580 : /* we expect another name, so done remains false */
3581 : }
3582 234550 : else if (*nextp == '\0')
3583 234548 : done = true;
3584 : else
3585 2 : return false; /* invalid syntax */
3586 :
3587 : /* Now safe to overwrite separator with a null */
3588 429218 : *endp = '\0';
3589 :
3590 : /* Truncate name if it's overlength */
3591 429218 : truncate_identifier(curname, strlen(curname), false);
3592 :
3593 : /*
3594 : * Finished isolating current name --- add it to list
3595 : */
3596 429218 : *namelist = lappend(*namelist, curname);
3597 :
3598 : /* Loop back if we didn't reach end of string */
3599 429218 : } while (!done);
3600 :
3601 234548 : return true;
3602 : }
3603 :
3604 :
3605 : /*
3606 : * SplitDirectoriesString --- parse a string containing file/directory names
3607 : *
3608 : * This works fine on file names too; the function name is historical.
3609 : *
3610 : * This is similar to SplitIdentifierString, except that the parsing
3611 : * rules are meant to handle pathnames instead of identifiers: there is
3612 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3613 : * and we apply canonicalize_path() to each extracted string. Because of the
3614 : * last, the returned strings are separately palloc'd rather than being
3615 : * pointers into rawstring --- but we still scribble on rawstring.
3616 : *
3617 : * Inputs:
3618 : * rawstring: the input string; must be modifiable!
3619 : * separator: the separator punctuation expected between directories
3620 : * (typically ',' or ';'). Whitespace may also appear around
3621 : * directories.
3622 : * Outputs:
3623 : * namelist: filled with a palloc'd list of directory names.
3624 : * Caller should list_free_deep() this even on error return.
3625 : *
3626 : * Returns true if okay, false if there is a syntax error in the string.
3627 : *
3628 : * Note that an empty string is considered okay here.
3629 : */
3630 : bool
3631 1614 : SplitDirectoriesString(char *rawstring, char separator,
3632 : List **namelist)
3633 : {
3634 1614 : char *nextp = rawstring;
3635 1614 : bool done = false;
3636 :
3637 1614 : *namelist = NIL;
3638 :
3639 1614 : while (scanner_isspace(*nextp))
3640 0 : nextp++; /* skip leading whitespace */
3641 :
3642 1614 : if (*nextp == '\0')
3643 2 : return true; /* allow empty string */
3644 :
3645 : /* At the top of the loop, we are at start of a new directory. */
3646 : do
3647 : {
3648 : char *curname;
3649 : char *endp;
3650 :
3651 1614 : if (*nextp == '"')
3652 : {
3653 : /* Quoted name --- collapse quote-quote pairs */
3654 0 : curname = nextp + 1;
3655 : for (;;)
3656 : {
3657 0 : endp = strchr(nextp + 1, '"');
3658 0 : if (endp == NULL)
3659 0 : return false; /* mismatched quotes */
3660 0 : if (endp[1] != '"')
3661 0 : break; /* found end of quoted name */
3662 : /* Collapse adjacent quotes into one quote, and look again */
3663 0 : memmove(endp, endp + 1, strlen(endp));
3664 0 : nextp = endp;
3665 : }
3666 : /* endp now points at the terminating quote */
3667 0 : nextp = endp + 1;
3668 : }
3669 : else
3670 : {
3671 : /* Unquoted name --- extends to separator or end of string */
3672 1614 : curname = endp = nextp;
3673 27050 : while (*nextp && *nextp != separator)
3674 : {
3675 : /* trailing whitespace should not be included in name */
3676 25436 : if (!scanner_isspace(*nextp))
3677 25436 : endp = nextp + 1;
3678 25436 : nextp++;
3679 : }
3680 1614 : if (curname == endp)
3681 0 : return false; /* empty unquoted name not allowed */
3682 : }
3683 :
3684 1614 : while (scanner_isspace(*nextp))
3685 0 : nextp++; /* skip trailing whitespace */
3686 :
3687 1614 : if (*nextp == separator)
3688 : {
3689 2 : nextp++;
3690 2 : while (scanner_isspace(*nextp))
3691 0 : nextp++; /* skip leading whitespace for next */
3692 : /* we expect another name, so done remains false */
3693 : }
3694 1612 : else if (*nextp == '\0')
3695 1612 : done = true;
3696 : else
3697 0 : return false; /* invalid syntax */
3698 :
3699 : /* Now safe to overwrite separator with a null */
3700 1614 : *endp = '\0';
3701 :
3702 : /* Truncate path if it's overlength */
3703 1614 : if (strlen(curname) >= MAXPGPATH)
3704 0 : curname[MAXPGPATH - 1] = '\0';
3705 :
3706 : /*
3707 : * Finished isolating current name --- add it to list
3708 : */
3709 1614 : curname = pstrdup(curname);
3710 1614 : canonicalize_path(curname);
3711 1614 : *namelist = lappend(*namelist, curname);
3712 :
3713 : /* Loop back if we didn't reach end of string */
3714 1614 : } while (!done);
3715 :
3716 1612 : return true;
3717 : }
3718 :
3719 :
3720 : /*
3721 : * SplitGUCList --- parse a string containing identifiers or file names
3722 : *
3723 : * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3724 : * presuming whether the elements will be taken as identifiers or file names.
3725 : * We assume the input has already been through flatten_set_variable_args(),
3726 : * so that we need never downcase (if appropriate, that was done already).
3727 : * Nor do we ever truncate, since we don't know the correct max length.
3728 : * We disallow embedded whitespace for simplicity (it shouldn't matter,
3729 : * because any embedded whitespace should have led to double-quoting).
3730 : * Otherwise the API is identical to SplitIdentifierString.
3731 : *
3732 : * XXX it's annoying to have so many copies of this string-splitting logic.
3733 : * However, it's not clear that having one function with a bunch of option
3734 : * flags would be much better.
3735 : *
3736 : * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3737 : * Be sure to update that if you have to change this.
3738 : *
3739 : * Inputs:
3740 : * rawstring: the input string; must be overwritable! On return, it's
3741 : * been modified to contain the separated identifiers.
3742 : * separator: the separator punctuation expected between identifiers
3743 : * (typically '.' or ','). Whitespace may also appear around
3744 : * identifiers.
3745 : * Outputs:
3746 : * namelist: filled with a palloc'd list of pointers to identifiers within
3747 : * rawstring. Caller should list_free() this even on error return.
3748 : *
3749 : * Returns true if okay, false if there is a syntax error in the string.
3750 : */
3751 : bool
3752 3554 : SplitGUCList(char *rawstring, char separator,
3753 : List **namelist)
3754 : {
3755 3554 : char *nextp = rawstring;
3756 3554 : bool done = false;
3757 :
3758 3554 : *namelist = NIL;
3759 :
3760 3554 : while (scanner_isspace(*nextp))
3761 0 : nextp++; /* skip leading whitespace */
3762 :
3763 3554 : if (*nextp == '\0')
3764 3480 : return true; /* allow empty string */
3765 :
3766 : /* At the top of the loop, we are at start of a new identifier. */
3767 : do
3768 : {
3769 : char *curname;
3770 : char *endp;
3771 :
3772 100 : if (*nextp == '"')
3773 : {
3774 : /* Quoted name --- collapse quote-quote pairs */
3775 24 : curname = nextp + 1;
3776 : for (;;)
3777 : {
3778 36 : endp = strchr(nextp + 1, '"');
3779 30 : if (endp == NULL)
3780 0 : return false; /* mismatched quotes */
3781 30 : if (endp[1] != '"')
3782 24 : break; /* found end of quoted name */
3783 : /* Collapse adjacent quotes into one quote, and look again */
3784 6 : memmove(endp, endp + 1, strlen(endp));
3785 6 : nextp = endp;
3786 : }
3787 : /* endp now points at the terminating quote */
3788 24 : nextp = endp + 1;
3789 : }
3790 : else
3791 : {
3792 : /* Unquoted name --- extends to separator or whitespace */
3793 76 : curname = nextp;
3794 718 : while (*nextp && *nextp != separator &&
3795 642 : !scanner_isspace(*nextp))
3796 642 : nextp++;
3797 76 : endp = nextp;
3798 76 : if (curname == nextp)
3799 0 : return false; /* empty unquoted name not allowed */
3800 : }
3801 :
3802 100 : while (scanner_isspace(*nextp))
3803 0 : nextp++; /* skip trailing whitespace */
3804 :
3805 100 : if (*nextp == separator)
3806 : {
3807 26 : nextp++;
3808 44 : while (scanner_isspace(*nextp))
3809 18 : nextp++; /* skip leading whitespace for next */
3810 : /* we expect another name, so done remains false */
3811 : }
3812 74 : else if (*nextp == '\0')
3813 74 : done = true;
3814 : else
3815 0 : return false; /* invalid syntax */
3816 :
3817 : /* Now safe to overwrite separator with a null */
3818 100 : *endp = '\0';
3819 :
3820 : /*
3821 : * Finished isolating current name --- add it to list
3822 : */
3823 100 : *namelist = lappend(*namelist, curname);
3824 :
3825 : /* Loop back if we didn't reach end of string */
3826 100 : } while (!done);
3827 :
3828 74 : return true;
3829 : }
3830 :
3831 :
3832 : /*****************************************************************************
3833 : * Comparison Functions used for bytea
3834 : *
3835 : * Note: btree indexes need these routines not to leak memory; therefore,
3836 : * be careful to free working copies of toasted datums. Most places don't
3837 : * need to be so careful.
3838 : *****************************************************************************/
3839 :
3840 : Datum
3841 10390 : byteaeq(PG_FUNCTION_ARGS)
3842 : {
3843 10390 : Datum arg1 = PG_GETARG_DATUM(0);
3844 10390 : Datum arg2 = PG_GETARG_DATUM(1);
3845 : bool result;
3846 : Size len1,
3847 : len2;
3848 :
3849 : /*
3850 : * We can use a fast path for unequal lengths, which might save us from
3851 : * having to detoast one or both values.
3852 : */
3853 10390 : len1 = toast_raw_datum_size(arg1);
3854 10390 : len2 = toast_raw_datum_size(arg2);
3855 10390 : if (len1 != len2)
3856 4316 : result = false;
3857 : else
3858 : {
3859 6074 : bytea *barg1 = DatumGetByteaPP(arg1);
3860 6074 : bytea *barg2 = DatumGetByteaPP(arg2);
3861 :
3862 6074 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3863 : len1 - VARHDRSZ) == 0);
3864 :
3865 6074 : PG_FREE_IF_COPY(barg1, 0);
3866 6074 : PG_FREE_IF_COPY(barg2, 1);
3867 : }
3868 :
3869 10390 : PG_RETURN_BOOL(result);
3870 : }
3871 :
3872 : Datum
3873 768 : byteane(PG_FUNCTION_ARGS)
3874 : {
3875 768 : Datum arg1 = PG_GETARG_DATUM(0);
3876 768 : Datum arg2 = PG_GETARG_DATUM(1);
3877 : bool result;
3878 : Size len1,
3879 : len2;
3880 :
3881 : /*
3882 : * We can use a fast path for unequal lengths, which might save us from
3883 : * having to detoast one or both values.
3884 : */
3885 768 : len1 = toast_raw_datum_size(arg1);
3886 768 : len2 = toast_raw_datum_size(arg2);
3887 768 : if (len1 != len2)
3888 0 : result = true;
3889 : else
3890 : {
3891 768 : bytea *barg1 = DatumGetByteaPP(arg1);
3892 768 : bytea *barg2 = DatumGetByteaPP(arg2);
3893 :
3894 768 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3895 : len1 - VARHDRSZ) != 0);
3896 :
3897 768 : PG_FREE_IF_COPY(barg1, 0);
3898 768 : PG_FREE_IF_COPY(barg2, 1);
3899 : }
3900 :
3901 768 : PG_RETURN_BOOL(result);
3902 : }
3903 :
3904 : Datum
3905 8316 : bytealt(PG_FUNCTION_ARGS)
3906 : {
3907 8316 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3908 8316 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3909 : int len1,
3910 : len2;
3911 : int cmp;
3912 :
3913 8316 : len1 = VARSIZE_ANY_EXHDR(arg1);
3914 8316 : len2 = VARSIZE_ANY_EXHDR(arg2);
3915 :
3916 8316 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3917 :
3918 8316 : PG_FREE_IF_COPY(arg1, 0);
3919 8316 : PG_FREE_IF_COPY(arg2, 1);
3920 :
3921 8316 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3922 : }
3923 :
3924 : Datum
3925 6356 : byteale(PG_FUNCTION_ARGS)
3926 : {
3927 6356 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3928 6356 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3929 : int len1,
3930 : len2;
3931 : int cmp;
3932 :
3933 6356 : len1 = VARSIZE_ANY_EXHDR(arg1);
3934 6356 : len2 = VARSIZE_ANY_EXHDR(arg2);
3935 :
3936 6356 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3937 :
3938 6356 : PG_FREE_IF_COPY(arg1, 0);
3939 6356 : PG_FREE_IF_COPY(arg2, 1);
3940 :
3941 6356 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3942 : }
3943 :
3944 : Datum
3945 6228 : byteagt(PG_FUNCTION_ARGS)
3946 : {
3947 6228 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3948 6228 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3949 : int len1,
3950 : len2;
3951 : int cmp;
3952 :
3953 6228 : len1 = VARSIZE_ANY_EXHDR(arg1);
3954 6228 : len2 = VARSIZE_ANY_EXHDR(arg2);
3955 :
3956 6228 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3957 :
3958 6228 : PG_FREE_IF_COPY(arg1, 0);
3959 6228 : PG_FREE_IF_COPY(arg2, 1);
3960 :
3961 6228 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3962 : }
3963 :
3964 : Datum
3965 5010 : byteage(PG_FUNCTION_ARGS)
3966 : {
3967 5010 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3968 5010 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3969 : int len1,
3970 : len2;
3971 : int cmp;
3972 :
3973 5010 : len1 = VARSIZE_ANY_EXHDR(arg1);
3974 5010 : len2 = VARSIZE_ANY_EXHDR(arg2);
3975 :
3976 5010 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3977 :
3978 5010 : PG_FREE_IF_COPY(arg1, 0);
3979 5010 : PG_FREE_IF_COPY(arg2, 1);
3980 :
3981 5010 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3982 : }
3983 :
3984 : Datum
3985 87420 : byteacmp(PG_FUNCTION_ARGS)
3986 : {
3987 87420 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3988 87420 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3989 : int len1,
3990 : len2;
3991 : int cmp;
3992 :
3993 87420 : len1 = VARSIZE_ANY_EXHDR(arg1);
3994 87420 : len2 = VARSIZE_ANY_EXHDR(arg2);
3995 :
3996 87420 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3997 87420 : if ((cmp == 0) && (len1 != len2))
3998 14506 : cmp = (len1 < len2) ? -1 : 1;
3999 :
4000 87420 : PG_FREE_IF_COPY(arg1, 0);
4001 87420 : PG_FREE_IF_COPY(arg2, 1);
4002 :
4003 87420 : PG_RETURN_INT32(cmp);
4004 : }
4005 :
4006 : Datum
4007 24 : bytea_larger(PG_FUNCTION_ARGS)
4008 : {
4009 24 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4010 24 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4011 : bytea *result;
4012 : int len1,
4013 : len2;
4014 : int cmp;
4015 :
4016 24 : len1 = VARSIZE_ANY_EXHDR(arg1);
4017 24 : len2 = VARSIZE_ANY_EXHDR(arg2);
4018 :
4019 24 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4020 24 : result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2);
4021 :
4022 24 : PG_RETURN_BYTEA_P(result);
4023 : }
4024 :
4025 : Datum
4026 24 : bytea_smaller(PG_FUNCTION_ARGS)
4027 : {
4028 24 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4029 24 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4030 : bytea *result;
4031 : int len1,
4032 : len2;
4033 : int cmp;
4034 :
4035 24 : len1 = VARSIZE_ANY_EXHDR(arg1);
4036 24 : len2 = VARSIZE_ANY_EXHDR(arg2);
4037 :
4038 24 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4039 24 : result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2);
4040 :
4041 24 : PG_RETURN_BYTEA_P(result);
4042 : }
4043 :
4044 : Datum
4045 32 : bytea_sortsupport(PG_FUNCTION_ARGS)
4046 : {
4047 32 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4048 : MemoryContext oldcontext;
4049 :
4050 32 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4051 :
4052 : /* Use generic string SortSupport, forcing "C" collation */
4053 32 : varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4054 :
4055 32 : MemoryContextSwitchTo(oldcontext);
4056 :
4057 32 : PG_RETURN_VOID();
4058 : }
4059 :
4060 : /*
4061 : * appendStringInfoText
4062 : *
4063 : * Append a text to str.
4064 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4065 : */
4066 : static void
4067 1706784 : appendStringInfoText(StringInfo str, const text *t)
4068 : {
4069 1706784 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4070 1706784 : }
4071 :
4072 : /*
4073 : * replace_text
4074 : * replace all occurrences of 'old_sub_str' in 'orig_str'
4075 : * with 'new_sub_str' to form 'new_str'
4076 : *
4077 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4078 : * otherwise returns 'new_str'
4079 : */
4080 : Datum
4081 1282 : replace_text(PG_FUNCTION_ARGS)
4082 : {
4083 1282 : text *src_text = PG_GETARG_TEXT_PP(0);
4084 1282 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
4085 1282 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
4086 : int src_text_len;
4087 : int from_sub_text_len;
4088 : TextPositionState state;
4089 : text *ret_text;
4090 : int chunk_len;
4091 : char *curr_ptr;
4092 : char *start_ptr;
4093 : StringInfoData str;
4094 : bool found;
4095 :
4096 1282 : src_text_len = VARSIZE_ANY_EXHDR(src_text);
4097 1282 : from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4098 :
4099 : /* Return unmodified source string if empty source or pattern */
4100 1282 : if (src_text_len < 1 || from_sub_text_len < 1)
4101 : {
4102 0 : PG_RETURN_TEXT_P(src_text);
4103 : }
4104 :
4105 1282 : text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4106 :
4107 1282 : found = text_position_next(&state);
4108 :
4109 : /* When the from_sub_text is not found, there is nothing to do. */
4110 1282 : if (!found)
4111 : {
4112 292 : text_position_cleanup(&state);
4113 292 : PG_RETURN_TEXT_P(src_text);
4114 : }
4115 990 : curr_ptr = text_position_get_match_ptr(&state);
4116 990 : start_ptr = VARDATA_ANY(src_text);
4117 :
4118 990 : initStringInfo(&str);
4119 :
4120 : do
4121 : {
4122 5162 : CHECK_FOR_INTERRUPTS();
4123 :
4124 : /* copy the data skipped over by last text_position_next() */
4125 5162 : chunk_len = curr_ptr - start_ptr;
4126 5162 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4127 :
4128 5162 : appendStringInfoText(&str, to_sub_text);
4129 :
4130 5162 : start_ptr = curr_ptr + state.last_match_len;
4131 :
4132 5162 : found = text_position_next(&state);
4133 5162 : if (found)
4134 4172 : curr_ptr = text_position_get_match_ptr(&state);
4135 : }
4136 5162 : while (found);
4137 :
4138 : /* copy trailing data */
4139 990 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4140 990 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4141 :
4142 990 : text_position_cleanup(&state);
4143 :
4144 990 : ret_text = cstring_to_text_with_len(str.data, str.len);
4145 990 : pfree(str.data);
4146 :
4147 990 : PG_RETURN_TEXT_P(ret_text);
4148 : }
4149 :
4150 : /*
4151 : * check_replace_text_has_escape
4152 : *
4153 : * Returns 0 if text contains no backslashes that need processing.
4154 : * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4155 : * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4156 : */
4157 : static int
4158 12950 : check_replace_text_has_escape(const text *replace_text)
4159 : {
4160 12950 : int result = 0;
4161 12950 : const char *p = VARDATA_ANY(replace_text);
4162 12950 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4163 :
4164 12994 : while (p < p_end)
4165 : {
4166 : /* Find next escape char, if any. */
4167 11982 : p = memchr(p, '\\', p_end - p);
4168 11982 : if (p == NULL)
4169 11160 : break;
4170 822 : p++;
4171 : /* Note: a backslash at the end doesn't require extra processing. */
4172 822 : if (p < p_end)
4173 : {
4174 822 : if (*p >= '1' && *p <= '9')
4175 778 : return 2; /* Found a submatch specifier, so done */
4176 44 : result = 1; /* Found some other sequence, keep looking */
4177 44 : p++;
4178 : }
4179 : }
4180 12172 : return result;
4181 : }
4182 :
4183 : /*
4184 : * appendStringInfoRegexpSubstr
4185 : *
4186 : * Append replace_text to str, substituting regexp back references for
4187 : * \n escapes. start_ptr is the start of the match in the source string,
4188 : * at logical character position data_pos.
4189 : */
4190 : static void
4191 236 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4192 : regmatch_t *pmatch,
4193 : char *start_ptr, int data_pos)
4194 : {
4195 236 : const char *p = VARDATA_ANY(replace_text);
4196 236 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4197 :
4198 574 : while (p < p_end)
4199 : {
4200 518 : const char *chunk_start = p;
4201 : int so;
4202 : int eo;
4203 :
4204 : /* Find next escape char, if any. */
4205 518 : p = memchr(p, '\\', p_end - p);
4206 518 : if (p == NULL)
4207 174 : p = p_end;
4208 :
4209 : /* Copy the text we just scanned over, if any. */
4210 518 : if (p > chunk_start)
4211 318 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4212 :
4213 : /* Done if at end of string, else advance over escape char. */
4214 518 : if (p >= p_end)
4215 174 : break;
4216 344 : p++;
4217 :
4218 344 : if (p >= p_end)
4219 : {
4220 : /* Escape at very end of input. Treat same as unexpected char */
4221 6 : appendStringInfoChar(str, '\\');
4222 6 : break;
4223 : }
4224 :
4225 338 : if (*p >= '1' && *p <= '9')
4226 278 : {
4227 : /* Use the back reference of regexp. */
4228 278 : int idx = *p - '0';
4229 :
4230 278 : so = pmatch[idx].rm_so;
4231 278 : eo = pmatch[idx].rm_eo;
4232 278 : p++;
4233 : }
4234 60 : else if (*p == '&')
4235 : {
4236 : /* Use the entire matched string. */
4237 18 : so = pmatch[0].rm_so;
4238 18 : eo = pmatch[0].rm_eo;
4239 18 : p++;
4240 : }
4241 42 : else if (*p == '\\')
4242 : {
4243 : /* \\ means transfer one \ to output. */
4244 36 : appendStringInfoChar(str, '\\');
4245 36 : p++;
4246 36 : continue;
4247 : }
4248 : else
4249 : {
4250 : /*
4251 : * If escape char is not followed by any expected char, just treat
4252 : * it as ordinary data to copy. (XXX would it be better to throw
4253 : * an error?)
4254 : */
4255 6 : appendStringInfoChar(str, '\\');
4256 6 : continue;
4257 : }
4258 :
4259 296 : if (so >= 0 && eo >= 0)
4260 : {
4261 : /*
4262 : * Copy the text that is back reference of regexp. Note so and eo
4263 : * are counted in characters not bytes.
4264 : */
4265 : char *chunk_start;
4266 : int chunk_len;
4267 :
4268 : Assert(so >= data_pos);
4269 296 : chunk_start = start_ptr;
4270 296 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4271 296 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4272 296 : appendBinaryStringInfo(str, chunk_start, chunk_len);
4273 : }
4274 : }
4275 236 : }
4276 :
4277 : /*
4278 : * replace_text_regexp
4279 : *
4280 : * replace substring(s) in src_text that match pattern with replace_text.
4281 : * The replace_text can contain backslash markers to substitute
4282 : * (parts of) the matched text.
4283 : *
4284 : * cflags: regexp compile flags.
4285 : * collation: collation to use.
4286 : * search_start: the character (not byte) offset in src_text at which to
4287 : * begin searching.
4288 : * n: if 0, replace all matches; if > 0, replace only the N'th match.
4289 : */
4290 : text *
4291 12950 : replace_text_regexp(text *src_text, text *pattern_text,
4292 : text *replace_text,
4293 : int cflags, Oid collation,
4294 : int search_start, int n)
4295 : {
4296 : text *ret_text;
4297 : regex_t *re;
4298 12950 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4299 12950 : int nmatches = 0;
4300 : StringInfoData buf;
4301 : regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4302 12950 : int nmatch = lengthof(pmatch);
4303 : pg_wchar *data;
4304 : size_t data_len;
4305 : int data_pos;
4306 : char *start_ptr;
4307 : int escape_status;
4308 :
4309 12950 : initStringInfo(&buf);
4310 :
4311 : /* Convert data string to wide characters. */
4312 12950 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4313 12950 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4314 :
4315 : /* Check whether replace_text has escapes, especially regexp submatches. */
4316 12950 : escape_status = check_replace_text_has_escape(replace_text);
4317 :
4318 : /* If no regexp submatches, we can use REG_NOSUB. */
4319 12950 : if (escape_status < 2)
4320 : {
4321 12172 : cflags |= REG_NOSUB;
4322 : /* Also tell pg_regexec we only want the whole-match location. */
4323 12172 : nmatch = 1;
4324 : }
4325 :
4326 : /* Prepare the regexp. */
4327 12950 : re = RE_compile_and_cache(pattern_text, cflags, collation);
4328 :
4329 : /* start_ptr points to the data_pos'th character of src_text */
4330 12950 : start_ptr = (char *) VARDATA_ANY(src_text);
4331 12950 : data_pos = 0;
4332 :
4333 18676 : while (search_start <= data_len)
4334 : {
4335 : int regexec_result;
4336 :
4337 18670 : CHECK_FOR_INTERRUPTS();
4338 :
4339 18670 : regexec_result = pg_regexec(re,
4340 : data,
4341 : data_len,
4342 : search_start,
4343 : NULL, /* no details */
4344 : nmatch,
4345 : pmatch,
4346 : 0);
4347 :
4348 18670 : if (regexec_result == REG_NOMATCH)
4349 11240 : break;
4350 :
4351 7430 : if (regexec_result != REG_OKAY)
4352 : {
4353 : char errMsg[100];
4354 :
4355 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4356 0 : ereport(ERROR,
4357 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4358 : errmsg("regular expression failed: %s", errMsg)));
4359 : }
4360 :
4361 : /*
4362 : * Count matches, and decide whether to replace this match.
4363 : */
4364 7430 : nmatches++;
4365 7430 : if (n > 0 && nmatches != n)
4366 : {
4367 : /*
4368 : * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4369 : * we treat the matched text as if it weren't matched, and copy it
4370 : * to the output later.)
4371 : */
4372 60 : search_start = pmatch[0].rm_eo;
4373 60 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4374 0 : search_start++;
4375 60 : continue;
4376 : }
4377 :
4378 : /*
4379 : * Copy the text to the left of the match position. Note we are given
4380 : * character not byte indexes.
4381 : */
4382 7370 : if (pmatch[0].rm_so - data_pos > 0)
4383 : {
4384 : int chunk_len;
4385 :
4386 7196 : chunk_len = charlen_to_bytelen(start_ptr,
4387 7196 : pmatch[0].rm_so - data_pos);
4388 7196 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4389 :
4390 : /*
4391 : * Advance start_ptr over that text, to avoid multiple rescans of
4392 : * it if the replace_text contains multiple back-references.
4393 : */
4394 7196 : start_ptr += chunk_len;
4395 7196 : data_pos = pmatch[0].rm_so;
4396 : }
4397 :
4398 : /*
4399 : * Copy the replace_text, processing escapes if any are present.
4400 : */
4401 7370 : if (escape_status > 0)
4402 236 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4403 : start_ptr, data_pos);
4404 : else
4405 7134 : appendStringInfoText(&buf, replace_text);
4406 :
4407 : /* Advance start_ptr and data_pos over the matched text. */
4408 14740 : start_ptr += charlen_to_bytelen(start_ptr,
4409 7370 : pmatch[0].rm_eo - data_pos);
4410 7370 : data_pos = pmatch[0].rm_eo;
4411 :
4412 : /*
4413 : * If we only want to replace one occurrence, we're done.
4414 : */
4415 7370 : if (n > 0)
4416 1704 : break;
4417 :
4418 : /*
4419 : * Advance search position. Normally we start the next search at the
4420 : * end of the previous match; but if the match was of zero length, we
4421 : * have to advance by one character, or we'd just find the same match
4422 : * again.
4423 : */
4424 5666 : search_start = data_pos;
4425 5666 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4426 12 : search_start++;
4427 : }
4428 :
4429 : /*
4430 : * Copy the text to the right of the last match.
4431 : */
4432 12950 : if (data_pos < data_len)
4433 : {
4434 : int chunk_len;
4435 :
4436 12378 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4437 12378 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4438 : }
4439 :
4440 12950 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4441 12950 : pfree(buf.data);
4442 12950 : pfree(data);
4443 :
4444 12950 : return ret_text;
4445 : }
4446 :
4447 : /*
4448 : * split_part
4449 : * parse input string based on provided field separator
4450 : * return N'th item (1 based, negative counts from end)
4451 : */
4452 : Datum
4453 150 : split_part(PG_FUNCTION_ARGS)
4454 : {
4455 150 : text *inputstring = PG_GETARG_TEXT_PP(0);
4456 150 : text *fldsep = PG_GETARG_TEXT_PP(1);
4457 150 : int fldnum = PG_GETARG_INT32(2);
4458 : int inputstring_len;
4459 : int fldsep_len;
4460 : TextPositionState state;
4461 : char *start_ptr;
4462 : char *end_ptr;
4463 : text *result_text;
4464 : bool found;
4465 :
4466 : /* field number is 1 based */
4467 150 : if (fldnum == 0)
4468 6 : ereport(ERROR,
4469 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4470 : errmsg("field position must not be zero")));
4471 :
4472 144 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4473 144 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4474 :
4475 : /* return empty string for empty input string */
4476 144 : if (inputstring_len < 1)
4477 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4478 :
4479 : /* handle empty field separator */
4480 132 : if (fldsep_len < 1)
4481 : {
4482 : /* if first or last field, return input string, else empty string */
4483 24 : if (fldnum == 1 || fldnum == -1)
4484 12 : PG_RETURN_TEXT_P(inputstring);
4485 : else
4486 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4487 : }
4488 :
4489 : /* find the first field separator */
4490 108 : text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4491 :
4492 108 : found = text_position_next(&state);
4493 :
4494 : /* special case if fldsep not found at all */
4495 108 : if (!found)
4496 : {
4497 24 : text_position_cleanup(&state);
4498 : /* if first or last field, return input string, else empty string */
4499 24 : if (fldnum == 1 || fldnum == -1)
4500 12 : PG_RETURN_TEXT_P(inputstring);
4501 : else
4502 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4503 : }
4504 :
4505 : /*
4506 : * take care of a negative field number (i.e. count from the right) by
4507 : * converting to a positive field number; we need total number of fields
4508 : */
4509 84 : if (fldnum < 0)
4510 : {
4511 : /* we found a fldsep, so there are at least two fields */
4512 42 : int numfields = 2;
4513 :
4514 54 : while (text_position_next(&state))
4515 12 : numfields++;
4516 :
4517 : /* special case of last field does not require an extra pass */
4518 42 : if (fldnum == -1)
4519 : {
4520 24 : start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
4521 24 : end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4522 24 : text_position_cleanup(&state);
4523 24 : PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4524 : end_ptr - start_ptr));
4525 : }
4526 :
4527 : /* else, convert fldnum to positive notation */
4528 18 : fldnum += numfields + 1;
4529 :
4530 : /* if nonexistent field, return empty string */
4531 18 : if (fldnum <= 0)
4532 : {
4533 6 : text_position_cleanup(&state);
4534 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4535 : }
4536 :
4537 : /* reset to pointing at first match, but now with positive fldnum */
4538 12 : text_position_reset(&state);
4539 12 : found = text_position_next(&state);
4540 : Assert(found);
4541 : }
4542 :
4543 : /* identify bounds of first field */
4544 54 : start_ptr = VARDATA_ANY(inputstring);
4545 54 : end_ptr = text_position_get_match_ptr(&state);
4546 :
4547 102 : while (found && --fldnum > 0)
4548 : {
4549 : /* identify bounds of next field */
4550 48 : start_ptr = end_ptr + state.last_match_len;
4551 48 : found = text_position_next(&state);
4552 48 : if (found)
4553 18 : end_ptr = text_position_get_match_ptr(&state);
4554 : }
4555 :
4556 54 : text_position_cleanup(&state);
4557 :
4558 54 : if (fldnum > 0)
4559 : {
4560 : /* N'th field separator not found */
4561 : /* if last field requested, return it, else empty string */
4562 30 : if (fldnum == 1)
4563 : {
4564 24 : int last_len = start_ptr - VARDATA_ANY(inputstring);
4565 :
4566 24 : result_text = cstring_to_text_with_len(start_ptr,
4567 : inputstring_len - last_len);
4568 : }
4569 : else
4570 6 : result_text = cstring_to_text("");
4571 : }
4572 : else
4573 : {
4574 : /* non-last field requested */
4575 24 : result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4576 : }
4577 :
4578 54 : PG_RETURN_TEXT_P(result_text);
4579 : }
4580 :
4581 : /*
4582 : * Convenience function to return true when two text params are equal.
4583 : */
4584 : static bool
4585 384 : text_isequal(text *txt1, text *txt2, Oid collid)
4586 : {
4587 384 : return DatumGetBool(DirectFunctionCall2Coll(texteq,
4588 : collid,
4589 : PointerGetDatum(txt1),
4590 : PointerGetDatum(txt2)));
4591 : }
4592 :
4593 : /*
4594 : * text_to_array
4595 : * parse input string and return text array of elements,
4596 : * based on provided field separator
4597 : */
4598 : Datum
4599 170 : text_to_array(PG_FUNCTION_ARGS)
4600 : {
4601 : SplitTextOutputData tstate;
4602 :
4603 : /* For array output, tstate should start as all zeroes */
4604 170 : memset(&tstate, 0, sizeof(tstate));
4605 :
4606 170 : if (!split_text(fcinfo, &tstate))
4607 6 : PG_RETURN_NULL();
4608 :
4609 164 : if (tstate.astate == NULL)
4610 6 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4611 :
4612 158 : PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4613 : CurrentMemoryContext));
4614 : }
4615 :
4616 : /*
4617 : * text_to_array_null
4618 : * parse input string and return text array of elements,
4619 : * based on provided field separator and null string
4620 : *
4621 : * This is a separate entry point only to prevent the regression tests from
4622 : * complaining about different argument sets for the same internal function.
4623 : */
4624 : Datum
4625 60 : text_to_array_null(PG_FUNCTION_ARGS)
4626 : {
4627 60 : return text_to_array(fcinfo);
4628 : }
4629 :
4630 : /*
4631 : * text_to_table
4632 : * parse input string and return table of elements,
4633 : * based on provided field separator
4634 : */
4635 : Datum
4636 84 : text_to_table(PG_FUNCTION_ARGS)
4637 : {
4638 84 : ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4639 : SplitTextOutputData tstate;
4640 :
4641 84 : tstate.astate = NULL;
4642 84 : InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4643 84 : tstate.tupstore = rsi->setResult;
4644 84 : tstate.tupdesc = rsi->setDesc;
4645 :
4646 84 : (void) split_text(fcinfo, &tstate);
4647 :
4648 84 : return (Datum) 0;
4649 : }
4650 :
4651 : /*
4652 : * text_to_table_null
4653 : * parse input string and return table of elements,
4654 : * based on provided field separator and null string
4655 : *
4656 : * This is a separate entry point only to prevent the regression tests from
4657 : * complaining about different argument sets for the same internal function.
4658 : */
4659 : Datum
4660 24 : text_to_table_null(PG_FUNCTION_ARGS)
4661 : {
4662 24 : return text_to_table(fcinfo);
4663 : }
4664 :
4665 : /*
4666 : * Common code for text_to_array, text_to_array_null, text_to_table
4667 : * and text_to_table_null functions.
4668 : *
4669 : * These are not strict so we have to test for null inputs explicitly.
4670 : * Returns false if result is to be null, else returns true.
4671 : *
4672 : * Note that if the result is valid but empty (zero elements), we return
4673 : * without changing *tstate --- caller must handle that case, too.
4674 : */
4675 : static bool
4676 254 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4677 : {
4678 : text *inputstring;
4679 : text *fldsep;
4680 : text *null_string;
4681 254 : Oid collation = PG_GET_COLLATION();
4682 : int inputstring_len;
4683 : int fldsep_len;
4684 : char *start_ptr;
4685 : text *result_text;
4686 :
4687 : /* when input string is NULL, then result is NULL too */
4688 254 : if (PG_ARGISNULL(0))
4689 12 : return false;
4690 :
4691 242 : inputstring = PG_GETARG_TEXT_PP(0);
4692 :
4693 : /* fldsep can be NULL */
4694 242 : if (!PG_ARGISNULL(1))
4695 212 : fldsep = PG_GETARG_TEXT_PP(1);
4696 : else
4697 30 : fldsep = NULL;
4698 :
4699 : /* null_string can be NULL or omitted */
4700 242 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4701 84 : null_string = PG_GETARG_TEXT_PP(2);
4702 : else
4703 158 : null_string = NULL;
4704 :
4705 242 : if (fldsep != NULL)
4706 : {
4707 : /*
4708 : * Normal case with non-null fldsep. Use the text_position machinery
4709 : * to search for occurrences of fldsep.
4710 : */
4711 : TextPositionState state;
4712 :
4713 212 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4714 212 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4715 :
4716 : /* return empty set for empty input string */
4717 212 : if (inputstring_len < 1)
4718 60 : return true;
4719 :
4720 : /* empty field separator: return input string as a one-element set */
4721 200 : if (fldsep_len < 1)
4722 : {
4723 48 : split_text_accum_result(tstate, inputstring,
4724 : null_string, collation);
4725 48 : return true;
4726 : }
4727 :
4728 152 : text_position_setup(inputstring, fldsep, collation, &state);
4729 :
4730 152 : start_ptr = VARDATA_ANY(inputstring);
4731 :
4732 : for (;;)
4733 512 : {
4734 : bool found;
4735 : char *end_ptr;
4736 : int chunk_len;
4737 :
4738 664 : CHECK_FOR_INTERRUPTS();
4739 :
4740 664 : found = text_position_next(&state);
4741 664 : if (!found)
4742 : {
4743 : /* fetch last field */
4744 152 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4745 152 : end_ptr = NULL; /* not used, but some compilers complain */
4746 : }
4747 : else
4748 : {
4749 : /* fetch non-last field */
4750 512 : end_ptr = text_position_get_match_ptr(&state);
4751 512 : chunk_len = end_ptr - start_ptr;
4752 : }
4753 :
4754 : /* build a temp text datum to pass to split_text_accum_result */
4755 664 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4756 :
4757 : /* stash away this field */
4758 664 : split_text_accum_result(tstate, result_text,
4759 : null_string, collation);
4760 :
4761 664 : pfree(result_text);
4762 :
4763 664 : if (!found)
4764 152 : break;
4765 :
4766 512 : start_ptr = end_ptr + state.last_match_len;
4767 : }
4768 :
4769 152 : text_position_cleanup(&state);
4770 : }
4771 : else
4772 : {
4773 : /*
4774 : * When fldsep is NULL, each character in the input string becomes a
4775 : * separate element in the result set. The separator is effectively
4776 : * the space between characters.
4777 : */
4778 30 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4779 :
4780 30 : start_ptr = VARDATA_ANY(inputstring);
4781 :
4782 252 : while (inputstring_len > 0)
4783 : {
4784 222 : int chunk_len = pg_mblen(start_ptr);
4785 :
4786 222 : CHECK_FOR_INTERRUPTS();
4787 :
4788 : /* build a temp text datum to pass to split_text_accum_result */
4789 222 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4790 :
4791 : /* stash away this field */
4792 222 : split_text_accum_result(tstate, result_text,
4793 : null_string, collation);
4794 :
4795 222 : pfree(result_text);
4796 :
4797 222 : start_ptr += chunk_len;
4798 222 : inputstring_len -= chunk_len;
4799 : }
4800 : }
4801 :
4802 182 : return true;
4803 : }
4804 :
4805 : /*
4806 : * Add text item to result set (table or array).
4807 : *
4808 : * This is also responsible for checking to see if the item matches
4809 : * the null_string, in which case we should emit NULL instead.
4810 : */
4811 : static void
4812 934 : split_text_accum_result(SplitTextOutputData *tstate,
4813 : text *field_value,
4814 : text *null_string,
4815 : Oid collation)
4816 : {
4817 934 : bool is_null = false;
4818 :
4819 934 : if (null_string && text_isequal(field_value, null_string, collation))
4820 72 : is_null = true;
4821 :
4822 934 : if (tstate->tupstore)
4823 : {
4824 : Datum values[1];
4825 : bool nulls[1];
4826 :
4827 228 : values[0] = PointerGetDatum(field_value);
4828 228 : nulls[0] = is_null;
4829 :
4830 228 : tuplestore_putvalues(tstate->tupstore,
4831 : tstate->tupdesc,
4832 : values,
4833 : nulls);
4834 : }
4835 : else
4836 : {
4837 706 : tstate->astate = accumArrayResult(tstate->astate,
4838 : PointerGetDatum(field_value),
4839 : is_null,
4840 : TEXTOID,
4841 : CurrentMemoryContext);
4842 : }
4843 934 : }
4844 :
4845 : /*
4846 : * array_to_text
4847 : * concatenate Cstring representation of input array elements
4848 : * using provided field separator
4849 : */
4850 : Datum
4851 74644 : array_to_text(PG_FUNCTION_ARGS)
4852 : {
4853 74644 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4854 74644 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4855 :
4856 74644 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4857 : }
4858 :
4859 : /*
4860 : * array_to_text_null
4861 : * concatenate Cstring representation of input array elements
4862 : * using provided field separator and null string
4863 : *
4864 : * This version is not strict so we have to test for null inputs explicitly.
4865 : */
4866 : Datum
4867 12 : array_to_text_null(PG_FUNCTION_ARGS)
4868 : {
4869 : ArrayType *v;
4870 : char *fldsep;
4871 : char *null_string;
4872 :
4873 : /* returns NULL when first or second parameter is NULL */
4874 12 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4875 0 : PG_RETURN_NULL();
4876 :
4877 12 : v = PG_GETARG_ARRAYTYPE_P(0);
4878 12 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4879 :
4880 : /* NULL null string is passed through as a null pointer */
4881 12 : if (!PG_ARGISNULL(2))
4882 6 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4883 : else
4884 6 : null_string = NULL;
4885 :
4886 12 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4887 : }
4888 :
4889 : /*
4890 : * common code for array_to_text and array_to_text_null functions
4891 : */
4892 : static text *
4893 74674 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4894 : const char *fldsep, const char *null_string)
4895 : {
4896 : text *result;
4897 : int nitems,
4898 : *dims,
4899 : ndims;
4900 : Oid element_type;
4901 : int typlen;
4902 : bool typbyval;
4903 : char typalign;
4904 : StringInfoData buf;
4905 74674 : bool printed = false;
4906 : char *p;
4907 : bits8 *bitmap;
4908 : int bitmask;
4909 : int i;
4910 : ArrayMetaState *my_extra;
4911 :
4912 74674 : ndims = ARR_NDIM(v);
4913 74674 : dims = ARR_DIMS(v);
4914 74674 : nitems = ArrayGetNItems(ndims, dims);
4915 :
4916 : /* if there are no elements, return an empty string */
4917 74674 : if (nitems == 0)
4918 49828 : return cstring_to_text_with_len("", 0);
4919 :
4920 24846 : element_type = ARR_ELEMTYPE(v);
4921 24846 : initStringInfo(&buf);
4922 :
4923 : /*
4924 : * We arrange to look up info about element type, including its output
4925 : * conversion proc, only once per series of calls, assuming the element
4926 : * type doesn't change underneath us.
4927 : */
4928 24846 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4929 24846 : if (my_extra == NULL)
4930 : {
4931 1438 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4932 : sizeof(ArrayMetaState));
4933 1438 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4934 1438 : my_extra->element_type = ~element_type;
4935 : }
4936 :
4937 24846 : if (my_extra->element_type != element_type)
4938 : {
4939 : /*
4940 : * Get info about element type, including its output conversion proc
4941 : */
4942 1438 : get_type_io_data(element_type, IOFunc_output,
4943 : &my_extra->typlen, &my_extra->typbyval,
4944 : &my_extra->typalign, &my_extra->typdelim,
4945 : &my_extra->typioparam, &my_extra->typiofunc);
4946 1438 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4947 1438 : fcinfo->flinfo->fn_mcxt);
4948 1438 : my_extra->element_type = element_type;
4949 : }
4950 24846 : typlen = my_extra->typlen;
4951 24846 : typbyval = my_extra->typbyval;
4952 24846 : typalign = my_extra->typalign;
4953 :
4954 24846 : p = ARR_DATA_PTR(v);
4955 24846 : bitmap = ARR_NULLBITMAP(v);
4956 24846 : bitmask = 1;
4957 :
4958 84722 : for (i = 0; i < nitems; i++)
4959 : {
4960 : Datum itemvalue;
4961 : char *value;
4962 :
4963 : /* Get source element, checking for NULL */
4964 59876 : if (bitmap && (*bitmap & bitmask) == 0)
4965 : {
4966 : /* if null_string is NULL, we just ignore null elements */
4967 18 : if (null_string != NULL)
4968 : {
4969 6 : if (printed)
4970 6 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
4971 : else
4972 0 : appendStringInfoString(&buf, null_string);
4973 6 : printed = true;
4974 : }
4975 : }
4976 : else
4977 : {
4978 59858 : itemvalue = fetch_att(p, typbyval, typlen);
4979 :
4980 59858 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
4981 :
4982 59858 : if (printed)
4983 35012 : appendStringInfo(&buf, "%s%s", fldsep, value);
4984 : else
4985 24846 : appendStringInfoString(&buf, value);
4986 59858 : printed = true;
4987 :
4988 59858 : p = att_addlength_pointer(p, typlen, p);
4989 59858 : p = (char *) att_align_nominal(p, typalign);
4990 : }
4991 :
4992 : /* advance bitmap pointer if any */
4993 59876 : if (bitmap)
4994 : {
4995 108 : bitmask <<= 1;
4996 108 : if (bitmask == 0x100)
4997 : {
4998 0 : bitmap++;
4999 0 : bitmask = 1;
5000 : }
5001 : }
5002 : }
5003 :
5004 24846 : result = cstring_to_text_with_len(buf.data, buf.len);
5005 24846 : pfree(buf.data);
5006 :
5007 24846 : return result;
5008 : }
5009 :
5010 : /*
5011 : * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
5012 : * 16.
5013 : */
5014 : static inline text *
5015 38750 : convert_to_base(uint64 value, int base)
5016 : {
5017 38750 : const char *digits = "0123456789abcdef";
5018 :
5019 : /* We size the buffer for to_bin's longest possible return value. */
5020 : char buf[sizeof(uint64) * BITS_PER_BYTE];
5021 38750 : char *const end = buf + sizeof(buf);
5022 38750 : char *ptr = end;
5023 :
5024 : Assert(base > 1);
5025 : Assert(base <= 16);
5026 :
5027 : do
5028 : {
5029 75974 : *--ptr = digits[value % base];
5030 75974 : value /= base;
5031 75974 : } while (ptr > buf && value);
5032 :
5033 38750 : return cstring_to_text_with_len(ptr, end - ptr);
5034 : }
5035 :
5036 : /*
5037 : * Convert an integer to a string containing a base-2 (binary) representation
5038 : * of the number.
5039 : */
5040 : Datum
5041 12 : to_bin32(PG_FUNCTION_ARGS)
5042 : {
5043 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
5044 :
5045 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
5046 : }
5047 : Datum
5048 12 : to_bin64(PG_FUNCTION_ARGS)
5049 : {
5050 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5051 :
5052 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
5053 : }
5054 :
5055 : /*
5056 : * Convert an integer to a string containing a base-8 (oct) representation of
5057 : * the number.
5058 : */
5059 : Datum
5060 12 : to_oct32(PG_FUNCTION_ARGS)
5061 : {
5062 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
5063 :
5064 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
5065 : }
5066 : Datum
5067 12 : to_oct64(PG_FUNCTION_ARGS)
5068 : {
5069 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5070 :
5071 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
5072 : }
5073 :
5074 : /*
5075 : * Convert an integer to a string containing a base-16 (hex) representation of
5076 : * the number.
5077 : */
5078 : Datum
5079 38690 : to_hex32(PG_FUNCTION_ARGS)
5080 : {
5081 38690 : uint64 value = (uint32) PG_GETARG_INT32(0);
5082 :
5083 38690 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
5084 : }
5085 : Datum
5086 12 : to_hex64(PG_FUNCTION_ARGS)
5087 : {
5088 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5089 :
5090 12 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
5091 : }
5092 :
5093 : /*
5094 : * Return the size of a datum, possibly compressed
5095 : *
5096 : * Works on any data type
5097 : */
5098 : Datum
5099 122 : pg_column_size(PG_FUNCTION_ARGS)
5100 : {
5101 122 : Datum value = PG_GETARG_DATUM(0);
5102 : int32 result;
5103 : int typlen;
5104 :
5105 : /* On first call, get the input type's typlen, and save at *fn_extra */
5106 122 : if (fcinfo->flinfo->fn_extra == NULL)
5107 : {
5108 : /* Lookup the datatype of the supplied argument */
5109 122 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5110 :
5111 122 : typlen = get_typlen(argtypeid);
5112 122 : if (typlen == 0) /* should not happen */
5113 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5114 :
5115 122 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5116 : sizeof(int));
5117 122 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5118 : }
5119 : else
5120 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5121 :
5122 122 : if (typlen == -1)
5123 : {
5124 : /* varlena type, possibly toasted */
5125 122 : result = toast_datum_size(value);
5126 : }
5127 0 : else if (typlen == -2)
5128 : {
5129 : /* cstring */
5130 0 : result = strlen(DatumGetCString(value)) + 1;
5131 : }
5132 : else
5133 : {
5134 : /* ordinary fixed-width type */
5135 0 : result = typlen;
5136 : }
5137 :
5138 122 : PG_RETURN_INT32(result);
5139 : }
5140 :
5141 : /*
5142 : * Return the compression method stored in the compressed attribute. Return
5143 : * NULL for non varlena type or uncompressed data.
5144 : */
5145 : Datum
5146 162 : pg_column_compression(PG_FUNCTION_ARGS)
5147 : {
5148 : int typlen;
5149 : char *result;
5150 : ToastCompressionId cmid;
5151 :
5152 : /* On first call, get the input type's typlen, and save at *fn_extra */
5153 162 : if (fcinfo->flinfo->fn_extra == NULL)
5154 : {
5155 : /* Lookup the datatype of the supplied argument */
5156 108 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5157 :
5158 108 : typlen = get_typlen(argtypeid);
5159 108 : if (typlen == 0) /* should not happen */
5160 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5161 :
5162 108 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5163 : sizeof(int));
5164 108 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5165 : }
5166 : else
5167 54 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5168 :
5169 162 : if (typlen != -1)
5170 0 : PG_RETURN_NULL();
5171 :
5172 : /* get the compression method id stored in the compressed varlena */
5173 162 : cmid = toast_get_compression_id((struct varlena *)
5174 162 : DatumGetPointer(PG_GETARG_DATUM(0)));
5175 162 : if (cmid == TOAST_INVALID_COMPRESSION_ID)
5176 6 : PG_RETURN_NULL();
5177 :
5178 : /* convert compression method id to compression method name */
5179 156 : switch (cmid)
5180 : {
5181 66 : case TOAST_PGLZ_COMPRESSION_ID:
5182 66 : result = "pglz";
5183 66 : break;
5184 90 : case TOAST_LZ4_COMPRESSION_ID:
5185 90 : result = "lz4";
5186 90 : break;
5187 0 : default:
5188 0 : elog(ERROR, "invalid compression method id %d", cmid);
5189 : }
5190 :
5191 156 : PG_RETURN_TEXT_P(cstring_to_text(result));
5192 : }
5193 :
5194 : /*
5195 : * Return the chunk_id of the on-disk TOASTed value. Return NULL if the value
5196 : * is un-TOASTed or not on-disk.
5197 : */
5198 : Datum
5199 12 : pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
5200 : {
5201 : int typlen;
5202 : struct varlena *attr;
5203 : struct varatt_external toast_pointer;
5204 :
5205 : /* On first call, get the input type's typlen, and save at *fn_extra */
5206 12 : if (fcinfo->flinfo->fn_extra == NULL)
5207 : {
5208 : /* Lookup the datatype of the supplied argument */
5209 12 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5210 :
5211 12 : typlen = get_typlen(argtypeid);
5212 12 : if (typlen == 0) /* should not happen */
5213 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5214 :
5215 12 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5216 : sizeof(int));
5217 12 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5218 : }
5219 : else
5220 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5221 :
5222 12 : if (typlen != -1)
5223 0 : PG_RETURN_NULL();
5224 :
5225 12 : attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
5226 :
5227 12 : if (!VARATT_IS_EXTERNAL_ONDISK(attr))
5228 6 : PG_RETURN_NULL();
5229 :
5230 6 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
5231 :
5232 6 : PG_RETURN_OID(toast_pointer.va_valueid);
5233 : }
5234 :
5235 : /*
5236 : * string_agg - Concatenates values and returns string.
5237 : *
5238 : * Syntax: string_agg(value text, delimiter text) RETURNS text
5239 : *
5240 : * Note: Any NULL values are ignored. The first-call delimiter isn't
5241 : * actually used at all, and on subsequent calls the delimiter precedes
5242 : * the associated value.
5243 : */
5244 :
5245 : /* subroutine to initialize state */
5246 : static StringInfo
5247 2334 : makeStringAggState(FunctionCallInfo fcinfo)
5248 : {
5249 : StringInfo state;
5250 : MemoryContext aggcontext;
5251 : MemoryContext oldcontext;
5252 :
5253 2334 : if (!AggCheckCallContext(fcinfo, &aggcontext))
5254 : {
5255 : /* cannot be called directly because of internal-type argument */
5256 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
5257 : }
5258 :
5259 : /*
5260 : * Create state in aggregate context. It'll stay there across subsequent
5261 : * calls.
5262 : */
5263 2334 : oldcontext = MemoryContextSwitchTo(aggcontext);
5264 2334 : state = makeStringInfo();
5265 2334 : MemoryContextSwitchTo(oldcontext);
5266 :
5267 2334 : return state;
5268 : }
5269 :
5270 : Datum
5271 862292 : string_agg_transfn(PG_FUNCTION_ARGS)
5272 : {
5273 : StringInfo state;
5274 :
5275 862292 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5276 :
5277 : /* Append the value unless null, preceding it with the delimiter. */
5278 862292 : if (!PG_ARGISNULL(1))
5279 : {
5280 847244 : text *value = PG_GETARG_TEXT_PP(1);
5281 847244 : bool isfirst = false;
5282 :
5283 : /*
5284 : * You might think we can just throw away the first delimiter, however
5285 : * we must keep it as we may be a parallel worker doing partial
5286 : * aggregation building a state to send to the main process. We need
5287 : * to keep the delimiter of every aggregation so that the combine
5288 : * function can properly join up the strings of two separately
5289 : * partially aggregated results. The first delimiter is only stripped
5290 : * off in the final function. To know how much to strip off the front
5291 : * of the string, we store the length of the first delimiter in the
5292 : * StringInfo's cursor field, which we don't otherwise need here.
5293 : */
5294 847244 : if (state == NULL)
5295 : {
5296 1946 : state = makeStringAggState(fcinfo);
5297 1946 : isfirst = true;
5298 : }
5299 :
5300 847244 : if (!PG_ARGISNULL(2))
5301 : {
5302 847244 : text *delim = PG_GETARG_TEXT_PP(2);
5303 :
5304 847244 : appendStringInfoText(state, delim);
5305 847244 : if (isfirst)
5306 1946 : state->cursor = VARSIZE_ANY_EXHDR(delim);
5307 : }
5308 :
5309 847244 : appendStringInfoText(state, value);
5310 : }
5311 :
5312 : /*
5313 : * The transition type for string_agg() is declared to be "internal",
5314 : * which is a pass-by-value type the same size as a pointer.
5315 : */
5316 862292 : if (state)
5317 862214 : PG_RETURN_POINTER(state);
5318 78 : PG_RETURN_NULL();
5319 : }
5320 :
5321 : /*
5322 : * string_agg_combine
5323 : * Aggregate combine function for string_agg(text) and string_agg(bytea)
5324 : */
5325 : Datum
5326 120 : string_agg_combine(PG_FUNCTION_ARGS)
5327 : {
5328 : StringInfo state1;
5329 : StringInfo state2;
5330 : MemoryContext agg_context;
5331 :
5332 120 : if (!AggCheckCallContext(fcinfo, &agg_context))
5333 0 : elog(ERROR, "aggregate function called in non-aggregate context");
5334 :
5335 120 : state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5336 120 : state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5337 :
5338 120 : if (state2 == NULL)
5339 : {
5340 : /*
5341 : * NULL state2 is easy, just return state1, which we know is already
5342 : * in the agg_context
5343 : */
5344 0 : if (state1 == NULL)
5345 0 : PG_RETURN_NULL();
5346 0 : PG_RETURN_POINTER(state1);
5347 : }
5348 :
5349 120 : if (state1 == NULL)
5350 : {
5351 : /* We must copy state2's data into the agg_context */
5352 : MemoryContext old_context;
5353 :
5354 120 : old_context = MemoryContextSwitchTo(agg_context);
5355 120 : state1 = makeStringAggState(fcinfo);
5356 120 : appendBinaryStringInfo(state1, state2->data, state2->len);
5357 120 : state1->cursor = state2->cursor;
5358 120 : MemoryContextSwitchTo(old_context);
5359 : }
5360 0 : else if (state2->len > 0)
5361 : {
5362 : /* Combine ... state1->cursor does not change in this case */
5363 0 : appendBinaryStringInfo(state1, state2->data, state2->len);
5364 : }
5365 :
5366 120 : PG_RETURN_POINTER(state1);
5367 : }
5368 :
5369 : /*
5370 : * string_agg_serialize
5371 : * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5372 : *
5373 : * This is strict, so we need not handle NULL input
5374 : */
5375 : Datum
5376 120 : string_agg_serialize(PG_FUNCTION_ARGS)
5377 : {
5378 : StringInfo state;
5379 : StringInfoData buf;
5380 : bytea *result;
5381 :
5382 : /* cannot be called directly because of internal-type argument */
5383 : Assert(AggCheckCallContext(fcinfo, NULL));
5384 :
5385 120 : state = (StringInfo) PG_GETARG_POINTER(0);
5386 :
5387 120 : pq_begintypsend(&buf);
5388 :
5389 : /* cursor */
5390 120 : pq_sendint(&buf, state->cursor, 4);
5391 :
5392 : /* data */
5393 120 : pq_sendbytes(&buf, state->data, state->len);
5394 :
5395 120 : result = pq_endtypsend(&buf);
5396 :
5397 120 : PG_RETURN_BYTEA_P(result);
5398 : }
5399 :
5400 : /*
5401 : * string_agg_deserialize
5402 : * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5403 : *
5404 : * This is strict, so we need not handle NULL input
5405 : */
5406 : Datum
5407 120 : string_agg_deserialize(PG_FUNCTION_ARGS)
5408 : {
5409 : bytea *sstate;
5410 : StringInfo result;
5411 : StringInfoData buf;
5412 : char *data;
5413 : int datalen;
5414 :
5415 : /* cannot be called directly because of internal-type argument */
5416 : Assert(AggCheckCallContext(fcinfo, NULL));
5417 :
5418 120 : sstate = PG_GETARG_BYTEA_PP(0);
5419 :
5420 : /*
5421 : * Initialize a StringInfo so that we can "receive" it using the standard
5422 : * recv-function infrastructure.
5423 : */
5424 120 : initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
5425 120 : VARSIZE_ANY_EXHDR(sstate));
5426 :
5427 120 : result = makeStringAggState(fcinfo);
5428 :
5429 : /* cursor */
5430 120 : result->cursor = pq_getmsgint(&buf, 4);
5431 :
5432 : /* data */
5433 120 : datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5434 120 : data = (char *) pq_getmsgbytes(&buf, datalen);
5435 120 : appendBinaryStringInfo(result, data, datalen);
5436 :
5437 120 : pq_getmsgend(&buf);
5438 :
5439 120 : PG_RETURN_POINTER(result);
5440 : }
5441 :
5442 : Datum
5443 2018 : string_agg_finalfn(PG_FUNCTION_ARGS)
5444 : {
5445 : StringInfo state;
5446 :
5447 : /* cannot be called directly because of internal-type argument */
5448 : Assert(AggCheckCallContext(fcinfo, NULL));
5449 :
5450 2018 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5451 :
5452 2018 : if (state != NULL)
5453 : {
5454 : /* As per comment in transfn, strip data before the cursor position */
5455 1946 : PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5456 : state->len - state->cursor));
5457 : }
5458 : else
5459 72 : PG_RETURN_NULL();
5460 : }
5461 :
5462 : /*
5463 : * Prepare cache with fmgr info for the output functions of the datatypes of
5464 : * the arguments of a concat-like function, beginning with argument "argidx".
5465 : * (Arguments before that will have corresponding slots in the resulting
5466 : * FmgrInfo array, but we don't fill those slots.)
5467 : */
5468 : static FmgrInfo *
5469 106 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5470 : {
5471 : FmgrInfo *foutcache;
5472 : int i;
5473 :
5474 : /* We keep the info in fn_mcxt so it survives across calls */
5475 106 : foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5476 106 : PG_NARGS() * sizeof(FmgrInfo));
5477 :
5478 400 : for (i = argidx; i < PG_NARGS(); i++)
5479 : {
5480 : Oid valtype;
5481 : Oid typOutput;
5482 : bool typIsVarlena;
5483 :
5484 294 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5485 294 : if (!OidIsValid(valtype))
5486 0 : elog(ERROR, "could not determine data type of concat() input");
5487 :
5488 294 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5489 294 : fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5490 : }
5491 :
5492 106 : fcinfo->flinfo->fn_extra = foutcache;
5493 :
5494 106 : return foutcache;
5495 : }
5496 :
5497 : /*
5498 : * Implementation of both concat() and concat_ws().
5499 : *
5500 : * sepstr is the separator string to place between values.
5501 : * argidx identifies the first argument to concatenate (counting from zero);
5502 : * note that this must be constant across any one series of calls.
5503 : *
5504 : * Returns NULL if result should be NULL, else text value.
5505 : */
5506 : static text *
5507 264 : concat_internal(const char *sepstr, int argidx,
5508 : FunctionCallInfo fcinfo)
5509 : {
5510 : text *result;
5511 : StringInfoData str;
5512 : FmgrInfo *foutcache;
5513 264 : bool first_arg = true;
5514 : int i;
5515 :
5516 : /*
5517 : * concat(VARIADIC some-array) is essentially equivalent to
5518 : * array_to_text(), ie concat the array elements with the given separator.
5519 : * So we just pass the case off to that code.
5520 : */
5521 264 : if (get_fn_expr_variadic(fcinfo->flinfo))
5522 : {
5523 : ArrayType *arr;
5524 :
5525 : /* Should have just the one argument */
5526 : Assert(argidx == PG_NARGS() - 1);
5527 :
5528 : /* concat(VARIADIC NULL) is defined as NULL */
5529 30 : if (PG_ARGISNULL(argidx))
5530 12 : return NULL;
5531 :
5532 : /*
5533 : * Non-null argument had better be an array. We assume that any call
5534 : * context that could let get_fn_expr_variadic return true will have
5535 : * checked that a VARIADIC-labeled parameter actually is an array. So
5536 : * it should be okay to just Assert that it's an array rather than
5537 : * doing a full-fledged error check.
5538 : */
5539 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5540 :
5541 : /* OK, safe to fetch the array value */
5542 18 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
5543 :
5544 : /*
5545 : * And serialize the array. We tell array_to_text to ignore null
5546 : * elements, which matches the behavior of the loop below.
5547 : */
5548 18 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5549 : }
5550 :
5551 : /* Normal case without explicit VARIADIC marker */
5552 234 : initStringInfo(&str);
5553 :
5554 : /* Get output function info, building it if first time through */
5555 234 : foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5556 234 : if (foutcache == NULL)
5557 106 : foutcache = build_concat_foutcache(fcinfo, argidx);
5558 :
5559 822 : for (i = argidx; i < PG_NARGS(); i++)
5560 : {
5561 588 : if (!PG_ARGISNULL(i))
5562 : {
5563 510 : Datum value = PG_GETARG_DATUM(i);
5564 :
5565 : /* add separator if appropriate */
5566 510 : if (first_arg)
5567 228 : first_arg = false;
5568 : else
5569 282 : appendStringInfoString(&str, sepstr);
5570 :
5571 : /* call the appropriate type output function, append the result */
5572 510 : appendStringInfoString(&str,
5573 510 : OutputFunctionCall(&foutcache[i], value));
5574 : }
5575 : }
5576 :
5577 234 : result = cstring_to_text_with_len(str.data, str.len);
5578 234 : pfree(str.data);
5579 :
5580 234 : return result;
5581 : }
5582 :
5583 : /*
5584 : * Concatenate all arguments. NULL arguments are ignored.
5585 : */
5586 : Datum
5587 186 : text_concat(PG_FUNCTION_ARGS)
5588 : {
5589 : text *result;
5590 :
5591 186 : result = concat_internal("", 0, fcinfo);
5592 186 : if (result == NULL)
5593 6 : PG_RETURN_NULL();
5594 180 : PG_RETURN_TEXT_P(result);
5595 : }
5596 :
5597 : /*
5598 : * Concatenate all but first argument value with separators. The first
5599 : * parameter is used as the separator. NULL arguments are ignored.
5600 : */
5601 : Datum
5602 84 : text_concat_ws(PG_FUNCTION_ARGS)
5603 : {
5604 : char *sep;
5605 : text *result;
5606 :
5607 : /* return NULL when separator is NULL */
5608 84 : if (PG_ARGISNULL(0))
5609 6 : PG_RETURN_NULL();
5610 78 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5611 :
5612 78 : result = concat_internal(sep, 1, fcinfo);
5613 78 : if (result == NULL)
5614 6 : PG_RETURN_NULL();
5615 72 : PG_RETURN_TEXT_P(result);
5616 : }
5617 :
5618 : /*
5619 : * Return first n characters in the string. When n is negative,
5620 : * return all but last |n| characters.
5621 : */
5622 : Datum
5623 2136 : text_left(PG_FUNCTION_ARGS)
5624 : {
5625 2136 : int n = PG_GETARG_INT32(1);
5626 :
5627 2136 : if (n < 0)
5628 : {
5629 30 : text *str = PG_GETARG_TEXT_PP(0);
5630 30 : const char *p = VARDATA_ANY(str);
5631 30 : int len = VARSIZE_ANY_EXHDR(str);
5632 : int rlen;
5633 :
5634 30 : n = pg_mbstrlen_with_len(p, len) + n;
5635 30 : rlen = pg_mbcharcliplen(p, len, n);
5636 30 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5637 : }
5638 : else
5639 2106 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5640 : }
5641 :
5642 : /*
5643 : * Return last n characters in the string. When n is negative,
5644 : * return all but first |n| characters.
5645 : */
5646 : Datum
5647 66 : text_right(PG_FUNCTION_ARGS)
5648 : {
5649 66 : text *str = PG_GETARG_TEXT_PP(0);
5650 66 : const char *p = VARDATA_ANY(str);
5651 66 : int len = VARSIZE_ANY_EXHDR(str);
5652 66 : int n = PG_GETARG_INT32(1);
5653 : int off;
5654 :
5655 66 : if (n < 0)
5656 30 : n = -n;
5657 : else
5658 36 : n = pg_mbstrlen_with_len(p, len) - n;
5659 66 : off = pg_mbcharcliplen(p, len, n);
5660 :
5661 66 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5662 : }
5663 :
5664 : /*
5665 : * Return reversed string
5666 : */
5667 : Datum
5668 6 : text_reverse(PG_FUNCTION_ARGS)
5669 : {
5670 6 : text *str = PG_GETARG_TEXT_PP(0);
5671 6 : const char *p = VARDATA_ANY(str);
5672 6 : int len = VARSIZE_ANY_EXHDR(str);
5673 6 : const char *endp = p + len;
5674 : text *result;
5675 : char *dst;
5676 :
5677 6 : result = palloc(len + VARHDRSZ);
5678 6 : dst = (char *) VARDATA(result) + len;
5679 6 : SET_VARSIZE(result, len + VARHDRSZ);
5680 :
5681 6 : if (pg_database_encoding_max_length() > 1)
5682 : {
5683 : /* multibyte version */
5684 36 : while (p < endp)
5685 : {
5686 : int sz;
5687 :
5688 30 : sz = pg_mblen(p);
5689 30 : dst -= sz;
5690 30 : memcpy(dst, p, sz);
5691 30 : p += sz;
5692 : }
5693 : }
5694 : else
5695 : {
5696 : /* single byte version */
5697 0 : while (p < endp)
5698 0 : *(--dst) = *p++;
5699 : }
5700 :
5701 6 : PG_RETURN_TEXT_P(result);
5702 : }
5703 :
5704 :
5705 : /*
5706 : * Support macros for text_format()
5707 : */
5708 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5709 :
5710 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5711 : do { \
5712 : if (++(ptr) >= (end_ptr)) \
5713 : ereport(ERROR, \
5714 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5715 : errmsg("unterminated format() type specifier"), \
5716 : errhint("For a single \"%%\" use \"%%%%\"."))); \
5717 : } while (0)
5718 :
5719 : /*
5720 : * Returns a formatted string
5721 : */
5722 : Datum
5723 31306 : text_format(PG_FUNCTION_ARGS)
5724 : {
5725 : text *fmt;
5726 : StringInfoData str;
5727 : const char *cp;
5728 : const char *start_ptr;
5729 : const char *end_ptr;
5730 : text *result;
5731 : int arg;
5732 : bool funcvariadic;
5733 : int nargs;
5734 31306 : Datum *elements = NULL;
5735 31306 : bool *nulls = NULL;
5736 31306 : Oid element_type = InvalidOid;
5737 31306 : Oid prev_type = InvalidOid;
5738 31306 : Oid prev_width_type = InvalidOid;
5739 : FmgrInfo typoutputfinfo;
5740 : FmgrInfo typoutputinfo_width;
5741 :
5742 : /* When format string is null, immediately return null */
5743 31306 : if (PG_ARGISNULL(0))
5744 6 : PG_RETURN_NULL();
5745 :
5746 : /* If argument is marked VARIADIC, expand array into elements */
5747 31300 : if (get_fn_expr_variadic(fcinfo->flinfo))
5748 : {
5749 : ArrayType *arr;
5750 : int16 elmlen;
5751 : bool elmbyval;
5752 : char elmalign;
5753 : int nitems;
5754 :
5755 : /* Should have just the one argument */
5756 : Assert(PG_NARGS() == 2);
5757 :
5758 : /* If argument is NULL, we treat it as zero-length array */
5759 48 : if (PG_ARGISNULL(1))
5760 6 : nitems = 0;
5761 : else
5762 : {
5763 : /*
5764 : * Non-null argument had better be an array. We assume that any
5765 : * call context that could let get_fn_expr_variadic return true
5766 : * will have checked that a VARIADIC-labeled parameter actually is
5767 : * an array. So it should be okay to just Assert that it's an
5768 : * array rather than doing a full-fledged error check.
5769 : */
5770 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5771 :
5772 : /* OK, safe to fetch the array value */
5773 42 : arr = PG_GETARG_ARRAYTYPE_P(1);
5774 :
5775 : /* Get info about array element type */
5776 42 : element_type = ARR_ELEMTYPE(arr);
5777 42 : get_typlenbyvalalign(element_type,
5778 : &elmlen, &elmbyval, &elmalign);
5779 :
5780 : /* Extract all array elements */
5781 42 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5782 : &elements, &nulls, &nitems);
5783 : }
5784 :
5785 48 : nargs = nitems + 1;
5786 48 : funcvariadic = true;
5787 : }
5788 : else
5789 : {
5790 : /* Non-variadic case, we'll process the arguments individually */
5791 31252 : nargs = PG_NARGS();
5792 31252 : funcvariadic = false;
5793 : }
5794 :
5795 : /* Setup for main loop. */
5796 31300 : fmt = PG_GETARG_TEXT_PP(0);
5797 31300 : start_ptr = VARDATA_ANY(fmt);
5798 31300 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5799 31300 : initStringInfo(&str);
5800 31300 : arg = 1; /* next argument position to print */
5801 :
5802 : /* Scan format string, looking for conversion specifiers. */
5803 875098 : for (cp = start_ptr; cp < end_ptr; cp++)
5804 : {
5805 : int argpos;
5806 : int widthpos;
5807 : int flags;
5808 : int width;
5809 : Datum value;
5810 : bool isNull;
5811 : Oid typid;
5812 :
5813 : /*
5814 : * If it's not the start of a conversion specifier, just copy it to
5815 : * the output buffer.
5816 : */
5817 843858 : if (*cp != '%')
5818 : {
5819 781634 : appendStringInfoCharMacro(&str, *cp);
5820 781652 : continue;
5821 : }
5822 :
5823 62224 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5824 :
5825 : /* Easy case: %% outputs a single % */
5826 62224 : if (*cp == '%')
5827 : {
5828 18 : appendStringInfoCharMacro(&str, *cp);
5829 18 : continue;
5830 : }
5831 :
5832 : /* Parse the optional portions of the format specifier */
5833 62206 : cp = text_format_parse_format(cp, end_ptr,
5834 : &argpos, &widthpos,
5835 : &flags, &width);
5836 :
5837 : /*
5838 : * Next we should see the main conversion specifier. Whether or not
5839 : * an argument position was present, it's known that at least one
5840 : * character remains in the string at this point. Experience suggests
5841 : * that it's worth checking that that character is one of the expected
5842 : * ones before we try to fetch arguments, so as to produce the least
5843 : * confusing response to a mis-formatted specifier.
5844 : */
5845 62182 : if (strchr("sIL", *cp) == NULL)
5846 6 : ereport(ERROR,
5847 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5848 : errmsg("unrecognized format() type specifier \"%.*s\"",
5849 : pg_mblen(cp), cp),
5850 : errhint("For a single \"%%\" use \"%%%%\".")));
5851 :
5852 : /* If indirect width was specified, get its value */
5853 62176 : if (widthpos >= 0)
5854 : {
5855 : /* Collect the specified or next argument position */
5856 42 : if (widthpos > 0)
5857 36 : arg = widthpos;
5858 42 : if (arg >= nargs)
5859 0 : ereport(ERROR,
5860 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5861 : errmsg("too few arguments for format()")));
5862 :
5863 : /* Get the value and type of the selected argument */
5864 42 : if (!funcvariadic)
5865 : {
5866 42 : value = PG_GETARG_DATUM(arg);
5867 42 : isNull = PG_ARGISNULL(arg);
5868 42 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5869 : }
5870 : else
5871 : {
5872 0 : value = elements[arg - 1];
5873 0 : isNull = nulls[arg - 1];
5874 0 : typid = element_type;
5875 : }
5876 42 : if (!OidIsValid(typid))
5877 0 : elog(ERROR, "could not determine data type of format() input");
5878 :
5879 42 : arg++;
5880 :
5881 : /* We can treat NULL width the same as zero */
5882 42 : if (isNull)
5883 6 : width = 0;
5884 36 : else if (typid == INT4OID)
5885 36 : width = DatumGetInt32(value);
5886 0 : else if (typid == INT2OID)
5887 0 : width = DatumGetInt16(value);
5888 : else
5889 : {
5890 : /* For less-usual datatypes, convert to text then to int */
5891 : char *str;
5892 :
5893 0 : if (typid != prev_width_type)
5894 : {
5895 : Oid typoutputfunc;
5896 : bool typIsVarlena;
5897 :
5898 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5899 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
5900 0 : prev_width_type = typid;
5901 : }
5902 :
5903 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
5904 :
5905 : /* pg_strtoint32 will complain about bad data or overflow */
5906 0 : width = pg_strtoint32(str);
5907 :
5908 0 : pfree(str);
5909 : }
5910 : }
5911 :
5912 : /* Collect the specified or next argument position */
5913 62176 : if (argpos > 0)
5914 132 : arg = argpos;
5915 62176 : if (arg >= nargs)
5916 24 : ereport(ERROR,
5917 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5918 : errmsg("too few arguments for format()")));
5919 :
5920 : /* Get the value and type of the selected argument */
5921 62152 : if (!funcvariadic)
5922 : {
5923 60880 : value = PG_GETARG_DATUM(arg);
5924 60880 : isNull = PG_ARGISNULL(arg);
5925 60880 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5926 : }
5927 : else
5928 : {
5929 1272 : value = elements[arg - 1];
5930 1272 : isNull = nulls[arg - 1];
5931 1272 : typid = element_type;
5932 : }
5933 62152 : if (!OidIsValid(typid))
5934 0 : elog(ERROR, "could not determine data type of format() input");
5935 :
5936 62152 : arg++;
5937 :
5938 : /*
5939 : * Get the appropriate typOutput function, reusing previous one if
5940 : * same type as previous argument. That's particularly useful in the
5941 : * variadic-array case, but often saves work even for ordinary calls.
5942 : */
5943 62152 : if (typid != prev_type)
5944 : {
5945 : Oid typoutputfunc;
5946 : bool typIsVarlena;
5947 :
5948 30634 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5949 30634 : fmgr_info(typoutputfunc, &typoutputfinfo);
5950 30634 : prev_type = typid;
5951 : }
5952 :
5953 : /*
5954 : * And now we can format the value.
5955 : */
5956 62152 : switch (*cp)
5957 : {
5958 62152 : case 's':
5959 : case 'I':
5960 : case 'L':
5961 62152 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
5962 : value, isNull,
5963 : flags, width);
5964 62146 : break;
5965 0 : default:
5966 : /* should not get here, because of previous check */
5967 0 : ereport(ERROR,
5968 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5969 : errmsg("unrecognized format() type specifier \"%.*s\"",
5970 : pg_mblen(cp), cp),
5971 : errhint("For a single \"%%\" use \"%%%%\".")));
5972 : break;
5973 : }
5974 : }
5975 :
5976 : /* Don't need deconstruct_array results anymore. */
5977 31240 : if (elements != NULL)
5978 42 : pfree(elements);
5979 31240 : if (nulls != NULL)
5980 42 : pfree(nulls);
5981 :
5982 : /* Generate results. */
5983 31240 : result = cstring_to_text_with_len(str.data, str.len);
5984 31240 : pfree(str.data);
5985 :
5986 31240 : PG_RETURN_TEXT_P(result);
5987 : }
5988 :
5989 : /*
5990 : * Parse contiguous digits as a decimal number.
5991 : *
5992 : * Returns true if some digits could be parsed.
5993 : * The value is returned into *value, and *ptr is advanced to the next
5994 : * character to be parsed.
5995 : *
5996 : * Note parsing invariant: at least one character is known available before
5997 : * string end (end_ptr) at entry, and this is still true at exit.
5998 : */
5999 : static bool
6000 124376 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
6001 : {
6002 124376 : bool found = false;
6003 124376 : const char *cp = *ptr;
6004 124376 : int val = 0;
6005 :
6006 124688 : while (*cp >= '0' && *cp <= '9')
6007 : {
6008 318 : int8 digit = (*cp - '0');
6009 :
6010 318 : if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
6011 318 : unlikely(pg_add_s32_overflow(val, digit, &val)))
6012 0 : ereport(ERROR,
6013 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6014 : errmsg("number is out of range")));
6015 318 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6016 312 : found = true;
6017 : }
6018 :
6019 124370 : *ptr = cp;
6020 124370 : *value = val;
6021 :
6022 124370 : return found;
6023 : }
6024 :
6025 : /*
6026 : * Parse a format specifier (generally following the SUS printf spec).
6027 : *
6028 : * We have already advanced over the initial '%', and we are looking for
6029 : * [argpos][flags][width]type (but the type character is not consumed here).
6030 : *
6031 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6032 : * Output parameters:
6033 : * argpos: argument position for value to be printed. -1 means unspecified.
6034 : * widthpos: argument position for width. Zero means the argument position
6035 : * was unspecified (ie, take the next arg) and -1 means no width
6036 : * argument (width was omitted or specified as a constant).
6037 : * flags: bitmask of flags.
6038 : * width: directly-specified width value. Zero means the width was omitted
6039 : * (note it's not necessary to distinguish this case from an explicit
6040 : * zero width value).
6041 : *
6042 : * The function result is the next character position to be parsed, ie, the
6043 : * location where the type character is/should be.
6044 : *
6045 : * Note parsing invariant: at least one character is known available before
6046 : * string end (end_ptr) at entry, and this is still true at exit.
6047 : */
6048 : static const char *
6049 62206 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
6050 : int *argpos, int *widthpos,
6051 : int *flags, int *width)
6052 : {
6053 62206 : const char *cp = start_ptr;
6054 : int n;
6055 :
6056 : /* set defaults for output parameters */
6057 62206 : *argpos = -1;
6058 62206 : *widthpos = -1;
6059 62206 : *flags = 0;
6060 62206 : *width = 0;
6061 :
6062 : /* try to identify first number */
6063 62206 : if (text_format_parse_digits(&cp, end_ptr, &n))
6064 : {
6065 174 : if (*cp != '$')
6066 : {
6067 : /* Must be just a width and a type, so we're done */
6068 24 : *width = n;
6069 24 : return cp;
6070 : }
6071 : /* The number was argument position */
6072 150 : *argpos = n;
6073 : /* Explicit 0 for argument index is immediately refused */
6074 150 : if (n == 0)
6075 6 : ereport(ERROR,
6076 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6077 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6078 144 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6079 : }
6080 :
6081 : /* Handle flags (only minus is supported now) */
6082 62200 : while (*cp == '-')
6083 : {
6084 30 : *flags |= TEXT_FORMAT_FLAG_MINUS;
6085 30 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6086 : }
6087 :
6088 62170 : if (*cp == '*')
6089 : {
6090 : /* Handle indirect width */
6091 48 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6092 48 : if (text_format_parse_digits(&cp, end_ptr, &n))
6093 : {
6094 : /* number in this position must be closed by $ */
6095 42 : if (*cp != '$')
6096 0 : ereport(ERROR,
6097 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6098 : errmsg("width argument position must be ended by \"$\"")));
6099 : /* The number was width argument position */
6100 42 : *widthpos = n;
6101 : /* Explicit 0 for argument index is immediately refused */
6102 42 : if (n == 0)
6103 6 : ereport(ERROR,
6104 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6105 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6106 36 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6107 : }
6108 : else
6109 6 : *widthpos = 0; /* width's argument position is unspecified */
6110 : }
6111 : else
6112 : {
6113 : /* Check for direct width specification */
6114 62122 : if (text_format_parse_digits(&cp, end_ptr, &n))
6115 30 : *width = n;
6116 : }
6117 :
6118 : /* cp should now be pointing at type character */
6119 62158 : return cp;
6120 : }
6121 :
6122 : /*
6123 : * Format a %s, %I, or %L conversion
6124 : */
6125 : static void
6126 62152 : text_format_string_conversion(StringInfo buf, char conversion,
6127 : FmgrInfo *typOutputInfo,
6128 : Datum value, bool isNull,
6129 : int flags, int width)
6130 : {
6131 : char *str;
6132 :
6133 : /* Handle NULL arguments before trying to stringify the value. */
6134 62152 : if (isNull)
6135 : {
6136 342 : if (conversion == 's')
6137 270 : text_format_append_string(buf, "", flags, width);
6138 72 : else if (conversion == 'L')
6139 66 : text_format_append_string(buf, "NULL", flags, width);
6140 6 : else if (conversion == 'I')
6141 6 : ereport(ERROR,
6142 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6143 : errmsg("null values cannot be formatted as an SQL identifier")));
6144 336 : return;
6145 : }
6146 :
6147 : /* Stringify. */
6148 61810 : str = OutputFunctionCall(typOutputInfo, value);
6149 :
6150 : /* Escape. */
6151 61810 : if (conversion == 'I')
6152 : {
6153 : /* quote_identifier may or may not allocate a new string. */
6154 3162 : text_format_append_string(buf, quote_identifier(str), flags, width);
6155 : }
6156 58648 : else if (conversion == 'L')
6157 : {
6158 3232 : char *qstr = quote_literal_cstr(str);
6159 :
6160 3232 : text_format_append_string(buf, qstr, flags, width);
6161 : /* quote_literal_cstr() always allocates a new string */
6162 3232 : pfree(qstr);
6163 : }
6164 : else
6165 55416 : text_format_append_string(buf, str, flags, width);
6166 :
6167 : /* Cleanup. */
6168 61810 : pfree(str);
6169 : }
6170 :
6171 : /*
6172 : * Append str to buf, padding as directed by flags/width
6173 : */
6174 : static void
6175 62146 : text_format_append_string(StringInfo buf, const char *str,
6176 : int flags, int width)
6177 : {
6178 62146 : bool align_to_left = false;
6179 : int len;
6180 :
6181 : /* fast path for typical easy case */
6182 62146 : if (width == 0)
6183 : {
6184 62062 : appendStringInfoString(buf, str);
6185 62062 : return;
6186 : }
6187 :
6188 84 : if (width < 0)
6189 : {
6190 : /* Negative width: implicit '-' flag, then take absolute value */
6191 6 : align_to_left = true;
6192 : /* -INT_MIN is undefined */
6193 6 : if (width <= INT_MIN)
6194 0 : ereport(ERROR,
6195 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6196 : errmsg("number is out of range")));
6197 6 : width = -width;
6198 : }
6199 78 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
6200 24 : align_to_left = true;
6201 :
6202 84 : len = pg_mbstrlen(str);
6203 84 : if (align_to_left)
6204 : {
6205 : /* left justify */
6206 30 : appendStringInfoString(buf, str);
6207 30 : if (len < width)
6208 30 : appendStringInfoSpaces(buf, width - len);
6209 : }
6210 : else
6211 : {
6212 : /* right justify */
6213 54 : if (len < width)
6214 54 : appendStringInfoSpaces(buf, width - len);
6215 54 : appendStringInfoString(buf, str);
6216 : }
6217 : }
6218 :
6219 : /*
6220 : * text_format_nv - nonvariadic wrapper for text_format function.
6221 : *
6222 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6223 : * which checks that all built-in functions that share the implementing C
6224 : * function take the same number of arguments.
6225 : */
6226 : Datum
6227 3810 : text_format_nv(PG_FUNCTION_ARGS)
6228 : {
6229 3810 : return text_format(fcinfo);
6230 : }
6231 :
6232 : /*
6233 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
6234 : * for this use case.
6235 : */
6236 : static inline bool
6237 0 : rest_of_char_same(const char *s1, const char *s2, int len)
6238 : {
6239 0 : while (len > 0)
6240 : {
6241 0 : len--;
6242 0 : if (s1[len] != s2[len])
6243 0 : return false;
6244 : }
6245 0 : return true;
6246 : }
6247 :
6248 : /* Expand each Levenshtein distance variant */
6249 : #include "levenshtein.c"
6250 : #define LEVENSHTEIN_LESS_EQUAL
6251 : #include "levenshtein.c"
6252 :
6253 :
6254 : /*
6255 : * The following *ClosestMatch() functions can be used to determine whether a
6256 : * user-provided string resembles any known valid values, which is useful for
6257 : * providing hints in log messages, among other things. Use these functions
6258 : * like so:
6259 : *
6260 : * initClosestMatch(&state, source_string, max_distance);
6261 : *
6262 : * for (int i = 0; i < num_valid_strings; i++)
6263 : * updateClosestMatch(&state, valid_strings[i]);
6264 : *
6265 : * closestMatch = getClosestMatch(&state);
6266 : */
6267 :
6268 : /*
6269 : * Initialize the given state with the source string and maximum Levenshtein
6270 : * distance to consider.
6271 : */
6272 : void
6273 56 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6274 : {
6275 : Assert(state);
6276 : Assert(max_d >= 0);
6277 :
6278 56 : state->source = source;
6279 56 : state->min_d = -1;
6280 56 : state->max_d = max_d;
6281 56 : state->match = NULL;
6282 56 : }
6283 :
6284 : /*
6285 : * If the candidate string is a closer match than the current one saved (or
6286 : * there is no match saved), save it as the closest match.
6287 : *
6288 : * If the source or candidate string is NULL, empty, or too long, this function
6289 : * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6290 : * allowed or more than half the characters are different, no action is taken.
6291 : */
6292 : void
6293 360 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
6294 : {
6295 : int dist;
6296 :
6297 : Assert(state);
6298 :
6299 360 : if (state->source == NULL || state->source[0] == '\0' ||
6300 360 : candidate == NULL || candidate[0] == '\0')
6301 0 : return;
6302 :
6303 : /*
6304 : * To avoid ERROR-ing, we check the lengths here instead of setting
6305 : * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6306 : */
6307 360 : if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6308 360 : strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6309 0 : return;
6310 :
6311 360 : dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6312 360 : candidate, strlen(candidate), 1, 1, 1,
6313 : state->max_d, true);
6314 360 : if (dist <= state->max_d &&
6315 56 : dist <= strlen(state->source) / 2 &&
6316 14 : (state->min_d == -1 || dist < state->min_d))
6317 : {
6318 14 : state->min_d = dist;
6319 14 : state->match = candidate;
6320 : }
6321 : }
6322 :
6323 : /*
6324 : * Return the closest match. If no suitable candidates were provided via
6325 : * updateClosestMatch(), return NULL.
6326 : */
6327 : const char *
6328 56 : getClosestMatch(ClosestMatchState *state)
6329 : {
6330 : Assert(state);
6331 :
6332 56 : return state->match;
6333 : }
6334 :
6335 :
6336 : /*
6337 : * Unicode support
6338 : */
6339 :
6340 : static UnicodeNormalizationForm
6341 210 : unicode_norm_form_from_string(const char *formstr)
6342 : {
6343 210 : UnicodeNormalizationForm form = -1;
6344 :
6345 : /*
6346 : * Might as well check this while we're here.
6347 : */
6348 210 : if (GetDatabaseEncoding() != PG_UTF8)
6349 0 : ereport(ERROR,
6350 : (errcode(ERRCODE_SYNTAX_ERROR),
6351 : errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6352 :
6353 210 : if (pg_strcasecmp(formstr, "NFC") == 0)
6354 66 : form = UNICODE_NFC;
6355 144 : else if (pg_strcasecmp(formstr, "NFD") == 0)
6356 60 : form = UNICODE_NFD;
6357 84 : else if (pg_strcasecmp(formstr, "NFKC") == 0)
6358 36 : form = UNICODE_NFKC;
6359 48 : else if (pg_strcasecmp(formstr, "NFKD") == 0)
6360 36 : form = UNICODE_NFKD;
6361 : else
6362 12 : ereport(ERROR,
6363 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6364 : errmsg("invalid normalization form: %s", formstr)));
6365 :
6366 198 : return form;
6367 : }
6368 :
6369 : /*
6370 : * Returns version of Unicode used by Postgres in "major.minor" format (the
6371 : * same format as the Unicode version reported by ICU). The third component
6372 : * ("update version") never involves additions to the character repertoire and
6373 : * is unimportant for most purposes.
6374 : *
6375 : * See: https://unicode.org/versions/
6376 : */
6377 : Datum
6378 6 : unicode_version(PG_FUNCTION_ARGS)
6379 : {
6380 6 : PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6381 : }
6382 :
6383 : /*
6384 : * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6385 : */
6386 : Datum
6387 2 : icu_unicode_version(PG_FUNCTION_ARGS)
6388 : {
6389 : #ifdef USE_ICU
6390 2 : PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6391 : #else
6392 : PG_RETURN_NULL();
6393 : #endif
6394 : }
6395 :
6396 : /*
6397 : * Check whether the string contains only assigned Unicode code
6398 : * points. Requires that the database encoding is UTF-8.
6399 : */
6400 : Datum
6401 12 : unicode_assigned(PG_FUNCTION_ARGS)
6402 : {
6403 12 : text *input = PG_GETARG_TEXT_PP(0);
6404 : unsigned char *p;
6405 : int size;
6406 :
6407 12 : if (GetDatabaseEncoding() != PG_UTF8)
6408 0 : ereport(ERROR,
6409 : (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6410 :
6411 : /* convert to pg_wchar */
6412 12 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6413 12 : p = (unsigned char *) VARDATA_ANY(input);
6414 48 : for (int i = 0; i < size; i++)
6415 : {
6416 42 : pg_wchar uchar = utf8_to_unicode(p);
6417 42 : int category = unicode_category(uchar);
6418 :
6419 42 : if (category == PG_U_UNASSIGNED)
6420 6 : PG_RETURN_BOOL(false);
6421 :
6422 36 : p += pg_utf_mblen(p);
6423 : }
6424 :
6425 6 : PG_RETURN_BOOL(true);
6426 : }
6427 :
6428 : Datum
6429 72 : unicode_normalize_func(PG_FUNCTION_ARGS)
6430 : {
6431 72 : text *input = PG_GETARG_TEXT_PP(0);
6432 72 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6433 : UnicodeNormalizationForm form;
6434 : int size;
6435 : pg_wchar *input_chars;
6436 : pg_wchar *output_chars;
6437 : unsigned char *p;
6438 : text *result;
6439 : int i;
6440 :
6441 72 : form = unicode_norm_form_from_string(formstr);
6442 :
6443 : /* convert to pg_wchar */
6444 66 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6445 66 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6446 66 : p = (unsigned char *) VARDATA_ANY(input);
6447 288 : for (i = 0; i < size; i++)
6448 : {
6449 222 : input_chars[i] = utf8_to_unicode(p);
6450 222 : p += pg_utf_mblen(p);
6451 : }
6452 66 : input_chars[i] = (pg_wchar) '\0';
6453 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6454 :
6455 : /* action */
6456 66 : output_chars = unicode_normalize(form, input_chars);
6457 :
6458 : /* convert back to UTF-8 string */
6459 66 : size = 0;
6460 306 : for (pg_wchar *wp = output_chars; *wp; wp++)
6461 : {
6462 : unsigned char buf[4];
6463 :
6464 240 : unicode_to_utf8(*wp, buf);
6465 240 : size += pg_utf_mblen(buf);
6466 : }
6467 :
6468 66 : result = palloc(size + VARHDRSZ);
6469 66 : SET_VARSIZE(result, size + VARHDRSZ);
6470 :
6471 66 : p = (unsigned char *) VARDATA_ANY(result);
6472 306 : for (pg_wchar *wp = output_chars; *wp; wp++)
6473 : {
6474 240 : unicode_to_utf8(*wp, p);
6475 240 : p += pg_utf_mblen(p);
6476 : }
6477 : Assert((char *) p == (char *) result + size + VARHDRSZ);
6478 :
6479 66 : PG_RETURN_TEXT_P(result);
6480 : }
6481 :
6482 : /*
6483 : * Check whether the string is in the specified Unicode normalization form.
6484 : *
6485 : * This is done by converting the string to the specified normal form and then
6486 : * comparing that to the original string. To speed that up, we also apply the
6487 : * "quick check" algorithm specified in UAX #15, which can give a yes or no
6488 : * answer for many strings by just scanning the string once.
6489 : *
6490 : * This function should generally be optimized for the case where the string
6491 : * is in fact normalized. In that case, we'll end up looking at the entire
6492 : * string, so it's probably not worth doing any incremental conversion etc.
6493 : */
6494 : Datum
6495 138 : unicode_is_normalized(PG_FUNCTION_ARGS)
6496 : {
6497 138 : text *input = PG_GETARG_TEXT_PP(0);
6498 138 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6499 : UnicodeNormalizationForm form;
6500 : int size;
6501 : pg_wchar *input_chars;
6502 : pg_wchar *output_chars;
6503 : unsigned char *p;
6504 : int i;
6505 : UnicodeNormalizationQC quickcheck;
6506 : int output_size;
6507 : bool result;
6508 :
6509 138 : form = unicode_norm_form_from_string(formstr);
6510 :
6511 : /* convert to pg_wchar */
6512 132 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6513 132 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6514 132 : p = (unsigned char *) VARDATA_ANY(input);
6515 504 : for (i = 0; i < size; i++)
6516 : {
6517 372 : input_chars[i] = utf8_to_unicode(p);
6518 372 : p += pg_utf_mblen(p);
6519 : }
6520 132 : input_chars[i] = (pg_wchar) '\0';
6521 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6522 :
6523 : /* quick check (see UAX #15) */
6524 132 : quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6525 132 : if (quickcheck == UNICODE_NORM_QC_YES)
6526 42 : PG_RETURN_BOOL(true);
6527 90 : else if (quickcheck == UNICODE_NORM_QC_NO)
6528 12 : PG_RETURN_BOOL(false);
6529 :
6530 : /* normalize and compare with original */
6531 78 : output_chars = unicode_normalize(form, input_chars);
6532 :
6533 78 : output_size = 0;
6534 324 : for (pg_wchar *wp = output_chars; *wp; wp++)
6535 246 : output_size++;
6536 :
6537 114 : result = (size == output_size) &&
6538 36 : (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6539 :
6540 78 : PG_RETURN_BOOL(result);
6541 : }
6542 :
6543 : /*
6544 : * Check if first n chars are hexadecimal digits
6545 : */
6546 : static bool
6547 156 : isxdigits_n(const char *instr, size_t n)
6548 : {
6549 660 : for (size_t i = 0; i < n; i++)
6550 570 : if (!isxdigit((unsigned char) instr[i]))
6551 66 : return false;
6552 :
6553 90 : return true;
6554 : }
6555 :
6556 : static unsigned int
6557 504 : hexval(unsigned char c)
6558 : {
6559 504 : if (c >= '0' && c <= '9')
6560 384 : return c - '0';
6561 120 : if (c >= 'a' && c <= 'f')
6562 60 : return c - 'a' + 0xA;
6563 60 : if (c >= 'A' && c <= 'F')
6564 60 : return c - 'A' + 0xA;
6565 0 : elog(ERROR, "invalid hexadecimal digit");
6566 : return 0; /* not reached */
6567 : }
6568 :
6569 : /*
6570 : * Translate string with hexadecimal digits to number
6571 : */
6572 : static unsigned int
6573 90 : hexval_n(const char *instr, size_t n)
6574 : {
6575 90 : unsigned int result = 0;
6576 :
6577 594 : for (size_t i = 0; i < n; i++)
6578 504 : result += hexval(instr[i]) << (4 * (n - i - 1));
6579 :
6580 90 : return result;
6581 : }
6582 :
6583 : /*
6584 : * Replaces Unicode escape sequences by Unicode characters
6585 : */
6586 : Datum
6587 66 : unistr(PG_FUNCTION_ARGS)
6588 : {
6589 66 : text *input_text = PG_GETARG_TEXT_PP(0);
6590 : char *instr;
6591 : int len;
6592 : StringInfoData str;
6593 : text *result;
6594 66 : pg_wchar pair_first = 0;
6595 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6596 :
6597 66 : instr = VARDATA_ANY(input_text);
6598 66 : len = VARSIZE_ANY_EXHDR(input_text);
6599 :
6600 66 : initStringInfo(&str);
6601 :
6602 510 : while (len > 0)
6603 : {
6604 486 : if (instr[0] == '\\')
6605 : {
6606 102 : if (len >= 2 &&
6607 102 : instr[1] == '\\')
6608 : {
6609 6 : if (pair_first)
6610 0 : goto invalid_pair;
6611 6 : appendStringInfoChar(&str, '\\');
6612 6 : instr += 2;
6613 6 : len -= 2;
6614 : }
6615 96 : else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6616 66 : (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6617 30 : {
6618 : pg_wchar unicode;
6619 42 : int offset = instr[1] == 'u' ? 2 : 1;
6620 :
6621 42 : unicode = hexval_n(instr + offset, 4);
6622 :
6623 42 : if (!is_valid_unicode_codepoint(unicode))
6624 0 : ereport(ERROR,
6625 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6626 : errmsg("invalid Unicode code point: %04X", unicode));
6627 :
6628 42 : if (pair_first)
6629 : {
6630 12 : if (is_utf16_surrogate_second(unicode))
6631 : {
6632 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6633 0 : pair_first = 0;
6634 : }
6635 : else
6636 12 : goto invalid_pair;
6637 : }
6638 30 : else if (is_utf16_surrogate_second(unicode))
6639 0 : goto invalid_pair;
6640 :
6641 30 : if (is_utf16_surrogate_first(unicode))
6642 18 : pair_first = unicode;
6643 : else
6644 : {
6645 12 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6646 12 : appendStringInfoString(&str, cbuf);
6647 : }
6648 :
6649 30 : instr += 4 + offset;
6650 30 : len -= 4 + offset;
6651 : }
6652 54 : else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6653 12 : {
6654 : pg_wchar unicode;
6655 :
6656 24 : unicode = hexval_n(instr + 2, 6);
6657 :
6658 24 : if (!is_valid_unicode_codepoint(unicode))
6659 6 : ereport(ERROR,
6660 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6661 : errmsg("invalid Unicode code point: %04X", unicode));
6662 :
6663 18 : if (pair_first)
6664 : {
6665 6 : if (is_utf16_surrogate_second(unicode))
6666 : {
6667 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6668 0 : pair_first = 0;
6669 : }
6670 : else
6671 6 : goto invalid_pair;
6672 : }
6673 12 : else if (is_utf16_surrogate_second(unicode))
6674 0 : goto invalid_pair;
6675 :
6676 12 : if (is_utf16_surrogate_first(unicode))
6677 6 : pair_first = unicode;
6678 : else
6679 : {
6680 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6681 6 : appendStringInfoString(&str, cbuf);
6682 : }
6683 :
6684 12 : instr += 8;
6685 12 : len -= 8;
6686 : }
6687 30 : else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6688 12 : {
6689 : pg_wchar unicode;
6690 :
6691 24 : unicode = hexval_n(instr + 2, 8);
6692 :
6693 24 : if (!is_valid_unicode_codepoint(unicode))
6694 6 : ereport(ERROR,
6695 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6696 : errmsg("invalid Unicode code point: %04X", unicode));
6697 :
6698 18 : if (pair_first)
6699 : {
6700 6 : if (is_utf16_surrogate_second(unicode))
6701 : {
6702 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6703 0 : pair_first = 0;
6704 : }
6705 : else
6706 6 : goto invalid_pair;
6707 : }
6708 12 : else if (is_utf16_surrogate_second(unicode))
6709 0 : goto invalid_pair;
6710 :
6711 12 : if (is_utf16_surrogate_first(unicode))
6712 6 : pair_first = unicode;
6713 : else
6714 : {
6715 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6716 6 : appendStringInfoString(&str, cbuf);
6717 : }
6718 :
6719 12 : instr += 10;
6720 12 : len -= 10;
6721 : }
6722 : else
6723 6 : ereport(ERROR,
6724 : (errcode(ERRCODE_SYNTAX_ERROR),
6725 : errmsg("invalid Unicode escape"),
6726 : errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6727 : }
6728 : else
6729 : {
6730 384 : if (pair_first)
6731 0 : goto invalid_pair;
6732 :
6733 384 : appendStringInfoChar(&str, *instr++);
6734 384 : len--;
6735 : }
6736 : }
6737 :
6738 : /* unfinished surrogate pair? */
6739 24 : if (pair_first)
6740 6 : goto invalid_pair;
6741 :
6742 18 : result = cstring_to_text_with_len(str.data, str.len);
6743 18 : pfree(str.data);
6744 :
6745 18 : PG_RETURN_TEXT_P(result);
6746 :
6747 30 : invalid_pair:
6748 30 : ereport(ERROR,
6749 : (errcode(ERRCODE_SYNTAX_ERROR),
6750 : errmsg("invalid Unicode surrogate pair")));
6751 : PG_RETURN_NULL(); /* keep compiler quiet */
6752 : }
|