Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/detoast.h"
21 : #include "access/toast_compression.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/hashfn.h"
25 : #include "common/int.h"
26 : #include "common/unicode_category.h"
27 : #include "common/unicode_norm.h"
28 : #include "common/unicode_version.h"
29 : #include "funcapi.h"
30 : #include "lib/hyperloglog.h"
31 : #include "libpq/pqformat.h"
32 : #include "miscadmin.h"
33 : #include "nodes/execnodes.h"
34 : #include "parser/scansup.h"
35 : #include "port/pg_bswap.h"
36 : #include "regex/regex.h"
37 : #include "utils/builtins.h"
38 : #include "utils/bytea.h"
39 : #include "utils/guc.h"
40 : #include "utils/lsyscache.h"
41 : #include "utils/memutils.h"
42 : #include "utils/pg_locale.h"
43 : #include "utils/sortsupport.h"
44 : #include "utils/varlena.h"
45 :
46 :
47 : /* GUC variable */
48 : int bytea_output = BYTEA_OUTPUT_HEX;
49 :
50 : typedef struct varlena VarString;
51 :
52 : /*
53 : * State for text_position_* functions.
54 : */
55 : typedef struct
56 : {
57 : bool is_multibyte_char_in_char; /* need to check char boundaries? */
58 :
59 : char *str1; /* haystack string */
60 : char *str2; /* needle string */
61 : int len1; /* string lengths in bytes */
62 : int len2;
63 :
64 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
65 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
66 : int skiptable[256]; /* skip distance for given mismatched char */
67 :
68 : char *last_match; /* pointer to last match in 'str1' */
69 :
70 : /*
71 : * Sometimes we need to convert the byte position of a match to a
72 : * character position. These store the last position that was converted,
73 : * so that on the next call, we can continue from that point, rather than
74 : * count characters from the very beginning.
75 : */
76 : char *refpoint; /* pointer within original haystack string */
77 : int refpos; /* 0-based character offset of the same point */
78 : } TextPositionState;
79 :
80 : typedef struct
81 : {
82 : char *buf1; /* 1st string, or abbreviation original string
83 : * buf */
84 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
85 : int buflen1; /* Allocated length of buf1 */
86 : int buflen2; /* Allocated length of buf2 */
87 : int last_len1; /* Length of last buf1 string/strxfrm() input */
88 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
89 : int last_returned; /* Last comparison result (cache) */
90 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
91 : bool collate_c;
92 : Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
93 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
94 : hyperLogLogState full_card; /* Full key cardinality state */
95 : double prop_card; /* Required cardinality proportion */
96 : pg_locale_t locale;
97 : } VarStringSortSupport;
98 :
99 : /*
100 : * Output data for split_text(): we output either to an array or a table.
101 : * tupstore and tupdesc must be set up in advance to output to a table.
102 : */
103 : typedef struct
104 : {
105 : ArrayBuildState *astate;
106 : Tuplestorestate *tupstore;
107 : TupleDesc tupdesc;
108 : } SplitTextOutputData;
109 :
110 : /*
111 : * This should be large enough that most strings will fit, but small enough
112 : * that we feel comfortable putting it on the stack
113 : */
114 : #define TEXTBUFLEN 1024
115 :
116 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
117 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
118 :
119 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
120 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
121 : static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
122 : static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
123 : static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
124 : static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
125 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
126 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
127 : static int32 text_length(Datum str);
128 : static text *text_catenate(text *t1, text *t2);
129 : static text *text_substring(Datum str,
130 : int32 start,
131 : int32 length,
132 : bool length_not_specified);
133 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
134 : static int text_position(text *t1, text *t2, Oid collid);
135 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
136 : static bool text_position_next(TextPositionState *state);
137 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
138 : static char *text_position_get_match_ptr(TextPositionState *state);
139 : static int text_position_get_match_pos(TextPositionState *state);
140 : static void text_position_cleanup(TextPositionState *state);
141 : static void check_collation_set(Oid collid);
142 : static int text_cmp(text *arg1, text *arg2, Oid collid);
143 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
144 : static bytea *bytea_substring(Datum str,
145 : int S,
146 : int L,
147 : bool length_not_specified);
148 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
149 : static void appendStringInfoText(StringInfo str, const text *t);
150 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
151 : static void split_text_accum_result(SplitTextOutputData *tstate,
152 : text *field_value,
153 : text *null_string,
154 : Oid collation);
155 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
156 : const char *fldsep, const char *null_string);
157 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
158 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
159 : int *value);
160 : static const char *text_format_parse_format(const char *start_ptr,
161 : const char *end_ptr,
162 : int *argpos, int *widthpos,
163 : int *flags, int *width);
164 : static void text_format_string_conversion(StringInfo buf, char conversion,
165 : FmgrInfo *typOutputInfo,
166 : Datum value, bool isNull,
167 : int flags, int width);
168 : static void text_format_append_string(StringInfo buf, const char *str,
169 : int flags, int width);
170 :
171 :
172 : /*****************************************************************************
173 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
174 : *****************************************************************************/
175 :
176 : /*
177 : * cstring_to_text
178 : *
179 : * Create a text value from a null-terminated C string.
180 : *
181 : * The new text value is freshly palloc'd with a full-size VARHDR.
182 : */
183 : text *
184 22339306 : cstring_to_text(const char *s)
185 : {
186 22339306 : return cstring_to_text_with_len(s, strlen(s));
187 : }
188 :
189 : /*
190 : * cstring_to_text_with_len
191 : *
192 : * Same as cstring_to_text except the caller specifies the string length;
193 : * the string need not be null_terminated.
194 : */
195 : text *
196 23746736 : cstring_to_text_with_len(const char *s, int len)
197 : {
198 23746736 : text *result = (text *) palloc(len + VARHDRSZ);
199 :
200 23746736 : SET_VARSIZE(result, len + VARHDRSZ);
201 23746736 : memcpy(VARDATA(result), s, len);
202 :
203 23746736 : return result;
204 : }
205 :
206 : /*
207 : * text_to_cstring
208 : *
209 : * Create a palloc'd, null-terminated C string from a text value.
210 : *
211 : * We support being passed a compressed or toasted text value.
212 : * This is a bit bogus since such values shouldn't really be referred to as
213 : * "text *", but it seems useful for robustness. If we didn't handle that
214 : * case here, we'd need another routine that did, anyway.
215 : */
216 : char *
217 13472086 : text_to_cstring(const text *t)
218 : {
219 : /* must cast away the const, unfortunately */
220 13472086 : text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
221 13472086 : int len = VARSIZE_ANY_EXHDR(tunpacked);
222 : char *result;
223 :
224 13472086 : result = (char *) palloc(len + 1);
225 13472086 : memcpy(result, VARDATA_ANY(tunpacked), len);
226 13472086 : result[len] = '\0';
227 :
228 13472086 : if (tunpacked != t)
229 29666 : pfree(tunpacked);
230 :
231 13472086 : return result;
232 : }
233 :
234 : /*
235 : * text_to_cstring_buffer
236 : *
237 : * Copy a text value into a caller-supplied buffer of size dst_len.
238 : *
239 : * The text string is truncated if necessary to fit. The result is
240 : * guaranteed null-terminated (unless dst_len == 0).
241 : *
242 : * We support being passed a compressed or toasted text value.
243 : * This is a bit bogus since such values shouldn't really be referred to as
244 : * "text *", but it seems useful for robustness. If we didn't handle that
245 : * case here, we'd need another routine that did, anyway.
246 : */
247 : void
248 940 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
249 : {
250 : /* must cast away the const, unfortunately */
251 940 : text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
252 940 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
253 :
254 940 : if (dst_len > 0)
255 : {
256 940 : dst_len--;
257 940 : if (dst_len >= src_len)
258 940 : dst_len = src_len;
259 : else /* ensure truncation is encoding-safe */
260 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
261 940 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
262 940 : dst[dst_len] = '\0';
263 : }
264 :
265 940 : if (srcunpacked != src)
266 0 : pfree(srcunpacked);
267 940 : }
268 :
269 :
270 : /*****************************************************************************
271 : * USER I/O ROUTINES *
272 : *****************************************************************************/
273 :
274 :
275 : #define VAL(CH) ((CH) - '0')
276 : #define DIG(VAL) ((VAL) + '0')
277 :
278 : /*
279 : * byteain - converts from printable representation of byte array
280 : *
281 : * Non-printable characters must be passed as '\nnn' (octal) and are
282 : * converted to internal form. '\' must be passed as '\\'.
283 : * ereport(ERROR, ...) if bad form.
284 : *
285 : * BUGS:
286 : * The input is scanned twice.
287 : * The error checking of input is minimal.
288 : */
289 : Datum
290 982174 : byteain(PG_FUNCTION_ARGS)
291 : {
292 982174 : char *inputText = PG_GETARG_CSTRING(0);
293 982174 : Node *escontext = fcinfo->context;
294 : char *tp;
295 : char *rp;
296 : int bc;
297 : bytea *result;
298 :
299 : /* Recognize hex input */
300 982174 : if (inputText[0] == '\\' && inputText[1] == 'x')
301 : {
302 111128 : size_t len = strlen(inputText);
303 :
304 111128 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
305 111128 : result = palloc(bc);
306 111128 : bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
307 : escontext);
308 111116 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
309 :
310 111116 : PG_RETURN_BYTEA_P(result);
311 : }
312 :
313 : /* Else, it's the traditional escaped style */
314 8066748 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
315 : {
316 7195714 : if (tp[0] != '\\')
317 7194698 : tp++;
318 1016 : else if ((tp[0] == '\\') &&
319 1016 : (tp[1] >= '0' && tp[1] <= '3') &&
320 1004 : (tp[2] >= '0' && tp[2] <= '7') &&
321 1004 : (tp[3] >= '0' && tp[3] <= '7'))
322 1004 : tp += 4;
323 12 : else if ((tp[0] == '\\') &&
324 12 : (tp[1] == '\\'))
325 0 : tp += 2;
326 : else
327 : {
328 : /*
329 : * one backslash, not followed by another or ### valid octal
330 : */
331 12 : ereturn(escontext, (Datum) 0,
332 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
333 : errmsg("invalid input syntax for type %s", "bytea")));
334 : }
335 : }
336 :
337 871034 : bc += VARHDRSZ;
338 :
339 871034 : result = (bytea *) palloc(bc);
340 871034 : SET_VARSIZE(result, bc);
341 :
342 871034 : tp = inputText;
343 871034 : rp = VARDATA(result);
344 8066706 : while (*tp != '\0')
345 : {
346 7195672 : if (tp[0] != '\\')
347 7194668 : *rp++ = *tp++;
348 1004 : else if ((tp[0] == '\\') &&
349 1004 : (tp[1] >= '0' && tp[1] <= '3') &&
350 1004 : (tp[2] >= '0' && tp[2] <= '7') &&
351 1004 : (tp[3] >= '0' && tp[3] <= '7'))
352 : {
353 1004 : bc = VAL(tp[1]);
354 1004 : bc <<= 3;
355 1004 : bc += VAL(tp[2]);
356 1004 : bc <<= 3;
357 1004 : *rp++ = bc + VAL(tp[3]);
358 :
359 1004 : tp += 4;
360 : }
361 0 : else if ((tp[0] == '\\') &&
362 0 : (tp[1] == '\\'))
363 : {
364 0 : *rp++ = '\\';
365 0 : tp += 2;
366 : }
367 : else
368 : {
369 : /*
370 : * We should never get here. The first pass should not allow it.
371 : */
372 0 : ereturn(escontext, (Datum) 0,
373 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
374 : errmsg("invalid input syntax for type %s", "bytea")));
375 : }
376 : }
377 :
378 871034 : PG_RETURN_BYTEA_P(result);
379 : }
380 :
381 : /*
382 : * byteaout - converts to printable representation of byte array
383 : *
384 : * In the traditional escaped format, non-printable characters are
385 : * printed as '\nnn' (octal) and '\' as '\\'.
386 : */
387 : Datum
388 158828 : byteaout(PG_FUNCTION_ARGS)
389 : {
390 158828 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
391 : char *result;
392 : char *rp;
393 :
394 158828 : if (bytea_output == BYTEA_OUTPUT_HEX)
395 : {
396 : /* Print hex format */
397 158444 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
398 158444 : *rp++ = '\\';
399 158444 : *rp++ = 'x';
400 158444 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
401 : }
402 384 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
403 : {
404 : /* Print traditional escaped format */
405 : char *vp;
406 : uint64 len;
407 : int i;
408 :
409 384 : len = 1; /* empty string has 1 char */
410 384 : vp = VARDATA_ANY(vlena);
411 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
412 : {
413 217276 : if (*vp == '\\')
414 0 : len += 2;
415 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
416 498 : len += 4;
417 : else
418 216778 : len++;
419 : }
420 :
421 : /*
422 : * In principle len can't overflow uint32 if the input fit in 1GB, but
423 : * for safety let's check rather than relying on palloc's internal
424 : * check.
425 : */
426 384 : if (len > MaxAllocSize)
427 0 : ereport(ERROR,
428 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
429 : errmsg_internal("result of bytea output conversion is too large")));
430 384 : rp = result = (char *) palloc(len);
431 :
432 384 : vp = VARDATA_ANY(vlena);
433 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
434 : {
435 217276 : if (*vp == '\\')
436 : {
437 0 : *rp++ = '\\';
438 0 : *rp++ = '\\';
439 : }
440 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
441 498 : {
442 : int val; /* holds unprintable chars */
443 :
444 498 : val = *vp;
445 498 : rp[0] = '\\';
446 498 : rp[3] = DIG(val & 07);
447 498 : val >>= 3;
448 498 : rp[2] = DIG(val & 07);
449 498 : val >>= 3;
450 498 : rp[1] = DIG(val & 03);
451 498 : rp += 4;
452 : }
453 : else
454 216778 : *rp++ = *vp;
455 : }
456 : }
457 : else
458 : {
459 0 : elog(ERROR, "unrecognized bytea_output setting: %d",
460 : bytea_output);
461 : rp = result = NULL; /* keep compiler quiet */
462 : }
463 158828 : *rp = '\0';
464 158828 : PG_RETURN_CSTRING(result);
465 : }
466 :
467 : /*
468 : * bytearecv - converts external binary format to bytea
469 : */
470 : Datum
471 107710 : bytearecv(PG_FUNCTION_ARGS)
472 : {
473 107710 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
474 : bytea *result;
475 : int nbytes;
476 :
477 107710 : nbytes = buf->len - buf->cursor;
478 107710 : result = (bytea *) palloc(nbytes + VARHDRSZ);
479 107710 : SET_VARSIZE(result, nbytes + VARHDRSZ);
480 107710 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
481 107710 : PG_RETURN_BYTEA_P(result);
482 : }
483 :
484 : /*
485 : * byteasend - converts bytea to binary format
486 : *
487 : * This is a special case: just copy the input...
488 : */
489 : Datum
490 68984 : byteasend(PG_FUNCTION_ARGS)
491 : {
492 68984 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
493 :
494 68984 : PG_RETURN_BYTEA_P(vlena);
495 : }
496 :
497 : Datum
498 258774 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
499 : {
500 : StringInfo state;
501 :
502 258774 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
503 :
504 : /* Append the value unless null, preceding it with the delimiter. */
505 258774 : if (!PG_ARGISNULL(1))
506 : {
507 243774 : bytea *value = PG_GETARG_BYTEA_PP(1);
508 243774 : bool isfirst = false;
509 :
510 : /*
511 : * You might think we can just throw away the first delimiter, however
512 : * we must keep it as we may be a parallel worker doing partial
513 : * aggregation building a state to send to the main process. We need
514 : * to keep the delimiter of every aggregation so that the combine
515 : * function can properly join up the strings of two separately
516 : * partially aggregated results. The first delimiter is only stripped
517 : * off in the final function. To know how much to strip off the front
518 : * of the string, we store the length of the first delimiter in the
519 : * StringInfo's cursor field, which we don't otherwise need here.
520 : */
521 243774 : if (state == NULL)
522 : {
523 148 : state = makeStringAggState(fcinfo);
524 148 : isfirst = true;
525 : }
526 :
527 243774 : if (!PG_ARGISNULL(2))
528 : {
529 243762 : bytea *delim = PG_GETARG_BYTEA_PP(2);
530 :
531 243762 : appendBinaryStringInfo(state, VARDATA_ANY(delim),
532 243762 : VARSIZE_ANY_EXHDR(delim));
533 243762 : if (isfirst)
534 142 : state->cursor = VARSIZE_ANY_EXHDR(delim);
535 : }
536 :
537 243774 : appendBinaryStringInfo(state, VARDATA_ANY(value),
538 243774 : VARSIZE_ANY_EXHDR(value));
539 : }
540 :
541 : /*
542 : * The transition type for string_agg() is declared to be "internal",
543 : * which is a pass-by-value type the same size as a pointer.
544 : */
545 258774 : if (state)
546 258738 : PG_RETURN_POINTER(state);
547 36 : PG_RETURN_NULL();
548 : }
549 :
550 : Datum
551 154 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
552 : {
553 : StringInfo state;
554 :
555 : /* cannot be called directly because of internal-type argument */
556 : Assert(AggCheckCallContext(fcinfo, NULL));
557 :
558 154 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
559 :
560 154 : if (state != NULL)
561 : {
562 : /* As per comment in transfn, strip data before the cursor position */
563 : bytea *result;
564 148 : int strippedlen = state->len - state->cursor;
565 :
566 148 : result = (bytea *) palloc(strippedlen + VARHDRSZ);
567 148 : SET_VARSIZE(result, strippedlen + VARHDRSZ);
568 148 : memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
569 148 : PG_RETURN_BYTEA_P(result);
570 : }
571 : else
572 6 : PG_RETURN_NULL();
573 : }
574 :
575 : /*
576 : * textin - converts cstring to internal representation
577 : */
578 : Datum
579 19667042 : textin(PG_FUNCTION_ARGS)
580 : {
581 19667042 : char *inputText = PG_GETARG_CSTRING(0);
582 :
583 19667042 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
584 : }
585 :
586 : /*
587 : * textout - converts internal representation to cstring
588 : */
589 : Datum
590 6948574 : textout(PG_FUNCTION_ARGS)
591 : {
592 6948574 : Datum txt = PG_GETARG_DATUM(0);
593 :
594 6948574 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
595 : }
596 :
597 : /*
598 : * textrecv - converts external binary format to text
599 : */
600 : Datum
601 48 : textrecv(PG_FUNCTION_ARGS)
602 : {
603 48 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
604 : text *result;
605 : char *str;
606 : int nbytes;
607 :
608 48 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
609 :
610 48 : result = cstring_to_text_with_len(str, nbytes);
611 48 : pfree(str);
612 48 : PG_RETURN_TEXT_P(result);
613 : }
614 :
615 : /*
616 : * textsend - converts text to binary format
617 : */
618 : Datum
619 4936 : textsend(PG_FUNCTION_ARGS)
620 : {
621 4936 : text *t = PG_GETARG_TEXT_PP(0);
622 : StringInfoData buf;
623 :
624 4936 : pq_begintypsend(&buf);
625 4936 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
626 4936 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
627 : }
628 :
629 :
630 : /*
631 : * unknownin - converts cstring to internal representation
632 : */
633 : Datum
634 0 : unknownin(PG_FUNCTION_ARGS)
635 : {
636 0 : char *str = PG_GETARG_CSTRING(0);
637 :
638 : /* representation is same as cstring */
639 0 : PG_RETURN_CSTRING(pstrdup(str));
640 : }
641 :
642 : /*
643 : * unknownout - converts internal representation to cstring
644 : */
645 : Datum
646 682 : unknownout(PG_FUNCTION_ARGS)
647 : {
648 : /* representation is same as cstring */
649 682 : char *str = PG_GETARG_CSTRING(0);
650 :
651 682 : PG_RETURN_CSTRING(pstrdup(str));
652 : }
653 :
654 : /*
655 : * unknownrecv - converts external binary format to unknown
656 : */
657 : Datum
658 0 : unknownrecv(PG_FUNCTION_ARGS)
659 : {
660 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
661 : char *str;
662 : int nbytes;
663 :
664 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
665 : /* representation is same as cstring */
666 0 : PG_RETURN_CSTRING(str);
667 : }
668 :
669 : /*
670 : * unknownsend - converts unknown to binary format
671 : */
672 : Datum
673 0 : unknownsend(PG_FUNCTION_ARGS)
674 : {
675 : /* representation is same as cstring */
676 0 : char *str = PG_GETARG_CSTRING(0);
677 : StringInfoData buf;
678 :
679 0 : pq_begintypsend(&buf);
680 0 : pq_sendtext(&buf, str, strlen(str));
681 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
682 : }
683 :
684 :
685 : /* ========== PUBLIC ROUTINES ========== */
686 :
687 : /*
688 : * textlen -
689 : * returns the logical length of a text*
690 : * (which is less than the VARSIZE of the text*)
691 : */
692 : Datum
693 430600 : textlen(PG_FUNCTION_ARGS)
694 : {
695 430600 : Datum str = PG_GETARG_DATUM(0);
696 :
697 : /* try to avoid decompressing argument */
698 430600 : PG_RETURN_INT32(text_length(str));
699 : }
700 :
701 : /*
702 : * text_length -
703 : * Does the real work for textlen()
704 : *
705 : * This is broken out so it can be called directly by other string processing
706 : * functions. Note that the argument is passed as a Datum, to indicate that
707 : * it may still be in compressed form. We can avoid decompressing it at all
708 : * in some cases.
709 : */
710 : static int32
711 430612 : text_length(Datum str)
712 : {
713 : /* fastpath when max encoding length is one */
714 430612 : if (pg_database_encoding_max_length() == 1)
715 289310 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
716 : else
717 : {
718 141302 : text *t = DatumGetTextPP(str);
719 :
720 141302 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
721 : VARSIZE_ANY_EXHDR(t)));
722 : }
723 : }
724 :
725 : /*
726 : * textoctetlen -
727 : * returns the physical length of a text*
728 : * (which is less than the VARSIZE of the text*)
729 : */
730 : Datum
731 62 : textoctetlen(PG_FUNCTION_ARGS)
732 : {
733 62 : Datum str = PG_GETARG_DATUM(0);
734 :
735 : /* We need not detoast the input at all */
736 62 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
737 : }
738 :
739 : /*
740 : * textcat -
741 : * takes two text* and returns a text* that is the concatenation of
742 : * the two.
743 : *
744 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
745 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
746 : * Allocate space for output in all cases.
747 : * XXX - thomas 1997-07-10
748 : */
749 : Datum
750 1821252 : textcat(PG_FUNCTION_ARGS)
751 : {
752 1821252 : text *t1 = PG_GETARG_TEXT_PP(0);
753 1821252 : text *t2 = PG_GETARG_TEXT_PP(1);
754 :
755 1821252 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
756 : }
757 :
758 : /*
759 : * text_catenate
760 : * Guts of textcat(), broken out so it can be used by other functions
761 : *
762 : * Arguments can be in short-header form, but not compressed or out-of-line
763 : */
764 : static text *
765 1821332 : text_catenate(text *t1, text *t2)
766 : {
767 : text *result;
768 : int len1,
769 : len2,
770 : len;
771 : char *ptr;
772 :
773 1821332 : len1 = VARSIZE_ANY_EXHDR(t1);
774 1821332 : len2 = VARSIZE_ANY_EXHDR(t2);
775 :
776 : /* paranoia ... probably should throw error instead? */
777 1821332 : if (len1 < 0)
778 0 : len1 = 0;
779 1821332 : if (len2 < 0)
780 0 : len2 = 0;
781 :
782 1821332 : len = len1 + len2 + VARHDRSZ;
783 1821332 : result = (text *) palloc(len);
784 :
785 : /* Set size of result string... */
786 1821332 : SET_VARSIZE(result, len);
787 :
788 : /* Fill data field of result string... */
789 1821332 : ptr = VARDATA(result);
790 1821332 : if (len1 > 0)
791 1818068 : memcpy(ptr, VARDATA_ANY(t1), len1);
792 1821332 : if (len2 > 0)
793 1821122 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
794 :
795 1821332 : return result;
796 : }
797 :
798 : /*
799 : * charlen_to_bytelen()
800 : * Compute the number of bytes occupied by n characters starting at *p
801 : *
802 : * It is caller's responsibility that there actually are n characters;
803 : * the string need not be null-terminated.
804 : */
805 : static int
806 11140 : charlen_to_bytelen(const char *p, int n)
807 : {
808 11140 : if (pg_database_encoding_max_length() == 1)
809 : {
810 : /* Optimization for single-byte encodings */
811 8004 : return n;
812 : }
813 : else
814 : {
815 : const char *s;
816 :
817 61634 : for (s = p; n > 0; n--)
818 58498 : s += pg_mblen(s);
819 :
820 3136 : return s - p;
821 : }
822 : }
823 :
824 : /*
825 : * text_substr()
826 : * Return a substring starting at the specified position.
827 : * - thomas 1997-12-31
828 : *
829 : * Input:
830 : * - string
831 : * - starting position (is one-based)
832 : * - string length
833 : *
834 : * If the starting position is zero or less, then return from the start of the string
835 : * adjusting the length to be consistent with the "negative start" per SQL.
836 : * If the length is less than zero, return the remaining string.
837 : *
838 : * Added multibyte support.
839 : * - Tatsuo Ishii 1998-4-21
840 : * Changed behavior if starting position is less than one to conform to SQL behavior.
841 : * Formerly returned the entire string; now returns a portion.
842 : * - Thomas Lockhart 1998-12-10
843 : * Now uses faster TOAST-slicing interface
844 : * - John Gray 2002-02-22
845 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
846 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
847 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
848 : * S > LC and < LC + 4 sometimes garbage characters are returned.
849 : * - Joe Conway 2002-08-10
850 : */
851 : Datum
852 1647106 : text_substr(PG_FUNCTION_ARGS)
853 : {
854 1647106 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
855 : PG_GETARG_INT32(1),
856 : PG_GETARG_INT32(2),
857 : false));
858 : }
859 :
860 : /*
861 : * text_substr_no_len -
862 : * Wrapper to avoid opr_sanity failure due to
863 : * one function accepting a different number of args.
864 : */
865 : Datum
866 36 : text_substr_no_len(PG_FUNCTION_ARGS)
867 : {
868 36 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
869 : PG_GETARG_INT32(1),
870 : -1, true));
871 : }
872 :
873 : /*
874 : * text_substring -
875 : * Does the real work for text_substr() and text_substr_no_len()
876 : *
877 : * This is broken out so it can be called directly by other string processing
878 : * functions. Note that the argument is passed as a Datum, to indicate that
879 : * it may still be in compressed/toasted form. We can avoid detoasting all
880 : * of it in some cases.
881 : *
882 : * The result is always a freshly palloc'd datum.
883 : */
884 : static text *
885 1686990 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
886 : {
887 1686990 : int32 eml = pg_database_encoding_max_length();
888 1686990 : int32 S = start; /* start position */
889 : int32 S1; /* adjusted start position */
890 : int32 L1; /* adjusted substring length */
891 : int32 E; /* end position */
892 :
893 : /*
894 : * SQL99 says S can be zero or negative, but we still must fetch from the
895 : * start of the string.
896 : */
897 1686990 : S1 = Max(S, 1);
898 :
899 : /* life is easy if the encoding max length is 1 */
900 1686990 : if (eml == 1)
901 : {
902 1600688 : if (length_not_specified) /* special case - get length to end of
903 : * string */
904 58 : L1 = -1;
905 1600630 : else if (length < 0)
906 : {
907 : /* SQL99 says to throw an error for E < S, i.e., negative length */
908 8 : ereport(ERROR,
909 : (errcode(ERRCODE_SUBSTRING_ERROR),
910 : errmsg("negative substring length not allowed")));
911 : L1 = -1; /* silence stupider compilers */
912 : }
913 1600622 : else if (pg_add_s32_overflow(S, length, &E))
914 : {
915 : /*
916 : * L could be large enough for S + L to overflow, in which case
917 : * the substring must run to end of string.
918 : */
919 4 : L1 = -1;
920 : }
921 : else
922 : {
923 : /*
924 : * A zero or negative value for the end position can happen if the
925 : * start was negative or one. SQL99 says to return a zero-length
926 : * string.
927 : */
928 1600618 : if (E < 1)
929 0 : return cstring_to_text("");
930 :
931 1600618 : L1 = E - S1;
932 : }
933 :
934 : /*
935 : * If the start position is past the end of the string, SQL99 says to
936 : * return a zero-length string -- DatumGetTextPSlice() will do that
937 : * for us. We need only convert S1 to zero-based starting position.
938 : */
939 1600680 : return DatumGetTextPSlice(str, S1 - 1, L1);
940 : }
941 86302 : else if (eml > 1)
942 : {
943 : /*
944 : * When encoding max length is > 1, we can't get LC without
945 : * detoasting, so we'll grab a conservatively large slice now and go
946 : * back later to do the right thing
947 : */
948 : int32 slice_start;
949 : int32 slice_size;
950 : int32 slice_strlen;
951 : text *slice;
952 : int32 E1;
953 : int32 i;
954 : char *p;
955 : char *s;
956 : text *ret;
957 :
958 : /*
959 : * We need to start at position zero because there is no way to know
960 : * in advance which byte offset corresponds to the supplied start
961 : * position.
962 : */
963 86302 : slice_start = 0;
964 :
965 86302 : if (length_not_specified) /* special case - get length to end of
966 : * string */
967 18 : slice_size = L1 = -1;
968 86284 : else if (length < 0)
969 : {
970 : /* SQL99 says to throw an error for E < S, i.e., negative length */
971 4 : ereport(ERROR,
972 : (errcode(ERRCODE_SUBSTRING_ERROR),
973 : errmsg("negative substring length not allowed")));
974 : slice_size = L1 = -1; /* silence stupider compilers */
975 : }
976 86280 : else if (pg_add_s32_overflow(S, length, &E))
977 : {
978 : /*
979 : * L could be large enough for S + L to overflow, in which case
980 : * the substring must run to end of string.
981 : */
982 2 : slice_size = L1 = -1;
983 : }
984 : else
985 : {
986 : /*
987 : * A zero or negative value for the end position can happen if the
988 : * start was negative or one. SQL99 says to return a zero-length
989 : * string.
990 : */
991 86278 : if (E < 1)
992 0 : return cstring_to_text("");
993 :
994 : /*
995 : * if E is past the end of the string, the tuple toaster will
996 : * truncate the length for us
997 : */
998 86278 : L1 = E - S1;
999 :
1000 : /*
1001 : * Total slice size in bytes can't be any longer than the start
1002 : * position plus substring length times the encoding max length.
1003 : * If that overflows, we can just use -1.
1004 : */
1005 86278 : if (pg_mul_s32_overflow(E, eml, &slice_size))
1006 2 : slice_size = -1;
1007 : }
1008 :
1009 : /*
1010 : * If we're working with an untoasted source, no need to do an extra
1011 : * copying step.
1012 : */
1013 86298 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1014 86280 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1015 44 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
1016 : else
1017 86254 : slice = (text *) DatumGetPointer(str);
1018 :
1019 : /* see if we got back an empty string */
1020 86298 : if (VARSIZE_ANY_EXHDR(slice) == 0)
1021 : {
1022 0 : if (slice != (text *) DatumGetPointer(str))
1023 0 : pfree(slice);
1024 0 : return cstring_to_text("");
1025 : }
1026 :
1027 : /* Now we can get the actual length of the slice in MB characters */
1028 86298 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1029 86298 : VARSIZE_ANY_EXHDR(slice));
1030 :
1031 : /*
1032 : * Check that the start position wasn't > slice_strlen. If so, SQL99
1033 : * says to return a zero-length string.
1034 : */
1035 86298 : if (S1 > slice_strlen)
1036 : {
1037 2 : if (slice != (text *) DatumGetPointer(str))
1038 0 : pfree(slice);
1039 2 : return cstring_to_text("");
1040 : }
1041 :
1042 : /*
1043 : * Adjust L1 and E1 now that we know the slice string length. Again
1044 : * remember that S1 is one based, and slice_start is zero based.
1045 : */
1046 86296 : if (L1 > -1)
1047 86278 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1048 : else
1049 18 : E1 = slice_start + 1 + slice_strlen;
1050 :
1051 : /*
1052 : * Find the start position in the slice; remember S1 is not zero based
1053 : */
1054 86296 : p = VARDATA_ANY(slice);
1055 1693212 : for (i = 0; i < S1 - 1; i++)
1056 1606916 : p += pg_mblen(p);
1057 :
1058 : /* hang onto a pointer to our start position */
1059 86296 : s = p;
1060 :
1061 : /*
1062 : * Count the actual bytes used by the substring of the requested
1063 : * length.
1064 : */
1065 2476392 : for (i = S1; i < E1; i++)
1066 2390096 : p += pg_mblen(p);
1067 :
1068 86296 : ret = (text *) palloc(VARHDRSZ + (p - s));
1069 86296 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
1070 86296 : memcpy(VARDATA(ret), s, (p - s));
1071 :
1072 86296 : if (slice != (text *) DatumGetPointer(str))
1073 44 : pfree(slice);
1074 :
1075 86296 : return ret;
1076 : }
1077 : else
1078 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
1079 :
1080 : /* not reached: suppress compiler warning */
1081 : return NULL;
1082 : }
1083 :
1084 : /*
1085 : * textoverlay
1086 : * Replace specified substring of first string with second
1087 : *
1088 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1089 : * This code is a direct implementation of what the standard says.
1090 : */
1091 : Datum
1092 28 : textoverlay(PG_FUNCTION_ARGS)
1093 : {
1094 28 : text *t1 = PG_GETARG_TEXT_PP(0);
1095 28 : text *t2 = PG_GETARG_TEXT_PP(1);
1096 28 : int sp = PG_GETARG_INT32(2); /* substring start position */
1097 28 : int sl = PG_GETARG_INT32(3); /* substring length */
1098 :
1099 28 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1100 : }
1101 :
1102 : Datum
1103 12 : textoverlay_no_len(PG_FUNCTION_ARGS)
1104 : {
1105 12 : text *t1 = PG_GETARG_TEXT_PP(0);
1106 12 : text *t2 = PG_GETARG_TEXT_PP(1);
1107 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
1108 : int sl;
1109 :
1110 12 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1111 12 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1112 : }
1113 :
1114 : static text *
1115 40 : text_overlay(text *t1, text *t2, int sp, int sl)
1116 : {
1117 : text *result;
1118 : text *s1;
1119 : text *s2;
1120 : int sp_pl_sl;
1121 :
1122 : /*
1123 : * Check for possible integer-overflow cases. For negative sp, throw a
1124 : * "substring length" error because that's what should be expected
1125 : * according to the spec's definition of OVERLAY().
1126 : */
1127 40 : if (sp <= 0)
1128 0 : ereport(ERROR,
1129 : (errcode(ERRCODE_SUBSTRING_ERROR),
1130 : errmsg("negative substring length not allowed")));
1131 40 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1132 0 : ereport(ERROR,
1133 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1134 : errmsg("integer out of range")));
1135 :
1136 40 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1137 40 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1138 40 : result = text_catenate(s1, t2);
1139 40 : result = text_catenate(result, s2);
1140 :
1141 40 : return result;
1142 : }
1143 :
1144 : /*
1145 : * textpos -
1146 : * Return the position of the specified substring.
1147 : * Implements the SQL POSITION() function.
1148 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1149 : * - thomas 1997-07-27
1150 : */
1151 : Datum
1152 106 : textpos(PG_FUNCTION_ARGS)
1153 : {
1154 106 : text *str = PG_GETARG_TEXT_PP(0);
1155 106 : text *search_str = PG_GETARG_TEXT_PP(1);
1156 :
1157 106 : PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1158 : }
1159 :
1160 : /*
1161 : * text_position -
1162 : * Does the real work for textpos()
1163 : *
1164 : * Inputs:
1165 : * t1 - string to be searched
1166 : * t2 - pattern to match within t1
1167 : * Result:
1168 : * Character index of the first matched char, starting from 1,
1169 : * or 0 if no match.
1170 : *
1171 : * This is broken out so it can be called directly by other string processing
1172 : * functions.
1173 : */
1174 : static int
1175 106 : text_position(text *t1, text *t2, Oid collid)
1176 : {
1177 : TextPositionState state;
1178 : int result;
1179 :
1180 : /* Empty needle always matches at position 1 */
1181 106 : if (VARSIZE_ANY_EXHDR(t2) < 1)
1182 12 : return 1;
1183 :
1184 : /* Otherwise, can't match if haystack is shorter than needle */
1185 94 : if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1186 22 : return 0;
1187 :
1188 72 : text_position_setup(t1, t2, collid, &state);
1189 72 : if (!text_position_next(&state))
1190 24 : result = 0;
1191 : else
1192 48 : result = text_position_get_match_pos(&state);
1193 72 : text_position_cleanup(&state);
1194 72 : return result;
1195 : }
1196 :
1197 :
1198 : /*
1199 : * text_position_setup, text_position_next, text_position_cleanup -
1200 : * Component steps of text_position()
1201 : *
1202 : * These are broken out so that a string can be efficiently searched for
1203 : * multiple occurrences of the same pattern. text_position_next may be
1204 : * called multiple times, and it advances to the next match on each call.
1205 : * text_position_get_match_ptr() and text_position_get_match_pos() return
1206 : * a pointer or 1-based character position of the last match, respectively.
1207 : *
1208 : * The "state" variable is normally just a local variable in the caller.
1209 : *
1210 : * NOTE: text_position_next skips over the matched portion. For example,
1211 : * searching for "xx" in "xxx" returns only one match, not two.
1212 : */
1213 :
1214 : static void
1215 1232 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1216 : {
1217 1232 : int len1 = VARSIZE_ANY_EXHDR(t1);
1218 1232 : int len2 = VARSIZE_ANY_EXHDR(t2);
1219 1232 : pg_locale_t mylocale = 0;
1220 :
1221 1232 : check_collation_set(collid);
1222 :
1223 1232 : if (!lc_collate_is_c(collid))
1224 76 : mylocale = pg_newlocale_from_collation(collid);
1225 :
1226 1232 : if (!pg_locale_deterministic(mylocale))
1227 4 : ereport(ERROR,
1228 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1229 : errmsg("nondeterministic collations are not supported for substring searches")));
1230 :
1231 : Assert(len1 > 0);
1232 : Assert(len2 > 0);
1233 :
1234 : /*
1235 : * Even with a multi-byte encoding, we perform the search using the raw
1236 : * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1237 : * because in UTF-8 the byte sequence of one character cannot contain
1238 : * another character. For other multi-byte encodings, we do the search
1239 : * initially as a simple byte search, ignoring multibyte issues, but
1240 : * verify afterwards that the match we found is at a character boundary,
1241 : * and continue the search if it was a false match.
1242 : */
1243 1228 : if (pg_database_encoding_max_length() == 1)
1244 1040 : state->is_multibyte_char_in_char = false;
1245 188 : else if (GetDatabaseEncoding() == PG_UTF8)
1246 188 : state->is_multibyte_char_in_char = false;
1247 : else
1248 0 : state->is_multibyte_char_in_char = true;
1249 :
1250 1228 : state->str1 = VARDATA_ANY(t1);
1251 1228 : state->str2 = VARDATA_ANY(t2);
1252 1228 : state->len1 = len1;
1253 1228 : state->len2 = len2;
1254 1228 : state->last_match = NULL;
1255 1228 : state->refpoint = state->str1;
1256 1228 : state->refpos = 0;
1257 :
1258 : /*
1259 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1260 : * notes we use the terminology that the "haystack" is the string to be
1261 : * searched (t1) and the "needle" is the pattern being sought (t2).
1262 : *
1263 : * If the needle is empty or bigger than the haystack then there is no
1264 : * point in wasting cycles initializing the table. We also choose not to
1265 : * use B-M-H for needles of length 1, since the skip table can't possibly
1266 : * save anything in that case.
1267 : */
1268 1228 : if (len1 >= len2 && len2 > 1)
1269 : {
1270 1024 : int searchlength = len1 - len2;
1271 : int skiptablemask;
1272 : int last;
1273 : int i;
1274 1024 : const char *str2 = state->str2;
1275 :
1276 : /*
1277 : * First we must determine how much of the skip table to use. The
1278 : * declaration of TextPositionState allows up to 256 elements, but for
1279 : * short search problems we don't really want to have to initialize so
1280 : * many elements --- it would take too long in comparison to the
1281 : * actual search time. So we choose a useful skip table size based on
1282 : * the haystack length minus the needle length. The closer the needle
1283 : * length is to the haystack length the less useful skipping becomes.
1284 : *
1285 : * Note: since we use bit-masking to select table elements, the skip
1286 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1287 : */
1288 1024 : if (searchlength < 16)
1289 54 : skiptablemask = 3;
1290 970 : else if (searchlength < 64)
1291 16 : skiptablemask = 7;
1292 954 : else if (searchlength < 128)
1293 14 : skiptablemask = 15;
1294 940 : else if (searchlength < 512)
1295 200 : skiptablemask = 31;
1296 740 : else if (searchlength < 2048)
1297 602 : skiptablemask = 63;
1298 138 : else if (searchlength < 4096)
1299 68 : skiptablemask = 127;
1300 : else
1301 70 : skiptablemask = 255;
1302 1024 : state->skiptablemask = skiptablemask;
1303 :
1304 : /*
1305 : * Initialize the skip table. We set all elements to the needle
1306 : * length, since this is the correct skip distance for any character
1307 : * not found in the needle.
1308 : */
1309 73144 : for (i = 0; i <= skiptablemask; i++)
1310 72120 : state->skiptable[i] = len2;
1311 :
1312 : /*
1313 : * Now examine the needle. For each character except the last one,
1314 : * set the corresponding table element to the appropriate skip
1315 : * distance. Note that when two characters share the same skip table
1316 : * entry, the one later in the needle must determine the skip
1317 : * distance.
1318 : */
1319 1024 : last = len2 - 1;
1320 :
1321 13968 : for (i = 0; i < last; i++)
1322 12944 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1323 : }
1324 1228 : }
1325 :
1326 : /*
1327 : * Advance to the next match, starting from the end of the previous match
1328 : * (or the beginning of the string, on first call). Returns true if a match
1329 : * is found.
1330 : *
1331 : * Note that this refuses to match an empty-string needle. Most callers
1332 : * will have handled that case specially and we'll never see it here.
1333 : */
1334 : static bool
1335 5988 : text_position_next(TextPositionState *state)
1336 : {
1337 5988 : int needle_len = state->len2;
1338 : char *start_ptr;
1339 : char *matchptr;
1340 :
1341 5988 : if (needle_len <= 0)
1342 0 : return false; /* result for empty pattern */
1343 :
1344 : /* Start from the point right after the previous match. */
1345 5988 : if (state->last_match)
1346 4748 : start_ptr = state->last_match + needle_len;
1347 : else
1348 1240 : start_ptr = state->str1;
1349 :
1350 5988 : retry:
1351 5988 : matchptr = text_position_next_internal(start_ptr, state);
1352 :
1353 5988 : if (!matchptr)
1354 1168 : return false;
1355 :
1356 : /*
1357 : * Found a match for the byte sequence. If this is a multibyte encoding,
1358 : * where one character's byte sequence can appear inside a longer
1359 : * multi-byte character, we need to verify that the match was at a
1360 : * character boundary, not in the middle of a multi-byte character.
1361 : */
1362 4820 : if (state->is_multibyte_char_in_char)
1363 : {
1364 : /* Walk one character at a time, until we reach the match. */
1365 :
1366 : /* the search should never move backwards. */
1367 : Assert(state->refpoint <= matchptr);
1368 :
1369 0 : while (state->refpoint < matchptr)
1370 : {
1371 : /* step to next character. */
1372 0 : state->refpoint += pg_mblen(state->refpoint);
1373 0 : state->refpos++;
1374 :
1375 : /*
1376 : * If we stepped over the match's start position, then it was a
1377 : * false positive, where the byte sequence appeared in the middle
1378 : * of a multi-byte character. Skip it, and continue the search at
1379 : * the next character boundary.
1380 : */
1381 0 : if (state->refpoint > matchptr)
1382 : {
1383 0 : start_ptr = state->refpoint;
1384 0 : goto retry;
1385 : }
1386 : }
1387 : }
1388 :
1389 4820 : state->last_match = matchptr;
1390 4820 : return true;
1391 : }
1392 :
1393 : /*
1394 : * Subroutine of text_position_next(). This searches for the raw byte
1395 : * sequence, ignoring any multi-byte encoding issues. Returns the first
1396 : * match starting at 'start_ptr', or NULL if no match is found.
1397 : */
1398 : static char *
1399 5988 : text_position_next_internal(char *start_ptr, TextPositionState *state)
1400 : {
1401 5988 : int haystack_len = state->len1;
1402 5988 : int needle_len = state->len2;
1403 5988 : int skiptablemask = state->skiptablemask;
1404 5988 : const char *haystack = state->str1;
1405 5988 : const char *needle = state->str2;
1406 5988 : const char *haystack_end = &haystack[haystack_len];
1407 : const char *hptr;
1408 :
1409 : Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1410 :
1411 5988 : if (needle_len == 1)
1412 : {
1413 : /* No point in using B-M-H for a one-character needle */
1414 742 : char nchar = *needle;
1415 :
1416 742 : hptr = start_ptr;
1417 5710 : while (hptr < haystack_end)
1418 : {
1419 5548 : if (*hptr == nchar)
1420 580 : return (char *) hptr;
1421 4968 : hptr++;
1422 : }
1423 : }
1424 : else
1425 : {
1426 5246 : const char *needle_last = &needle[needle_len - 1];
1427 :
1428 : /* Start at startpos plus the length of the needle */
1429 5246 : hptr = start_ptr + needle_len - 1;
1430 137830 : while (hptr < haystack_end)
1431 : {
1432 : /* Match the needle scanning *backward* */
1433 : const char *nptr;
1434 : const char *p;
1435 :
1436 136824 : nptr = needle_last;
1437 136824 : p = hptr;
1438 200468 : while (*nptr == *p)
1439 : {
1440 : /* Matched it all? If so, return 1-based position */
1441 67884 : if (nptr == needle)
1442 4240 : return (char *) p;
1443 63644 : nptr--, p--;
1444 : }
1445 :
1446 : /*
1447 : * No match, so use the haystack char at hptr to decide how far to
1448 : * advance. If the needle had any occurrence of that character
1449 : * (or more precisely, one sharing the same skiptable entry)
1450 : * before its last character, then we advance far enough to align
1451 : * the last such needle character with that haystack position.
1452 : * Otherwise we can advance by the whole needle length.
1453 : */
1454 132584 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1455 : }
1456 : }
1457 :
1458 1168 : return 0; /* not found */
1459 : }
1460 :
1461 : /*
1462 : * Return a pointer to the current match.
1463 : *
1464 : * The returned pointer points into the original haystack string.
1465 : */
1466 : static char *
1467 4742 : text_position_get_match_ptr(TextPositionState *state)
1468 : {
1469 4742 : return state->last_match;
1470 : }
1471 :
1472 : /*
1473 : * Return the offset of the current match.
1474 : *
1475 : * The offset is in characters, 1-based.
1476 : */
1477 : static int
1478 48 : text_position_get_match_pos(TextPositionState *state)
1479 : {
1480 : /* Convert the byte position to char position. */
1481 96 : state->refpos += pg_mbstrlen_with_len(state->refpoint,
1482 48 : state->last_match - state->refpoint);
1483 48 : state->refpoint = state->last_match;
1484 48 : return state->refpos + 1;
1485 : }
1486 :
1487 : /*
1488 : * Reset search state to the initial state installed by text_position_setup.
1489 : *
1490 : * The next call to text_position_next will search from the beginning
1491 : * of the string.
1492 : */
1493 : static void
1494 12 : text_position_reset(TextPositionState *state)
1495 : {
1496 12 : state->last_match = NULL;
1497 12 : state->refpoint = state->str1;
1498 12 : state->refpos = 0;
1499 12 : }
1500 :
1501 : static void
1502 1228 : text_position_cleanup(TextPositionState *state)
1503 : {
1504 : /* no cleanup needed */
1505 1228 : }
1506 :
1507 :
1508 : static void
1509 12402046 : check_collation_set(Oid collid)
1510 : {
1511 12402046 : if (!OidIsValid(collid))
1512 : {
1513 : /*
1514 : * This typically means that the parser could not resolve a conflict
1515 : * of implicit collations, so report it that way.
1516 : */
1517 18 : ereport(ERROR,
1518 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1519 : errmsg("could not determine which collation to use for string comparison"),
1520 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1521 : }
1522 12402028 : }
1523 :
1524 : /* varstr_cmp()
1525 : * Comparison function for text strings with given lengths.
1526 : * Includes locale support, but must copy strings to temporary memory
1527 : * to allow null-termination for inputs to strcoll().
1528 : * Returns an integer less than, equal to, or greater than zero, indicating
1529 : * whether arg1 is less than, equal to, or greater than arg2.
1530 : *
1531 : * Note: many functions that depend on this are marked leakproof; therefore,
1532 : * avoid reporting the actual contents of the input when throwing errors.
1533 : * All errors herein should be things that can't happen except on corrupt
1534 : * data, anyway; otherwise we will have trouble with indexing strings that
1535 : * would cause them.
1536 : */
1537 : int
1538 5939262 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1539 : {
1540 : int result;
1541 :
1542 5939262 : check_collation_set(collid);
1543 :
1544 : /*
1545 : * Unfortunately, there is no strncoll(), so in the non-C locale case we
1546 : * have to do some memory copying. This turns out to be significantly
1547 : * slower, so we optimize the case where LC_COLLATE is C. We also try to
1548 : * optimize relatively-short strings by avoiding palloc/pfree overhead.
1549 : */
1550 5939252 : if (lc_collate_is_c(collid))
1551 : {
1552 4728662 : result = memcmp(arg1, arg2, Min(len1, len2));
1553 4728662 : if ((result == 0) && (len1 != len2))
1554 194160 : result = (len1 < len2) ? -1 : 1;
1555 : }
1556 : else
1557 : {
1558 : pg_locale_t mylocale;
1559 :
1560 1210590 : mylocale = pg_newlocale_from_collation(collid);
1561 :
1562 : /*
1563 : * memcmp() can't tell us which of two unequal strings sorts first,
1564 : * but it's a cheap way to tell if they're equal. Testing shows that
1565 : * memcmp() followed by strcoll() is only trivially slower than
1566 : * strcoll() by itself, so we don't lose much if this doesn't work out
1567 : * very often, and if it does - for example, because there are many
1568 : * equal strings in the input - then we win big by avoiding expensive
1569 : * collation-aware comparisons.
1570 : */
1571 1210590 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1572 566282 : return 0;
1573 :
1574 644308 : result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1575 :
1576 : /* Break tie if necessary. */
1577 644308 : if (result == 0 && pg_locale_deterministic(mylocale))
1578 : {
1579 0 : result = memcmp(arg1, arg2, Min(len1, len2));
1580 0 : if ((result == 0) && (len1 != len2))
1581 0 : result = (len1 < len2) ? -1 : 1;
1582 : }
1583 : }
1584 :
1585 5372970 : return result;
1586 : }
1587 :
1588 : /* text_cmp()
1589 : * Internal comparison function for text strings.
1590 : * Returns -1, 0 or 1
1591 : */
1592 : static int
1593 4468994 : text_cmp(text *arg1, text *arg2, Oid collid)
1594 : {
1595 : char *a1p,
1596 : *a2p;
1597 : int len1,
1598 : len2;
1599 :
1600 4468994 : a1p = VARDATA_ANY(arg1);
1601 4468994 : a2p = VARDATA_ANY(arg2);
1602 :
1603 4468994 : len1 = VARSIZE_ANY_EXHDR(arg1);
1604 4468994 : len2 = VARSIZE_ANY_EXHDR(arg2);
1605 :
1606 4468994 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1607 : }
1608 :
1609 : /*
1610 : * Comparison functions for text strings.
1611 : *
1612 : * Note: btree indexes need these routines not to leak memory; therefore,
1613 : * be careful to free working copies of toasted datums. Most places don't
1614 : * need to be so careful.
1615 : */
1616 :
1617 : Datum
1618 6101420 : texteq(PG_FUNCTION_ARGS)
1619 : {
1620 6101420 : Oid collid = PG_GET_COLLATION();
1621 6101420 : bool locale_is_c = false;
1622 6101420 : pg_locale_t mylocale = 0;
1623 : bool result;
1624 :
1625 6101420 : check_collation_set(collid);
1626 :
1627 6101420 : if (lc_collate_is_c(collid))
1628 4982296 : locale_is_c = true;
1629 : else
1630 1119124 : mylocale = pg_newlocale_from_collation(collid);
1631 :
1632 6101420 : if (locale_is_c || pg_locale_deterministic(mylocale))
1633 6101256 : {
1634 6101256 : Datum arg1 = PG_GETARG_DATUM(0);
1635 6101256 : Datum arg2 = PG_GETARG_DATUM(1);
1636 : Size len1,
1637 : len2;
1638 :
1639 : /*
1640 : * Since we only care about equality or not-equality, we can avoid all
1641 : * the expense of strcoll() here, and just do bitwise comparison. In
1642 : * fact, we don't even have to do a bitwise comparison if we can show
1643 : * the lengths of the strings are unequal; which might save us from
1644 : * having to detoast one or both values.
1645 : */
1646 6101256 : len1 = toast_raw_datum_size(arg1);
1647 6101256 : len2 = toast_raw_datum_size(arg2);
1648 6101256 : if (len1 != len2)
1649 2883936 : result = false;
1650 : else
1651 : {
1652 3217320 : text *targ1 = DatumGetTextPP(arg1);
1653 3217320 : text *targ2 = DatumGetTextPP(arg2);
1654 :
1655 3217320 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1656 : len1 - VARHDRSZ) == 0);
1657 :
1658 3217320 : PG_FREE_IF_COPY(targ1, 0);
1659 3217320 : PG_FREE_IF_COPY(targ2, 1);
1660 : }
1661 : }
1662 : else
1663 : {
1664 164 : text *arg1 = PG_GETARG_TEXT_PP(0);
1665 164 : text *arg2 = PG_GETARG_TEXT_PP(1);
1666 :
1667 164 : result = (text_cmp(arg1, arg2, collid) == 0);
1668 :
1669 164 : PG_FREE_IF_COPY(arg1, 0);
1670 164 : PG_FREE_IF_COPY(arg2, 1);
1671 : }
1672 :
1673 6101420 : PG_RETURN_BOOL(result);
1674 : }
1675 :
1676 : Datum
1677 20156 : textne(PG_FUNCTION_ARGS)
1678 : {
1679 20156 : Oid collid = PG_GET_COLLATION();
1680 20156 : bool locale_is_c = false;
1681 20156 : pg_locale_t mylocale = 0;
1682 : bool result;
1683 :
1684 20156 : check_collation_set(collid);
1685 :
1686 20156 : if (lc_collate_is_c(collid))
1687 14500 : locale_is_c = true;
1688 : else
1689 5656 : mylocale = pg_newlocale_from_collation(collid);
1690 :
1691 20156 : if (locale_is_c || pg_locale_deterministic(mylocale))
1692 20148 : {
1693 20148 : Datum arg1 = PG_GETARG_DATUM(0);
1694 20148 : Datum arg2 = PG_GETARG_DATUM(1);
1695 : Size len1,
1696 : len2;
1697 :
1698 : /* See comment in texteq() */
1699 20148 : len1 = toast_raw_datum_size(arg1);
1700 20148 : len2 = toast_raw_datum_size(arg2);
1701 20148 : if (len1 != len2)
1702 2560 : result = true;
1703 : else
1704 : {
1705 17588 : text *targ1 = DatumGetTextPP(arg1);
1706 17588 : text *targ2 = DatumGetTextPP(arg2);
1707 :
1708 17588 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1709 : len1 - VARHDRSZ) != 0);
1710 :
1711 17588 : PG_FREE_IF_COPY(targ1, 0);
1712 17588 : PG_FREE_IF_COPY(targ2, 1);
1713 : }
1714 : }
1715 : else
1716 : {
1717 8 : text *arg1 = PG_GETARG_TEXT_PP(0);
1718 8 : text *arg2 = PG_GETARG_TEXT_PP(1);
1719 :
1720 8 : result = (text_cmp(arg1, arg2, collid) != 0);
1721 :
1722 8 : PG_FREE_IF_COPY(arg1, 0);
1723 8 : PG_FREE_IF_COPY(arg2, 1);
1724 : }
1725 :
1726 20156 : PG_RETURN_BOOL(result);
1727 : }
1728 :
1729 : Datum
1730 183438 : text_lt(PG_FUNCTION_ARGS)
1731 : {
1732 183438 : text *arg1 = PG_GETARG_TEXT_PP(0);
1733 183438 : text *arg2 = PG_GETARG_TEXT_PP(1);
1734 : bool result;
1735 :
1736 183438 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1737 :
1738 183428 : PG_FREE_IF_COPY(arg1, 0);
1739 183428 : PG_FREE_IF_COPY(arg2, 1);
1740 :
1741 183428 : PG_RETURN_BOOL(result);
1742 : }
1743 :
1744 : Datum
1745 315836 : text_le(PG_FUNCTION_ARGS)
1746 : {
1747 315836 : text *arg1 = PG_GETARG_TEXT_PP(0);
1748 315836 : text *arg2 = PG_GETARG_TEXT_PP(1);
1749 : bool result;
1750 :
1751 315836 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1752 :
1753 315836 : PG_FREE_IF_COPY(arg1, 0);
1754 315836 : PG_FREE_IF_COPY(arg2, 1);
1755 :
1756 315836 : PG_RETURN_BOOL(result);
1757 : }
1758 :
1759 : Datum
1760 173610 : text_gt(PG_FUNCTION_ARGS)
1761 : {
1762 173610 : text *arg1 = PG_GETARG_TEXT_PP(0);
1763 173610 : text *arg2 = PG_GETARG_TEXT_PP(1);
1764 : bool result;
1765 :
1766 173610 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1767 :
1768 173610 : PG_FREE_IF_COPY(arg1, 0);
1769 173610 : PG_FREE_IF_COPY(arg2, 1);
1770 :
1771 173610 : PG_RETURN_BOOL(result);
1772 : }
1773 :
1774 : Datum
1775 175328 : text_ge(PG_FUNCTION_ARGS)
1776 : {
1777 175328 : text *arg1 = PG_GETARG_TEXT_PP(0);
1778 175328 : text *arg2 = PG_GETARG_TEXT_PP(1);
1779 : bool result;
1780 :
1781 175328 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1782 :
1783 175328 : PG_FREE_IF_COPY(arg1, 0);
1784 175328 : PG_FREE_IF_COPY(arg2, 1);
1785 :
1786 175328 : PG_RETURN_BOOL(result);
1787 : }
1788 :
1789 : Datum
1790 37914 : text_starts_with(PG_FUNCTION_ARGS)
1791 : {
1792 37914 : Datum arg1 = PG_GETARG_DATUM(0);
1793 37914 : Datum arg2 = PG_GETARG_DATUM(1);
1794 37914 : Oid collid = PG_GET_COLLATION();
1795 37914 : pg_locale_t mylocale = 0;
1796 : bool result;
1797 : Size len1,
1798 : len2;
1799 :
1800 37914 : check_collation_set(collid);
1801 :
1802 37914 : if (!lc_collate_is_c(collid))
1803 12638 : mylocale = pg_newlocale_from_collation(collid);
1804 :
1805 37914 : if (!pg_locale_deterministic(mylocale))
1806 0 : ereport(ERROR,
1807 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1808 : errmsg("nondeterministic collations are not supported for substring searches")));
1809 :
1810 37914 : len1 = toast_raw_datum_size(arg1);
1811 37914 : len2 = toast_raw_datum_size(arg2);
1812 37914 : if (len2 > len1)
1813 0 : result = false;
1814 : else
1815 : {
1816 37914 : text *targ1 = text_substring(arg1, 1, len2, false);
1817 37914 : text *targ2 = DatumGetTextPP(arg2);
1818 :
1819 37914 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1820 37914 : VARSIZE_ANY_EXHDR(targ2)) == 0);
1821 :
1822 37914 : PG_FREE_IF_COPY(targ1, 0);
1823 37914 : PG_FREE_IF_COPY(targ2, 1);
1824 : }
1825 :
1826 37914 : PG_RETURN_BOOL(result);
1827 : }
1828 :
1829 : Datum
1830 3305046 : bttextcmp(PG_FUNCTION_ARGS)
1831 : {
1832 3305046 : text *arg1 = PG_GETARG_TEXT_PP(0);
1833 3305046 : text *arg2 = PG_GETARG_TEXT_PP(1);
1834 : int32 result;
1835 :
1836 3305046 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1837 :
1838 3305046 : PG_FREE_IF_COPY(arg1, 0);
1839 3305046 : PG_FREE_IF_COPY(arg2, 1);
1840 :
1841 3305046 : PG_RETURN_INT32(result);
1842 : }
1843 :
1844 : Datum
1845 61884 : bttextsortsupport(PG_FUNCTION_ARGS)
1846 : {
1847 61884 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1848 61884 : Oid collid = ssup->ssup_collation;
1849 : MemoryContext oldcontext;
1850 :
1851 61884 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1852 :
1853 : /* Use generic string SortSupport */
1854 61884 : varstr_sortsupport(ssup, TEXTOID, collid);
1855 :
1856 61876 : MemoryContextSwitchTo(oldcontext);
1857 :
1858 61876 : PG_RETURN_VOID();
1859 : }
1860 :
1861 : /*
1862 : * Generic sortsupport interface for character type's operator classes.
1863 : * Includes locale support, and support for BpChar semantics (i.e. removing
1864 : * trailing spaces before comparison).
1865 : *
1866 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1867 : * same representation. Callers that always use the C collation (e.g.
1868 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
1869 : * this will not work with any other collation, though.
1870 : */
1871 : void
1872 93658 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1873 : {
1874 93658 : bool abbreviate = ssup->abbreviate;
1875 93658 : bool collate_c = false;
1876 : VarStringSortSupport *sss;
1877 93658 : pg_locale_t locale = 0;
1878 :
1879 93658 : check_collation_set(collid);
1880 :
1881 : /*
1882 : * If possible, set ssup->comparator to a function which can be used to
1883 : * directly compare two datums. If we can do this, we'll avoid the
1884 : * overhead of a trip through the fmgr layer for every comparison, which
1885 : * can be substantial.
1886 : *
1887 : * Most typically, we'll set the comparator to varlenafastcmp_locale,
1888 : * which uses strcoll() to perform comparisons. We use that for the
1889 : * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1890 : * LC_COLLATE = C, we can make things quite a bit faster with
1891 : * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1892 : * memcmp() rather than strcoll().
1893 : */
1894 93650 : if (lc_collate_is_c(collid))
1895 : {
1896 71354 : if (typid == BPCHAROID)
1897 678 : ssup->comparator = bpcharfastcmp_c;
1898 70676 : else if (typid == NAMEOID)
1899 : {
1900 30668 : ssup->comparator = namefastcmp_c;
1901 : /* Not supporting abbreviation with type NAME, for now */
1902 30668 : abbreviate = false;
1903 : }
1904 : else
1905 40008 : ssup->comparator = varstrfastcmp_c;
1906 :
1907 71354 : collate_c = true;
1908 : }
1909 : else
1910 : {
1911 : /*
1912 : * We need a collation-sensitive comparison. To make things faster,
1913 : * we'll figure out the collation based on the locale id and cache the
1914 : * result.
1915 : */
1916 22296 : locale = pg_newlocale_from_collation(collid);
1917 :
1918 : /*
1919 : * We use varlenafastcmp_locale except for type NAME.
1920 : */
1921 22296 : if (typid == NAMEOID)
1922 : {
1923 0 : ssup->comparator = namefastcmp_locale;
1924 : /* Not supporting abbreviation with type NAME, for now */
1925 0 : abbreviate = false;
1926 : }
1927 : else
1928 22296 : ssup->comparator = varlenafastcmp_locale;
1929 : }
1930 :
1931 : /*
1932 : * Unfortunately, it seems that abbreviation for non-C collations is
1933 : * broken on many common platforms; see pg_strxfrm_enabled().
1934 : *
1935 : * Even apart from the risk of broken locales, it's possible that there
1936 : * are platforms where the use of abbreviated keys should be disabled at
1937 : * compile time. Having only 4 byte datums could make worst-case
1938 : * performance drastically more likely, for example. Moreover, macOS's
1939 : * strxfrm() implementation is known to not effectively concentrate a
1940 : * significant amount of entropy from the original string in earlier
1941 : * transformed blobs. It's possible that other supported platforms are
1942 : * similarly encumbered. So, if we ever get past disabling this
1943 : * categorically, we may still want or need to disable it for particular
1944 : * platforms.
1945 : */
1946 93650 : if (!collate_c && !pg_strxfrm_enabled(locale))
1947 2 : abbreviate = false;
1948 :
1949 : /*
1950 : * If we're using abbreviated keys, or if we're using a locale-aware
1951 : * comparison, we need to initialize a VarStringSortSupport object. Both
1952 : * cases will make use of the temporary buffers we initialize here for
1953 : * scratch space (and to detect requirement for BpChar semantics from
1954 : * caller), and the abbreviation case requires additional state.
1955 : */
1956 93650 : if (abbreviate || !collate_c)
1957 : {
1958 51606 : sss = palloc(sizeof(VarStringSortSupport));
1959 51606 : sss->buf1 = palloc(TEXTBUFLEN);
1960 51606 : sss->buflen1 = TEXTBUFLEN;
1961 51606 : sss->buf2 = palloc(TEXTBUFLEN);
1962 51606 : sss->buflen2 = TEXTBUFLEN;
1963 : /* Start with invalid values */
1964 51606 : sss->last_len1 = -1;
1965 51606 : sss->last_len2 = -1;
1966 : /* Initialize */
1967 51606 : sss->last_returned = 0;
1968 51606 : sss->locale = locale;
1969 :
1970 : /*
1971 : * To avoid somehow confusing a strxfrm() blob and an original string,
1972 : * constantly keep track of the variety of data that buf1 and buf2
1973 : * currently contain.
1974 : *
1975 : * Comparisons may be interleaved with conversion calls. Frequently,
1976 : * conversions and comparisons are batched into two distinct phases,
1977 : * but the correctness of caching cannot hinge upon this. For
1978 : * comparison caching, buffer state is only trusted if cache_blob is
1979 : * found set to false, whereas strxfrm() caching only trusts the state
1980 : * when cache_blob is found set to true.
1981 : *
1982 : * Arbitrarily initialize cache_blob to true.
1983 : */
1984 51606 : sss->cache_blob = true;
1985 51606 : sss->collate_c = collate_c;
1986 51606 : sss->typid = typid;
1987 51606 : ssup->ssup_extra = sss;
1988 :
1989 : /*
1990 : * If possible, plan to use the abbreviated keys optimization. The
1991 : * core code may switch back to authoritative comparator should
1992 : * abbreviation be aborted.
1993 : */
1994 51606 : if (abbreviate)
1995 : {
1996 49206 : sss->prop_card = 0.20;
1997 49206 : initHyperLogLog(&sss->abbr_card, 10);
1998 49206 : initHyperLogLog(&sss->full_card, 10);
1999 49206 : ssup->abbrev_full_comparator = ssup->comparator;
2000 49206 : ssup->comparator = ssup_datum_unsigned_cmp;
2001 49206 : ssup->abbrev_converter = varstr_abbrev_convert;
2002 49206 : ssup->abbrev_abort = varstr_abbrev_abort;
2003 : }
2004 : }
2005 93650 : }
2006 :
2007 : /*
2008 : * sortsupport comparison func (for C locale case)
2009 : */
2010 : static int
2011 41719146 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2012 : {
2013 41719146 : VarString *arg1 = DatumGetVarStringPP(x);
2014 41719146 : VarString *arg2 = DatumGetVarStringPP(y);
2015 : char *a1p,
2016 : *a2p;
2017 : int len1,
2018 : len2,
2019 : result;
2020 :
2021 41719146 : a1p = VARDATA_ANY(arg1);
2022 41719146 : a2p = VARDATA_ANY(arg2);
2023 :
2024 41719146 : len1 = VARSIZE_ANY_EXHDR(arg1);
2025 41719146 : len2 = VARSIZE_ANY_EXHDR(arg2);
2026 :
2027 41719146 : result = memcmp(a1p, a2p, Min(len1, len2));
2028 41719146 : if ((result == 0) && (len1 != len2))
2029 1212774 : result = (len1 < len2) ? -1 : 1;
2030 :
2031 : /* We can't afford to leak memory here. */
2032 41719146 : if (PointerGetDatum(arg1) != x)
2033 4 : pfree(arg1);
2034 41719146 : if (PointerGetDatum(arg2) != y)
2035 4 : pfree(arg2);
2036 :
2037 41719146 : return result;
2038 : }
2039 :
2040 : /*
2041 : * sortsupport comparison func (for BpChar C locale case)
2042 : *
2043 : * BpChar outsources its sortsupport to this module. Specialization for the
2044 : * varstr_sortsupport BpChar case, modeled on
2045 : * internal_bpchar_pattern_compare().
2046 : */
2047 : static int
2048 146950 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2049 : {
2050 146950 : BpChar *arg1 = DatumGetBpCharPP(x);
2051 146950 : BpChar *arg2 = DatumGetBpCharPP(y);
2052 : char *a1p,
2053 : *a2p;
2054 : int len1,
2055 : len2,
2056 : result;
2057 :
2058 146950 : a1p = VARDATA_ANY(arg1);
2059 146950 : a2p = VARDATA_ANY(arg2);
2060 :
2061 146950 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2062 146950 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2063 :
2064 146950 : result = memcmp(a1p, a2p, Min(len1, len2));
2065 146950 : if ((result == 0) && (len1 != len2))
2066 0 : result = (len1 < len2) ? -1 : 1;
2067 :
2068 : /* We can't afford to leak memory here. */
2069 146950 : if (PointerGetDatum(arg1) != x)
2070 0 : pfree(arg1);
2071 146950 : if (PointerGetDatum(arg2) != y)
2072 0 : pfree(arg2);
2073 :
2074 146950 : return result;
2075 : }
2076 :
2077 : /*
2078 : * sortsupport comparison func (for NAME C locale case)
2079 : */
2080 : static int
2081 29496884 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2082 : {
2083 29496884 : Name arg1 = DatumGetName(x);
2084 29496884 : Name arg2 = DatumGetName(y);
2085 :
2086 29496884 : return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2087 : }
2088 :
2089 : /*
2090 : * sortsupport comparison func (for locale case with all varlena types)
2091 : */
2092 : static int
2093 10961922 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2094 : {
2095 10961922 : VarString *arg1 = DatumGetVarStringPP(x);
2096 10961922 : VarString *arg2 = DatumGetVarStringPP(y);
2097 : char *a1p,
2098 : *a2p;
2099 : int len1,
2100 : len2,
2101 : result;
2102 :
2103 10961922 : a1p = VARDATA_ANY(arg1);
2104 10961922 : a2p = VARDATA_ANY(arg2);
2105 :
2106 10961922 : len1 = VARSIZE_ANY_EXHDR(arg1);
2107 10961922 : len2 = VARSIZE_ANY_EXHDR(arg2);
2108 :
2109 10961922 : result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2110 :
2111 : /* We can't afford to leak memory here. */
2112 10961922 : if (PointerGetDatum(arg1) != x)
2113 2 : pfree(arg1);
2114 10961922 : if (PointerGetDatum(arg2) != y)
2115 2 : pfree(arg2);
2116 :
2117 10961922 : return result;
2118 : }
2119 :
2120 : /*
2121 : * sortsupport comparison func (for locale case with NAME type)
2122 : */
2123 : static int
2124 0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2125 : {
2126 0 : Name arg1 = DatumGetName(x);
2127 0 : Name arg2 = DatumGetName(y);
2128 :
2129 0 : return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2130 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2131 : ssup);
2132 : }
2133 :
2134 : /*
2135 : * sortsupport comparison func for locale cases
2136 : */
2137 : static int
2138 10961922 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2139 : {
2140 10961922 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2141 : int result;
2142 : bool arg1_match;
2143 :
2144 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2145 10961922 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2146 : {
2147 : /*
2148 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2149 : * last_len2. Existing contents of buffers might still be used by
2150 : * next call.
2151 : *
2152 : * It's fine to allow the comparison of BpChar padding bytes here,
2153 : * even though that implies that the memcmp() will usually be
2154 : * performed for BpChar callers (though multibyte characters could
2155 : * still prevent that from occurring). The memcmp() is still very
2156 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2157 : * (not limited to padding), so we need make no distinction between
2158 : * padding space characters and "real" space characters.
2159 : */
2160 4439592 : return 0;
2161 : }
2162 :
2163 6522330 : if (sss->typid == BPCHAROID)
2164 : {
2165 : /* Get true number of bytes, ignoring trailing spaces */
2166 2522 : len1 = bpchartruelen(a1p, len1);
2167 2522 : len2 = bpchartruelen(a2p, len2);
2168 : }
2169 :
2170 6522330 : if (len1 >= sss->buflen1)
2171 : {
2172 0 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2173 0 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2174 : }
2175 6522330 : if (len2 >= sss->buflen2)
2176 : {
2177 2 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2178 2 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2179 : }
2180 :
2181 : /*
2182 : * We're likely to be asked to compare the same strings repeatedly, and
2183 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2184 : * comparisons, even though in general there is no reason to think that
2185 : * that will work out (every string datum may be unique). Caching does
2186 : * not slow things down measurably when it doesn't work out, and can speed
2187 : * things up by rather a lot when it does. In part, this is because the
2188 : * memcmp() compares data from cachelines that are needed in L1 cache even
2189 : * when the last comparison's result cannot be reused.
2190 : */
2191 6522330 : arg1_match = true;
2192 6522330 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2193 : {
2194 5724538 : arg1_match = false;
2195 5724538 : memcpy(sss->buf1, a1p, len1);
2196 5724538 : sss->buf1[len1] = '\0';
2197 5724538 : sss->last_len1 = len1;
2198 : }
2199 :
2200 : /*
2201 : * If we're comparing the same two strings as last time, we can return the
2202 : * same answer without calling strcoll() again. This is more likely than
2203 : * it seems (at least with moderate to low cardinality sets), because
2204 : * quicksort compares the same pivot against many values.
2205 : */
2206 6522330 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2207 : {
2208 1068194 : memcpy(sss->buf2, a2p, len2);
2209 1068194 : sss->buf2[len2] = '\0';
2210 1068194 : sss->last_len2 = len2;
2211 : }
2212 5454136 : else if (arg1_match && !sss->cache_blob)
2213 : {
2214 : /* Use result cached following last actual strcoll() call */
2215 699086 : return sss->last_returned;
2216 : }
2217 :
2218 5823244 : result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2219 :
2220 : /* Break tie if necessary. */
2221 5823244 : if (result == 0 && pg_locale_deterministic(sss->locale))
2222 0 : result = strcmp(sss->buf1, sss->buf2);
2223 :
2224 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2225 5823244 : sss->cache_blob = false;
2226 5823244 : sss->last_returned = result;
2227 5823244 : return result;
2228 : }
2229 :
2230 : /*
2231 : * Conversion routine for sortsupport. Converts original to abbreviated key
2232 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2233 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2234 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2235 : * locale is used, or in case of bytea, just memcpy() from original instead.
2236 : */
2237 : static Datum
2238 1600176 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2239 : {
2240 1600176 : const size_t max_prefix_bytes = sizeof(Datum);
2241 1600176 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2242 1600176 : VarString *authoritative = DatumGetVarStringPP(original);
2243 1600176 : char *authoritative_data = VARDATA_ANY(authoritative);
2244 :
2245 : /* working state */
2246 : Datum res;
2247 : char *pres;
2248 : int len;
2249 : uint32 hash;
2250 :
2251 1600176 : pres = (char *) &res;
2252 : /* memset(), so any non-overwritten bytes are NUL */
2253 1600176 : memset(pres, 0, max_prefix_bytes);
2254 1600176 : len = VARSIZE_ANY_EXHDR(authoritative);
2255 :
2256 : /* Get number of bytes, ignoring trailing spaces */
2257 1600176 : if (sss->typid == BPCHAROID)
2258 3076 : len = bpchartruelen(authoritative_data, len);
2259 :
2260 : /*
2261 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2262 : * abbreviate keys. The full comparator for the C locale is always
2263 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2264 : * always force the C collation -- bytea isn't a collatable type, but this
2265 : * approach is convenient) to use strxfrm(). This is because bytea
2266 : * strings may contain NUL bytes. Besides, this should be faster, too.
2267 : *
2268 : * More generally, it's okay that bytea callers can have NUL bytes in
2269 : * strings because abbreviated cmp need not make a distinction between
2270 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2271 : * authoritative representation. Hopefully a comparison at or past one
2272 : * abbreviated key's terminating NUL byte will resolve the comparison
2273 : * without consulting the authoritative representation; specifically, some
2274 : * later non-NUL byte in the longer string can resolve the comparison
2275 : * against a subsequent terminating NUL in the shorter string. There will
2276 : * usually be what is effectively a "length-wise" resolution there and
2277 : * then.
2278 : *
2279 : * If that doesn't work out -- if all bytes in the longer string
2280 : * positioned at or past the offset of the smaller string's (first)
2281 : * terminating NUL are actually representative of NUL bytes in the
2282 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2283 : * towards the end of the longer string iff it happens to still be small)
2284 : * -- then an authoritative tie-breaker will happen, and do the right
2285 : * thing: explicitly consider string length.
2286 : */
2287 1600176 : if (sss->collate_c)
2288 1300368 : memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2289 : else
2290 : {
2291 : Size bsize;
2292 :
2293 : /*
2294 : * We're not using the C collation, so fall back on strxfrm or ICU
2295 : * analogs.
2296 : */
2297 :
2298 : /* By convention, we use buffer 1 to store and NUL-terminate */
2299 299808 : if (len >= sss->buflen1)
2300 : {
2301 8 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2302 8 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2303 : }
2304 :
2305 : /* Might be able to reuse strxfrm() blob from last call */
2306 299808 : if (sss->last_len1 == len && sss->cache_blob &&
2307 297344 : memcmp(sss->buf1, authoritative_data, len) == 0)
2308 : {
2309 62612 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2310 : /* No change affecting cardinality, so no hashing required */
2311 62612 : goto done;
2312 : }
2313 :
2314 237196 : memcpy(sss->buf1, authoritative_data, len);
2315 :
2316 : /*
2317 : * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2318 : */
2319 237196 : sss->buf1[len] = '\0';
2320 237196 : sss->last_len1 = len;
2321 :
2322 237196 : if (pg_strxfrm_prefix_enabled(sss->locale))
2323 : {
2324 237196 : if (sss->buflen2 < max_prefix_bytes)
2325 : {
2326 0 : sss->buflen2 = Max(max_prefix_bytes,
2327 : Min(sss->buflen2 * 2, MaxAllocSize));
2328 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2329 : }
2330 :
2331 237196 : bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2332 : max_prefix_bytes, sss->locale);
2333 237196 : sss->last_len2 = bsize;
2334 : }
2335 : else
2336 : {
2337 : /*
2338 : * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2339 : * again. The pg_strxfrm() function leaves the result buffer
2340 : * content undefined if the result did not fit, so we need to
2341 : * retry until everything fits, even though we only need the first
2342 : * few bytes in the end.
2343 : */
2344 : for (;;)
2345 : {
2346 0 : bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2347 : sss->locale);
2348 :
2349 0 : sss->last_len2 = bsize;
2350 0 : if (bsize < sss->buflen2)
2351 0 : break;
2352 :
2353 : /*
2354 : * Grow buffer and retry.
2355 : */
2356 0 : sss->buflen2 = Max(bsize + 1,
2357 : Min(sss->buflen2 * 2, MaxAllocSize));
2358 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2359 : }
2360 : }
2361 :
2362 : /*
2363 : * Every Datum byte is always compared. This is safe because the
2364 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2365 : * misinterpreting any NUL bytes not intended to be interpreted as
2366 : * logically representing termination.
2367 : *
2368 : * (Actually, even if there were NUL bytes in the blob it would be
2369 : * okay. See remarks on bytea case above.)
2370 : */
2371 237196 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2372 : }
2373 :
2374 : /*
2375 : * Maintain approximate cardinality of both abbreviated keys and original,
2376 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2377 : * the worst case, where we do many string transformations for no saving
2378 : * in full strcoll()-based comparisons. These statistics are used by
2379 : * varstr_abbrev_abort().
2380 : *
2381 : * First, Hash key proper, or a significant fraction of it. Mix in length
2382 : * in order to compensate for cases where differences are past
2383 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2384 : */
2385 1537564 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2386 : Min(len, PG_CACHE_LINE_SIZE)));
2387 :
2388 1537564 : if (len > PG_CACHE_LINE_SIZE)
2389 290 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2390 :
2391 1537564 : addHyperLogLog(&sss->full_card, hash);
2392 :
2393 : /* Hash abbreviated key */
2394 : #if SIZEOF_DATUM == 8
2395 : {
2396 : uint32 lohalf,
2397 : hihalf;
2398 :
2399 1537564 : lohalf = (uint32) res;
2400 1537564 : hihalf = (uint32) (res >> 32);
2401 1537564 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2402 : }
2403 : #else /* SIZEOF_DATUM != 8 */
2404 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2405 : #endif
2406 :
2407 1537564 : addHyperLogLog(&sss->abbr_card, hash);
2408 :
2409 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2410 1537564 : sss->cache_blob = true;
2411 1600176 : done:
2412 :
2413 : /*
2414 : * Byteswap on little-endian machines.
2415 : *
2416 : * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2417 : * 3-way comparator) works correctly on all platforms. If we didn't do
2418 : * this, the comparator would have to call memcmp() with a pair of
2419 : * pointers to the first byte of each abbreviated key, which is slower.
2420 : */
2421 1600176 : res = DatumBigEndianToNative(res);
2422 :
2423 : /* Don't leak memory here */
2424 1600176 : if (PointerGetDatum(authoritative) != original)
2425 12 : pfree(authoritative);
2426 :
2427 1600176 : return res;
2428 : }
2429 :
2430 : /*
2431 : * Callback for estimating effectiveness of abbreviated key optimization, using
2432 : * heuristic rules. Returns value indicating if the abbreviation optimization
2433 : * should be aborted, based on its projected effectiveness.
2434 : */
2435 : static bool
2436 3970 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2437 : {
2438 3970 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2439 : double abbrev_distinct,
2440 : key_distinct;
2441 :
2442 : Assert(ssup->abbreviate);
2443 :
2444 : /* Have a little patience */
2445 3970 : if (memtupcount < 100)
2446 2340 : return false;
2447 :
2448 1630 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2449 1630 : key_distinct = estimateHyperLogLog(&sss->full_card);
2450 :
2451 : /*
2452 : * Clamp cardinality estimates to at least one distinct value. While
2453 : * NULLs are generally disregarded, if only NULL values were seen so far,
2454 : * that might misrepresent costs if we failed to clamp.
2455 : */
2456 1630 : if (abbrev_distinct <= 1.0)
2457 0 : abbrev_distinct = 1.0;
2458 :
2459 1630 : if (key_distinct <= 1.0)
2460 0 : key_distinct = 1.0;
2461 :
2462 : /*
2463 : * In the worst case all abbreviated keys are identical, while at the same
2464 : * time there are differences within full key strings not captured in
2465 : * abbreviations.
2466 : */
2467 : #ifdef TRACE_SORT
2468 1630 : if (trace_sort)
2469 : {
2470 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2471 :
2472 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2473 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2474 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2475 : sss->prop_card);
2476 : }
2477 : #endif
2478 :
2479 : /*
2480 : * If the number of distinct abbreviated keys approximately matches the
2481 : * number of distinct authoritative original keys, that's reason enough to
2482 : * proceed. We can win even with a very low cardinality set if most
2483 : * tie-breakers only memcmp(). This is by far the most important
2484 : * consideration.
2485 : *
2486 : * While comparisons that are resolved at the abbreviated key level are
2487 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2488 : * those two outcomes are so much cheaper than a full strcoll() once
2489 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2490 : * cardinality against the overall size of the set in order to more
2491 : * accurately model costs. Assume that an abbreviated comparison, and an
2492 : * abbreviated comparison with a cheap memcmp()-based authoritative
2493 : * resolution are equivalent.
2494 : */
2495 1630 : if (abbrev_distinct > key_distinct * sss->prop_card)
2496 : {
2497 : /*
2498 : * When we have exceeded 10,000 tuples, decay required cardinality
2499 : * aggressively for next call.
2500 : *
2501 : * This is useful because the number of comparisons required on
2502 : * average increases at a linearithmic rate, and at roughly 10,000
2503 : * tuples that factor will start to dominate over the linear costs of
2504 : * string transformation (this is a conservative estimate). The decay
2505 : * rate is chosen to be a little less aggressive than halving -- which
2506 : * (since we're called at points at which memtupcount has doubled)
2507 : * would never see the cost model actually abort past the first call
2508 : * following a decay. This decay rate is mostly a precaution against
2509 : * a sudden, violent swing in how well abbreviated cardinality tracks
2510 : * full key cardinality. The decay also serves to prevent a marginal
2511 : * case from being aborted too late, when too much has already been
2512 : * invested in string transformation.
2513 : *
2514 : * It's possible for sets of several million distinct strings with
2515 : * mere tens of thousands of distinct abbreviated keys to still
2516 : * benefit very significantly. This will generally occur provided
2517 : * each abbreviated key is a proxy for a roughly uniform number of the
2518 : * set's full keys. If it isn't so, we hope to catch that early and
2519 : * abort. If it isn't caught early, by the time the problem is
2520 : * apparent it's probably not worth aborting.
2521 : */
2522 1630 : if (memtupcount > 10000)
2523 14 : sss->prop_card *= 0.65;
2524 :
2525 1630 : return false;
2526 : }
2527 :
2528 : /*
2529 : * Abort abbreviation strategy.
2530 : *
2531 : * The worst case, where all abbreviated keys are identical while all
2532 : * original strings differ will typically only see a regression of about
2533 : * 10% in execution time for small to medium sized lists of strings.
2534 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2535 : * often expect very large improvements, particularly with sets of strings
2536 : * of moderately high to high abbreviated cardinality. There is little to
2537 : * lose but much to gain, which our strategy reflects.
2538 : */
2539 : #ifdef TRACE_SORT
2540 0 : if (trace_sort)
2541 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2542 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2543 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2544 : #endif
2545 :
2546 0 : return true;
2547 : }
2548 :
2549 : /*
2550 : * Generic equalimage support function for character type's operator classes.
2551 : * Disables the use of deduplication with nondeterministic collations.
2552 : */
2553 : Datum
2554 6860 : btvarstrequalimage(PG_FUNCTION_ARGS)
2555 : {
2556 : /* Oid opcintype = PG_GETARG_OID(0); */
2557 6860 : Oid collid = PG_GET_COLLATION();
2558 :
2559 6860 : check_collation_set(collid);
2560 :
2561 6860 : if (lc_collate_is_c(collid) ||
2562 16 : collid == DEFAULT_COLLATION_OID ||
2563 16 : get_collation_isdeterministic(collid))
2564 6852 : PG_RETURN_BOOL(true);
2565 : else
2566 8 : PG_RETURN_BOOL(false);
2567 : }
2568 :
2569 : Datum
2570 229524 : text_larger(PG_FUNCTION_ARGS)
2571 : {
2572 229524 : text *arg1 = PG_GETARG_TEXT_PP(0);
2573 229524 : text *arg2 = PG_GETARG_TEXT_PP(1);
2574 : text *result;
2575 :
2576 229524 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2577 :
2578 229524 : PG_RETURN_TEXT_P(result);
2579 : }
2580 :
2581 : Datum
2582 86040 : text_smaller(PG_FUNCTION_ARGS)
2583 : {
2584 86040 : text *arg1 = PG_GETARG_TEXT_PP(0);
2585 86040 : text *arg2 = PG_GETARG_TEXT_PP(1);
2586 : text *result;
2587 :
2588 86040 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2589 :
2590 86040 : PG_RETURN_TEXT_P(result);
2591 : }
2592 :
2593 :
2594 : /*
2595 : * Cross-type comparison functions for types text and name.
2596 : */
2597 :
2598 : Datum
2599 198876 : nameeqtext(PG_FUNCTION_ARGS)
2600 : {
2601 198876 : Name arg1 = PG_GETARG_NAME(0);
2602 198876 : text *arg2 = PG_GETARG_TEXT_PP(1);
2603 198876 : size_t len1 = strlen(NameStr(*arg1));
2604 198876 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2605 198876 : Oid collid = PG_GET_COLLATION();
2606 : bool result;
2607 :
2608 198876 : check_collation_set(collid);
2609 :
2610 198876 : if (collid == C_COLLATION_OID)
2611 299160 : result = (len1 == len2 &&
2612 130720 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2613 : else
2614 30436 : result = (varstr_cmp(NameStr(*arg1), len1,
2615 30436 : VARDATA_ANY(arg2), len2,
2616 : collid) == 0);
2617 :
2618 198876 : PG_FREE_IF_COPY(arg2, 1);
2619 :
2620 198876 : PG_RETURN_BOOL(result);
2621 : }
2622 :
2623 : Datum
2624 2638 : texteqname(PG_FUNCTION_ARGS)
2625 : {
2626 2638 : text *arg1 = PG_GETARG_TEXT_PP(0);
2627 2638 : Name arg2 = PG_GETARG_NAME(1);
2628 2638 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2629 2638 : size_t len2 = strlen(NameStr(*arg2));
2630 2638 : Oid collid = PG_GET_COLLATION();
2631 : bool result;
2632 :
2633 2638 : check_collation_set(collid);
2634 :
2635 2638 : if (collid == C_COLLATION_OID)
2636 564 : result = (len1 == len2 &&
2637 180 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2638 : else
2639 2254 : result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2640 2254 : NameStr(*arg2), len2,
2641 : collid) == 0);
2642 :
2643 2638 : PG_FREE_IF_COPY(arg1, 0);
2644 :
2645 2638 : PG_RETURN_BOOL(result);
2646 : }
2647 :
2648 : Datum
2649 24 : namenetext(PG_FUNCTION_ARGS)
2650 : {
2651 24 : Name arg1 = PG_GETARG_NAME(0);
2652 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
2653 24 : size_t len1 = strlen(NameStr(*arg1));
2654 24 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2655 24 : Oid collid = PG_GET_COLLATION();
2656 : bool result;
2657 :
2658 24 : check_collation_set(collid);
2659 :
2660 24 : if (collid == C_COLLATION_OID)
2661 18 : result = !(len1 == len2 &&
2662 0 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2663 : else
2664 6 : result = !(varstr_cmp(NameStr(*arg1), len1,
2665 6 : VARDATA_ANY(arg2), len2,
2666 : collid) == 0);
2667 :
2668 24 : PG_FREE_IF_COPY(arg2, 1);
2669 :
2670 24 : PG_RETURN_BOOL(result);
2671 : }
2672 :
2673 : Datum
2674 6 : textnename(PG_FUNCTION_ARGS)
2675 : {
2676 6 : text *arg1 = PG_GETARG_TEXT_PP(0);
2677 6 : Name arg2 = PG_GETARG_NAME(1);
2678 6 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2679 6 : size_t len2 = strlen(NameStr(*arg2));
2680 6 : Oid collid = PG_GET_COLLATION();
2681 : bool result;
2682 :
2683 6 : check_collation_set(collid);
2684 :
2685 6 : if (collid == C_COLLATION_OID)
2686 0 : result = !(len1 == len2 &&
2687 0 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2688 : else
2689 6 : result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2690 6 : NameStr(*arg2), len2,
2691 : collid) == 0);
2692 :
2693 6 : PG_FREE_IF_COPY(arg1, 0);
2694 :
2695 6 : PG_RETURN_BOOL(result);
2696 : }
2697 :
2698 : Datum
2699 133336 : btnametextcmp(PG_FUNCTION_ARGS)
2700 : {
2701 133336 : Name arg1 = PG_GETARG_NAME(0);
2702 133336 : text *arg2 = PG_GETARG_TEXT_PP(1);
2703 : int32 result;
2704 :
2705 266672 : result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2706 266672 : VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2707 : PG_GET_COLLATION());
2708 :
2709 133336 : PG_FREE_IF_COPY(arg2, 1);
2710 :
2711 133336 : PG_RETURN_INT32(result);
2712 : }
2713 :
2714 : Datum
2715 0 : bttextnamecmp(PG_FUNCTION_ARGS)
2716 : {
2717 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2718 0 : Name arg2 = PG_GETARG_NAME(1);
2719 : int32 result;
2720 :
2721 0 : result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2722 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2723 : PG_GET_COLLATION());
2724 :
2725 0 : PG_FREE_IF_COPY(arg1, 0);
2726 :
2727 0 : PG_RETURN_INT32(result);
2728 : }
2729 :
2730 : #define CmpCall(cmpfunc) \
2731 : DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2732 : PG_GET_COLLATION(), \
2733 : PG_GETARG_DATUM(0), \
2734 : PG_GETARG_DATUM(1)))
2735 :
2736 : Datum
2737 47296 : namelttext(PG_FUNCTION_ARGS)
2738 : {
2739 47296 : PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2740 : }
2741 :
2742 : Datum
2743 0 : nameletext(PG_FUNCTION_ARGS)
2744 : {
2745 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2746 : }
2747 :
2748 : Datum
2749 0 : namegttext(PG_FUNCTION_ARGS)
2750 : {
2751 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2752 : }
2753 :
2754 : Datum
2755 27868 : namegetext(PG_FUNCTION_ARGS)
2756 : {
2757 27868 : PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2758 : }
2759 :
2760 : Datum
2761 0 : textltname(PG_FUNCTION_ARGS)
2762 : {
2763 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2764 : }
2765 :
2766 : Datum
2767 0 : textlename(PG_FUNCTION_ARGS)
2768 : {
2769 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2770 : }
2771 :
2772 : Datum
2773 0 : textgtname(PG_FUNCTION_ARGS)
2774 : {
2775 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2776 : }
2777 :
2778 : Datum
2779 0 : textgename(PG_FUNCTION_ARGS)
2780 : {
2781 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2782 : }
2783 :
2784 : #undef CmpCall
2785 :
2786 :
2787 : /*
2788 : * The following operators support character-by-character comparison
2789 : * of text datums, to allow building indexes suitable for LIKE clauses.
2790 : * Note that the regular texteq/textne comparison operators, and regular
2791 : * support functions 1 and 2 with "C" collation are assumed to be
2792 : * compatible with these!
2793 : */
2794 :
2795 : static int
2796 152098 : internal_text_pattern_compare(text *arg1, text *arg2)
2797 : {
2798 : int result;
2799 : int len1,
2800 : len2;
2801 :
2802 152098 : len1 = VARSIZE_ANY_EXHDR(arg1);
2803 152098 : len2 = VARSIZE_ANY_EXHDR(arg2);
2804 :
2805 152098 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2806 152098 : if (result != 0)
2807 152044 : return result;
2808 54 : else if (len1 < len2)
2809 0 : return -1;
2810 54 : else if (len1 > len2)
2811 18 : return 1;
2812 : else
2813 36 : return 0;
2814 : }
2815 :
2816 :
2817 : Datum
2818 39556 : text_pattern_lt(PG_FUNCTION_ARGS)
2819 : {
2820 39556 : text *arg1 = PG_GETARG_TEXT_PP(0);
2821 39556 : text *arg2 = PG_GETARG_TEXT_PP(1);
2822 : int result;
2823 :
2824 39556 : result = internal_text_pattern_compare(arg1, arg2);
2825 :
2826 39556 : PG_FREE_IF_COPY(arg1, 0);
2827 39556 : PG_FREE_IF_COPY(arg2, 1);
2828 :
2829 39556 : PG_RETURN_BOOL(result < 0);
2830 : }
2831 :
2832 :
2833 : Datum
2834 37510 : text_pattern_le(PG_FUNCTION_ARGS)
2835 : {
2836 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2837 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2838 : int result;
2839 :
2840 37510 : result = internal_text_pattern_compare(arg1, arg2);
2841 :
2842 37510 : PG_FREE_IF_COPY(arg1, 0);
2843 37510 : PG_FREE_IF_COPY(arg2, 1);
2844 :
2845 37510 : PG_RETURN_BOOL(result <= 0);
2846 : }
2847 :
2848 :
2849 : Datum
2850 37510 : text_pattern_ge(PG_FUNCTION_ARGS)
2851 : {
2852 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2853 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2854 : int result;
2855 :
2856 37510 : result = internal_text_pattern_compare(arg1, arg2);
2857 :
2858 37510 : PG_FREE_IF_COPY(arg1, 0);
2859 37510 : PG_FREE_IF_COPY(arg2, 1);
2860 :
2861 37510 : PG_RETURN_BOOL(result >= 0);
2862 : }
2863 :
2864 :
2865 : Datum
2866 37510 : text_pattern_gt(PG_FUNCTION_ARGS)
2867 : {
2868 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2869 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2870 : int result;
2871 :
2872 37510 : result = internal_text_pattern_compare(arg1, arg2);
2873 :
2874 37510 : PG_FREE_IF_COPY(arg1, 0);
2875 37510 : PG_FREE_IF_COPY(arg2, 1);
2876 :
2877 37510 : PG_RETURN_BOOL(result > 0);
2878 : }
2879 :
2880 :
2881 : Datum
2882 12 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
2883 : {
2884 12 : text *arg1 = PG_GETARG_TEXT_PP(0);
2885 12 : text *arg2 = PG_GETARG_TEXT_PP(1);
2886 : int result;
2887 :
2888 12 : result = internal_text_pattern_compare(arg1, arg2);
2889 :
2890 12 : PG_FREE_IF_COPY(arg1, 0);
2891 12 : PG_FREE_IF_COPY(arg2, 1);
2892 :
2893 12 : PG_RETURN_INT32(result);
2894 : }
2895 :
2896 :
2897 : Datum
2898 116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2899 : {
2900 116 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2901 : MemoryContext oldcontext;
2902 :
2903 116 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2904 :
2905 : /* Use generic string SortSupport, forcing "C" collation */
2906 116 : varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2907 :
2908 116 : MemoryContextSwitchTo(oldcontext);
2909 :
2910 116 : PG_RETURN_VOID();
2911 : }
2912 :
2913 :
2914 : /*-------------------------------------------------------------
2915 : * byteaoctetlen
2916 : *
2917 : * get the number of bytes contained in an instance of type 'bytea'
2918 : *-------------------------------------------------------------
2919 : */
2920 : Datum
2921 314 : byteaoctetlen(PG_FUNCTION_ARGS)
2922 : {
2923 314 : Datum str = PG_GETARG_DATUM(0);
2924 :
2925 : /* We need not detoast the input at all */
2926 314 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2927 : }
2928 :
2929 : /*
2930 : * byteacat -
2931 : * takes two bytea* and returns a bytea* that is the concatenation of
2932 : * the two.
2933 : *
2934 : * Cloned from textcat and modified as required.
2935 : */
2936 : Datum
2937 1520 : byteacat(PG_FUNCTION_ARGS)
2938 : {
2939 1520 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2940 1520 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2941 :
2942 1520 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2943 : }
2944 :
2945 : /*
2946 : * bytea_catenate
2947 : * Guts of byteacat(), broken out so it can be used by other functions
2948 : *
2949 : * Arguments can be in short-header form, but not compressed or out-of-line
2950 : */
2951 : static bytea *
2952 1556 : bytea_catenate(bytea *t1, bytea *t2)
2953 : {
2954 : bytea *result;
2955 : int len1,
2956 : len2,
2957 : len;
2958 : char *ptr;
2959 :
2960 1556 : len1 = VARSIZE_ANY_EXHDR(t1);
2961 1556 : len2 = VARSIZE_ANY_EXHDR(t2);
2962 :
2963 : /* paranoia ... probably should throw error instead? */
2964 1556 : if (len1 < 0)
2965 0 : len1 = 0;
2966 1556 : if (len2 < 0)
2967 0 : len2 = 0;
2968 :
2969 1556 : len = len1 + len2 + VARHDRSZ;
2970 1556 : result = (bytea *) palloc(len);
2971 :
2972 : /* Set size of result string... */
2973 1556 : SET_VARSIZE(result, len);
2974 :
2975 : /* Fill data field of result string... */
2976 1556 : ptr = VARDATA(result);
2977 1556 : if (len1 > 0)
2978 1556 : memcpy(ptr, VARDATA_ANY(t1), len1);
2979 1556 : if (len2 > 0)
2980 1538 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2981 :
2982 1556 : return result;
2983 : }
2984 :
2985 : #define PG_STR_GET_BYTEA(str_) \
2986 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2987 :
2988 : /*
2989 : * bytea_substr()
2990 : * Return a substring starting at the specified position.
2991 : * Cloned from text_substr and modified as required.
2992 : *
2993 : * Input:
2994 : * - string
2995 : * - starting position (is one-based)
2996 : * - string length (optional)
2997 : *
2998 : * If the starting position is zero or less, then return from the start of the string
2999 : * adjusting the length to be consistent with the "negative start" per SQL.
3000 : * If the length is less than zero, an ERROR is thrown. If no third argument
3001 : * (length) is provided, the length to the end of the string is assumed.
3002 : */
3003 : Datum
3004 86 : bytea_substr(PG_FUNCTION_ARGS)
3005 : {
3006 86 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3007 : PG_GETARG_INT32(1),
3008 : PG_GETARG_INT32(2),
3009 : false));
3010 : }
3011 :
3012 : /*
3013 : * bytea_substr_no_len -
3014 : * Wrapper to avoid opr_sanity failure due to
3015 : * one function accepting a different number of args.
3016 : */
3017 : Datum
3018 3900 : bytea_substr_no_len(PG_FUNCTION_ARGS)
3019 : {
3020 3900 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3021 : PG_GETARG_INT32(1),
3022 : -1,
3023 : true));
3024 : }
3025 :
3026 : static bytea *
3027 4022 : bytea_substring(Datum str,
3028 : int S,
3029 : int L,
3030 : bool length_not_specified)
3031 : {
3032 : int32 S1; /* adjusted start position */
3033 : int32 L1; /* adjusted substring length */
3034 : int32 E; /* end position */
3035 :
3036 : /*
3037 : * The logic here should generally match text_substring().
3038 : */
3039 4022 : S1 = Max(S, 1);
3040 :
3041 4022 : if (length_not_specified)
3042 : {
3043 : /*
3044 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3045 : * end of the string if we pass it a negative value for length.
3046 : */
3047 3918 : L1 = -1;
3048 : }
3049 104 : else if (L < 0)
3050 : {
3051 : /* SQL99 says to throw an error for E < S, i.e., negative length */
3052 12 : ereport(ERROR,
3053 : (errcode(ERRCODE_SUBSTRING_ERROR),
3054 : errmsg("negative substring length not allowed")));
3055 : L1 = -1; /* silence stupider compilers */
3056 : }
3057 92 : else if (pg_add_s32_overflow(S, L, &E))
3058 : {
3059 : /*
3060 : * L could be large enough for S + L to overflow, in which case the
3061 : * substring must run to end of string.
3062 : */
3063 6 : L1 = -1;
3064 : }
3065 : else
3066 : {
3067 : /*
3068 : * A zero or negative value for the end position can happen if the
3069 : * start was negative or one. SQL99 says to return a zero-length
3070 : * string.
3071 : */
3072 86 : if (E < 1)
3073 0 : return PG_STR_GET_BYTEA("");
3074 :
3075 86 : L1 = E - S1;
3076 : }
3077 :
3078 : /*
3079 : * If the start position is past the end of the string, SQL99 says to
3080 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
3081 : * us. We need only convert S1 to zero-based starting position.
3082 : */
3083 4010 : return DatumGetByteaPSlice(str, S1 - 1, L1);
3084 : }
3085 :
3086 : /*
3087 : * byteaoverlay
3088 : * Replace specified substring of first string with second
3089 : *
3090 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3091 : * This code is a direct implementation of what the standard says.
3092 : */
3093 : Datum
3094 6 : byteaoverlay(PG_FUNCTION_ARGS)
3095 : {
3096 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3097 6 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3098 6 : int sp = PG_GETARG_INT32(2); /* substring start position */
3099 6 : int sl = PG_GETARG_INT32(3); /* substring length */
3100 :
3101 6 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3102 : }
3103 :
3104 : Datum
3105 12 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
3106 : {
3107 12 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3108 12 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3109 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
3110 : int sl;
3111 :
3112 12 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3113 12 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3114 : }
3115 :
3116 : static bytea *
3117 18 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3118 : {
3119 : bytea *result;
3120 : bytea *s1;
3121 : bytea *s2;
3122 : int sp_pl_sl;
3123 :
3124 : /*
3125 : * Check for possible integer-overflow cases. For negative sp, throw a
3126 : * "substring length" error because that's what should be expected
3127 : * according to the spec's definition of OVERLAY().
3128 : */
3129 18 : if (sp <= 0)
3130 0 : ereport(ERROR,
3131 : (errcode(ERRCODE_SUBSTRING_ERROR),
3132 : errmsg("negative substring length not allowed")));
3133 18 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3134 0 : ereport(ERROR,
3135 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3136 : errmsg("integer out of range")));
3137 :
3138 18 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3139 18 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3140 18 : result = bytea_catenate(s1, t2);
3141 18 : result = bytea_catenate(result, s2);
3142 :
3143 18 : return result;
3144 : }
3145 :
3146 : /*
3147 : * bit_count
3148 : */
3149 : Datum
3150 6 : bytea_bit_count(PG_FUNCTION_ARGS)
3151 : {
3152 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3153 :
3154 6 : PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3155 : }
3156 :
3157 : /*
3158 : * byteapos -
3159 : * Return the position of the specified substring.
3160 : * Implements the SQL POSITION() function.
3161 : * Cloned from textpos and modified as required.
3162 : */
3163 : Datum
3164 0 : byteapos(PG_FUNCTION_ARGS)
3165 : {
3166 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3167 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3168 : int pos;
3169 : int px,
3170 : p;
3171 : int len1,
3172 : len2;
3173 : char *p1,
3174 : *p2;
3175 :
3176 0 : len1 = VARSIZE_ANY_EXHDR(t1);
3177 0 : len2 = VARSIZE_ANY_EXHDR(t2);
3178 :
3179 0 : if (len2 <= 0)
3180 0 : PG_RETURN_INT32(1); /* result for empty pattern */
3181 :
3182 0 : p1 = VARDATA_ANY(t1);
3183 0 : p2 = VARDATA_ANY(t2);
3184 :
3185 0 : pos = 0;
3186 0 : px = (len1 - len2);
3187 0 : for (p = 0; p <= px; p++)
3188 : {
3189 0 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3190 : {
3191 0 : pos = p + 1;
3192 0 : break;
3193 : };
3194 0 : p1++;
3195 : };
3196 :
3197 0 : PG_RETURN_INT32(pos);
3198 : }
3199 :
3200 : /*-------------------------------------------------------------
3201 : * byteaGetByte
3202 : *
3203 : * this routine treats "bytea" as an array of bytes.
3204 : * It returns the Nth byte (a number between 0 and 255).
3205 : *-------------------------------------------------------------
3206 : */
3207 : Datum
3208 60 : byteaGetByte(PG_FUNCTION_ARGS)
3209 : {
3210 60 : bytea *v = PG_GETARG_BYTEA_PP(0);
3211 60 : int32 n = PG_GETARG_INT32(1);
3212 : int len;
3213 : int byte;
3214 :
3215 60 : len = VARSIZE_ANY_EXHDR(v);
3216 :
3217 60 : if (n < 0 || n >= len)
3218 6 : ereport(ERROR,
3219 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3220 : errmsg("index %d out of valid range, 0..%d",
3221 : n, len - 1)));
3222 :
3223 54 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3224 :
3225 54 : PG_RETURN_INT32(byte);
3226 : }
3227 :
3228 : /*-------------------------------------------------------------
3229 : * byteaGetBit
3230 : *
3231 : * This routine treats a "bytea" type like an array of bits.
3232 : * It returns the value of the Nth bit (0 or 1).
3233 : *
3234 : *-------------------------------------------------------------
3235 : */
3236 : Datum
3237 12 : byteaGetBit(PG_FUNCTION_ARGS)
3238 : {
3239 12 : bytea *v = PG_GETARG_BYTEA_PP(0);
3240 12 : int64 n = PG_GETARG_INT64(1);
3241 : int byteNo,
3242 : bitNo;
3243 : int len;
3244 : int byte;
3245 :
3246 12 : len = VARSIZE_ANY_EXHDR(v);
3247 :
3248 12 : if (n < 0 || n >= (int64) len * 8)
3249 6 : ereport(ERROR,
3250 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3251 : errmsg("index %lld out of valid range, 0..%lld",
3252 : (long long) n, (long long) len * 8 - 1)));
3253 :
3254 : /* n/8 is now known < len, so safe to cast to int */
3255 6 : byteNo = (int) (n / 8);
3256 6 : bitNo = (int) (n % 8);
3257 :
3258 6 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3259 :
3260 6 : if (byte & (1 << bitNo))
3261 6 : PG_RETURN_INT32(1);
3262 : else
3263 0 : PG_RETURN_INT32(0);
3264 : }
3265 :
3266 : /*-------------------------------------------------------------
3267 : * byteaSetByte
3268 : *
3269 : * Given an instance of type 'bytea' creates a new one with
3270 : * the Nth byte set to the given value.
3271 : *
3272 : *-------------------------------------------------------------
3273 : */
3274 : Datum
3275 12 : byteaSetByte(PG_FUNCTION_ARGS)
3276 : {
3277 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3278 12 : int32 n = PG_GETARG_INT32(1);
3279 12 : int32 newByte = PG_GETARG_INT32(2);
3280 : int len;
3281 :
3282 12 : len = VARSIZE(res) - VARHDRSZ;
3283 :
3284 12 : if (n < 0 || n >= len)
3285 6 : ereport(ERROR,
3286 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3287 : errmsg("index %d out of valid range, 0..%d",
3288 : n, len - 1)));
3289 :
3290 : /*
3291 : * Now set the byte.
3292 : */
3293 6 : ((unsigned char *) VARDATA(res))[n] = newByte;
3294 :
3295 6 : PG_RETURN_BYTEA_P(res);
3296 : }
3297 :
3298 : /*-------------------------------------------------------------
3299 : * byteaSetBit
3300 : *
3301 : * Given an instance of type 'bytea' creates a new one with
3302 : * the Nth bit set to the given value.
3303 : *
3304 : *-------------------------------------------------------------
3305 : */
3306 : Datum
3307 12 : byteaSetBit(PG_FUNCTION_ARGS)
3308 : {
3309 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3310 12 : int64 n = PG_GETARG_INT64(1);
3311 12 : int32 newBit = PG_GETARG_INT32(2);
3312 : int len;
3313 : int oldByte,
3314 : newByte;
3315 : int byteNo,
3316 : bitNo;
3317 :
3318 12 : len = VARSIZE(res) - VARHDRSZ;
3319 :
3320 12 : if (n < 0 || n >= (int64) len * 8)
3321 6 : ereport(ERROR,
3322 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3323 : errmsg("index %lld out of valid range, 0..%lld",
3324 : (long long) n, (long long) len * 8 - 1)));
3325 :
3326 : /* n/8 is now known < len, so safe to cast to int */
3327 6 : byteNo = (int) (n / 8);
3328 6 : bitNo = (int) (n % 8);
3329 :
3330 : /*
3331 : * sanity check!
3332 : */
3333 6 : if (newBit != 0 && newBit != 1)
3334 0 : ereport(ERROR,
3335 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3336 : errmsg("new bit must be 0 or 1")));
3337 :
3338 : /*
3339 : * Update the byte.
3340 : */
3341 6 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3342 :
3343 6 : if (newBit == 0)
3344 6 : newByte = oldByte & (~(1 << bitNo));
3345 : else
3346 0 : newByte = oldByte | (1 << bitNo);
3347 :
3348 6 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3349 :
3350 6 : PG_RETURN_BYTEA_P(res);
3351 : }
3352 :
3353 :
3354 : /* text_name()
3355 : * Converts a text type to a Name type.
3356 : */
3357 : Datum
3358 30566 : text_name(PG_FUNCTION_ARGS)
3359 : {
3360 30566 : text *s = PG_GETARG_TEXT_PP(0);
3361 : Name result;
3362 : int len;
3363 :
3364 30566 : len = VARSIZE_ANY_EXHDR(s);
3365 :
3366 : /* Truncate oversize input */
3367 30566 : if (len >= NAMEDATALEN)
3368 6 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3369 :
3370 : /* We use palloc0 here to ensure result is zero-padded */
3371 30566 : result = (Name) palloc0(NAMEDATALEN);
3372 30566 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3373 :
3374 30566 : PG_RETURN_NAME(result);
3375 : }
3376 :
3377 : /* name_text()
3378 : * Converts a Name type to a text type.
3379 : */
3380 : Datum
3381 618152 : name_text(PG_FUNCTION_ARGS)
3382 : {
3383 618152 : Name s = PG_GETARG_NAME(0);
3384 :
3385 618152 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3386 : }
3387 :
3388 :
3389 : /*
3390 : * textToQualifiedNameList - convert a text object to list of names
3391 : *
3392 : * This implements the input parsing needed by nextval() and other
3393 : * functions that take a text parameter representing a qualified name.
3394 : * We split the name at dots, downcase if not double-quoted, and
3395 : * truncate names if they're too long.
3396 : */
3397 : List *
3398 1398 : textToQualifiedNameList(text *textval)
3399 : {
3400 : char *rawname;
3401 1398 : List *result = NIL;
3402 : List *namelist;
3403 : ListCell *l;
3404 :
3405 : /* Convert to C string (handles possible detoasting). */
3406 : /* Note we rely on being able to modify rawname below. */
3407 1398 : rawname = text_to_cstring(textval);
3408 :
3409 1398 : if (!SplitIdentifierString(rawname, '.', &namelist))
3410 0 : ereport(ERROR,
3411 : (errcode(ERRCODE_INVALID_NAME),
3412 : errmsg("invalid name syntax")));
3413 :
3414 1398 : if (namelist == NIL)
3415 0 : ereport(ERROR,
3416 : (errcode(ERRCODE_INVALID_NAME),
3417 : errmsg("invalid name syntax")));
3418 :
3419 2906 : foreach(l, namelist)
3420 : {
3421 1508 : char *curname = (char *) lfirst(l);
3422 :
3423 1508 : result = lappend(result, makeString(pstrdup(curname)));
3424 : }
3425 :
3426 1398 : pfree(rawname);
3427 1398 : list_free(namelist);
3428 :
3429 1398 : return result;
3430 : }
3431 :
3432 : /*
3433 : * SplitIdentifierString --- parse a string containing identifiers
3434 : *
3435 : * This is the guts of textToQualifiedNameList, and is exported for use in
3436 : * other situations such as parsing GUC variables. In the GUC case, it's
3437 : * important to avoid memory leaks, so the API is designed to minimize the
3438 : * amount of stuff that needs to be allocated and freed.
3439 : *
3440 : * Inputs:
3441 : * rawstring: the input string; must be overwritable! On return, it's
3442 : * been modified to contain the separated identifiers.
3443 : * separator: the separator punctuation expected between identifiers
3444 : * (typically '.' or ','). Whitespace may also appear around
3445 : * identifiers.
3446 : * Outputs:
3447 : * namelist: filled with a palloc'd list of pointers to identifiers within
3448 : * rawstring. Caller should list_free() this even on error return.
3449 : *
3450 : * Returns true if okay, false if there is a syntax error in the string.
3451 : *
3452 : * Note that an empty string is considered okay here, though not in
3453 : * textToQualifiedNameList.
3454 : */
3455 : bool
3456 101890 : SplitIdentifierString(char *rawstring, char separator,
3457 : List **namelist)
3458 : {
3459 101890 : char *nextp = rawstring;
3460 101890 : bool done = false;
3461 :
3462 101890 : *namelist = NIL;
3463 :
3464 101896 : while (scanner_isspace(*nextp))
3465 6 : nextp++; /* skip leading whitespace */
3466 :
3467 101890 : if (*nextp == '\0')
3468 13742 : return true; /* allow empty string */
3469 :
3470 : /* At the top of the loop, we are at start of a new identifier. */
3471 : do
3472 : {
3473 : char *curname;
3474 : char *endp;
3475 :
3476 145124 : if (*nextp == '"')
3477 : {
3478 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3479 30460 : curname = nextp + 1;
3480 : for (;;)
3481 : {
3482 30464 : endp = strchr(nextp + 1, '"');
3483 30462 : if (endp == NULL)
3484 0 : return false; /* mismatched quotes */
3485 30462 : if (endp[1] != '"')
3486 30460 : break; /* found end of quoted name */
3487 : /* Collapse adjacent quotes into one quote, and look again */
3488 2 : memmove(endp, endp + 1, strlen(endp));
3489 2 : nextp = endp;
3490 : }
3491 : /* endp now points at the terminating quote */
3492 30460 : nextp = endp + 1;
3493 : }
3494 : else
3495 : {
3496 : /* Unquoted name --- extends to separator or whitespace */
3497 : char *downname;
3498 : int len;
3499 :
3500 114664 : curname = nextp;
3501 921274 : while (*nextp && *nextp != separator &&
3502 806612 : !scanner_isspace(*nextp))
3503 806610 : nextp++;
3504 114664 : endp = nextp;
3505 114664 : if (curname == nextp)
3506 0 : return false; /* empty unquoted name not allowed */
3507 :
3508 : /*
3509 : * Downcase the identifier, using same code as main lexer does.
3510 : *
3511 : * XXX because we want to overwrite the input in-place, we cannot
3512 : * support a downcasing transformation that increases the string
3513 : * length. This is not a problem given the current implementation
3514 : * of downcase_truncate_identifier, but we'll probably have to do
3515 : * something about this someday.
3516 : */
3517 114664 : len = endp - curname;
3518 114664 : downname = downcase_truncate_identifier(curname, len, false);
3519 : Assert(strlen(downname) <= len);
3520 114664 : strncpy(curname, downname, len); /* strncpy is required here */
3521 114664 : pfree(downname);
3522 : }
3523 :
3524 145126 : while (scanner_isspace(*nextp))
3525 2 : nextp++; /* skip trailing whitespace */
3526 :
3527 145124 : if (*nextp == separator)
3528 : {
3529 56976 : nextp++;
3530 96676 : while (scanner_isspace(*nextp))
3531 39700 : nextp++; /* skip leading whitespace for next */
3532 : /* we expect another name, so done remains false */
3533 : }
3534 88148 : else if (*nextp == '\0')
3535 88146 : done = true;
3536 : else
3537 2 : return false; /* invalid syntax */
3538 :
3539 : /* Now safe to overwrite separator with a null */
3540 145122 : *endp = '\0';
3541 :
3542 : /* Truncate name if it's overlength */
3543 145122 : truncate_identifier(curname, strlen(curname), false);
3544 :
3545 : /*
3546 : * Finished isolating current name --- add it to list
3547 : */
3548 145122 : *namelist = lappend(*namelist, curname);
3549 :
3550 : /* Loop back if we didn't reach end of string */
3551 145122 : } while (!done);
3552 :
3553 88146 : return true;
3554 : }
3555 :
3556 :
3557 : /*
3558 : * SplitDirectoriesString --- parse a string containing file/directory names
3559 : *
3560 : * This works fine on file names too; the function name is historical.
3561 : *
3562 : * This is similar to SplitIdentifierString, except that the parsing
3563 : * rules are meant to handle pathnames instead of identifiers: there is
3564 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3565 : * and we apply canonicalize_path() to each extracted string. Because of the
3566 : * last, the returned strings are separately palloc'd rather than being
3567 : * pointers into rawstring --- but we still scribble on rawstring.
3568 : *
3569 : * Inputs:
3570 : * rawstring: the input string; must be modifiable!
3571 : * separator: the separator punctuation expected between directories
3572 : * (typically ',' or ';'). Whitespace may also appear around
3573 : * directories.
3574 : * Outputs:
3575 : * namelist: filled with a palloc'd list of directory names.
3576 : * Caller should list_free_deep() this even on error return.
3577 : *
3578 : * Returns true if okay, false if there is a syntax error in the string.
3579 : *
3580 : * Note that an empty string is considered okay here.
3581 : */
3582 : bool
3583 1336 : SplitDirectoriesString(char *rawstring, char separator,
3584 : List **namelist)
3585 : {
3586 1336 : char *nextp = rawstring;
3587 1336 : bool done = false;
3588 :
3589 1336 : *namelist = NIL;
3590 :
3591 1336 : while (scanner_isspace(*nextp))
3592 0 : nextp++; /* skip leading whitespace */
3593 :
3594 1336 : if (*nextp == '\0')
3595 2 : return true; /* allow empty string */
3596 :
3597 : /* At the top of the loop, we are at start of a new directory. */
3598 : do
3599 : {
3600 : char *curname;
3601 : char *endp;
3602 :
3603 1334 : if (*nextp == '"')
3604 : {
3605 : /* Quoted name --- collapse quote-quote pairs */
3606 0 : curname = nextp + 1;
3607 : for (;;)
3608 : {
3609 0 : endp = strchr(nextp + 1, '"');
3610 0 : if (endp == NULL)
3611 0 : return false; /* mismatched quotes */
3612 0 : if (endp[1] != '"')
3613 0 : break; /* found end of quoted name */
3614 : /* Collapse adjacent quotes into one quote, and look again */
3615 0 : memmove(endp, endp + 1, strlen(endp));
3616 0 : nextp = endp;
3617 : }
3618 : /* endp now points at the terminating quote */
3619 0 : nextp = endp + 1;
3620 : }
3621 : else
3622 : {
3623 : /* Unquoted name --- extends to separator or end of string */
3624 1334 : curname = endp = nextp;
3625 22452 : while (*nextp && *nextp != separator)
3626 : {
3627 : /* trailing whitespace should not be included in name */
3628 21118 : if (!scanner_isspace(*nextp))
3629 21118 : endp = nextp + 1;
3630 21118 : nextp++;
3631 : }
3632 1334 : if (curname == endp)
3633 0 : return false; /* empty unquoted name not allowed */
3634 : }
3635 :
3636 1334 : while (scanner_isspace(*nextp))
3637 0 : nextp++; /* skip trailing whitespace */
3638 :
3639 1334 : if (*nextp == separator)
3640 : {
3641 0 : nextp++;
3642 0 : while (scanner_isspace(*nextp))
3643 0 : nextp++; /* skip leading whitespace for next */
3644 : /* we expect another name, so done remains false */
3645 : }
3646 1334 : else if (*nextp == '\0')
3647 1334 : done = true;
3648 : else
3649 0 : return false; /* invalid syntax */
3650 :
3651 : /* Now safe to overwrite separator with a null */
3652 1334 : *endp = '\0';
3653 :
3654 : /* Truncate path if it's overlength */
3655 1334 : if (strlen(curname) >= MAXPGPATH)
3656 0 : curname[MAXPGPATH - 1] = '\0';
3657 :
3658 : /*
3659 : * Finished isolating current name --- add it to list
3660 : */
3661 1334 : curname = pstrdup(curname);
3662 1334 : canonicalize_path(curname);
3663 1334 : *namelist = lappend(*namelist, curname);
3664 :
3665 : /* Loop back if we didn't reach end of string */
3666 1334 : } while (!done);
3667 :
3668 1334 : return true;
3669 : }
3670 :
3671 :
3672 : /*
3673 : * SplitGUCList --- parse a string containing identifiers or file names
3674 : *
3675 : * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3676 : * presuming whether the elements will be taken as identifiers or file names.
3677 : * We assume the input has already been through flatten_set_variable_args(),
3678 : * so that we need never downcase (if appropriate, that was done already).
3679 : * Nor do we ever truncate, since we don't know the correct max length.
3680 : * We disallow embedded whitespace for simplicity (it shouldn't matter,
3681 : * because any embedded whitespace should have led to double-quoting).
3682 : * Otherwise the API is identical to SplitIdentifierString.
3683 : *
3684 : * XXX it's annoying to have so many copies of this string-splitting logic.
3685 : * However, it's not clear that having one function with a bunch of option
3686 : * flags would be much better.
3687 : *
3688 : * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3689 : * Be sure to update that if you have to change this.
3690 : *
3691 : * Inputs:
3692 : * rawstring: the input string; must be overwritable! On return, it's
3693 : * been modified to contain the separated identifiers.
3694 : * separator: the separator punctuation expected between identifiers
3695 : * (typically '.' or ','). Whitespace may also appear around
3696 : * identifiers.
3697 : * Outputs:
3698 : * namelist: filled with a palloc'd list of pointers to identifiers within
3699 : * rawstring. Caller should list_free() this even on error return.
3700 : *
3701 : * Returns true if okay, false if there is a syntax error in the string.
3702 : */
3703 : bool
3704 2920 : SplitGUCList(char *rawstring, char separator,
3705 : List **namelist)
3706 : {
3707 2920 : char *nextp = rawstring;
3708 2920 : bool done = false;
3709 :
3710 2920 : *namelist = NIL;
3711 :
3712 2920 : while (scanner_isspace(*nextp))
3713 0 : nextp++; /* skip leading whitespace */
3714 :
3715 2920 : if (*nextp == '\0')
3716 2854 : return true; /* allow empty string */
3717 :
3718 : /* At the top of the loop, we are at start of a new identifier. */
3719 : do
3720 : {
3721 : char *curname;
3722 : char *endp;
3723 :
3724 92 : if (*nextp == '"')
3725 : {
3726 : /* Quoted name --- collapse quote-quote pairs */
3727 24 : curname = nextp + 1;
3728 : for (;;)
3729 : {
3730 36 : endp = strchr(nextp + 1, '"');
3731 30 : if (endp == NULL)
3732 0 : return false; /* mismatched quotes */
3733 30 : if (endp[1] != '"')
3734 24 : break; /* found end of quoted name */
3735 : /* Collapse adjacent quotes into one quote, and look again */
3736 6 : memmove(endp, endp + 1, strlen(endp));
3737 6 : nextp = endp;
3738 : }
3739 : /* endp now points at the terminating quote */
3740 24 : nextp = endp + 1;
3741 : }
3742 : else
3743 : {
3744 : /* Unquoted name --- extends to separator or whitespace */
3745 68 : curname = nextp;
3746 638 : while (*nextp && *nextp != separator &&
3747 570 : !scanner_isspace(*nextp))
3748 570 : nextp++;
3749 68 : endp = nextp;
3750 68 : if (curname == nextp)
3751 0 : return false; /* empty unquoted name not allowed */
3752 : }
3753 :
3754 92 : while (scanner_isspace(*nextp))
3755 0 : nextp++; /* skip trailing whitespace */
3756 :
3757 92 : if (*nextp == separator)
3758 : {
3759 26 : nextp++;
3760 44 : while (scanner_isspace(*nextp))
3761 18 : nextp++; /* skip leading whitespace for next */
3762 : /* we expect another name, so done remains false */
3763 : }
3764 66 : else if (*nextp == '\0')
3765 66 : done = true;
3766 : else
3767 0 : return false; /* invalid syntax */
3768 :
3769 : /* Now safe to overwrite separator with a null */
3770 92 : *endp = '\0';
3771 :
3772 : /*
3773 : * Finished isolating current name --- add it to list
3774 : */
3775 92 : *namelist = lappend(*namelist, curname);
3776 :
3777 : /* Loop back if we didn't reach end of string */
3778 92 : } while (!done);
3779 :
3780 66 : return true;
3781 : }
3782 :
3783 :
3784 : /*****************************************************************************
3785 : * Comparison Functions used for bytea
3786 : *
3787 : * Note: btree indexes need these routines not to leak memory; therefore,
3788 : * be careful to free working copies of toasted datums. Most places don't
3789 : * need to be so careful.
3790 : *****************************************************************************/
3791 :
3792 : Datum
3793 10390 : byteaeq(PG_FUNCTION_ARGS)
3794 : {
3795 10390 : Datum arg1 = PG_GETARG_DATUM(0);
3796 10390 : Datum arg2 = PG_GETARG_DATUM(1);
3797 : bool result;
3798 : Size len1,
3799 : len2;
3800 :
3801 : /*
3802 : * We can use a fast path for unequal lengths, which might save us from
3803 : * having to detoast one or both values.
3804 : */
3805 10390 : len1 = toast_raw_datum_size(arg1);
3806 10390 : len2 = toast_raw_datum_size(arg2);
3807 10390 : if (len1 != len2)
3808 4316 : result = false;
3809 : else
3810 : {
3811 6074 : bytea *barg1 = DatumGetByteaPP(arg1);
3812 6074 : bytea *barg2 = DatumGetByteaPP(arg2);
3813 :
3814 6074 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3815 : len1 - VARHDRSZ) == 0);
3816 :
3817 6074 : PG_FREE_IF_COPY(barg1, 0);
3818 6074 : PG_FREE_IF_COPY(barg2, 1);
3819 : }
3820 :
3821 10390 : PG_RETURN_BOOL(result);
3822 : }
3823 :
3824 : Datum
3825 768 : byteane(PG_FUNCTION_ARGS)
3826 : {
3827 768 : Datum arg1 = PG_GETARG_DATUM(0);
3828 768 : Datum arg2 = PG_GETARG_DATUM(1);
3829 : bool result;
3830 : Size len1,
3831 : len2;
3832 :
3833 : /*
3834 : * We can use a fast path for unequal lengths, which might save us from
3835 : * having to detoast one or both values.
3836 : */
3837 768 : len1 = toast_raw_datum_size(arg1);
3838 768 : len2 = toast_raw_datum_size(arg2);
3839 768 : if (len1 != len2)
3840 0 : result = true;
3841 : else
3842 : {
3843 768 : bytea *barg1 = DatumGetByteaPP(arg1);
3844 768 : bytea *barg2 = DatumGetByteaPP(arg2);
3845 :
3846 768 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3847 : len1 - VARHDRSZ) != 0);
3848 :
3849 768 : PG_FREE_IF_COPY(barg1, 0);
3850 768 : PG_FREE_IF_COPY(barg2, 1);
3851 : }
3852 :
3853 768 : PG_RETURN_BOOL(result);
3854 : }
3855 :
3856 : Datum
3857 8316 : bytealt(PG_FUNCTION_ARGS)
3858 : {
3859 8316 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3860 8316 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3861 : int len1,
3862 : len2;
3863 : int cmp;
3864 :
3865 8316 : len1 = VARSIZE_ANY_EXHDR(arg1);
3866 8316 : len2 = VARSIZE_ANY_EXHDR(arg2);
3867 :
3868 8316 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3869 :
3870 8316 : PG_FREE_IF_COPY(arg1, 0);
3871 8316 : PG_FREE_IF_COPY(arg2, 1);
3872 :
3873 8316 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3874 : }
3875 :
3876 : Datum
3877 6356 : byteale(PG_FUNCTION_ARGS)
3878 : {
3879 6356 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3880 6356 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3881 : int len1,
3882 : len2;
3883 : int cmp;
3884 :
3885 6356 : len1 = VARSIZE_ANY_EXHDR(arg1);
3886 6356 : len2 = VARSIZE_ANY_EXHDR(arg2);
3887 :
3888 6356 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3889 :
3890 6356 : PG_FREE_IF_COPY(arg1, 0);
3891 6356 : PG_FREE_IF_COPY(arg2, 1);
3892 :
3893 6356 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3894 : }
3895 :
3896 : Datum
3897 6228 : byteagt(PG_FUNCTION_ARGS)
3898 : {
3899 6228 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3900 6228 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3901 : int len1,
3902 : len2;
3903 : int cmp;
3904 :
3905 6228 : len1 = VARSIZE_ANY_EXHDR(arg1);
3906 6228 : len2 = VARSIZE_ANY_EXHDR(arg2);
3907 :
3908 6228 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3909 :
3910 6228 : PG_FREE_IF_COPY(arg1, 0);
3911 6228 : PG_FREE_IF_COPY(arg2, 1);
3912 :
3913 6228 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3914 : }
3915 :
3916 : Datum
3917 5010 : byteage(PG_FUNCTION_ARGS)
3918 : {
3919 5010 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3920 5010 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3921 : int len1,
3922 : len2;
3923 : int cmp;
3924 :
3925 5010 : len1 = VARSIZE_ANY_EXHDR(arg1);
3926 5010 : len2 = VARSIZE_ANY_EXHDR(arg2);
3927 :
3928 5010 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3929 :
3930 5010 : PG_FREE_IF_COPY(arg1, 0);
3931 5010 : PG_FREE_IF_COPY(arg2, 1);
3932 :
3933 5010 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3934 : }
3935 :
3936 : Datum
3937 87446 : byteacmp(PG_FUNCTION_ARGS)
3938 : {
3939 87446 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3940 87446 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3941 : int len1,
3942 : len2;
3943 : int cmp;
3944 :
3945 87446 : len1 = VARSIZE_ANY_EXHDR(arg1);
3946 87446 : len2 = VARSIZE_ANY_EXHDR(arg2);
3947 :
3948 87446 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3949 87446 : if ((cmp == 0) && (len1 != len2))
3950 14550 : cmp = (len1 < len2) ? -1 : 1;
3951 :
3952 87446 : PG_FREE_IF_COPY(arg1, 0);
3953 87446 : PG_FREE_IF_COPY(arg2, 1);
3954 :
3955 87446 : PG_RETURN_INT32(cmp);
3956 : }
3957 :
3958 : Datum
3959 40 : bytea_sortsupport(PG_FUNCTION_ARGS)
3960 : {
3961 40 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3962 : MemoryContext oldcontext;
3963 :
3964 40 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3965 :
3966 : /* Use generic string SortSupport, forcing "C" collation */
3967 40 : varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
3968 :
3969 40 : MemoryContextSwitchTo(oldcontext);
3970 :
3971 40 : PG_RETURN_VOID();
3972 : }
3973 :
3974 : /*
3975 : * appendStringInfoText
3976 : *
3977 : * Append a text to str.
3978 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3979 : */
3980 : static void
3981 1684298 : appendStringInfoText(StringInfo str, const text *t)
3982 : {
3983 1684298 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3984 1684298 : }
3985 :
3986 : /*
3987 : * replace_text
3988 : * replace all occurrences of 'old_sub_str' in 'orig_str'
3989 : * with 'new_sub_str' to form 'new_str'
3990 : *
3991 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3992 : * otherwise returns 'new_str'
3993 : */
3994 : Datum
3995 984 : replace_text(PG_FUNCTION_ARGS)
3996 : {
3997 984 : text *src_text = PG_GETARG_TEXT_PP(0);
3998 984 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
3999 984 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
4000 : int src_text_len;
4001 : int from_sub_text_len;
4002 : TextPositionState state;
4003 : text *ret_text;
4004 : int chunk_len;
4005 : char *curr_ptr;
4006 : char *start_ptr;
4007 : StringInfoData str;
4008 : bool found;
4009 :
4010 984 : src_text_len = VARSIZE_ANY_EXHDR(src_text);
4011 984 : from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4012 :
4013 : /* Return unmodified source string if empty source or pattern */
4014 984 : if (src_text_len < 1 || from_sub_text_len < 1)
4015 : {
4016 0 : PG_RETURN_TEXT_P(src_text);
4017 : }
4018 :
4019 984 : text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4020 :
4021 984 : found = text_position_next(&state);
4022 :
4023 : /* When the from_sub_text is not found, there is nothing to do. */
4024 984 : if (!found)
4025 : {
4026 238 : text_position_cleanup(&state);
4027 238 : PG_RETURN_TEXT_P(src_text);
4028 : }
4029 746 : curr_ptr = text_position_get_match_ptr(&state);
4030 746 : start_ptr = VARDATA_ANY(src_text);
4031 :
4032 746 : initStringInfo(&str);
4033 :
4034 : do
4035 : {
4036 4226 : CHECK_FOR_INTERRUPTS();
4037 :
4038 : /* copy the data skipped over by last text_position_next() */
4039 4226 : chunk_len = curr_ptr - start_ptr;
4040 4226 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4041 :
4042 4226 : appendStringInfoText(&str, to_sub_text);
4043 :
4044 4226 : start_ptr = curr_ptr + from_sub_text_len;
4045 :
4046 4226 : found = text_position_next(&state);
4047 4226 : if (found)
4048 3480 : curr_ptr = text_position_get_match_ptr(&state);
4049 : }
4050 4226 : while (found);
4051 :
4052 : /* copy trailing data */
4053 746 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4054 746 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4055 :
4056 746 : text_position_cleanup(&state);
4057 :
4058 746 : ret_text = cstring_to_text_with_len(str.data, str.len);
4059 746 : pfree(str.data);
4060 :
4061 746 : PG_RETURN_TEXT_P(ret_text);
4062 : }
4063 :
4064 : /*
4065 : * check_replace_text_has_escape
4066 : *
4067 : * Returns 0 if text contains no backslashes that need processing.
4068 : * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4069 : * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4070 : */
4071 : static int
4072 9968 : check_replace_text_has_escape(const text *replace_text)
4073 : {
4074 9968 : int result = 0;
4075 9968 : const char *p = VARDATA_ANY(replace_text);
4076 9968 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4077 :
4078 10012 : while (p < p_end)
4079 : {
4080 : /* Find next escape char, if any. */
4081 9182 : p = memchr(p, '\\', p_end - p);
4082 9182 : if (p == NULL)
4083 8504 : break;
4084 678 : p++;
4085 : /* Note: a backslash at the end doesn't require extra processing. */
4086 678 : if (p < p_end)
4087 : {
4088 678 : if (*p >= '1' && *p <= '9')
4089 634 : return 2; /* Found a submatch specifier, so done */
4090 44 : result = 1; /* Found some other sequence, keep looking */
4091 44 : p++;
4092 : }
4093 : }
4094 9334 : return result;
4095 : }
4096 :
4097 : /*
4098 : * appendStringInfoRegexpSubstr
4099 : *
4100 : * Append replace_text to str, substituting regexp back references for
4101 : * \n escapes. start_ptr is the start of the match in the source string,
4102 : * at logical character position data_pos.
4103 : */
4104 : static void
4105 212 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4106 : regmatch_t *pmatch,
4107 : char *start_ptr, int data_pos)
4108 : {
4109 212 : const char *p = VARDATA_ANY(replace_text);
4110 212 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4111 :
4112 526 : while (p < p_end)
4113 : {
4114 470 : const char *chunk_start = p;
4115 : int so;
4116 : int eo;
4117 :
4118 : /* Find next escape char, if any. */
4119 470 : p = memchr(p, '\\', p_end - p);
4120 470 : if (p == NULL)
4121 150 : p = p_end;
4122 :
4123 : /* Copy the text we just scanned over, if any. */
4124 470 : if (p > chunk_start)
4125 294 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4126 :
4127 : /* Done if at end of string, else advance over escape char. */
4128 470 : if (p >= p_end)
4129 150 : break;
4130 320 : p++;
4131 :
4132 320 : if (p >= p_end)
4133 : {
4134 : /* Escape at very end of input. Treat same as unexpected char */
4135 6 : appendStringInfoChar(str, '\\');
4136 6 : break;
4137 : }
4138 :
4139 314 : if (*p >= '1' && *p <= '9')
4140 254 : {
4141 : /* Use the back reference of regexp. */
4142 254 : int idx = *p - '0';
4143 :
4144 254 : so = pmatch[idx].rm_so;
4145 254 : eo = pmatch[idx].rm_eo;
4146 254 : p++;
4147 : }
4148 60 : else if (*p == '&')
4149 : {
4150 : /* Use the entire matched string. */
4151 18 : so = pmatch[0].rm_so;
4152 18 : eo = pmatch[0].rm_eo;
4153 18 : p++;
4154 : }
4155 42 : else if (*p == '\\')
4156 : {
4157 : /* \\ means transfer one \ to output. */
4158 36 : appendStringInfoChar(str, '\\');
4159 36 : p++;
4160 36 : continue;
4161 : }
4162 : else
4163 : {
4164 : /*
4165 : * If escape char is not followed by any expected char, just treat
4166 : * it as ordinary data to copy. (XXX would it be better to throw
4167 : * an error?)
4168 : */
4169 6 : appendStringInfoChar(str, '\\');
4170 6 : continue;
4171 : }
4172 :
4173 272 : if (so >= 0 && eo >= 0)
4174 : {
4175 : /*
4176 : * Copy the text that is back reference of regexp. Note so and eo
4177 : * are counted in characters not bytes.
4178 : */
4179 : char *chunk_start;
4180 : int chunk_len;
4181 :
4182 : Assert(so >= data_pos);
4183 272 : chunk_start = start_ptr;
4184 272 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4185 272 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4186 272 : appendBinaryStringInfo(str, chunk_start, chunk_len);
4187 : }
4188 : }
4189 212 : }
4190 :
4191 : /*
4192 : * replace_text_regexp
4193 : *
4194 : * replace substring(s) in src_text that match pattern with replace_text.
4195 : * The replace_text can contain backslash markers to substitute
4196 : * (parts of) the matched text.
4197 : *
4198 : * cflags: regexp compile flags.
4199 : * collation: collation to use.
4200 : * search_start: the character (not byte) offset in src_text at which to
4201 : * begin searching.
4202 : * n: if 0, replace all matches; if > 0, replace only the N'th match.
4203 : */
4204 : text *
4205 9968 : replace_text_regexp(text *src_text, text *pattern_text,
4206 : text *replace_text,
4207 : int cflags, Oid collation,
4208 : int search_start, int n)
4209 : {
4210 : text *ret_text;
4211 : regex_t *re;
4212 9968 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4213 9968 : int nmatches = 0;
4214 : StringInfoData buf;
4215 : regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4216 9968 : int nmatch = lengthof(pmatch);
4217 : pg_wchar *data;
4218 : size_t data_len;
4219 : int data_pos;
4220 : char *start_ptr;
4221 : int escape_status;
4222 :
4223 9968 : initStringInfo(&buf);
4224 :
4225 : /* Convert data string to wide characters. */
4226 9968 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4227 9968 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4228 :
4229 : /* Check whether replace_text has escapes, especially regexp submatches. */
4230 9968 : escape_status = check_replace_text_has_escape(replace_text);
4231 :
4232 : /* If no regexp submatches, we can use REG_NOSUB. */
4233 9968 : if (escape_status < 2)
4234 : {
4235 9334 : cflags |= REG_NOSUB;
4236 : /* Also tell pg_regexec we only want the whole-match location. */
4237 9334 : nmatch = 1;
4238 : }
4239 :
4240 : /* Prepare the regexp. */
4241 9968 : re = RE_compile_and_cache(pattern_text, cflags, collation);
4242 :
4243 : /* start_ptr points to the data_pos'th character of src_text */
4244 9968 : start_ptr = (char *) VARDATA_ANY(src_text);
4245 9968 : data_pos = 0;
4246 :
4247 14068 : while (search_start <= data_len)
4248 : {
4249 : int regexec_result;
4250 :
4251 14062 : CHECK_FOR_INTERRUPTS();
4252 :
4253 14062 : regexec_result = pg_regexec(re,
4254 : data,
4255 : data_len,
4256 : search_start,
4257 : NULL, /* no details */
4258 : nmatch,
4259 : pmatch,
4260 : 0);
4261 :
4262 14062 : if (regexec_result == REG_NOMATCH)
4263 8618 : break;
4264 :
4265 5444 : if (regexec_result != REG_OKAY)
4266 : {
4267 : char errMsg[100];
4268 :
4269 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4270 0 : ereport(ERROR,
4271 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4272 : errmsg("regular expression failed: %s", errMsg)));
4273 : }
4274 :
4275 : /*
4276 : * Count matches, and decide whether to replace this match.
4277 : */
4278 5444 : nmatches++;
4279 5444 : if (n > 0 && nmatches != n)
4280 : {
4281 : /*
4282 : * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4283 : * we treat the matched text as if it weren't matched, and copy it
4284 : * to the output later.)
4285 : */
4286 60 : search_start = pmatch[0].rm_eo;
4287 60 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4288 0 : search_start++;
4289 60 : continue;
4290 : }
4291 :
4292 : /*
4293 : * Copy the text to the left of the match position. Note we are given
4294 : * character not byte indexes.
4295 : */
4296 5384 : if (pmatch[0].rm_so - data_pos > 0)
4297 : {
4298 : int chunk_len;
4299 :
4300 5212 : chunk_len = charlen_to_bytelen(start_ptr,
4301 5212 : pmatch[0].rm_so - data_pos);
4302 5212 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4303 :
4304 : /*
4305 : * Advance start_ptr over that text, to avoid multiple rescans of
4306 : * it if the replace_text contains multiple back-references.
4307 : */
4308 5212 : start_ptr += chunk_len;
4309 5212 : data_pos = pmatch[0].rm_so;
4310 : }
4311 :
4312 : /*
4313 : * Copy the replace_text, processing escapes if any are present.
4314 : */
4315 5384 : if (escape_status > 0)
4316 212 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4317 : start_ptr, data_pos);
4318 : else
4319 5172 : appendStringInfoText(&buf, replace_text);
4320 :
4321 : /* Advance start_ptr and data_pos over the matched text. */
4322 10768 : start_ptr += charlen_to_bytelen(start_ptr,
4323 5384 : pmatch[0].rm_eo - data_pos);
4324 5384 : data_pos = pmatch[0].rm_eo;
4325 :
4326 : /*
4327 : * If we only want to replace one occurrence, we're done.
4328 : */
4329 5384 : if (n > 0)
4330 1344 : break;
4331 :
4332 : /*
4333 : * Advance search position. Normally we start the next search at the
4334 : * end of the previous match; but if the match was of zero length, we
4335 : * have to advance by one character, or we'd just find the same match
4336 : * again.
4337 : */
4338 4040 : search_start = data_pos;
4339 4040 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4340 12 : search_start++;
4341 : }
4342 :
4343 : /*
4344 : * Copy the text to the right of the last match.
4345 : */
4346 9968 : if (data_pos < data_len)
4347 : {
4348 : int chunk_len;
4349 :
4350 9510 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4351 9510 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4352 : }
4353 :
4354 9968 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4355 9968 : pfree(buf.data);
4356 9968 : pfree(data);
4357 :
4358 9968 : return ret_text;
4359 : }
4360 :
4361 : /*
4362 : * split_part
4363 : * parse input string based on provided field separator
4364 : * return N'th item (1 based, negative counts from end)
4365 : */
4366 : Datum
4367 102 : split_part(PG_FUNCTION_ARGS)
4368 : {
4369 102 : text *inputstring = PG_GETARG_TEXT_PP(0);
4370 102 : text *fldsep = PG_GETARG_TEXT_PP(1);
4371 102 : int fldnum = PG_GETARG_INT32(2);
4372 : int inputstring_len;
4373 : int fldsep_len;
4374 : TextPositionState state;
4375 : char *start_ptr;
4376 : char *end_ptr;
4377 : text *result_text;
4378 : bool found;
4379 :
4380 : /* field number is 1 based */
4381 102 : if (fldnum == 0)
4382 6 : ereport(ERROR,
4383 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4384 : errmsg("field position must not be zero")));
4385 :
4386 96 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4387 96 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4388 :
4389 : /* return empty string for empty input string */
4390 96 : if (inputstring_len < 1)
4391 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4392 :
4393 : /* handle empty field separator */
4394 84 : if (fldsep_len < 1)
4395 : {
4396 : /* if first or last field, return input string, else empty string */
4397 24 : if (fldnum == 1 || fldnum == -1)
4398 12 : PG_RETURN_TEXT_P(inputstring);
4399 : else
4400 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4401 : }
4402 :
4403 : /* find the first field separator */
4404 60 : text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4405 :
4406 60 : found = text_position_next(&state);
4407 :
4408 : /* special case if fldsep not found at all */
4409 60 : if (!found)
4410 : {
4411 12 : text_position_cleanup(&state);
4412 : /* if first or last field, return input string, else empty string */
4413 12 : if (fldnum == 1 || fldnum == -1)
4414 6 : PG_RETURN_TEXT_P(inputstring);
4415 : else
4416 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4417 : }
4418 :
4419 : /*
4420 : * take care of a negative field number (i.e. count from the right) by
4421 : * converting to a positive field number; we need total number of fields
4422 : */
4423 48 : if (fldnum < 0)
4424 : {
4425 : /* we found a fldsep, so there are at least two fields */
4426 24 : int numfields = 2;
4427 :
4428 36 : while (text_position_next(&state))
4429 12 : numfields++;
4430 :
4431 : /* special case of last field does not require an extra pass */
4432 24 : if (fldnum == -1)
4433 : {
4434 6 : start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4435 6 : end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4436 6 : text_position_cleanup(&state);
4437 6 : PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4438 : end_ptr - start_ptr));
4439 : }
4440 :
4441 : /* else, convert fldnum to positive notation */
4442 18 : fldnum += numfields + 1;
4443 :
4444 : /* if nonexistent field, return empty string */
4445 18 : if (fldnum <= 0)
4446 : {
4447 6 : text_position_cleanup(&state);
4448 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4449 : }
4450 :
4451 : /* reset to pointing at first match, but now with positive fldnum */
4452 12 : text_position_reset(&state);
4453 12 : found = text_position_next(&state);
4454 : Assert(found);
4455 : }
4456 :
4457 : /* identify bounds of first field */
4458 36 : start_ptr = VARDATA_ANY(inputstring);
4459 36 : end_ptr = text_position_get_match_ptr(&state);
4460 :
4461 66 : while (found && --fldnum > 0)
4462 : {
4463 : /* identify bounds of next field */
4464 30 : start_ptr = end_ptr + fldsep_len;
4465 30 : found = text_position_next(&state);
4466 30 : if (found)
4467 18 : end_ptr = text_position_get_match_ptr(&state);
4468 : }
4469 :
4470 36 : text_position_cleanup(&state);
4471 :
4472 36 : if (fldnum > 0)
4473 : {
4474 : /* N'th field separator not found */
4475 : /* if last field requested, return it, else empty string */
4476 12 : if (fldnum == 1)
4477 : {
4478 6 : int last_len = start_ptr - VARDATA_ANY(inputstring);
4479 :
4480 6 : result_text = cstring_to_text_with_len(start_ptr,
4481 : inputstring_len - last_len);
4482 : }
4483 : else
4484 6 : result_text = cstring_to_text("");
4485 : }
4486 : else
4487 : {
4488 : /* non-last field requested */
4489 24 : result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4490 : }
4491 :
4492 36 : PG_RETURN_TEXT_P(result_text);
4493 : }
4494 :
4495 : /*
4496 : * Convenience function to return true when two text params are equal.
4497 : */
4498 : static bool
4499 228 : text_isequal(text *txt1, text *txt2, Oid collid)
4500 : {
4501 228 : return DatumGetBool(DirectFunctionCall2Coll(texteq,
4502 : collid,
4503 : PointerGetDatum(txt1),
4504 : PointerGetDatum(txt2)));
4505 : }
4506 :
4507 : /*
4508 : * text_to_array
4509 : * parse input string and return text array of elements,
4510 : * based on provided field separator
4511 : */
4512 : Datum
4513 122 : text_to_array(PG_FUNCTION_ARGS)
4514 : {
4515 : SplitTextOutputData tstate;
4516 :
4517 : /* For array output, tstate should start as all zeroes */
4518 122 : memset(&tstate, 0, sizeof(tstate));
4519 :
4520 122 : if (!split_text(fcinfo, &tstate))
4521 6 : PG_RETURN_NULL();
4522 :
4523 112 : if (tstate.astate == NULL)
4524 6 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4525 :
4526 106 : PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4527 : CurrentMemoryContext));
4528 : }
4529 :
4530 : /*
4531 : * text_to_array_null
4532 : * parse input string and return text array of elements,
4533 : * based on provided field separator and null string
4534 : *
4535 : * This is a separate entry point only to prevent the regression tests from
4536 : * complaining about different argument sets for the same internal function.
4537 : */
4538 : Datum
4539 36 : text_to_array_null(PG_FUNCTION_ARGS)
4540 : {
4541 36 : return text_to_array(fcinfo);
4542 : }
4543 :
4544 : /*
4545 : * text_to_table
4546 : * parse input string and return table of elements,
4547 : * based on provided field separator
4548 : */
4549 : Datum
4550 84 : text_to_table(PG_FUNCTION_ARGS)
4551 : {
4552 84 : ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4553 : SplitTextOutputData tstate;
4554 :
4555 84 : tstate.astate = NULL;
4556 84 : InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4557 84 : tstate.tupstore = rsi->setResult;
4558 84 : tstate.tupdesc = rsi->setDesc;
4559 :
4560 84 : (void) split_text(fcinfo, &tstate);
4561 :
4562 84 : return (Datum) 0;
4563 : }
4564 :
4565 : /*
4566 : * text_to_table_null
4567 : * parse input string and return table of elements,
4568 : * based on provided field separator and null string
4569 : *
4570 : * This is a separate entry point only to prevent the regression tests from
4571 : * complaining about different argument sets for the same internal function.
4572 : */
4573 : Datum
4574 24 : text_to_table_null(PG_FUNCTION_ARGS)
4575 : {
4576 24 : return text_to_table(fcinfo);
4577 : }
4578 :
4579 : /*
4580 : * Common code for text_to_array, text_to_array_null, text_to_table
4581 : * and text_to_table_null functions.
4582 : *
4583 : * These are not strict so we have to test for null inputs explicitly.
4584 : * Returns false if result is to be null, else returns true.
4585 : *
4586 : * Note that if the result is valid but empty (zero elements), we return
4587 : * without changing *tstate --- caller must handle that case, too.
4588 : */
4589 : static bool
4590 206 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4591 : {
4592 : text *inputstring;
4593 : text *fldsep;
4594 : text *null_string;
4595 206 : Oid collation = PG_GET_COLLATION();
4596 : int inputstring_len;
4597 : int fldsep_len;
4598 : char *start_ptr;
4599 : text *result_text;
4600 :
4601 : /* when input string is NULL, then result is NULL too */
4602 206 : if (PG_ARGISNULL(0))
4603 12 : return false;
4604 :
4605 194 : inputstring = PG_GETARG_TEXT_PP(0);
4606 :
4607 : /* fldsep can be NULL */
4608 194 : if (!PG_ARGISNULL(1))
4609 176 : fldsep = PG_GETARG_TEXT_PP(1);
4610 : else
4611 18 : fldsep = NULL;
4612 :
4613 : /* null_string can be NULL or omitted */
4614 194 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4615 60 : null_string = PG_GETARG_TEXT_PP(2);
4616 : else
4617 134 : null_string = NULL;
4618 :
4619 194 : if (fldsep != NULL)
4620 : {
4621 : /*
4622 : * Normal case with non-null fldsep. Use the text_position machinery
4623 : * to search for occurrences of fldsep.
4624 : */
4625 : TextPositionState state;
4626 :
4627 176 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4628 176 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4629 :
4630 : /* return empty set for empty input string */
4631 176 : if (inputstring_len < 1)
4632 60 : return true;
4633 :
4634 : /* empty field separator: return input string as a one-element set */
4635 164 : if (fldsep_len < 1)
4636 : {
4637 48 : split_text_accum_result(tstate, inputstring,
4638 : null_string, collation);
4639 48 : return true;
4640 : }
4641 :
4642 116 : text_position_setup(inputstring, fldsep, collation, &state);
4643 :
4644 112 : start_ptr = VARDATA_ANY(inputstring);
4645 :
4646 : for (;;)
4647 456 : {
4648 : bool found;
4649 : char *end_ptr;
4650 : int chunk_len;
4651 :
4652 568 : CHECK_FOR_INTERRUPTS();
4653 :
4654 568 : found = text_position_next(&state);
4655 568 : if (!found)
4656 : {
4657 : /* fetch last field */
4658 112 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4659 112 : end_ptr = NULL; /* not used, but some compilers complain */
4660 : }
4661 : else
4662 : {
4663 : /* fetch non-last field */
4664 456 : end_ptr = text_position_get_match_ptr(&state);
4665 456 : chunk_len = end_ptr - start_ptr;
4666 : }
4667 :
4668 : /* build a temp text datum to pass to split_text_accum_result */
4669 568 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4670 :
4671 : /* stash away this field */
4672 568 : split_text_accum_result(tstate, result_text,
4673 : null_string, collation);
4674 :
4675 568 : pfree(result_text);
4676 :
4677 568 : if (!found)
4678 112 : break;
4679 :
4680 456 : start_ptr = end_ptr + fldsep_len;
4681 : }
4682 :
4683 112 : text_position_cleanup(&state);
4684 : }
4685 : else
4686 : {
4687 : /*
4688 : * When fldsep is NULL, each character in the input string becomes a
4689 : * separate element in the result set. The separator is effectively
4690 : * the space between characters.
4691 : */
4692 18 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4693 :
4694 18 : start_ptr = VARDATA_ANY(inputstring);
4695 :
4696 132 : while (inputstring_len > 0)
4697 : {
4698 114 : int chunk_len = pg_mblen(start_ptr);
4699 :
4700 114 : CHECK_FOR_INTERRUPTS();
4701 :
4702 : /* build a temp text datum to pass to split_text_accum_result */
4703 114 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4704 :
4705 : /* stash away this field */
4706 114 : split_text_accum_result(tstate, result_text,
4707 : null_string, collation);
4708 :
4709 114 : pfree(result_text);
4710 :
4711 114 : start_ptr += chunk_len;
4712 114 : inputstring_len -= chunk_len;
4713 : }
4714 : }
4715 :
4716 130 : return true;
4717 : }
4718 :
4719 : /*
4720 : * Add text item to result set (table or array).
4721 : *
4722 : * This is also responsible for checking to see if the item matches
4723 : * the null_string, in which case we should emit NULL instead.
4724 : */
4725 : static void
4726 730 : split_text_accum_result(SplitTextOutputData *tstate,
4727 : text *field_value,
4728 : text *null_string,
4729 : Oid collation)
4730 : {
4731 730 : bool is_null = false;
4732 :
4733 730 : if (null_string && text_isequal(field_value, null_string, collation))
4734 52 : is_null = true;
4735 :
4736 730 : if (tstate->tupstore)
4737 : {
4738 : Datum values[1];
4739 : bool nulls[1];
4740 :
4741 228 : values[0] = PointerGetDatum(field_value);
4742 228 : nulls[0] = is_null;
4743 :
4744 228 : tuplestore_putvalues(tstate->tupstore,
4745 : tstate->tupdesc,
4746 : values,
4747 : nulls);
4748 : }
4749 : else
4750 : {
4751 502 : tstate->astate = accumArrayResult(tstate->astate,
4752 : PointerGetDatum(field_value),
4753 : is_null,
4754 : TEXTOID,
4755 : CurrentMemoryContext);
4756 : }
4757 730 : }
4758 :
4759 : /*
4760 : * array_to_text
4761 : * concatenate Cstring representation of input array elements
4762 : * using provided field separator
4763 : */
4764 : Datum
4765 63976 : array_to_text(PG_FUNCTION_ARGS)
4766 : {
4767 63976 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4768 63976 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4769 :
4770 63976 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4771 : }
4772 :
4773 : /*
4774 : * array_to_text_null
4775 : * concatenate Cstring representation of input array elements
4776 : * using provided field separator and null string
4777 : *
4778 : * This version is not strict so we have to test for null inputs explicitly.
4779 : */
4780 : Datum
4781 12 : array_to_text_null(PG_FUNCTION_ARGS)
4782 : {
4783 : ArrayType *v;
4784 : char *fldsep;
4785 : char *null_string;
4786 :
4787 : /* returns NULL when first or second parameter is NULL */
4788 12 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4789 0 : PG_RETURN_NULL();
4790 :
4791 12 : v = PG_GETARG_ARRAYTYPE_P(0);
4792 12 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4793 :
4794 : /* NULL null string is passed through as a null pointer */
4795 12 : if (!PG_ARGISNULL(2))
4796 6 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4797 : else
4798 6 : null_string = NULL;
4799 :
4800 12 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4801 : }
4802 :
4803 : /*
4804 : * common code for array_to_text and array_to_text_null functions
4805 : */
4806 : static text *
4807 64006 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4808 : const char *fldsep, const char *null_string)
4809 : {
4810 : text *result;
4811 : int nitems,
4812 : *dims,
4813 : ndims;
4814 : Oid element_type;
4815 : int typlen;
4816 : bool typbyval;
4817 : char typalign;
4818 : StringInfoData buf;
4819 64006 : bool printed = false;
4820 : char *p;
4821 : bits8 *bitmap;
4822 : int bitmask;
4823 : int i;
4824 : ArrayMetaState *my_extra;
4825 :
4826 64006 : ndims = ARR_NDIM(v);
4827 64006 : dims = ARR_DIMS(v);
4828 64006 : nitems = ArrayGetNItems(ndims, dims);
4829 :
4830 : /* if there are no elements, return an empty string */
4831 64006 : if (nitems == 0)
4832 39984 : return cstring_to_text_with_len("", 0);
4833 :
4834 24022 : element_type = ARR_ELEMTYPE(v);
4835 24022 : initStringInfo(&buf);
4836 :
4837 : /*
4838 : * We arrange to look up info about element type, including its output
4839 : * conversion proc, only once per series of calls, assuming the element
4840 : * type doesn't change underneath us.
4841 : */
4842 24022 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4843 24022 : if (my_extra == NULL)
4844 : {
4845 1364 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4846 : sizeof(ArrayMetaState));
4847 1364 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4848 1364 : my_extra->element_type = ~element_type;
4849 : }
4850 :
4851 24022 : if (my_extra->element_type != element_type)
4852 : {
4853 : /*
4854 : * Get info about element type, including its output conversion proc
4855 : */
4856 1364 : get_type_io_data(element_type, IOFunc_output,
4857 : &my_extra->typlen, &my_extra->typbyval,
4858 : &my_extra->typalign, &my_extra->typdelim,
4859 : &my_extra->typioparam, &my_extra->typiofunc);
4860 1364 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4861 1364 : fcinfo->flinfo->fn_mcxt);
4862 1364 : my_extra->element_type = element_type;
4863 : }
4864 24022 : typlen = my_extra->typlen;
4865 24022 : typbyval = my_extra->typbyval;
4866 24022 : typalign = my_extra->typalign;
4867 :
4868 24022 : p = ARR_DATA_PTR(v);
4869 24022 : bitmap = ARR_NULLBITMAP(v);
4870 24022 : bitmask = 1;
4871 :
4872 81792 : for (i = 0; i < nitems; i++)
4873 : {
4874 : Datum itemvalue;
4875 : char *value;
4876 :
4877 : /* Get source element, checking for NULL */
4878 57770 : if (bitmap && (*bitmap & bitmask) == 0)
4879 : {
4880 : /* if null_string is NULL, we just ignore null elements */
4881 18 : if (null_string != NULL)
4882 : {
4883 6 : if (printed)
4884 6 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
4885 : else
4886 0 : appendStringInfoString(&buf, null_string);
4887 6 : printed = true;
4888 : }
4889 : }
4890 : else
4891 : {
4892 57752 : itemvalue = fetch_att(p, typbyval, typlen);
4893 :
4894 57752 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
4895 :
4896 57752 : if (printed)
4897 33730 : appendStringInfo(&buf, "%s%s", fldsep, value);
4898 : else
4899 24022 : appendStringInfoString(&buf, value);
4900 57752 : printed = true;
4901 :
4902 57752 : p = att_addlength_pointer(p, typlen, p);
4903 57752 : p = (char *) att_align_nominal(p, typalign);
4904 : }
4905 :
4906 : /* advance bitmap pointer if any */
4907 57770 : if (bitmap)
4908 : {
4909 108 : bitmask <<= 1;
4910 108 : if (bitmask == 0x100)
4911 : {
4912 0 : bitmap++;
4913 0 : bitmask = 1;
4914 : }
4915 : }
4916 : }
4917 :
4918 24022 : result = cstring_to_text_with_len(buf.data, buf.len);
4919 24022 : pfree(buf.data);
4920 :
4921 24022 : return result;
4922 : }
4923 :
4924 : /*
4925 : * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
4926 : * 16.
4927 : */
4928 : static inline text *
4929 38750 : convert_to_base(uint64 value, int base)
4930 : {
4931 38750 : const char *digits = "0123456789abcdef";
4932 :
4933 : /* We size the buffer for to_bin's longest possible return value. */
4934 : char buf[sizeof(uint64) * BITS_PER_BYTE];
4935 38750 : char *const end = buf + sizeof(buf);
4936 38750 : char *ptr = end;
4937 :
4938 : Assert(base > 1);
4939 : Assert(base <= 16);
4940 :
4941 : do
4942 : {
4943 75974 : *--ptr = digits[value % base];
4944 75974 : value /= base;
4945 75974 : } while (ptr > buf && value);
4946 :
4947 38750 : return cstring_to_text_with_len(ptr, end - ptr);
4948 : }
4949 :
4950 : /*
4951 : * Convert an integer to a string containing a base-2 (binary) representation
4952 : * of the number.
4953 : */
4954 : Datum
4955 12 : to_bin32(PG_FUNCTION_ARGS)
4956 : {
4957 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
4958 :
4959 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
4960 : }
4961 : Datum
4962 12 : to_bin64(PG_FUNCTION_ARGS)
4963 : {
4964 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
4965 :
4966 12 : PG_RETURN_TEXT_P(convert_to_base(value, 2));
4967 : }
4968 :
4969 : /*
4970 : * Convert an integer to a string containing a base-8 (oct) representation of
4971 : * the number.
4972 : */
4973 : Datum
4974 12 : to_oct32(PG_FUNCTION_ARGS)
4975 : {
4976 12 : uint64 value = (uint32) PG_GETARG_INT32(0);
4977 :
4978 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
4979 : }
4980 : Datum
4981 12 : to_oct64(PG_FUNCTION_ARGS)
4982 : {
4983 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
4984 :
4985 12 : PG_RETURN_TEXT_P(convert_to_base(value, 8));
4986 : }
4987 :
4988 : /*
4989 : * Convert an integer to a string containing a base-16 (hex) representation of
4990 : * the number.
4991 : */
4992 : Datum
4993 38690 : to_hex32(PG_FUNCTION_ARGS)
4994 : {
4995 38690 : uint64 value = (uint32) PG_GETARG_INT32(0);
4996 :
4997 38690 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
4998 : }
4999 : Datum
5000 12 : to_hex64(PG_FUNCTION_ARGS)
5001 : {
5002 12 : uint64 value = (uint64) PG_GETARG_INT64(0);
5003 :
5004 12 : PG_RETURN_TEXT_P(convert_to_base(value, 16));
5005 : }
5006 :
5007 : /*
5008 : * Return the size of a datum, possibly compressed
5009 : *
5010 : * Works on any data type
5011 : */
5012 : Datum
5013 122 : pg_column_size(PG_FUNCTION_ARGS)
5014 : {
5015 122 : Datum value = PG_GETARG_DATUM(0);
5016 : int32 result;
5017 : int typlen;
5018 :
5019 : /* On first call, get the input type's typlen, and save at *fn_extra */
5020 122 : if (fcinfo->flinfo->fn_extra == NULL)
5021 : {
5022 : /* Lookup the datatype of the supplied argument */
5023 122 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5024 :
5025 122 : typlen = get_typlen(argtypeid);
5026 122 : if (typlen == 0) /* should not happen */
5027 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5028 :
5029 122 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5030 : sizeof(int));
5031 122 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5032 : }
5033 : else
5034 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5035 :
5036 122 : if (typlen == -1)
5037 : {
5038 : /* varlena type, possibly toasted */
5039 122 : result = toast_datum_size(value);
5040 : }
5041 0 : else if (typlen == -2)
5042 : {
5043 : /* cstring */
5044 0 : result = strlen(DatumGetCString(value)) + 1;
5045 : }
5046 : else
5047 : {
5048 : /* ordinary fixed-width type */
5049 0 : result = typlen;
5050 : }
5051 :
5052 122 : PG_RETURN_INT32(result);
5053 : }
5054 :
5055 : /*
5056 : * Return the compression method stored in the compressed attribute. Return
5057 : * NULL for non varlena type or uncompressed data.
5058 : */
5059 : Datum
5060 162 : pg_column_compression(PG_FUNCTION_ARGS)
5061 : {
5062 : int typlen;
5063 : char *result;
5064 : ToastCompressionId cmid;
5065 :
5066 : /* On first call, get the input type's typlen, and save at *fn_extra */
5067 162 : if (fcinfo->flinfo->fn_extra == NULL)
5068 : {
5069 : /* Lookup the datatype of the supplied argument */
5070 108 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5071 :
5072 108 : typlen = get_typlen(argtypeid);
5073 108 : if (typlen == 0) /* should not happen */
5074 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5075 :
5076 108 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5077 : sizeof(int));
5078 108 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5079 : }
5080 : else
5081 54 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5082 :
5083 162 : if (typlen != -1)
5084 0 : PG_RETURN_NULL();
5085 :
5086 : /* get the compression method id stored in the compressed varlena */
5087 162 : cmid = toast_get_compression_id((struct varlena *)
5088 162 : DatumGetPointer(PG_GETARG_DATUM(0)));
5089 162 : if (cmid == TOAST_INVALID_COMPRESSION_ID)
5090 6 : PG_RETURN_NULL();
5091 :
5092 : /* convert compression method id to compression method name */
5093 156 : switch (cmid)
5094 : {
5095 66 : case TOAST_PGLZ_COMPRESSION_ID:
5096 66 : result = "pglz";
5097 66 : break;
5098 90 : case TOAST_LZ4_COMPRESSION_ID:
5099 90 : result = "lz4";
5100 90 : break;
5101 0 : default:
5102 0 : elog(ERROR, "invalid compression method id %d", cmid);
5103 : }
5104 :
5105 156 : PG_RETURN_TEXT_P(cstring_to_text(result));
5106 : }
5107 :
5108 : /*
5109 : * string_agg - Concatenates values and returns string.
5110 : *
5111 : * Syntax: string_agg(value text, delimiter text) RETURNS text
5112 : *
5113 : * Note: Any NULL values are ignored. The first-call delimiter isn't
5114 : * actually used at all, and on subsequent calls the delimiter precedes
5115 : * the associated value.
5116 : */
5117 :
5118 : /* subroutine to initialize state */
5119 : static StringInfo
5120 2290 : makeStringAggState(FunctionCallInfo fcinfo)
5121 : {
5122 : StringInfo state;
5123 : MemoryContext aggcontext;
5124 : MemoryContext oldcontext;
5125 :
5126 2290 : if (!AggCheckCallContext(fcinfo, &aggcontext))
5127 : {
5128 : /* cannot be called directly because of internal-type argument */
5129 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
5130 : }
5131 :
5132 : /*
5133 : * Create state in aggregate context. It'll stay there across subsequent
5134 : * calls.
5135 : */
5136 2290 : oldcontext = MemoryContextSwitchTo(aggcontext);
5137 2290 : state = makeStringInfo();
5138 2290 : MemoryContextSwitchTo(oldcontext);
5139 :
5140 2290 : return state;
5141 : }
5142 :
5143 : Datum
5144 852498 : string_agg_transfn(PG_FUNCTION_ARGS)
5145 : {
5146 : StringInfo state;
5147 :
5148 852498 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5149 :
5150 : /* Append the value unless null, preceding it with the delimiter. */
5151 852498 : if (!PG_ARGISNULL(1))
5152 : {
5153 837450 : text *value = PG_GETARG_TEXT_PP(1);
5154 837450 : bool isfirst = false;
5155 :
5156 : /*
5157 : * You might think we can just throw away the first delimiter, however
5158 : * we must keep it as we may be a parallel worker doing partial
5159 : * aggregation building a state to send to the main process. We need
5160 : * to keep the delimiter of every aggregation so that the combine
5161 : * function can properly join up the strings of two separately
5162 : * partially aggregated results. The first delimiter is only stripped
5163 : * off in the final function. To know how much to strip off the front
5164 : * of the string, we store the length of the first delimiter in the
5165 : * StringInfo's cursor field, which we don't otherwise need here.
5166 : */
5167 837450 : if (state == NULL)
5168 : {
5169 1902 : state = makeStringAggState(fcinfo);
5170 1902 : isfirst = true;
5171 : }
5172 :
5173 837450 : if (!PG_ARGISNULL(2))
5174 : {
5175 837450 : text *delim = PG_GETARG_TEXT_PP(2);
5176 :
5177 837450 : appendStringInfoText(state, delim);
5178 837450 : if (isfirst)
5179 1902 : state->cursor = VARSIZE_ANY_EXHDR(delim);
5180 : }
5181 :
5182 837450 : appendStringInfoText(state, value);
5183 : }
5184 :
5185 : /*
5186 : * The transition type for string_agg() is declared to be "internal",
5187 : * which is a pass-by-value type the same size as a pointer.
5188 : */
5189 852498 : if (state)
5190 852420 : PG_RETURN_POINTER(state);
5191 78 : PG_RETURN_NULL();
5192 : }
5193 :
5194 : /*
5195 : * string_agg_combine
5196 : * Aggregate combine function for string_agg(text) and string_agg(bytea)
5197 : */
5198 : Datum
5199 120 : string_agg_combine(PG_FUNCTION_ARGS)
5200 : {
5201 : StringInfo state1;
5202 : StringInfo state2;
5203 : MemoryContext agg_context;
5204 :
5205 120 : if (!AggCheckCallContext(fcinfo, &agg_context))
5206 0 : elog(ERROR, "aggregate function called in non-aggregate context");
5207 :
5208 120 : state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5209 120 : state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5210 :
5211 120 : if (state2 == NULL)
5212 : {
5213 : /*
5214 : * NULL state2 is easy, just return state1, which we know is already
5215 : * in the agg_context
5216 : */
5217 0 : if (state1 == NULL)
5218 0 : PG_RETURN_NULL();
5219 0 : PG_RETURN_POINTER(state1);
5220 : }
5221 :
5222 120 : if (state1 == NULL)
5223 : {
5224 : /* We must copy state2's data into the agg_context */
5225 : MemoryContext old_context;
5226 :
5227 120 : old_context = MemoryContextSwitchTo(agg_context);
5228 120 : state1 = makeStringAggState(fcinfo);
5229 120 : appendBinaryStringInfo(state1, state2->data, state2->len);
5230 120 : state1->cursor = state2->cursor;
5231 120 : MemoryContextSwitchTo(old_context);
5232 : }
5233 0 : else if (state2->len > 0)
5234 : {
5235 : /* Combine ... state1->cursor does not change in this case */
5236 0 : appendBinaryStringInfo(state1, state2->data, state2->len);
5237 : }
5238 :
5239 120 : PG_RETURN_POINTER(state1);
5240 : }
5241 :
5242 : /*
5243 : * string_agg_serialize
5244 : * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5245 : *
5246 : * This is strict, so we need not handle NULL input
5247 : */
5248 : Datum
5249 120 : string_agg_serialize(PG_FUNCTION_ARGS)
5250 : {
5251 : StringInfo state;
5252 : StringInfoData buf;
5253 : bytea *result;
5254 :
5255 : /* cannot be called directly because of internal-type argument */
5256 : Assert(AggCheckCallContext(fcinfo, NULL));
5257 :
5258 120 : state = (StringInfo) PG_GETARG_POINTER(0);
5259 :
5260 120 : pq_begintypsend(&buf);
5261 :
5262 : /* cursor */
5263 120 : pq_sendint(&buf, state->cursor, 4);
5264 :
5265 : /* data */
5266 120 : pq_sendbytes(&buf, state->data, state->len);
5267 :
5268 120 : result = pq_endtypsend(&buf);
5269 :
5270 120 : PG_RETURN_BYTEA_P(result);
5271 : }
5272 :
5273 : /*
5274 : * string_agg_deserialize
5275 : * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5276 : *
5277 : * This is strict, so we need not handle NULL input
5278 : */
5279 : Datum
5280 120 : string_agg_deserialize(PG_FUNCTION_ARGS)
5281 : {
5282 : bytea *sstate;
5283 : StringInfo result;
5284 : StringInfoData buf;
5285 : char *data;
5286 : int datalen;
5287 :
5288 : /* cannot be called directly because of internal-type argument */
5289 : Assert(AggCheckCallContext(fcinfo, NULL));
5290 :
5291 120 : sstate = PG_GETARG_BYTEA_PP(0);
5292 :
5293 : /*
5294 : * Initialize a StringInfo so that we can "receive" it using the standard
5295 : * recv-function infrastructure.
5296 : */
5297 120 : initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
5298 120 : VARSIZE_ANY_EXHDR(sstate));
5299 :
5300 120 : result = makeStringAggState(fcinfo);
5301 :
5302 : /* cursor */
5303 120 : result->cursor = pq_getmsgint(&buf, 4);
5304 :
5305 : /* data */
5306 120 : datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5307 120 : data = (char *) pq_getmsgbytes(&buf, datalen);
5308 120 : appendBinaryStringInfo(result, data, datalen);
5309 :
5310 120 : pq_getmsgend(&buf);
5311 :
5312 120 : PG_RETURN_POINTER(result);
5313 : }
5314 :
5315 : Datum
5316 1974 : string_agg_finalfn(PG_FUNCTION_ARGS)
5317 : {
5318 : StringInfo state;
5319 :
5320 : /* cannot be called directly because of internal-type argument */
5321 : Assert(AggCheckCallContext(fcinfo, NULL));
5322 :
5323 1974 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5324 :
5325 1974 : if (state != NULL)
5326 : {
5327 : /* As per comment in transfn, strip data before the cursor position */
5328 1902 : PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5329 : state->len - state->cursor));
5330 : }
5331 : else
5332 72 : PG_RETURN_NULL();
5333 : }
5334 :
5335 : /*
5336 : * Prepare cache with fmgr info for the output functions of the datatypes of
5337 : * the arguments of a concat-like function, beginning with argument "argidx".
5338 : * (Arguments before that will have corresponding slots in the resulting
5339 : * FmgrInfo array, but we don't fill those slots.)
5340 : */
5341 : static FmgrInfo *
5342 46 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5343 : {
5344 : FmgrInfo *foutcache;
5345 : int i;
5346 :
5347 : /* We keep the info in fn_mcxt so it survives across calls */
5348 46 : foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5349 46 : PG_NARGS() * sizeof(FmgrInfo));
5350 :
5351 220 : for (i = argidx; i < PG_NARGS(); i++)
5352 : {
5353 : Oid valtype;
5354 : Oid typOutput;
5355 : bool typIsVarlena;
5356 :
5357 174 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5358 174 : if (!OidIsValid(valtype))
5359 0 : elog(ERROR, "could not determine data type of concat() input");
5360 :
5361 174 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5362 174 : fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5363 : }
5364 :
5365 46 : fcinfo->flinfo->fn_extra = foutcache;
5366 :
5367 46 : return foutcache;
5368 : }
5369 :
5370 : /*
5371 : * Implementation of both concat() and concat_ws().
5372 : *
5373 : * sepstr is the separator string to place between values.
5374 : * argidx identifies the first argument to concatenate (counting from zero);
5375 : * note that this must be constant across any one series of calls.
5376 : *
5377 : * Returns NULL if result should be NULL, else text value.
5378 : */
5379 : static text *
5380 114 : concat_internal(const char *sepstr, int argidx,
5381 : FunctionCallInfo fcinfo)
5382 : {
5383 : text *result;
5384 : StringInfoData str;
5385 : FmgrInfo *foutcache;
5386 114 : bool first_arg = true;
5387 : int i;
5388 :
5389 : /*
5390 : * concat(VARIADIC some-array) is essentially equivalent to
5391 : * array_to_text(), ie concat the array elements with the given separator.
5392 : * So we just pass the case off to that code.
5393 : */
5394 114 : if (get_fn_expr_variadic(fcinfo->flinfo))
5395 : {
5396 : ArrayType *arr;
5397 :
5398 : /* Should have just the one argument */
5399 : Assert(argidx == PG_NARGS() - 1);
5400 :
5401 : /* concat(VARIADIC NULL) is defined as NULL */
5402 30 : if (PG_ARGISNULL(argidx))
5403 12 : return NULL;
5404 :
5405 : /*
5406 : * Non-null argument had better be an array. We assume that any call
5407 : * context that could let get_fn_expr_variadic return true will have
5408 : * checked that a VARIADIC-labeled parameter actually is an array. So
5409 : * it should be okay to just Assert that it's an array rather than
5410 : * doing a full-fledged error check.
5411 : */
5412 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5413 :
5414 : /* OK, safe to fetch the array value */
5415 18 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
5416 :
5417 : /*
5418 : * And serialize the array. We tell array_to_text to ignore null
5419 : * elements, which matches the behavior of the loop below.
5420 : */
5421 18 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5422 : }
5423 :
5424 : /* Normal case without explicit VARIADIC marker */
5425 84 : initStringInfo(&str);
5426 :
5427 : /* Get output function info, building it if first time through */
5428 84 : foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5429 84 : if (foutcache == NULL)
5430 46 : foutcache = build_concat_foutcache(fcinfo, argidx);
5431 :
5432 372 : for (i = argidx; i < PG_NARGS(); i++)
5433 : {
5434 288 : if (!PG_ARGISNULL(i))
5435 : {
5436 210 : Datum value = PG_GETARG_DATUM(i);
5437 :
5438 : /* add separator if appropriate */
5439 210 : if (first_arg)
5440 78 : first_arg = false;
5441 : else
5442 132 : appendStringInfoString(&str, sepstr);
5443 :
5444 : /* call the appropriate type output function, append the result */
5445 210 : appendStringInfoString(&str,
5446 210 : OutputFunctionCall(&foutcache[i], value));
5447 : }
5448 : }
5449 :
5450 84 : result = cstring_to_text_with_len(str.data, str.len);
5451 84 : pfree(str.data);
5452 :
5453 84 : return result;
5454 : }
5455 :
5456 : /*
5457 : * Concatenate all arguments. NULL arguments are ignored.
5458 : */
5459 : Datum
5460 36 : text_concat(PG_FUNCTION_ARGS)
5461 : {
5462 : text *result;
5463 :
5464 36 : result = concat_internal("", 0, fcinfo);
5465 36 : if (result == NULL)
5466 6 : PG_RETURN_NULL();
5467 30 : PG_RETURN_TEXT_P(result);
5468 : }
5469 :
5470 : /*
5471 : * Concatenate all but first argument value with separators. The first
5472 : * parameter is used as the separator. NULL arguments are ignored.
5473 : */
5474 : Datum
5475 84 : text_concat_ws(PG_FUNCTION_ARGS)
5476 : {
5477 : char *sep;
5478 : text *result;
5479 :
5480 : /* return NULL when separator is NULL */
5481 84 : if (PG_ARGISNULL(0))
5482 6 : PG_RETURN_NULL();
5483 78 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5484 :
5485 78 : result = concat_internal(sep, 1, fcinfo);
5486 78 : if (result == NULL)
5487 6 : PG_RETURN_NULL();
5488 72 : PG_RETURN_TEXT_P(result);
5489 : }
5490 :
5491 : /*
5492 : * Return first n characters in the string. When n is negative,
5493 : * return all but last |n| characters.
5494 : */
5495 : Datum
5496 1884 : text_left(PG_FUNCTION_ARGS)
5497 : {
5498 1884 : int n = PG_GETARG_INT32(1);
5499 :
5500 1884 : if (n < 0)
5501 : {
5502 30 : text *str = PG_GETARG_TEXT_PP(0);
5503 30 : const char *p = VARDATA_ANY(str);
5504 30 : int len = VARSIZE_ANY_EXHDR(str);
5505 : int rlen;
5506 :
5507 30 : n = pg_mbstrlen_with_len(p, len) + n;
5508 30 : rlen = pg_mbcharcliplen(p, len, n);
5509 30 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5510 : }
5511 : else
5512 1854 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5513 : }
5514 :
5515 : /*
5516 : * Return last n characters in the string. When n is negative,
5517 : * return all but first |n| characters.
5518 : */
5519 : Datum
5520 66 : text_right(PG_FUNCTION_ARGS)
5521 : {
5522 66 : text *str = PG_GETARG_TEXT_PP(0);
5523 66 : const char *p = VARDATA_ANY(str);
5524 66 : int len = VARSIZE_ANY_EXHDR(str);
5525 66 : int n = PG_GETARG_INT32(1);
5526 : int off;
5527 :
5528 66 : if (n < 0)
5529 30 : n = -n;
5530 : else
5531 36 : n = pg_mbstrlen_with_len(p, len) - n;
5532 66 : off = pg_mbcharcliplen(p, len, n);
5533 :
5534 66 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5535 : }
5536 :
5537 : /*
5538 : * Return reversed string
5539 : */
5540 : Datum
5541 6 : text_reverse(PG_FUNCTION_ARGS)
5542 : {
5543 6 : text *str = PG_GETARG_TEXT_PP(0);
5544 6 : const char *p = VARDATA_ANY(str);
5545 6 : int len = VARSIZE_ANY_EXHDR(str);
5546 6 : const char *endp = p + len;
5547 : text *result;
5548 : char *dst;
5549 :
5550 6 : result = palloc(len + VARHDRSZ);
5551 6 : dst = (char *) VARDATA(result) + len;
5552 6 : SET_VARSIZE(result, len + VARHDRSZ);
5553 :
5554 6 : if (pg_database_encoding_max_length() > 1)
5555 : {
5556 : /* multibyte version */
5557 12 : while (p < endp)
5558 : {
5559 : int sz;
5560 :
5561 10 : sz = pg_mblen(p);
5562 10 : dst -= sz;
5563 10 : memcpy(dst, p, sz);
5564 10 : p += sz;
5565 : }
5566 : }
5567 : else
5568 : {
5569 : /* single byte version */
5570 24 : while (p < endp)
5571 20 : *(--dst) = *p++;
5572 : }
5573 :
5574 6 : PG_RETURN_TEXT_P(result);
5575 : }
5576 :
5577 :
5578 : /*
5579 : * Support macros for text_format()
5580 : */
5581 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5582 :
5583 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5584 : do { \
5585 : if (++(ptr) >= (end_ptr)) \
5586 : ereport(ERROR, \
5587 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5588 : errmsg("unterminated format() type specifier"), \
5589 : errhint("For a single \"%%\" use \"%%%%\"."))); \
5590 : } while (0)
5591 :
5592 : /*
5593 : * Returns a formatted string
5594 : */
5595 : Datum
5596 25398 : text_format(PG_FUNCTION_ARGS)
5597 : {
5598 : text *fmt;
5599 : StringInfoData str;
5600 : const char *cp;
5601 : const char *start_ptr;
5602 : const char *end_ptr;
5603 : text *result;
5604 : int arg;
5605 : bool funcvariadic;
5606 : int nargs;
5607 25398 : Datum *elements = NULL;
5608 25398 : bool *nulls = NULL;
5609 25398 : Oid element_type = InvalidOid;
5610 25398 : Oid prev_type = InvalidOid;
5611 25398 : Oid prev_width_type = InvalidOid;
5612 : FmgrInfo typoutputfinfo;
5613 : FmgrInfo typoutputinfo_width;
5614 :
5615 : /* When format string is null, immediately return null */
5616 25398 : if (PG_ARGISNULL(0))
5617 6 : PG_RETURN_NULL();
5618 :
5619 : /* If argument is marked VARIADIC, expand array into elements */
5620 25392 : if (get_fn_expr_variadic(fcinfo->flinfo))
5621 : {
5622 : ArrayType *arr;
5623 : int16 elmlen;
5624 : bool elmbyval;
5625 : char elmalign;
5626 : int nitems;
5627 :
5628 : /* Should have just the one argument */
5629 : Assert(PG_NARGS() == 2);
5630 :
5631 : /* If argument is NULL, we treat it as zero-length array */
5632 48 : if (PG_ARGISNULL(1))
5633 6 : nitems = 0;
5634 : else
5635 : {
5636 : /*
5637 : * Non-null argument had better be an array. We assume that any
5638 : * call context that could let get_fn_expr_variadic return true
5639 : * will have checked that a VARIADIC-labeled parameter actually is
5640 : * an array. So it should be okay to just Assert that it's an
5641 : * array rather than doing a full-fledged error check.
5642 : */
5643 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5644 :
5645 : /* OK, safe to fetch the array value */
5646 42 : arr = PG_GETARG_ARRAYTYPE_P(1);
5647 :
5648 : /* Get info about array element type */
5649 42 : element_type = ARR_ELEMTYPE(arr);
5650 42 : get_typlenbyvalalign(element_type,
5651 : &elmlen, &elmbyval, &elmalign);
5652 :
5653 : /* Extract all array elements */
5654 42 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5655 : &elements, &nulls, &nitems);
5656 : }
5657 :
5658 48 : nargs = nitems + 1;
5659 48 : funcvariadic = true;
5660 : }
5661 : else
5662 : {
5663 : /* Non-variadic case, we'll process the arguments individually */
5664 25344 : nargs = PG_NARGS();
5665 25344 : funcvariadic = false;
5666 : }
5667 :
5668 : /* Setup for main loop. */
5669 25392 : fmt = PG_GETARG_TEXT_PP(0);
5670 25392 : start_ptr = VARDATA_ANY(fmt);
5671 25392 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5672 25392 : initStringInfo(&str);
5673 25392 : arg = 1; /* next argument position to print */
5674 :
5675 : /* Scan format string, looking for conversion specifiers. */
5676 735936 : for (cp = start_ptr; cp < end_ptr; cp++)
5677 : {
5678 : int argpos;
5679 : int widthpos;
5680 : int flags;
5681 : int width;
5682 : Datum value;
5683 : bool isNull;
5684 : Oid typid;
5685 :
5686 : /*
5687 : * If it's not the start of a conversion specifier, just copy it to
5688 : * the output buffer.
5689 : */
5690 710604 : if (*cp != '%')
5691 : {
5692 650868 : appendStringInfoCharMacro(&str, *cp);
5693 650886 : continue;
5694 : }
5695 :
5696 59736 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5697 :
5698 : /* Easy case: %% outputs a single % */
5699 59736 : if (*cp == '%')
5700 : {
5701 18 : appendStringInfoCharMacro(&str, *cp);
5702 18 : continue;
5703 : }
5704 :
5705 : /* Parse the optional portions of the format specifier */
5706 59718 : cp = text_format_parse_format(cp, end_ptr,
5707 : &argpos, &widthpos,
5708 : &flags, &width);
5709 :
5710 : /*
5711 : * Next we should see the main conversion specifier. Whether or not
5712 : * an argument position was present, it's known that at least one
5713 : * character remains in the string at this point. Experience suggests
5714 : * that it's worth checking that that character is one of the expected
5715 : * ones before we try to fetch arguments, so as to produce the least
5716 : * confusing response to a mis-formatted specifier.
5717 : */
5718 59694 : if (strchr("sIL", *cp) == NULL)
5719 6 : ereport(ERROR,
5720 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5721 : errmsg("unrecognized format() type specifier \"%.*s\"",
5722 : pg_mblen(cp), cp),
5723 : errhint("For a single \"%%\" use \"%%%%\".")));
5724 :
5725 : /* If indirect width was specified, get its value */
5726 59688 : if (widthpos >= 0)
5727 : {
5728 : /* Collect the specified or next argument position */
5729 42 : if (widthpos > 0)
5730 36 : arg = widthpos;
5731 42 : if (arg >= nargs)
5732 0 : ereport(ERROR,
5733 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5734 : errmsg("too few arguments for format()")));
5735 :
5736 : /* Get the value and type of the selected argument */
5737 42 : if (!funcvariadic)
5738 : {
5739 42 : value = PG_GETARG_DATUM(arg);
5740 42 : isNull = PG_ARGISNULL(arg);
5741 42 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5742 : }
5743 : else
5744 : {
5745 0 : value = elements[arg - 1];
5746 0 : isNull = nulls[arg - 1];
5747 0 : typid = element_type;
5748 : }
5749 42 : if (!OidIsValid(typid))
5750 0 : elog(ERROR, "could not determine data type of format() input");
5751 :
5752 42 : arg++;
5753 :
5754 : /* We can treat NULL width the same as zero */
5755 42 : if (isNull)
5756 6 : width = 0;
5757 36 : else if (typid == INT4OID)
5758 36 : width = DatumGetInt32(value);
5759 0 : else if (typid == INT2OID)
5760 0 : width = DatumGetInt16(value);
5761 : else
5762 : {
5763 : /* For less-usual datatypes, convert to text then to int */
5764 : char *str;
5765 :
5766 0 : if (typid != prev_width_type)
5767 : {
5768 : Oid typoutputfunc;
5769 : bool typIsVarlena;
5770 :
5771 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5772 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
5773 0 : prev_width_type = typid;
5774 : }
5775 :
5776 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
5777 :
5778 : /* pg_strtoint32 will complain about bad data or overflow */
5779 0 : width = pg_strtoint32(str);
5780 :
5781 0 : pfree(str);
5782 : }
5783 : }
5784 :
5785 : /* Collect the specified or next argument position */
5786 59688 : if (argpos > 0)
5787 132 : arg = argpos;
5788 59688 : if (arg >= nargs)
5789 24 : ereport(ERROR,
5790 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5791 : errmsg("too few arguments for format()")));
5792 :
5793 : /* Get the value and type of the selected argument */
5794 59664 : if (!funcvariadic)
5795 : {
5796 58392 : value = PG_GETARG_DATUM(arg);
5797 58392 : isNull = PG_ARGISNULL(arg);
5798 58392 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5799 : }
5800 : else
5801 : {
5802 1272 : value = elements[arg - 1];
5803 1272 : isNull = nulls[arg - 1];
5804 1272 : typid = element_type;
5805 : }
5806 59664 : if (!OidIsValid(typid))
5807 0 : elog(ERROR, "could not determine data type of format() input");
5808 :
5809 59664 : arg++;
5810 :
5811 : /*
5812 : * Get the appropriate typOutput function, reusing previous one if
5813 : * same type as previous argument. That's particularly useful in the
5814 : * variadic-array case, but often saves work even for ordinary calls.
5815 : */
5816 59664 : if (typid != prev_type)
5817 : {
5818 : Oid typoutputfunc;
5819 : bool typIsVarlena;
5820 :
5821 28506 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5822 28506 : fmgr_info(typoutputfunc, &typoutputfinfo);
5823 28506 : prev_type = typid;
5824 : }
5825 :
5826 : /*
5827 : * And now we can format the value.
5828 : */
5829 59664 : switch (*cp)
5830 : {
5831 59664 : case 's':
5832 : case 'I':
5833 : case 'L':
5834 59664 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
5835 : value, isNull,
5836 : flags, width);
5837 59658 : break;
5838 0 : default:
5839 : /* should not get here, because of previous check */
5840 0 : ereport(ERROR,
5841 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5842 : errmsg("unrecognized format() type specifier \"%.*s\"",
5843 : pg_mblen(cp), cp),
5844 : errhint("For a single \"%%\" use \"%%%%\".")));
5845 : break;
5846 : }
5847 : }
5848 :
5849 : /* Don't need deconstruct_array results anymore. */
5850 25332 : if (elements != NULL)
5851 42 : pfree(elements);
5852 25332 : if (nulls != NULL)
5853 42 : pfree(nulls);
5854 :
5855 : /* Generate results. */
5856 25332 : result = cstring_to_text_with_len(str.data, str.len);
5857 25332 : pfree(str.data);
5858 :
5859 25332 : PG_RETURN_TEXT_P(result);
5860 : }
5861 :
5862 : /*
5863 : * Parse contiguous digits as a decimal number.
5864 : *
5865 : * Returns true if some digits could be parsed.
5866 : * The value is returned into *value, and *ptr is advanced to the next
5867 : * character to be parsed.
5868 : *
5869 : * Note parsing invariant: at least one character is known available before
5870 : * string end (end_ptr) at entry, and this is still true at exit.
5871 : */
5872 : static bool
5873 119400 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5874 : {
5875 119400 : bool found = false;
5876 119400 : const char *cp = *ptr;
5877 119400 : int val = 0;
5878 :
5879 119712 : while (*cp >= '0' && *cp <= '9')
5880 : {
5881 318 : int8 digit = (*cp - '0');
5882 :
5883 318 : if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5884 318 : unlikely(pg_add_s32_overflow(val, digit, &val)))
5885 0 : ereport(ERROR,
5886 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5887 : errmsg("number is out of range")));
5888 318 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5889 312 : found = true;
5890 : }
5891 :
5892 119394 : *ptr = cp;
5893 119394 : *value = val;
5894 :
5895 119394 : return found;
5896 : }
5897 :
5898 : /*
5899 : * Parse a format specifier (generally following the SUS printf spec).
5900 : *
5901 : * We have already advanced over the initial '%', and we are looking for
5902 : * [argpos][flags][width]type (but the type character is not consumed here).
5903 : *
5904 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5905 : * Output parameters:
5906 : * argpos: argument position for value to be printed. -1 means unspecified.
5907 : * widthpos: argument position for width. Zero means the argument position
5908 : * was unspecified (ie, take the next arg) and -1 means no width
5909 : * argument (width was omitted or specified as a constant).
5910 : * flags: bitmask of flags.
5911 : * width: directly-specified width value. Zero means the width was omitted
5912 : * (note it's not necessary to distinguish this case from an explicit
5913 : * zero width value).
5914 : *
5915 : * The function result is the next character position to be parsed, ie, the
5916 : * location where the type character is/should be.
5917 : *
5918 : * Note parsing invariant: at least one character is known available before
5919 : * string end (end_ptr) at entry, and this is still true at exit.
5920 : */
5921 : static const char *
5922 59718 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
5923 : int *argpos, int *widthpos,
5924 : int *flags, int *width)
5925 : {
5926 59718 : const char *cp = start_ptr;
5927 : int n;
5928 :
5929 : /* set defaults for output parameters */
5930 59718 : *argpos = -1;
5931 59718 : *widthpos = -1;
5932 59718 : *flags = 0;
5933 59718 : *width = 0;
5934 :
5935 : /* try to identify first number */
5936 59718 : if (text_format_parse_digits(&cp, end_ptr, &n))
5937 : {
5938 174 : if (*cp != '$')
5939 : {
5940 : /* Must be just a width and a type, so we're done */
5941 24 : *width = n;
5942 24 : return cp;
5943 : }
5944 : /* The number was argument position */
5945 150 : *argpos = n;
5946 : /* Explicit 0 for argument index is immediately refused */
5947 150 : if (n == 0)
5948 6 : ereport(ERROR,
5949 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5950 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5951 144 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5952 : }
5953 :
5954 : /* Handle flags (only minus is supported now) */
5955 59712 : while (*cp == '-')
5956 : {
5957 30 : *flags |= TEXT_FORMAT_FLAG_MINUS;
5958 30 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5959 : }
5960 :
5961 59682 : if (*cp == '*')
5962 : {
5963 : /* Handle indirect width */
5964 48 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5965 48 : if (text_format_parse_digits(&cp, end_ptr, &n))
5966 : {
5967 : /* number in this position must be closed by $ */
5968 42 : if (*cp != '$')
5969 0 : ereport(ERROR,
5970 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5971 : errmsg("width argument position must be ended by \"$\"")));
5972 : /* The number was width argument position */
5973 42 : *widthpos = n;
5974 : /* Explicit 0 for argument index is immediately refused */
5975 42 : if (n == 0)
5976 6 : ereport(ERROR,
5977 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5978 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5979 36 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5980 : }
5981 : else
5982 6 : *widthpos = 0; /* width's argument position is unspecified */
5983 : }
5984 : else
5985 : {
5986 : /* Check for direct width specification */
5987 59634 : if (text_format_parse_digits(&cp, end_ptr, &n))
5988 30 : *width = n;
5989 : }
5990 :
5991 : /* cp should now be pointing at type character */
5992 59670 : return cp;
5993 : }
5994 :
5995 : /*
5996 : * Format a %s, %I, or %L conversion
5997 : */
5998 : static void
5999 59664 : text_format_string_conversion(StringInfo buf, char conversion,
6000 : FmgrInfo *typOutputInfo,
6001 : Datum value, bool isNull,
6002 : int flags, int width)
6003 : {
6004 : char *str;
6005 :
6006 : /* Handle NULL arguments before trying to stringify the value. */
6007 59664 : if (isNull)
6008 : {
6009 306 : if (conversion == 's')
6010 234 : text_format_append_string(buf, "", flags, width);
6011 72 : else if (conversion == 'L')
6012 66 : text_format_append_string(buf, "NULL", flags, width);
6013 6 : else if (conversion == 'I')
6014 6 : ereport(ERROR,
6015 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6016 : errmsg("null values cannot be formatted as an SQL identifier")));
6017 300 : return;
6018 : }
6019 :
6020 : /* Stringify. */
6021 59358 : str = OutputFunctionCall(typOutputInfo, value);
6022 :
6023 : /* Escape. */
6024 59358 : if (conversion == 'I')
6025 : {
6026 : /* quote_identifier may or may not allocate a new string. */
6027 3106 : text_format_append_string(buf, quote_identifier(str), flags, width);
6028 : }
6029 56252 : else if (conversion == 'L')
6030 : {
6031 3232 : char *qstr = quote_literal_cstr(str);
6032 :
6033 3232 : text_format_append_string(buf, qstr, flags, width);
6034 : /* quote_literal_cstr() always allocates a new string */
6035 3232 : pfree(qstr);
6036 : }
6037 : else
6038 53020 : text_format_append_string(buf, str, flags, width);
6039 :
6040 : /* Cleanup. */
6041 59358 : pfree(str);
6042 : }
6043 :
6044 : /*
6045 : * Append str to buf, padding as directed by flags/width
6046 : */
6047 : static void
6048 59658 : text_format_append_string(StringInfo buf, const char *str,
6049 : int flags, int width)
6050 : {
6051 59658 : bool align_to_left = false;
6052 : int len;
6053 :
6054 : /* fast path for typical easy case */
6055 59658 : if (width == 0)
6056 : {
6057 59574 : appendStringInfoString(buf, str);
6058 59574 : return;
6059 : }
6060 :
6061 84 : if (width < 0)
6062 : {
6063 : /* Negative width: implicit '-' flag, then take absolute value */
6064 6 : align_to_left = true;
6065 : /* -INT_MIN is undefined */
6066 6 : if (width <= INT_MIN)
6067 0 : ereport(ERROR,
6068 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6069 : errmsg("number is out of range")));
6070 6 : width = -width;
6071 : }
6072 78 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
6073 24 : align_to_left = true;
6074 :
6075 84 : len = pg_mbstrlen(str);
6076 84 : if (align_to_left)
6077 : {
6078 : /* left justify */
6079 30 : appendStringInfoString(buf, str);
6080 30 : if (len < width)
6081 30 : appendStringInfoSpaces(buf, width - len);
6082 : }
6083 : else
6084 : {
6085 : /* right justify */
6086 54 : if (len < width)
6087 54 : appendStringInfoSpaces(buf, width - len);
6088 54 : appendStringInfoString(buf, str);
6089 : }
6090 : }
6091 :
6092 : /*
6093 : * text_format_nv - nonvariadic wrapper for text_format function.
6094 : *
6095 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6096 : * which checks that all built-in functions that share the implementing C
6097 : * function take the same number of arguments.
6098 : */
6099 : Datum
6100 30 : text_format_nv(PG_FUNCTION_ARGS)
6101 : {
6102 30 : return text_format(fcinfo);
6103 : }
6104 :
6105 : /*
6106 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
6107 : * for this use case.
6108 : */
6109 : static inline bool
6110 0 : rest_of_char_same(const char *s1, const char *s2, int len)
6111 : {
6112 0 : while (len > 0)
6113 : {
6114 0 : len--;
6115 0 : if (s1[len] != s2[len])
6116 0 : return false;
6117 : }
6118 0 : return true;
6119 : }
6120 :
6121 : /* Expand each Levenshtein distance variant */
6122 : #include "levenshtein.c"
6123 : #define LEVENSHTEIN_LESS_EQUAL
6124 : #include "levenshtein.c"
6125 :
6126 :
6127 : /*
6128 : * The following *ClosestMatch() functions can be used to determine whether a
6129 : * user-provided string resembles any known valid values, which is useful for
6130 : * providing hints in log messages, among other things. Use these functions
6131 : * like so:
6132 : *
6133 : * initClosestMatch(&state, source_string, max_distance);
6134 : *
6135 : * for (int i = 0; i < num_valid_strings; i++)
6136 : * updateClosestMatch(&state, valid_strings[i]);
6137 : *
6138 : * closestMatch = getClosestMatch(&state);
6139 : */
6140 :
6141 : /*
6142 : * Initialize the given state with the source string and maximum Levenshtein
6143 : * distance to consider.
6144 : */
6145 : void
6146 56 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6147 : {
6148 : Assert(state);
6149 : Assert(max_d >= 0);
6150 :
6151 56 : state->source = source;
6152 56 : state->min_d = -1;
6153 56 : state->max_d = max_d;
6154 56 : state->match = NULL;
6155 56 : }
6156 :
6157 : /*
6158 : * If the candidate string is a closer match than the current one saved (or
6159 : * there is no match saved), save it as the closest match.
6160 : *
6161 : * If the source or candidate string is NULL, empty, or too long, this function
6162 : * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6163 : * allowed or more than half the characters are different, no action is taken.
6164 : */
6165 : void
6166 334 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
6167 : {
6168 : int dist;
6169 :
6170 : Assert(state);
6171 :
6172 334 : if (state->source == NULL || state->source[0] == '\0' ||
6173 334 : candidate == NULL || candidate[0] == '\0')
6174 0 : return;
6175 :
6176 : /*
6177 : * To avoid ERROR-ing, we check the lengths here instead of setting
6178 : * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6179 : */
6180 334 : if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6181 334 : strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6182 0 : return;
6183 :
6184 334 : dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6185 334 : candidate, strlen(candidate), 1, 1, 1,
6186 : state->max_d, true);
6187 334 : if (dist <= state->max_d &&
6188 56 : dist <= strlen(state->source) / 2 &&
6189 14 : (state->min_d == -1 || dist < state->min_d))
6190 : {
6191 14 : state->min_d = dist;
6192 14 : state->match = candidate;
6193 : }
6194 : }
6195 :
6196 : /*
6197 : * Return the closest match. If no suitable candidates were provided via
6198 : * updateClosestMatch(), return NULL.
6199 : */
6200 : const char *
6201 56 : getClosestMatch(ClosestMatchState *state)
6202 : {
6203 : Assert(state);
6204 :
6205 56 : return state->match;
6206 : }
6207 :
6208 :
6209 : /*
6210 : * Unicode support
6211 : */
6212 :
6213 : static UnicodeNormalizationForm
6214 62 : unicode_norm_form_from_string(const char *formstr)
6215 : {
6216 62 : UnicodeNormalizationForm form = -1;
6217 :
6218 : /*
6219 : * Might as well check this while we're here.
6220 : */
6221 62 : if (GetDatabaseEncoding() != PG_UTF8)
6222 0 : ereport(ERROR,
6223 : (errcode(ERRCODE_SYNTAX_ERROR),
6224 : errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6225 :
6226 62 : if (pg_strcasecmp(formstr, "NFC") == 0)
6227 22 : form = UNICODE_NFC;
6228 40 : else if (pg_strcasecmp(formstr, "NFD") == 0)
6229 12 : form = UNICODE_NFD;
6230 28 : else if (pg_strcasecmp(formstr, "NFKC") == 0)
6231 12 : form = UNICODE_NFKC;
6232 16 : else if (pg_strcasecmp(formstr, "NFKD") == 0)
6233 12 : form = UNICODE_NFKD;
6234 : else
6235 4 : ereport(ERROR,
6236 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6237 : errmsg("invalid normalization form: %s", formstr)));
6238 :
6239 58 : return form;
6240 : }
6241 :
6242 : /*
6243 : * Returns version of Unicode used by Postgres in "major.minor" format (the
6244 : * same format as the Unicode version reported by ICU). The third component
6245 : * ("update version") never involves additions to the character repertiore and
6246 : * is unimportant for most purposes.
6247 : *
6248 : * See: https://unicode.org/versions/
6249 : */
6250 : Datum
6251 2 : unicode_version(PG_FUNCTION_ARGS)
6252 : {
6253 2 : PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6254 : }
6255 :
6256 : /*
6257 : * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6258 : */
6259 : Datum
6260 2 : icu_unicode_version(PG_FUNCTION_ARGS)
6261 : {
6262 : #ifdef USE_ICU
6263 2 : PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6264 : #else
6265 : PG_RETURN_NULL();
6266 : #endif
6267 : }
6268 :
6269 : /*
6270 : * Check whether the string contains only assigned Unicode code
6271 : * points. Requires that the database encoding is UTF-8.
6272 : */
6273 : Datum
6274 4 : unicode_assigned(PG_FUNCTION_ARGS)
6275 : {
6276 4 : text *input = PG_GETARG_TEXT_PP(0);
6277 : unsigned char *p;
6278 : int size;
6279 :
6280 4 : if (GetDatabaseEncoding() != PG_UTF8)
6281 0 : ereport(ERROR,
6282 : (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6283 :
6284 : /* convert to pg_wchar */
6285 4 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6286 4 : p = (unsigned char *) VARDATA_ANY(input);
6287 16 : for (int i = 0; i < size; i++)
6288 : {
6289 14 : pg_wchar uchar = utf8_to_unicode(p);
6290 14 : int category = unicode_category(uchar);
6291 :
6292 14 : if (category == PG_U_UNASSIGNED)
6293 2 : PG_RETURN_BOOL(false);
6294 :
6295 12 : p += pg_utf_mblen(p);
6296 : }
6297 :
6298 2 : PG_RETURN_BOOL(true);
6299 : }
6300 :
6301 : Datum
6302 16 : unicode_normalize_func(PG_FUNCTION_ARGS)
6303 : {
6304 16 : text *input = PG_GETARG_TEXT_PP(0);
6305 16 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6306 : UnicodeNormalizationForm form;
6307 : int size;
6308 : pg_wchar *input_chars;
6309 : pg_wchar *output_chars;
6310 : unsigned char *p;
6311 : text *result;
6312 : int i;
6313 :
6314 16 : form = unicode_norm_form_from_string(formstr);
6315 :
6316 : /* convert to pg_wchar */
6317 14 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6318 14 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6319 14 : p = (unsigned char *) VARDATA_ANY(input);
6320 56 : for (i = 0; i < size; i++)
6321 : {
6322 42 : input_chars[i] = utf8_to_unicode(p);
6323 42 : p += pg_utf_mblen(p);
6324 : }
6325 14 : input_chars[i] = (pg_wchar) '\0';
6326 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6327 :
6328 : /* action */
6329 14 : output_chars = unicode_normalize(form, input_chars);
6330 :
6331 : /* convert back to UTF-8 string */
6332 14 : size = 0;
6333 54 : for (pg_wchar *wp = output_chars; *wp; wp++)
6334 : {
6335 : unsigned char buf[4];
6336 :
6337 40 : unicode_to_utf8(*wp, buf);
6338 40 : size += pg_utf_mblen(buf);
6339 : }
6340 :
6341 14 : result = palloc(size + VARHDRSZ);
6342 14 : SET_VARSIZE(result, size + VARHDRSZ);
6343 :
6344 14 : p = (unsigned char *) VARDATA_ANY(result);
6345 54 : for (pg_wchar *wp = output_chars; *wp; wp++)
6346 : {
6347 40 : unicode_to_utf8(*wp, p);
6348 40 : p += pg_utf_mblen(p);
6349 : }
6350 : Assert((char *) p == (char *) result + size + VARHDRSZ);
6351 :
6352 14 : PG_RETURN_TEXT_P(result);
6353 : }
6354 :
6355 : /*
6356 : * Check whether the string is in the specified Unicode normalization form.
6357 : *
6358 : * This is done by converting the string to the specified normal form and then
6359 : * comparing that to the original string. To speed that up, we also apply the
6360 : * "quick check" algorithm specified in UAX #15, which can give a yes or no
6361 : * answer for many strings by just scanning the string once.
6362 : *
6363 : * This function should generally be optimized for the case where the string
6364 : * is in fact normalized. In that case, we'll end up looking at the entire
6365 : * string, so it's probably not worth doing any incremental conversion etc.
6366 : */
6367 : Datum
6368 46 : unicode_is_normalized(PG_FUNCTION_ARGS)
6369 : {
6370 46 : text *input = PG_GETARG_TEXT_PP(0);
6371 46 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6372 : UnicodeNormalizationForm form;
6373 : int size;
6374 : pg_wchar *input_chars;
6375 : pg_wchar *output_chars;
6376 : unsigned char *p;
6377 : int i;
6378 : UnicodeNormalizationQC quickcheck;
6379 : int output_size;
6380 : bool result;
6381 :
6382 46 : form = unicode_norm_form_from_string(formstr);
6383 :
6384 : /* convert to pg_wchar */
6385 44 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6386 44 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6387 44 : p = (unsigned char *) VARDATA_ANY(input);
6388 168 : for (i = 0; i < size; i++)
6389 : {
6390 124 : input_chars[i] = utf8_to_unicode(p);
6391 124 : p += pg_utf_mblen(p);
6392 : }
6393 44 : input_chars[i] = (pg_wchar) '\0';
6394 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6395 :
6396 : /* quick check (see UAX #15) */
6397 44 : quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6398 44 : if (quickcheck == UNICODE_NORM_QC_YES)
6399 14 : PG_RETURN_BOOL(true);
6400 30 : else if (quickcheck == UNICODE_NORM_QC_NO)
6401 4 : PG_RETURN_BOOL(false);
6402 :
6403 : /* normalize and compare with original */
6404 26 : output_chars = unicode_normalize(form, input_chars);
6405 :
6406 26 : output_size = 0;
6407 108 : for (pg_wchar *wp = output_chars; *wp; wp++)
6408 82 : output_size++;
6409 :
6410 38 : result = (size == output_size) &&
6411 12 : (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6412 :
6413 26 : PG_RETURN_BOOL(result);
6414 : }
6415 :
6416 : /*
6417 : * Check if first n chars are hexadecimal digits
6418 : */
6419 : static bool
6420 156 : isxdigits_n(const char *instr, size_t n)
6421 : {
6422 660 : for (size_t i = 0; i < n; i++)
6423 570 : if (!isxdigit((unsigned char) instr[i]))
6424 66 : return false;
6425 :
6426 90 : return true;
6427 : }
6428 :
6429 : static unsigned int
6430 504 : hexval(unsigned char c)
6431 : {
6432 504 : if (c >= '0' && c <= '9')
6433 384 : return c - '0';
6434 120 : if (c >= 'a' && c <= 'f')
6435 60 : return c - 'a' + 0xA;
6436 60 : if (c >= 'A' && c <= 'F')
6437 60 : return c - 'A' + 0xA;
6438 0 : elog(ERROR, "invalid hexadecimal digit");
6439 : return 0; /* not reached */
6440 : }
6441 :
6442 : /*
6443 : * Translate string with hexadecimal digits to number
6444 : */
6445 : static unsigned int
6446 90 : hexval_n(const char *instr, size_t n)
6447 : {
6448 90 : unsigned int result = 0;
6449 :
6450 594 : for (size_t i = 0; i < n; i++)
6451 504 : result += hexval(instr[i]) << (4 * (n - i - 1));
6452 :
6453 90 : return result;
6454 : }
6455 :
6456 : /*
6457 : * Replaces Unicode escape sequences by Unicode characters
6458 : */
6459 : Datum
6460 66 : unistr(PG_FUNCTION_ARGS)
6461 : {
6462 66 : text *input_text = PG_GETARG_TEXT_PP(0);
6463 : char *instr;
6464 : int len;
6465 : StringInfoData str;
6466 : text *result;
6467 66 : pg_wchar pair_first = 0;
6468 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6469 :
6470 66 : instr = VARDATA_ANY(input_text);
6471 66 : len = VARSIZE_ANY_EXHDR(input_text);
6472 :
6473 66 : initStringInfo(&str);
6474 :
6475 510 : while (len > 0)
6476 : {
6477 486 : if (instr[0] == '\\')
6478 : {
6479 102 : if (len >= 2 &&
6480 102 : instr[1] == '\\')
6481 : {
6482 6 : if (pair_first)
6483 0 : goto invalid_pair;
6484 6 : appendStringInfoChar(&str, '\\');
6485 6 : instr += 2;
6486 6 : len -= 2;
6487 : }
6488 96 : else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6489 66 : (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6490 30 : {
6491 : pg_wchar unicode;
6492 42 : int offset = instr[1] == 'u' ? 2 : 1;
6493 :
6494 42 : unicode = hexval_n(instr + offset, 4);
6495 :
6496 42 : if (!is_valid_unicode_codepoint(unicode))
6497 0 : ereport(ERROR,
6498 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6499 : errmsg("invalid Unicode code point: %04X", unicode));
6500 :
6501 42 : if (pair_first)
6502 : {
6503 12 : if (is_utf16_surrogate_second(unicode))
6504 : {
6505 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6506 0 : pair_first = 0;
6507 : }
6508 : else
6509 12 : goto invalid_pair;
6510 : }
6511 30 : else if (is_utf16_surrogate_second(unicode))
6512 0 : goto invalid_pair;
6513 :
6514 30 : if (is_utf16_surrogate_first(unicode))
6515 18 : pair_first = unicode;
6516 : else
6517 : {
6518 12 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6519 12 : appendStringInfoString(&str, cbuf);
6520 : }
6521 :
6522 30 : instr += 4 + offset;
6523 30 : len -= 4 + offset;
6524 : }
6525 54 : else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6526 12 : {
6527 : pg_wchar unicode;
6528 :
6529 24 : unicode = hexval_n(instr + 2, 6);
6530 :
6531 24 : if (!is_valid_unicode_codepoint(unicode))
6532 6 : ereport(ERROR,
6533 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6534 : errmsg("invalid Unicode code point: %04X", unicode));
6535 :
6536 18 : if (pair_first)
6537 : {
6538 6 : if (is_utf16_surrogate_second(unicode))
6539 : {
6540 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6541 0 : pair_first = 0;
6542 : }
6543 : else
6544 6 : goto invalid_pair;
6545 : }
6546 12 : else if (is_utf16_surrogate_second(unicode))
6547 0 : goto invalid_pair;
6548 :
6549 12 : if (is_utf16_surrogate_first(unicode))
6550 6 : pair_first = unicode;
6551 : else
6552 : {
6553 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6554 6 : appendStringInfoString(&str, cbuf);
6555 : }
6556 :
6557 12 : instr += 8;
6558 12 : len -= 8;
6559 : }
6560 30 : else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6561 12 : {
6562 : pg_wchar unicode;
6563 :
6564 24 : unicode = hexval_n(instr + 2, 8);
6565 :
6566 24 : if (!is_valid_unicode_codepoint(unicode))
6567 6 : ereport(ERROR,
6568 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6569 : errmsg("invalid Unicode code point: %04X", unicode));
6570 :
6571 18 : if (pair_first)
6572 : {
6573 6 : if (is_utf16_surrogate_second(unicode))
6574 : {
6575 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6576 0 : pair_first = 0;
6577 : }
6578 : else
6579 6 : goto invalid_pair;
6580 : }
6581 12 : else if (is_utf16_surrogate_second(unicode))
6582 0 : goto invalid_pair;
6583 :
6584 12 : if (is_utf16_surrogate_first(unicode))
6585 6 : pair_first = unicode;
6586 : else
6587 : {
6588 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6589 6 : appendStringInfoString(&str, cbuf);
6590 : }
6591 :
6592 12 : instr += 10;
6593 12 : len -= 10;
6594 : }
6595 : else
6596 6 : ereport(ERROR,
6597 : (errcode(ERRCODE_SYNTAX_ERROR),
6598 : errmsg("invalid Unicode escape"),
6599 : errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6600 : }
6601 : else
6602 : {
6603 384 : if (pair_first)
6604 0 : goto invalid_pair;
6605 :
6606 384 : appendStringInfoChar(&str, *instr++);
6607 384 : len--;
6608 : }
6609 : }
6610 :
6611 : /* unfinished surrogate pair? */
6612 24 : if (pair_first)
6613 6 : goto invalid_pair;
6614 :
6615 18 : result = cstring_to_text_with_len(str.data, str.len);
6616 18 : pfree(str.data);
6617 :
6618 18 : PG_RETURN_TEXT_P(result);
6619 :
6620 30 : invalid_pair:
6621 30 : ereport(ERROR,
6622 : (errcode(ERRCODE_SYNTAX_ERROR),
6623 : errmsg("invalid Unicode surrogate pair")));
6624 : PG_RETURN_NULL(); /* keep compiler quiet */
6625 : }
|