Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/detoast.h"
21 : #include "access/toast_compression.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/hashfn.h"
25 : #include "common/int.h"
26 : #include "common/unicode_norm.h"
27 : #include "funcapi.h"
28 : #include "lib/hyperloglog.h"
29 : #include "libpq/pqformat.h"
30 : #include "miscadmin.h"
31 : #include "nodes/execnodes.h"
32 : #include "parser/scansup.h"
33 : #include "port/pg_bswap.h"
34 : #include "regex/regex.h"
35 : #include "utils/builtins.h"
36 : #include "utils/bytea.h"
37 : #include "utils/guc.h"
38 : #include "utils/lsyscache.h"
39 : #include "utils/memutils.h"
40 : #include "utils/pg_locale.h"
41 : #include "utils/sortsupport.h"
42 : #include "utils/varlena.h"
43 :
44 :
45 : /* GUC variable */
46 : int bytea_output = BYTEA_OUTPUT_HEX;
47 :
48 : typedef struct varlena VarString;
49 :
50 : /*
51 : * State for text_position_* functions.
52 : */
53 : typedef struct
54 : {
55 : bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 :
57 : char *str1; /* haystack string */
58 : char *str2; /* needle string */
59 : int len1; /* string lengths in bytes */
60 : int len2;
61 :
62 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
63 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
64 : int skiptable[256]; /* skip distance for given mismatched char */
65 :
66 : char *last_match; /* pointer to last match in 'str1' */
67 :
68 : /*
69 : * Sometimes we need to convert the byte position of a match to a
70 : * character position. These store the last position that was converted,
71 : * so that on the next call, we can continue from that point, rather than
72 : * count characters from the very beginning.
73 : */
74 : char *refpoint; /* pointer within original haystack string */
75 : int refpos; /* 0-based character offset of the same point */
76 : } TextPositionState;
77 :
78 : typedef struct
79 : {
80 : char *buf1; /* 1st string, or abbreviation original string
81 : * buf */
82 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83 : int buflen1; /* Allocated length of buf1 */
84 : int buflen2; /* Allocated length of buf2 */
85 : int last_len1; /* Length of last buf1 string/strxfrm() input */
86 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
87 : int last_returned; /* Last comparison result (cache) */
88 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89 : bool collate_c;
90 : Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92 : hyperLogLogState full_card; /* Full key cardinality state */
93 : double prop_card; /* Required cardinality proportion */
94 : pg_locale_t locale;
95 : } VarStringSortSupport;
96 :
97 : /*
98 : * Output data for split_text(): we output either to an array or a table.
99 : * tupstore and tupdesc must be set up in advance to output to a table.
100 : */
101 : typedef struct
102 : {
103 : ArrayBuildState *astate;
104 : Tuplestorestate *tupstore;
105 : TupleDesc tupdesc;
106 : } SplitTextOutputData;
107 :
108 : /*
109 : * This should be large enough that most strings will fit, but small enough
110 : * that we feel comfortable putting it on the stack
111 : */
112 : #define TEXTBUFLEN 1024
113 :
114 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
115 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
116 :
117 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
118 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
119 : static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
120 : static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
121 : static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
122 : static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
123 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
124 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
125 : static int32 text_length(Datum str);
126 : static text *text_catenate(text *t1, text *t2);
127 : static text *text_substring(Datum str,
128 : int32 start,
129 : int32 length,
130 : bool length_not_specified);
131 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
132 : static int text_position(text *t1, text *t2, Oid collid);
133 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
134 : static bool text_position_next(TextPositionState *state);
135 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
136 : static char *text_position_get_match_ptr(TextPositionState *state);
137 : static int text_position_get_match_pos(TextPositionState *state);
138 : static void text_position_cleanup(TextPositionState *state);
139 : static void check_collation_set(Oid collid);
140 : static int text_cmp(text *arg1, text *arg2, Oid collid);
141 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
142 : static bytea *bytea_substring(Datum str,
143 : int S,
144 : int L,
145 : bool length_not_specified);
146 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
147 : static void appendStringInfoText(StringInfo str, const text *t);
148 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
149 : static void split_text_accum_result(SplitTextOutputData *tstate,
150 : text *field_value,
151 : text *null_string,
152 : Oid collation);
153 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
154 : const char *fldsep, const char *null_string);
155 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
156 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
157 : int *value);
158 : static const char *text_format_parse_format(const char *start_ptr,
159 : const char *end_ptr,
160 : int *argpos, int *widthpos,
161 : int *flags, int *width);
162 : static void text_format_string_conversion(StringInfo buf, char conversion,
163 : FmgrInfo *typOutputInfo,
164 : Datum value, bool isNull,
165 : int flags, int width);
166 : static void text_format_append_string(StringInfo buf, const char *str,
167 : int flags, int width);
168 :
169 :
170 : /*****************************************************************************
171 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
172 : *****************************************************************************/
173 :
174 : /*
175 : * cstring_to_text
176 : *
177 : * Create a text value from a null-terminated C string.
178 : *
179 : * The new text value is freshly palloc'd with a full-size VARHDR.
180 : */
181 : text *
182 23804648 : cstring_to_text(const char *s)
183 : {
184 23804648 : return cstring_to_text_with_len(s, strlen(s));
185 : }
186 :
187 : /*
188 : * cstring_to_text_with_len
189 : *
190 : * Same as cstring_to_text except the caller specifies the string length;
191 : * the string need not be null_terminated.
192 : */
193 : text *
194 28873258 : cstring_to_text_with_len(const char *s, int len)
195 : {
196 28873258 : text *result = (text *) palloc(len + VARHDRSZ);
197 :
198 28873258 : SET_VARSIZE(result, len + VARHDRSZ);
199 28873258 : memcpy(VARDATA(result), s, len);
200 :
201 28873258 : return result;
202 : }
203 :
204 : /*
205 : * text_to_cstring
206 : *
207 : * Create a palloc'd, null-terminated C string from a text value.
208 : *
209 : * We support being passed a compressed or toasted text value.
210 : * This is a bit bogus since such values shouldn't really be referred to as
211 : * "text *", but it seems useful for robustness. If we didn't handle that
212 : * case here, we'd need another routine that did, anyway.
213 : */
214 : char *
215 18129666 : text_to_cstring(const text *t)
216 : {
217 : /* must cast away the const, unfortunately */
218 18129666 : text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
219 18129666 : int len = VARSIZE_ANY_EXHDR(tunpacked);
220 : char *result;
221 :
222 18129666 : result = (char *) palloc(len + 1);
223 18129666 : memcpy(result, VARDATA_ANY(tunpacked), len);
224 18129666 : result[len] = '\0';
225 :
226 18129666 : if (tunpacked != t)
227 128472 : pfree(tunpacked);
228 :
229 18129666 : return result;
230 : }
231 :
232 : /*
233 : * text_to_cstring_buffer
234 : *
235 : * Copy a text value into a caller-supplied buffer of size dst_len.
236 : *
237 : * The text string is truncated if necessary to fit. The result is
238 : * guaranteed null-terminated (unless dst_len == 0).
239 : *
240 : * We support being passed a compressed or toasted text value.
241 : * This is a bit bogus since such values shouldn't really be referred to as
242 : * "text *", but it seems useful for robustness. If we didn't handle that
243 : * case here, we'd need another routine that did, anyway.
244 : */
245 : void
246 640 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
247 : {
248 : /* must cast away the const, unfortunately */
249 640 : text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
250 640 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
251 :
252 640 : if (dst_len > 0)
253 : {
254 640 : dst_len--;
255 640 : if (dst_len >= src_len)
256 640 : dst_len = src_len;
257 : else /* ensure truncation is encoding-safe */
258 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
259 640 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
260 640 : dst[dst_len] = '\0';
261 : }
262 :
263 640 : if (srcunpacked != src)
264 0 : pfree(srcunpacked);
265 640 : }
266 :
267 :
268 : /*****************************************************************************
269 : * USER I/O ROUTINES *
270 : *****************************************************************************/
271 :
272 :
273 : #define VAL(CH) ((CH) - '0')
274 : #define DIG(VAL) ((VAL) + '0')
275 :
276 : /*
277 : * byteain - converts from printable representation of byte array
278 : *
279 : * Non-printable characters must be passed as '\nnn' (octal) and are
280 : * converted to internal form. '\' must be passed as '\\'.
281 : * ereport(ERROR, ...) if bad form.
282 : *
283 : * BUGS:
284 : * The input is scanned twice.
285 : * The error checking of input is minimal.
286 : */
287 : Datum
288 262846 : byteain(PG_FUNCTION_ARGS)
289 : {
290 262846 : char *inputText = PG_GETARG_CSTRING(0);
291 262846 : Node *escontext = fcinfo->context;
292 : char *tp;
293 : char *rp;
294 : int bc;
295 : bytea *result;
296 :
297 : /* Recognize hex input */
298 262846 : if (inputText[0] == '\\' && inputText[1] == 'x')
299 : {
300 810 : size_t len = strlen(inputText);
301 :
302 810 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
303 810 : result = palloc(bc);
304 810 : bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
305 : escontext);
306 798 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
307 :
308 798 : PG_RETURN_BYTEA_P(result);
309 : }
310 :
311 : /* Else, it's the traditional escaped style */
312 4354570 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
313 : {
314 4092546 : if (tp[0] != '\\')
315 4091530 : tp++;
316 1016 : else if ((tp[0] == '\\') &&
317 1016 : (tp[1] >= '0' && tp[1] <= '3') &&
318 1004 : (tp[2] >= '0' && tp[2] <= '7') &&
319 1004 : (tp[3] >= '0' && tp[3] <= '7'))
320 1004 : tp += 4;
321 12 : else if ((tp[0] == '\\') &&
322 12 : (tp[1] == '\\'))
323 0 : tp += 2;
324 : else
325 : {
326 : /*
327 : * one backslash, not followed by another or ### valid octal
328 : */
329 12 : ereturn(escontext, (Datum) 0,
330 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
331 : errmsg("invalid input syntax for type %s", "bytea")));
332 : }
333 : }
334 :
335 262024 : bc += VARHDRSZ;
336 :
337 262024 : result = (bytea *) palloc(bc);
338 262024 : SET_VARSIZE(result, bc);
339 :
340 262024 : tp = inputText;
341 262024 : rp = VARDATA(result);
342 4354528 : while (*tp != '\0')
343 : {
344 4092504 : if (tp[0] != '\\')
345 4091500 : *rp++ = *tp++;
346 1004 : else if ((tp[0] == '\\') &&
347 1004 : (tp[1] >= '0' && tp[1] <= '3') &&
348 1004 : (tp[2] >= '0' && tp[2] <= '7') &&
349 1004 : (tp[3] >= '0' && tp[3] <= '7'))
350 : {
351 1004 : bc = VAL(tp[1]);
352 1004 : bc <<= 3;
353 1004 : bc += VAL(tp[2]);
354 1004 : bc <<= 3;
355 1004 : *rp++ = bc + VAL(tp[3]);
356 :
357 1004 : tp += 4;
358 : }
359 0 : else if ((tp[0] == '\\') &&
360 0 : (tp[1] == '\\'))
361 : {
362 0 : *rp++ = '\\';
363 0 : tp += 2;
364 : }
365 : else
366 : {
367 : /*
368 : * We should never get here. The first pass should not allow it.
369 : */
370 0 : ereturn(escontext, (Datum) 0,
371 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
372 : errmsg("invalid input syntax for type %s", "bytea")));
373 : }
374 : }
375 :
376 262024 : PG_RETURN_BYTEA_P(result);
377 : }
378 :
379 : /*
380 : * byteaout - converts to printable representation of byte array
381 : *
382 : * In the traditional escaped format, non-printable characters are
383 : * printed as '\nnn' (octal) and '\' as '\\'.
384 : */
385 : Datum
386 13432 : byteaout(PG_FUNCTION_ARGS)
387 : {
388 13432 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
389 : char *result;
390 : char *rp;
391 :
392 13432 : if (bytea_output == BYTEA_OUTPUT_HEX)
393 : {
394 : /* Print hex format */
395 13048 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
396 13048 : *rp++ = '\\';
397 13048 : *rp++ = 'x';
398 13048 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
399 : }
400 384 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
401 : {
402 : /* Print traditional escaped format */
403 : char *vp;
404 : uint64 len;
405 : int i;
406 :
407 384 : len = 1; /* empty string has 1 char */
408 384 : vp = VARDATA_ANY(vlena);
409 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
410 : {
411 217276 : if (*vp == '\\')
412 0 : len += 2;
413 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
414 498 : len += 4;
415 : else
416 216778 : len++;
417 : }
418 :
419 : /*
420 : * In principle len can't overflow uint32 if the input fit in 1GB, but
421 : * for safety let's check rather than relying on palloc's internal
422 : * check.
423 : */
424 384 : if (len > MaxAllocSize)
425 0 : ereport(ERROR,
426 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
427 : errmsg_internal("result of bytea output conversion is too large")));
428 384 : rp = result = (char *) palloc(len);
429 :
430 384 : vp = VARDATA_ANY(vlena);
431 217660 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
432 : {
433 217276 : if (*vp == '\\')
434 : {
435 0 : *rp++ = '\\';
436 0 : *rp++ = '\\';
437 : }
438 217276 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
439 498 : {
440 : int val; /* holds unprintable chars */
441 :
442 498 : val = *vp;
443 498 : rp[0] = '\\';
444 498 : rp[3] = DIG(val & 07);
445 498 : val >>= 3;
446 498 : rp[2] = DIG(val & 07);
447 498 : val >>= 3;
448 498 : rp[1] = DIG(val & 03);
449 498 : rp += 4;
450 : }
451 : else
452 216778 : *rp++ = *vp;
453 : }
454 : }
455 : else
456 : {
457 0 : elog(ERROR, "unrecognized bytea_output setting: %d",
458 : bytea_output);
459 : rp = result = NULL; /* keep compiler quiet */
460 : }
461 13432 : *rp = '\0';
462 13432 : PG_RETURN_CSTRING(result);
463 : }
464 :
465 : /*
466 : * bytearecv - converts external binary format to bytea
467 : */
468 : Datum
469 1038 : bytearecv(PG_FUNCTION_ARGS)
470 : {
471 1038 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
472 : bytea *result;
473 : int nbytes;
474 :
475 1038 : nbytes = buf->len - buf->cursor;
476 1038 : result = (bytea *) palloc(nbytes + VARHDRSZ);
477 1038 : SET_VARSIZE(result, nbytes + VARHDRSZ);
478 1038 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
479 1038 : PG_RETURN_BYTEA_P(result);
480 : }
481 :
482 : /*
483 : * byteasend - converts bytea to binary format
484 : *
485 : * This is a special case: just copy the input...
486 : */
487 : Datum
488 5604 : byteasend(PG_FUNCTION_ARGS)
489 : {
490 5604 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
491 :
492 5604 : PG_RETURN_BYTEA_P(vlena);
493 : }
494 :
495 : Datum
496 92774 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
497 : {
498 : StringInfo state;
499 :
500 92774 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
501 :
502 : /* Append the value unless null, preceding it with the delimiter. */
503 92774 : if (!PG_ARGISNULL(1))
504 : {
505 77774 : bytea *value = PG_GETARG_BYTEA_PP(1);
506 77774 : bool isfirst = false;
507 :
508 : /*
509 : * You might think we can just throw away the first delimiter, however
510 : * we must keep it as we may be a parallel worker doing partial
511 : * aggregation building a state to send to the main process. We need
512 : * to keep the delimiter of every aggregation so that the combine
513 : * function can properly join up the strings of two separately
514 : * partially aggregated results. The first delimiter is only stripped
515 : * off in the final function. To know how much to strip off the front
516 : * of the string, we store the length of the first delimiter in the
517 : * StringInfo's cursor field, which we don't otherwise need here.
518 : */
519 77774 : if (state == NULL)
520 : {
521 166 : state = makeStringAggState(fcinfo);
522 166 : isfirst = true;
523 : }
524 :
525 77774 : if (!PG_ARGISNULL(2))
526 : {
527 77762 : bytea *delim = PG_GETARG_BYTEA_PP(2);
528 :
529 77762 : appendBinaryStringInfo(state, VARDATA_ANY(delim),
530 77762 : VARSIZE_ANY_EXHDR(delim));
531 77762 : if (isfirst)
532 160 : state->cursor = VARSIZE_ANY_EXHDR(delim);
533 : }
534 :
535 77774 : appendBinaryStringInfo(state, VARDATA_ANY(value),
536 77774 : VARSIZE_ANY_EXHDR(value));
537 : }
538 :
539 : /*
540 : * The transition type for string_agg() is declared to be "internal",
541 : * which is a pass-by-value type the same size as a pointer.
542 : */
543 92774 : if (state)
544 92732 : PG_RETURN_POINTER(state);
545 42 : PG_RETURN_NULL();
546 : }
547 :
548 : Datum
549 152 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
550 : {
551 : StringInfo state;
552 :
553 : /* cannot be called directly because of internal-type argument */
554 : Assert(AggCheckCallContext(fcinfo, NULL));
555 :
556 152 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
557 :
558 152 : if (state != NULL)
559 : {
560 : /* As per comment in transfn, strip data before the cursor position */
561 : bytea *result;
562 146 : int strippedlen = state->len - state->cursor;
563 :
564 146 : result = (bytea *) palloc(strippedlen + VARHDRSZ);
565 146 : SET_VARSIZE(result, strippedlen + VARHDRSZ);
566 146 : memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
567 146 : PG_RETURN_BYTEA_P(result);
568 : }
569 : else
570 6 : PG_RETURN_NULL();
571 : }
572 :
573 : /*
574 : * textin - converts "..." to internal representation
575 : */
576 : Datum
577 18095270 : textin(PG_FUNCTION_ARGS)
578 : {
579 18095270 : char *inputText = PG_GETARG_CSTRING(0);
580 :
581 18095270 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
582 : }
583 :
584 : /*
585 : * textout - converts internal representation to "..."
586 : */
587 : Datum
588 8300560 : textout(PG_FUNCTION_ARGS)
589 : {
590 8300560 : Datum txt = PG_GETARG_DATUM(0);
591 :
592 8300560 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
593 : }
594 :
595 : /*
596 : * textrecv - converts external binary format to text
597 : */
598 : Datum
599 106720 : textrecv(PG_FUNCTION_ARGS)
600 : {
601 106720 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
602 : text *result;
603 : char *str;
604 : int nbytes;
605 :
606 106720 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
607 :
608 106720 : result = cstring_to_text_with_len(str, nbytes);
609 106720 : pfree(str);
610 106720 : PG_RETURN_TEXT_P(result);
611 : }
612 :
613 : /*
614 : * textsend - converts text to binary format
615 : */
616 : Datum
617 68228 : textsend(PG_FUNCTION_ARGS)
618 : {
619 68228 : text *t = PG_GETARG_TEXT_PP(0);
620 : StringInfoData buf;
621 :
622 68228 : pq_begintypsend(&buf);
623 68228 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
624 68228 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
625 : }
626 :
627 :
628 : /*
629 : * unknownin - converts "..." to internal representation
630 : */
631 : Datum
632 0 : unknownin(PG_FUNCTION_ARGS)
633 : {
634 0 : char *str = PG_GETARG_CSTRING(0);
635 :
636 : /* representation is same as cstring */
637 0 : PG_RETURN_CSTRING(pstrdup(str));
638 : }
639 :
640 : /*
641 : * unknownout - converts internal representation to "..."
642 : */
643 : Datum
644 682 : unknownout(PG_FUNCTION_ARGS)
645 : {
646 : /* representation is same as cstring */
647 682 : char *str = PG_GETARG_CSTRING(0);
648 :
649 682 : PG_RETURN_CSTRING(pstrdup(str));
650 : }
651 :
652 : /*
653 : * unknownrecv - converts external binary format to unknown
654 : */
655 : Datum
656 0 : unknownrecv(PG_FUNCTION_ARGS)
657 : {
658 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
659 : char *str;
660 : int nbytes;
661 :
662 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
663 : /* representation is same as cstring */
664 0 : PG_RETURN_CSTRING(str);
665 : }
666 :
667 : /*
668 : * unknownsend - converts unknown to binary format
669 : */
670 : Datum
671 0 : unknownsend(PG_FUNCTION_ARGS)
672 : {
673 : /* representation is same as cstring */
674 0 : char *str = PG_GETARG_CSTRING(0);
675 : StringInfoData buf;
676 :
677 0 : pq_begintypsend(&buf);
678 0 : pq_sendtext(&buf, str, strlen(str));
679 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
680 : }
681 :
682 :
683 : /* ========== PUBLIC ROUTINES ========== */
684 :
685 : /*
686 : * textlen -
687 : * returns the logical length of a text*
688 : * (which is less than the VARSIZE of the text*)
689 : */
690 : Datum
691 430540 : textlen(PG_FUNCTION_ARGS)
692 : {
693 430540 : Datum str = PG_GETARG_DATUM(0);
694 :
695 : /* try to avoid decompressing argument */
696 430540 : PG_RETURN_INT32(text_length(str));
697 : }
698 :
699 : /*
700 : * text_length -
701 : * Does the real work for textlen()
702 : *
703 : * This is broken out so it can be called directly by other string processing
704 : * functions. Note that the argument is passed as a Datum, to indicate that
705 : * it may still be in compressed form. We can avoid decompressing it at all
706 : * in some cases.
707 : */
708 : static int32
709 430552 : text_length(Datum str)
710 : {
711 : /* fastpath when max encoding length is one */
712 430552 : if (pg_database_encoding_max_length() == 1)
713 32 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
714 : else
715 : {
716 430520 : text *t = DatumGetTextPP(str);
717 :
718 430520 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
719 : VARSIZE_ANY_EXHDR(t)));
720 : }
721 : }
722 :
723 : /*
724 : * textoctetlen -
725 : * returns the physical length of a text*
726 : * (which is less than the VARSIZE of the text*)
727 : */
728 : Datum
729 70 : textoctetlen(PG_FUNCTION_ARGS)
730 : {
731 70 : Datum str = PG_GETARG_DATUM(0);
732 :
733 : /* We need not detoast the input at all */
734 70 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
735 : }
736 :
737 : /*
738 : * textcat -
739 : * takes two text* and returns a text* that is the concatenation of
740 : * the two.
741 : *
742 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
743 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
744 : * Allocate space for output in all cases.
745 : * XXX - thomas 1997-07-10
746 : */
747 : Datum
748 2664476 : textcat(PG_FUNCTION_ARGS)
749 : {
750 2664476 : text *t1 = PG_GETARG_TEXT_PP(0);
751 2664476 : text *t2 = PG_GETARG_TEXT_PP(1);
752 :
753 2664476 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
754 : }
755 :
756 : /*
757 : * text_catenate
758 : * Guts of textcat(), broken out so it can be used by other functions
759 : *
760 : * Arguments can be in short-header form, but not compressed or out-of-line
761 : */
762 : static text *
763 2664556 : text_catenate(text *t1, text *t2)
764 : {
765 : text *result;
766 : int len1,
767 : len2,
768 : len;
769 : char *ptr;
770 :
771 2664556 : len1 = VARSIZE_ANY_EXHDR(t1);
772 2664556 : len2 = VARSIZE_ANY_EXHDR(t2);
773 :
774 : /* paranoia ... probably should throw error instead? */
775 2664556 : if (len1 < 0)
776 0 : len1 = 0;
777 2664556 : if (len2 < 0)
778 0 : len2 = 0;
779 :
780 2664556 : len = len1 + len2 + VARHDRSZ;
781 2664556 : result = (text *) palloc(len);
782 :
783 : /* Set size of result string... */
784 2664556 : SET_VARSIZE(result, len);
785 :
786 : /* Fill data field of result string... */
787 2664556 : ptr = VARDATA(result);
788 2664556 : if (len1 > 0)
789 2661300 : memcpy(ptr, VARDATA_ANY(t1), len1);
790 2664556 : if (len2 > 0)
791 2664346 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
792 :
793 2664556 : return result;
794 : }
795 :
796 : /*
797 : * charlen_to_bytelen()
798 : * Compute the number of bytes occupied by n characters starting at *p
799 : *
800 : * It is caller's responsibility that there actually are n characters;
801 : * the string need not be null-terminated.
802 : */
803 : static int
804 10920 : charlen_to_bytelen(const char *p, int n)
805 : {
806 10920 : if (pg_database_encoding_max_length() == 1)
807 : {
808 : /* Optimization for single-byte encodings */
809 144 : return n;
810 : }
811 : else
812 : {
813 : const char *s;
814 :
815 5906696 : for (s = p; n > 0; n--)
816 5895920 : s += pg_mblen(s);
817 :
818 10776 : return s - p;
819 : }
820 : }
821 :
822 : /*
823 : * text_substr()
824 : * Return a substring starting at the specified position.
825 : * - thomas 1997-12-31
826 : *
827 : * Input:
828 : * - string
829 : * - starting position (is one-based)
830 : * - string length
831 : *
832 : * If the starting position is zero or less, then return from the start of the string
833 : * adjusting the length to be consistent with the "negative start" per SQL.
834 : * If the length is less than zero, return the remaining string.
835 : *
836 : * Added multibyte support.
837 : * - Tatsuo Ishii 1998-4-21
838 : * Changed behavior if starting position is less than one to conform to SQL behavior.
839 : * Formerly returned the entire string; now returns a portion.
840 : * - Thomas Lockhart 1998-12-10
841 : * Now uses faster TOAST-slicing interface
842 : * - John Gray 2002-02-22
843 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
844 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
845 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
846 : * S > LC and < LC + 4 sometimes garbage characters are returned.
847 : * - Joe Conway 2002-08-10
848 : */
849 : Datum
850 749534 : text_substr(PG_FUNCTION_ARGS)
851 : {
852 749534 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
853 : PG_GETARG_INT32(1),
854 : PG_GETARG_INT32(2),
855 : false));
856 : }
857 :
858 : /*
859 : * text_substr_no_len -
860 : * Wrapper to avoid opr_sanity failure due to
861 : * one function accepting a different number of args.
862 : */
863 : Datum
864 36 : text_substr_no_len(PG_FUNCTION_ARGS)
865 : {
866 36 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
867 : PG_GETARG_INT32(1),
868 : -1, true));
869 : }
870 :
871 : /*
872 : * text_substring -
873 : * Does the real work for text_substr() and text_substr_no_len()
874 : *
875 : * This is broken out so it can be called directly by other string processing
876 : * functions. Note that the argument is passed as a Datum, to indicate that
877 : * it may still be in compressed/toasted form. We can avoid detoasting all
878 : * of it in some cases.
879 : *
880 : * The result is always a freshly palloc'd datum.
881 : */
882 : static text *
883 789418 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
884 : {
885 789418 : int32 eml = pg_database_encoding_max_length();
886 789418 : int32 S = start; /* start position */
887 : int32 S1; /* adjusted start position */
888 : int32 L1; /* adjusted substring length */
889 : int32 E; /* end position */
890 :
891 : /*
892 : * SQL99 says S can be zero or negative, but we still must fetch from the
893 : * start of the string.
894 : */
895 789418 : S1 = Max(S, 1);
896 :
897 : /* life is easy if the encoding max length is 1 */
898 789418 : if (eml == 1)
899 : {
900 18 : if (length_not_specified) /* special case - get length to end of
901 : * string */
902 0 : L1 = -1;
903 18 : else if (length < 0)
904 : {
905 : /* SQL99 says to throw an error for E < S, i.e., negative length */
906 0 : ereport(ERROR,
907 : (errcode(ERRCODE_SUBSTRING_ERROR),
908 : errmsg("negative substring length not allowed")));
909 : L1 = -1; /* silence stupider compilers */
910 : }
911 18 : else if (pg_add_s32_overflow(S, length, &E))
912 : {
913 : /*
914 : * L could be large enough for S + L to overflow, in which case
915 : * the substring must run to end of string.
916 : */
917 0 : L1 = -1;
918 : }
919 : else
920 : {
921 : /*
922 : * A zero or negative value for the end position can happen if the
923 : * start was negative or one. SQL99 says to return a zero-length
924 : * string.
925 : */
926 18 : if (E < 1)
927 0 : return cstring_to_text("");
928 :
929 18 : L1 = E - S1;
930 : }
931 :
932 : /*
933 : * If the start position is past the end of the string, SQL99 says to
934 : * return a zero-length string -- DatumGetTextPSlice() will do that
935 : * for us. We need only convert S1 to zero-based starting position.
936 : */
937 18 : return DatumGetTextPSlice(str, S1 - 1, L1);
938 : }
939 789400 : else if (eml > 1)
940 : {
941 : /*
942 : * When encoding max length is > 1, we can't get LC without
943 : * detoasting, so we'll grab a conservatively large slice now and go
944 : * back later to do the right thing
945 : */
946 : int32 slice_start;
947 : int32 slice_size;
948 : int32 slice_strlen;
949 : text *slice;
950 : int32 E1;
951 : int32 i;
952 : char *p;
953 : char *s;
954 : text *ret;
955 :
956 : /*
957 : * We need to start at position zero because there is no way to know
958 : * in advance which byte offset corresponds to the supplied start
959 : * position.
960 : */
961 789400 : slice_start = 0;
962 :
963 789400 : if (length_not_specified) /* special case - get length to end of
964 : * string */
965 76 : slice_size = L1 = -1;
966 789324 : else if (length < 0)
967 : {
968 : /* SQL99 says to throw an error for E < S, i.e., negative length */
969 12 : ereport(ERROR,
970 : (errcode(ERRCODE_SUBSTRING_ERROR),
971 : errmsg("negative substring length not allowed")));
972 : slice_size = L1 = -1; /* silence stupider compilers */
973 : }
974 789312 : else if (pg_add_s32_overflow(S, length, &E))
975 : {
976 : /*
977 : * L could be large enough for S + L to overflow, in which case
978 : * the substring must run to end of string.
979 : */
980 6 : slice_size = L1 = -1;
981 : }
982 : else
983 : {
984 : /*
985 : * A zero or negative value for the end position can happen if the
986 : * start was negative or one. SQL99 says to return a zero-length
987 : * string.
988 : */
989 789306 : if (E < 1)
990 0 : return cstring_to_text("");
991 :
992 : /*
993 : * if E is past the end of the string, the tuple toaster will
994 : * truncate the length for us
995 : */
996 789306 : L1 = E - S1;
997 :
998 : /*
999 : * Total slice size in bytes can't be any longer than the start
1000 : * position plus substring length times the encoding max length.
1001 : * If that overflows, we can just use -1.
1002 : */
1003 789306 : if (pg_mul_s32_overflow(E, eml, &slice_size))
1004 6 : slice_size = -1;
1005 : }
1006 :
1007 : /*
1008 : * If we're working with an untoasted source, no need to do an extra
1009 : * copying step.
1010 : */
1011 789388 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1012 789334 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1013 324 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
1014 : else
1015 789064 : slice = (text *) DatumGetPointer(str);
1016 :
1017 : /* see if we got back an empty string */
1018 789388 : if (VARSIZE_ANY_EXHDR(slice) == 0)
1019 : {
1020 0 : if (slice != (text *) DatumGetPointer(str))
1021 0 : pfree(slice);
1022 0 : return cstring_to_text("");
1023 : }
1024 :
1025 : /* Now we can get the actual length of the slice in MB characters */
1026 789388 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1027 789388 : VARSIZE_ANY_EXHDR(slice));
1028 :
1029 : /*
1030 : * Check that the start position wasn't > slice_strlen. If so, SQL99
1031 : * says to return a zero-length string.
1032 : */
1033 789388 : if (S1 > slice_strlen)
1034 : {
1035 22 : if (slice != (text *) DatumGetPointer(str))
1036 0 : pfree(slice);
1037 22 : return cstring_to_text("");
1038 : }
1039 :
1040 : /*
1041 : * Adjust L1 and E1 now that we know the slice string length. Again
1042 : * remember that S1 is one based, and slice_start is zero based.
1043 : */
1044 789366 : if (L1 > -1)
1045 789306 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1046 : else
1047 60 : E1 = slice_start + 1 + slice_strlen;
1048 :
1049 : /*
1050 : * Find the start position in the slice; remember S1 is not zero based
1051 : */
1052 789366 : p = VARDATA_ANY(slice);
1053 5610148 : for (i = 0; i < S1 - 1; i++)
1054 4820782 : p += pg_mblen(p);
1055 :
1056 : /* hang onto a pointer to our start position */
1057 789366 : s = p;
1058 :
1059 : /*
1060 : * Count the actual bytes used by the substring of the requested
1061 : * length.
1062 : */
1063 9707316 : for (i = S1; i < E1; i++)
1064 8917950 : p += pg_mblen(p);
1065 :
1066 789366 : ret = (text *) palloc(VARHDRSZ + (p - s));
1067 789366 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
1068 789366 : memcpy(VARDATA(ret), s, (p - s));
1069 :
1070 789366 : if (slice != (text *) DatumGetPointer(str))
1071 324 : pfree(slice);
1072 :
1073 789366 : return ret;
1074 : }
1075 : else
1076 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
1077 :
1078 : /* not reached: suppress compiler warning */
1079 : return NULL;
1080 : }
1081 :
1082 : /*
1083 : * textoverlay
1084 : * Replace specified substring of first string with second
1085 : *
1086 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1087 : * This code is a direct implementation of what the standard says.
1088 : */
1089 : Datum
1090 28 : textoverlay(PG_FUNCTION_ARGS)
1091 : {
1092 28 : text *t1 = PG_GETARG_TEXT_PP(0);
1093 28 : text *t2 = PG_GETARG_TEXT_PP(1);
1094 28 : int sp = PG_GETARG_INT32(2); /* substring start position */
1095 28 : int sl = PG_GETARG_INT32(3); /* substring length */
1096 :
1097 28 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1098 : }
1099 :
1100 : Datum
1101 12 : textoverlay_no_len(PG_FUNCTION_ARGS)
1102 : {
1103 12 : text *t1 = PG_GETARG_TEXT_PP(0);
1104 12 : text *t2 = PG_GETARG_TEXT_PP(1);
1105 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
1106 : int sl;
1107 :
1108 12 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1109 12 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1110 : }
1111 :
1112 : static text *
1113 40 : text_overlay(text *t1, text *t2, int sp, int sl)
1114 : {
1115 : text *result;
1116 : text *s1;
1117 : text *s2;
1118 : int sp_pl_sl;
1119 :
1120 : /*
1121 : * Check for possible integer-overflow cases. For negative sp, throw a
1122 : * "substring length" error because that's what should be expected
1123 : * according to the spec's definition of OVERLAY().
1124 : */
1125 40 : if (sp <= 0)
1126 0 : ereport(ERROR,
1127 : (errcode(ERRCODE_SUBSTRING_ERROR),
1128 : errmsg("negative substring length not allowed")));
1129 40 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1130 0 : ereport(ERROR,
1131 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1132 : errmsg("integer out of range")));
1133 :
1134 40 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1135 40 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1136 40 : result = text_catenate(s1, t2);
1137 40 : result = text_catenate(result, s2);
1138 :
1139 40 : return result;
1140 : }
1141 :
1142 : /*
1143 : * textpos -
1144 : * Return the position of the specified substring.
1145 : * Implements the SQL POSITION() function.
1146 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1147 : * - thomas 1997-07-27
1148 : */
1149 : Datum
1150 106 : textpos(PG_FUNCTION_ARGS)
1151 : {
1152 106 : text *str = PG_GETARG_TEXT_PP(0);
1153 106 : text *search_str = PG_GETARG_TEXT_PP(1);
1154 :
1155 106 : PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1156 : }
1157 :
1158 : /*
1159 : * text_position -
1160 : * Does the real work for textpos()
1161 : *
1162 : * Inputs:
1163 : * t1 - string to be searched
1164 : * t2 - pattern to match within t1
1165 : * Result:
1166 : * Character index of the first matched char, starting from 1,
1167 : * or 0 if no match.
1168 : *
1169 : * This is broken out so it can be called directly by other string processing
1170 : * functions.
1171 : */
1172 : static int
1173 106 : text_position(text *t1, text *t2, Oid collid)
1174 : {
1175 : TextPositionState state;
1176 : int result;
1177 :
1178 : /* Empty needle always matches at position 1 */
1179 106 : if (VARSIZE_ANY_EXHDR(t2) < 1)
1180 12 : return 1;
1181 :
1182 : /* Otherwise, can't match if haystack is shorter than needle */
1183 94 : if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1184 22 : return 0;
1185 :
1186 72 : text_position_setup(t1, t2, collid, &state);
1187 72 : if (!text_position_next(&state))
1188 24 : result = 0;
1189 : else
1190 48 : result = text_position_get_match_pos(&state);
1191 72 : text_position_cleanup(&state);
1192 72 : return result;
1193 : }
1194 :
1195 :
1196 : /*
1197 : * text_position_setup, text_position_next, text_position_cleanup -
1198 : * Component steps of text_position()
1199 : *
1200 : * These are broken out so that a string can be efficiently searched for
1201 : * multiple occurrences of the same pattern. text_position_next may be
1202 : * called multiple times, and it advances to the next match on each call.
1203 : * text_position_get_match_ptr() and text_position_get_match_pos() return
1204 : * a pointer or 1-based character position of the last match, respectively.
1205 : *
1206 : * The "state" variable is normally just a local variable in the caller.
1207 : *
1208 : * NOTE: text_position_next skips over the matched portion. For example,
1209 : * searching for "xx" in "xxx" returns only one match, not two.
1210 : */
1211 :
1212 : static void
1213 2800 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1214 : {
1215 2800 : int len1 = VARSIZE_ANY_EXHDR(t1);
1216 2800 : int len2 = VARSIZE_ANY_EXHDR(t2);
1217 2800 : pg_locale_t mylocale = 0;
1218 :
1219 2800 : check_collation_set(collid);
1220 :
1221 2800 : if (!lc_collate_is_c(collid))
1222 280 : mylocale = pg_newlocale_from_collation(collid);
1223 :
1224 2800 : if (!pg_locale_deterministic(mylocale))
1225 12 : ereport(ERROR,
1226 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1227 : errmsg("nondeterministic collations are not supported for substring searches")));
1228 :
1229 : Assert(len1 > 0);
1230 : Assert(len2 > 0);
1231 :
1232 : /*
1233 : * Even with a multi-byte encoding, we perform the search using the raw
1234 : * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1235 : * because in UTF-8 the byte sequence of one character cannot contain
1236 : * another character. For other multi-byte encodings, we do the search
1237 : * initially as a simple byte search, ignoring multibyte issues, but
1238 : * verify afterwards that the match we found is at a character boundary,
1239 : * and continue the search if it was a false match.
1240 : */
1241 2788 : if (pg_database_encoding_max_length() == 1)
1242 86 : state->is_multibyte_char_in_char = false;
1243 2702 : else if (GetDatabaseEncoding() == PG_UTF8)
1244 2702 : state->is_multibyte_char_in_char = false;
1245 : else
1246 0 : state->is_multibyte_char_in_char = true;
1247 :
1248 2788 : state->str1 = VARDATA_ANY(t1);
1249 2788 : state->str2 = VARDATA_ANY(t2);
1250 2788 : state->len1 = len1;
1251 2788 : state->len2 = len2;
1252 2788 : state->last_match = NULL;
1253 2788 : state->refpoint = state->str1;
1254 2788 : state->refpos = 0;
1255 :
1256 : /*
1257 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1258 : * notes we use the terminology that the "haystack" is the string to be
1259 : * searched (t1) and the "needle" is the pattern being sought (t2).
1260 : *
1261 : * If the needle is empty or bigger than the haystack then there is no
1262 : * point in wasting cycles initializing the table. We also choose not to
1263 : * use B-M-H for needles of length 1, since the skip table can't possibly
1264 : * save anything in that case.
1265 : */
1266 2788 : if (len1 >= len2 && len2 > 1)
1267 : {
1268 2580 : int searchlength = len1 - len2;
1269 : int skiptablemask;
1270 : int last;
1271 : int i;
1272 2580 : const char *str2 = state->str2;
1273 :
1274 : /*
1275 : * First we must determine how much of the skip table to use. The
1276 : * declaration of TextPositionState allows up to 256 elements, but for
1277 : * short search problems we don't really want to have to initialize so
1278 : * many elements --- it would take too long in comparison to the
1279 : * actual search time. So we choose a useful skip table size based on
1280 : * the haystack length minus the needle length. The closer the needle
1281 : * length is to the haystack length the less useful skipping becomes.
1282 : *
1283 : * Note: since we use bit-masking to select table elements, the skip
1284 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1285 : */
1286 2580 : if (searchlength < 16)
1287 54 : skiptablemask = 3;
1288 2526 : else if (searchlength < 64)
1289 16 : skiptablemask = 7;
1290 2510 : else if (searchlength < 128)
1291 14 : skiptablemask = 15;
1292 2496 : else if (searchlength < 512)
1293 190 : skiptablemask = 31;
1294 2306 : else if (searchlength < 2048)
1295 2178 : skiptablemask = 63;
1296 128 : else if (searchlength < 4096)
1297 58 : skiptablemask = 127;
1298 : else
1299 70 : skiptablemask = 255;
1300 2580 : state->skiptablemask = skiptablemask;
1301 :
1302 : /*
1303 : * Initialize the skip table. We set all elements to the needle
1304 : * length, since this is the correct skip distance for any character
1305 : * not found in the needle.
1306 : */
1307 173964 : for (i = 0; i <= skiptablemask; i++)
1308 171384 : state->skiptable[i] = len2;
1309 :
1310 : /*
1311 : * Now examine the needle. For each character except the last one,
1312 : * set the corresponding table element to the appropriate skip
1313 : * distance. Note that when two characters share the same skip table
1314 : * entry, the one later in the needle must determine the skip
1315 : * distance.
1316 : */
1317 2580 : last = len2 - 1;
1318 :
1319 32412 : for (i = 0; i < last; i++)
1320 29832 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1321 : }
1322 2788 : }
1323 :
1324 : /*
1325 : * Advance to the next match, starting from the end of the previous match
1326 : * (or the beginning of the string, on first call). Returns true if a match
1327 : * is found.
1328 : *
1329 : * Note that this refuses to match an empty-string needle. Most callers
1330 : * will have handled that case specially and we'll never see it here.
1331 : */
1332 : static bool
1333 9598 : text_position_next(TextPositionState *state)
1334 : {
1335 9598 : int needle_len = state->len2;
1336 : char *start_ptr;
1337 : char *matchptr;
1338 :
1339 9598 : if (needle_len <= 0)
1340 0 : return false; /* result for empty pattern */
1341 :
1342 : /* Start from the point right after the previous match. */
1343 9598 : if (state->last_match)
1344 6798 : start_ptr = state->last_match + needle_len;
1345 : else
1346 2800 : start_ptr = state->str1;
1347 :
1348 9598 : retry:
1349 9598 : matchptr = text_position_next_internal(start_ptr, state);
1350 :
1351 9598 : if (!matchptr)
1352 2728 : return false;
1353 :
1354 : /*
1355 : * Found a match for the byte sequence. If this is a multibyte encoding,
1356 : * where one character's byte sequence can appear inside a longer
1357 : * multi-byte character, we need to verify that the match was at a
1358 : * character boundary, not in the middle of a multi-byte character.
1359 : */
1360 6870 : if (state->is_multibyte_char_in_char)
1361 : {
1362 : /* Walk one character at a time, until we reach the match. */
1363 :
1364 : /* the search should never move backwards. */
1365 : Assert(state->refpoint <= matchptr);
1366 :
1367 0 : while (state->refpoint < matchptr)
1368 : {
1369 : /* step to next character. */
1370 0 : state->refpoint += pg_mblen(state->refpoint);
1371 0 : state->refpos++;
1372 :
1373 : /*
1374 : * If we stepped over the match's start position, then it was a
1375 : * false positive, where the byte sequence appeared in the middle
1376 : * of a multi-byte character. Skip it, and continue the search at
1377 : * the next character boundary.
1378 : */
1379 0 : if (state->refpoint > matchptr)
1380 : {
1381 0 : start_ptr = state->refpoint;
1382 0 : goto retry;
1383 : }
1384 : }
1385 : }
1386 :
1387 6870 : state->last_match = matchptr;
1388 6870 : return true;
1389 : }
1390 :
1391 : /*
1392 : * Subroutine of text_position_next(). This searches for the raw byte
1393 : * sequence, ignoring any multi-byte encoding issues. Returns the first
1394 : * match starting at 'start_ptr', or NULL if no match is found.
1395 : */
1396 : static char *
1397 9598 : text_position_next_internal(char *start_ptr, TextPositionState *state)
1398 : {
1399 9598 : int haystack_len = state->len1;
1400 9598 : int needle_len = state->len2;
1401 9598 : int skiptablemask = state->skiptablemask;
1402 9598 : const char *haystack = state->str1;
1403 9598 : const char *needle = state->str2;
1404 9598 : const char *haystack_end = &haystack[haystack_len];
1405 : const char *hptr;
1406 :
1407 : Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1408 :
1409 9598 : if (needle_len == 1)
1410 : {
1411 : /* No point in using B-M-H for a one-character needle */
1412 754 : char nchar = *needle;
1413 :
1414 754 : hptr = start_ptr;
1415 5758 : while (hptr < haystack_end)
1416 : {
1417 5592 : if (*hptr == nchar)
1418 588 : return (char *) hptr;
1419 5004 : hptr++;
1420 : }
1421 : }
1422 : else
1423 : {
1424 8844 : const char *needle_last = &needle[needle_len - 1];
1425 :
1426 : /* Start at startpos plus the length of the needle */
1427 8844 : hptr = start_ptr + needle_len - 1;
1428 232916 : while (hptr < haystack_end)
1429 : {
1430 : /* Match the needle scanning *backward* */
1431 : const char *nptr;
1432 : const char *p;
1433 :
1434 230354 : nptr = needle_last;
1435 230354 : p = hptr;
1436 320786 : while (*nptr == *p)
1437 : {
1438 : /* Matched it all? If so, return 1-based position */
1439 96714 : if (nptr == needle)
1440 6282 : return (char *) p;
1441 90432 : nptr--, p--;
1442 : }
1443 :
1444 : /*
1445 : * No match, so use the haystack char at hptr to decide how far to
1446 : * advance. If the needle had any occurrence of that character
1447 : * (or more precisely, one sharing the same skiptable entry)
1448 : * before its last character, then we advance far enough to align
1449 : * the last such needle character with that haystack position.
1450 : * Otherwise we can advance by the whole needle length.
1451 : */
1452 224072 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1453 : }
1454 : }
1455 :
1456 2728 : return 0; /* not found */
1457 : }
1458 :
1459 : /*
1460 : * Return a pointer to the current match.
1461 : *
1462 : * The returned pointer points into the original haystack string.
1463 : */
1464 : static char *
1465 6792 : text_position_get_match_ptr(TextPositionState *state)
1466 : {
1467 6792 : return state->last_match;
1468 : }
1469 :
1470 : /*
1471 : * Return the offset of the current match.
1472 : *
1473 : * The offset is in characters, 1-based.
1474 : */
1475 : static int
1476 48 : text_position_get_match_pos(TextPositionState *state)
1477 : {
1478 : /* Convert the byte position to char position. */
1479 96 : state->refpos += pg_mbstrlen_with_len(state->refpoint,
1480 48 : state->last_match - state->refpoint);
1481 48 : state->refpoint = state->last_match;
1482 48 : return state->refpos + 1;
1483 : }
1484 :
1485 : /*
1486 : * Reset search state to the initial state installed by text_position_setup.
1487 : *
1488 : * The next call to text_position_next will search from the beginning
1489 : * of the string.
1490 : */
1491 : static void
1492 12 : text_position_reset(TextPositionState *state)
1493 : {
1494 12 : state->last_match = NULL;
1495 12 : state->refpoint = state->str1;
1496 12 : state->refpos = 0;
1497 12 : }
1498 :
1499 : static void
1500 2788 : text_position_cleanup(TextPositionState *state)
1501 : {
1502 : /* no cleanup needed */
1503 2788 : }
1504 :
1505 :
1506 : static void
1507 16557106 : check_collation_set(Oid collid)
1508 : {
1509 16557106 : if (!OidIsValid(collid))
1510 : {
1511 : /*
1512 : * This typically means that the parser could not resolve a conflict
1513 : * of implicit collations, so report it that way.
1514 : */
1515 30 : ereport(ERROR,
1516 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1517 : errmsg("could not determine which collation to use for string comparison"),
1518 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1519 : }
1520 16557076 : }
1521 :
1522 : /* varstr_cmp()
1523 : * Comparison function for text strings with given lengths.
1524 : * Includes locale support, but must copy strings to temporary memory
1525 : * to allow null-termination for inputs to strcoll().
1526 : * Returns an integer less than, equal to, or greater than zero, indicating
1527 : * whether arg1 is less than, equal to, or greater than arg2.
1528 : *
1529 : * Note: many functions that depend on this are marked leakproof; therefore,
1530 : * avoid reporting the actual contents of the input when throwing errors.
1531 : * All errors herein should be things that can't happen except on corrupt
1532 : * data, anyway; otherwise we will have trouble with indexing strings that
1533 : * would cause them.
1534 : */
1535 : int
1536 9273010 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1537 : {
1538 : int result;
1539 :
1540 9273010 : check_collation_set(collid);
1541 :
1542 : /*
1543 : * Unfortunately, there is no strncoll(), so in the non-C locale case we
1544 : * have to do some memory copying. This turns out to be significantly
1545 : * slower, so we optimize the case where LC_COLLATE is C. We also try to
1546 : * optimize relatively-short strings by avoiding palloc/pfree overhead.
1547 : */
1548 9272992 : if (lc_collate_is_c(collid))
1549 : {
1550 4921398 : result = memcmp(arg1, arg2, Min(len1, len2));
1551 4921398 : if ((result == 0) && (len1 != len2))
1552 151984 : result = (len1 < len2) ? -1 : 1;
1553 : }
1554 : else
1555 : {
1556 : pg_locale_t mylocale;
1557 :
1558 4351594 : mylocale = pg_newlocale_from_collation(collid);
1559 :
1560 : /*
1561 : * memcmp() can't tell us which of two unequal strings sorts first,
1562 : * but it's a cheap way to tell if they're equal. Testing shows that
1563 : * memcmp() followed by strcoll() is only trivially slower than
1564 : * strcoll() by itself, so we don't lose much if this doesn't work out
1565 : * very often, and if it does - for example, because there are many
1566 : * equal strings in the input - then we win big by avoiding expensive
1567 : * collation-aware comparisons.
1568 : */
1569 4351594 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1570 1723046 : return 0;
1571 :
1572 2628548 : result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1573 :
1574 : /* Break tie if necessary. */
1575 2628548 : if (result == 0 && pg_locale_deterministic(mylocale))
1576 : {
1577 0 : result = memcmp(arg1, arg2, Min(len1, len2));
1578 0 : if ((result == 0) && (len1 != len2))
1579 0 : result = (len1 < len2) ? -1 : 1;
1580 : }
1581 : }
1582 :
1583 7549946 : return result;
1584 : }
1585 :
1586 : /* text_cmp()
1587 : * Internal comparison function for text strings.
1588 : * Returns -1, 0 or 1
1589 : */
1590 : static int
1591 7628144 : text_cmp(text *arg1, text *arg2, Oid collid)
1592 : {
1593 : char *a1p,
1594 : *a2p;
1595 : int len1,
1596 : len2;
1597 :
1598 7628144 : a1p = VARDATA_ANY(arg1);
1599 7628144 : a2p = VARDATA_ANY(arg2);
1600 :
1601 7628144 : len1 = VARSIZE_ANY_EXHDR(arg1);
1602 7628144 : len2 = VARSIZE_ANY_EXHDR(arg2);
1603 :
1604 7628144 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1605 : }
1606 :
1607 : /*
1608 : * Comparison functions for text strings.
1609 : *
1610 : * Note: btree indexes need these routines not to leak memory; therefore,
1611 : * be careful to free working copies of toasted datums. Most places don't
1612 : * need to be so careful.
1613 : */
1614 :
1615 : Datum
1616 6835294 : texteq(PG_FUNCTION_ARGS)
1617 : {
1618 6835294 : Oid collid = PG_GET_COLLATION();
1619 6835294 : bool locale_is_c = false;
1620 6835294 : pg_locale_t mylocale = 0;
1621 : bool result;
1622 :
1623 6835294 : check_collation_set(collid);
1624 :
1625 6835294 : if (lc_collate_is_c(collid))
1626 160932 : locale_is_c = true;
1627 : else
1628 6674362 : mylocale = pg_newlocale_from_collation(collid);
1629 :
1630 6835294 : if (locale_is_c || pg_locale_deterministic(mylocale))
1631 6834810 : {
1632 6834810 : Datum arg1 = PG_GETARG_DATUM(0);
1633 6834810 : Datum arg2 = PG_GETARG_DATUM(1);
1634 : Size len1,
1635 : len2;
1636 :
1637 : /*
1638 : * Since we only care about equality or not-equality, we can avoid all
1639 : * the expense of strcoll() here, and just do bitwise comparison. In
1640 : * fact, we don't even have to do a bitwise comparison if we can show
1641 : * the lengths of the strings are unequal; which might save us from
1642 : * having to detoast one or both values.
1643 : */
1644 6834810 : len1 = toast_raw_datum_size(arg1);
1645 6834810 : len2 = toast_raw_datum_size(arg2);
1646 6834810 : if (len1 != len2)
1647 2501730 : result = false;
1648 : else
1649 : {
1650 4333080 : text *targ1 = DatumGetTextPP(arg1);
1651 4333080 : text *targ2 = DatumGetTextPP(arg2);
1652 :
1653 4333080 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654 : len1 - VARHDRSZ) == 0);
1655 :
1656 4333080 : PG_FREE_IF_COPY(targ1, 0);
1657 4333080 : PG_FREE_IF_COPY(targ2, 1);
1658 : }
1659 : }
1660 : else
1661 : {
1662 484 : text *arg1 = PG_GETARG_TEXT_PP(0);
1663 484 : text *arg2 = PG_GETARG_TEXT_PP(1);
1664 :
1665 484 : result = (text_cmp(arg1, arg2, collid) == 0);
1666 :
1667 484 : PG_FREE_IF_COPY(arg1, 0);
1668 484 : PG_FREE_IF_COPY(arg2, 1);
1669 : }
1670 :
1671 6835294 : PG_RETURN_BOOL(result);
1672 : }
1673 :
1674 : Datum
1675 19416 : textne(PG_FUNCTION_ARGS)
1676 : {
1677 19416 : Oid collid = PG_GET_COLLATION();
1678 19416 : bool locale_is_c = false;
1679 19416 : pg_locale_t mylocale = 0;
1680 : bool result;
1681 :
1682 19416 : check_collation_set(collid);
1683 :
1684 19416 : if (lc_collate_is_c(collid))
1685 18 : locale_is_c = true;
1686 : else
1687 19398 : mylocale = pg_newlocale_from_collation(collid);
1688 :
1689 19416 : if (locale_is_c || pg_locale_deterministic(mylocale))
1690 19392 : {
1691 19392 : Datum arg1 = PG_GETARG_DATUM(0);
1692 19392 : Datum arg2 = PG_GETARG_DATUM(1);
1693 : Size len1,
1694 : len2;
1695 :
1696 : /* See comment in texteq() */
1697 19392 : len1 = toast_raw_datum_size(arg1);
1698 19392 : len2 = toast_raw_datum_size(arg2);
1699 19392 : if (len1 != len2)
1700 1818 : result = true;
1701 : else
1702 : {
1703 17574 : text *targ1 = DatumGetTextPP(arg1);
1704 17574 : text *targ2 = DatumGetTextPP(arg2);
1705 :
1706 17574 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1707 : len1 - VARHDRSZ) != 0);
1708 :
1709 17574 : PG_FREE_IF_COPY(targ1, 0);
1710 17574 : PG_FREE_IF_COPY(targ2, 1);
1711 : }
1712 : }
1713 : else
1714 : {
1715 24 : text *arg1 = PG_GETARG_TEXT_PP(0);
1716 24 : text *arg2 = PG_GETARG_TEXT_PP(1);
1717 :
1718 24 : result = (text_cmp(arg1, arg2, collid) != 0);
1719 :
1720 24 : PG_FREE_IF_COPY(arg1, 0);
1721 24 : PG_FREE_IF_COPY(arg2, 1);
1722 : }
1723 :
1724 19416 : PG_RETURN_BOOL(result);
1725 : }
1726 :
1727 : Datum
1728 123412 : text_lt(PG_FUNCTION_ARGS)
1729 : {
1730 123412 : text *arg1 = PG_GETARG_TEXT_PP(0);
1731 123412 : text *arg2 = PG_GETARG_TEXT_PP(1);
1732 : bool result;
1733 :
1734 123412 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1735 :
1736 123394 : PG_FREE_IF_COPY(arg1, 0);
1737 123394 : PG_FREE_IF_COPY(arg2, 1);
1738 :
1739 123394 : PG_RETURN_BOOL(result);
1740 : }
1741 :
1742 : Datum
1743 324354 : text_le(PG_FUNCTION_ARGS)
1744 : {
1745 324354 : text *arg1 = PG_GETARG_TEXT_PP(0);
1746 324354 : text *arg2 = PG_GETARG_TEXT_PP(1);
1747 : bool result;
1748 :
1749 324354 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1750 :
1751 324354 : PG_FREE_IF_COPY(arg1, 0);
1752 324354 : PG_FREE_IF_COPY(arg2, 1);
1753 :
1754 324354 : PG_RETURN_BOOL(result);
1755 : }
1756 :
1757 : Datum
1758 113646 : text_gt(PG_FUNCTION_ARGS)
1759 : {
1760 113646 : text *arg1 = PG_GETARG_TEXT_PP(0);
1761 113646 : text *arg2 = PG_GETARG_TEXT_PP(1);
1762 : bool result;
1763 :
1764 113646 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1765 :
1766 113646 : PG_FREE_IF_COPY(arg1, 0);
1767 113646 : PG_FREE_IF_COPY(arg2, 1);
1768 :
1769 113646 : PG_RETURN_BOOL(result);
1770 : }
1771 :
1772 : Datum
1773 184724 : text_ge(PG_FUNCTION_ARGS)
1774 : {
1775 184724 : text *arg1 = PG_GETARG_TEXT_PP(0);
1776 184724 : text *arg2 = PG_GETARG_TEXT_PP(1);
1777 : bool result;
1778 :
1779 184724 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1780 :
1781 184724 : PG_FREE_IF_COPY(arg1, 0);
1782 184724 : PG_FREE_IF_COPY(arg2, 1);
1783 :
1784 184724 : PG_RETURN_BOOL(result);
1785 : }
1786 :
1787 : Datum
1788 37914 : text_starts_with(PG_FUNCTION_ARGS)
1789 : {
1790 37914 : Datum arg1 = PG_GETARG_DATUM(0);
1791 37914 : Datum arg2 = PG_GETARG_DATUM(1);
1792 37914 : Oid collid = PG_GET_COLLATION();
1793 37914 : pg_locale_t mylocale = 0;
1794 : bool result;
1795 : Size len1,
1796 : len2;
1797 :
1798 37914 : check_collation_set(collid);
1799 :
1800 37914 : if (!lc_collate_is_c(collid))
1801 37914 : mylocale = pg_newlocale_from_collation(collid);
1802 :
1803 37914 : if (!pg_locale_deterministic(mylocale))
1804 0 : ereport(ERROR,
1805 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1806 : errmsg("nondeterministic collations are not supported for substring searches")));
1807 :
1808 37914 : len1 = toast_raw_datum_size(arg1);
1809 37914 : len2 = toast_raw_datum_size(arg2);
1810 37914 : if (len2 > len1)
1811 0 : result = false;
1812 : else
1813 : {
1814 37914 : text *targ1 = text_substring(arg1, 1, len2, false);
1815 37914 : text *targ2 = DatumGetTextPP(arg2);
1816 :
1817 37914 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1818 37914 : VARSIZE_ANY_EXHDR(targ2)) == 0);
1819 :
1820 37914 : PG_FREE_IF_COPY(targ1, 0);
1821 37914 : PG_FREE_IF_COPY(targ2, 1);
1822 : }
1823 :
1824 37914 : PG_RETURN_BOOL(result);
1825 : }
1826 :
1827 : Datum
1828 6565864 : bttextcmp(PG_FUNCTION_ARGS)
1829 : {
1830 6565864 : text *arg1 = PG_GETARG_TEXT_PP(0);
1831 6565864 : text *arg2 = PG_GETARG_TEXT_PP(1);
1832 : int32 result;
1833 :
1834 6565864 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1835 :
1836 6565864 : PG_FREE_IF_COPY(arg1, 0);
1837 6565864 : PG_FREE_IF_COPY(arg2, 1);
1838 :
1839 6565864 : PG_RETURN_INT32(result);
1840 : }
1841 :
1842 : Datum
1843 87410 : bttextsortsupport(PG_FUNCTION_ARGS)
1844 : {
1845 87410 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1846 87410 : Oid collid = ssup->ssup_collation;
1847 : MemoryContext oldcontext;
1848 :
1849 87410 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1850 :
1851 : /* Use generic string SortSupport */
1852 87410 : varstr_sortsupport(ssup, TEXTOID, collid);
1853 :
1854 87398 : MemoryContextSwitchTo(oldcontext);
1855 :
1856 87398 : PG_RETURN_VOID();
1857 : }
1858 :
1859 : /*
1860 : * Generic sortsupport interface for character type's operator classes.
1861 : * Includes locale support, and support for BpChar semantics (i.e. removing
1862 : * trailing spaces before comparison).
1863 : *
1864 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1865 : * same representation. Callers that always use the C collation (e.g.
1866 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
1867 : * this will not work with any other collation, though.
1868 : */
1869 : void
1870 163398 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1871 : {
1872 163398 : bool abbreviate = ssup->abbreviate;
1873 163398 : bool collate_c = false;
1874 : VarStringSortSupport *sss;
1875 163398 : pg_locale_t locale = 0;
1876 :
1877 163398 : check_collation_set(collid);
1878 :
1879 : /*
1880 : * If possible, set ssup->comparator to a function which can be used to
1881 : * directly compare two datums. If we can do this, we'll avoid the
1882 : * overhead of a trip through the fmgr layer for every comparison, which
1883 : * can be substantial.
1884 : *
1885 : * Most typically, we'll set the comparator to varlenafastcmp_locale,
1886 : * which uses strcoll() to perform comparisons. We use that for the
1887 : * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1888 : * LC_COLLATE = C, we can make things quite a bit faster with
1889 : * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1890 : * memcmp() rather than strcoll().
1891 : */
1892 163386 : if (lc_collate_is_c(collid))
1893 : {
1894 112072 : if (typid == BPCHAROID)
1895 22 : ssup->comparator = bpcharfastcmp_c;
1896 112050 : else if (typid == NAMEOID)
1897 : {
1898 75038 : ssup->comparator = namefastcmp_c;
1899 : /* Not supporting abbreviation with type NAME, for now */
1900 75038 : abbreviate = false;
1901 : }
1902 : else
1903 37012 : ssup->comparator = varstrfastcmp_c;
1904 :
1905 112072 : collate_c = true;
1906 : }
1907 : else
1908 : {
1909 : /*
1910 : * We need a collation-sensitive comparison. To make things faster,
1911 : * we'll figure out the collation based on the locale id and cache the
1912 : * result.
1913 : */
1914 51314 : locale = pg_newlocale_from_collation(collid);
1915 :
1916 : /*
1917 : * We use varlenafastcmp_locale except for type NAME.
1918 : */
1919 51314 : if (typid == NAMEOID)
1920 : {
1921 0 : ssup->comparator = namefastcmp_locale;
1922 : /* Not supporting abbreviation with type NAME, for now */
1923 0 : abbreviate = false;
1924 : }
1925 : else
1926 51314 : ssup->comparator = varlenafastcmp_locale;
1927 : }
1928 :
1929 : /*
1930 : * Unfortunately, it seems that abbreviation for non-C collations is
1931 : * broken on many common platforms; see pg_strxfrm_enabled().
1932 : *
1933 : * Even apart from the risk of broken locales, it's possible that there
1934 : * are platforms where the use of abbreviated keys should be disabled at
1935 : * compile time. Having only 4 byte datums could make worst-case
1936 : * performance drastically more likely, for example. Moreover, macOS's
1937 : * strxfrm() implementation is known to not effectively concentrate a
1938 : * significant amount of entropy from the original string in earlier
1939 : * transformed blobs. It's possible that other supported platforms are
1940 : * similarly encumbered. So, if we ever get past disabling this
1941 : * categorically, we may still want or need to disable it for particular
1942 : * platforms.
1943 : */
1944 163386 : if (!collate_c && !pg_strxfrm_enabled(locale))
1945 0 : abbreviate = false;
1946 :
1947 : /*
1948 : * If we're using abbreviated keys, or if we're using a locale-aware
1949 : * comparison, we need to initialize a VarStringSortSupport object. Both
1950 : * cases will make use of the temporary buffers we initialize here for
1951 : * scratch space (and to detect requirement for BpChar semantics from
1952 : * caller), and the abbreviation case requires additional state.
1953 : */
1954 163386 : if (abbreviate || !collate_c)
1955 : {
1956 54858 : sss = palloc(sizeof(VarStringSortSupport));
1957 54858 : sss->buf1 = palloc(TEXTBUFLEN);
1958 54858 : sss->buflen1 = TEXTBUFLEN;
1959 54858 : sss->buf2 = palloc(TEXTBUFLEN);
1960 54858 : sss->buflen2 = TEXTBUFLEN;
1961 : /* Start with invalid values */
1962 54858 : sss->last_len1 = -1;
1963 54858 : sss->last_len2 = -1;
1964 : /* Initialize */
1965 54858 : sss->last_returned = 0;
1966 54858 : sss->locale = locale;
1967 :
1968 : /*
1969 : * To avoid somehow confusing a strxfrm() blob and an original string,
1970 : * constantly keep track of the variety of data that buf1 and buf2
1971 : * currently contain.
1972 : *
1973 : * Comparisons may be interleaved with conversion calls. Frequently,
1974 : * conversions and comparisons are batched into two distinct phases,
1975 : * but the correctness of caching cannot hinge upon this. For
1976 : * comparison caching, buffer state is only trusted if cache_blob is
1977 : * found set to false, whereas strxfrm() caching only trusts the state
1978 : * when cache_blob is found set to true.
1979 : *
1980 : * Arbitrarily initialize cache_blob to true.
1981 : */
1982 54858 : sss->cache_blob = true;
1983 54858 : sss->collate_c = collate_c;
1984 54858 : sss->typid = typid;
1985 54858 : ssup->ssup_extra = sss;
1986 :
1987 : /*
1988 : * If possible, plan to use the abbreviated keys optimization. The
1989 : * core code may switch back to authoritative comparator should
1990 : * abbreviation be aborted.
1991 : */
1992 54858 : if (abbreviate)
1993 : {
1994 47492 : sss->prop_card = 0.20;
1995 47492 : initHyperLogLog(&sss->abbr_card, 10);
1996 47492 : initHyperLogLog(&sss->full_card, 10);
1997 47492 : ssup->abbrev_full_comparator = ssup->comparator;
1998 47492 : ssup->comparator = ssup_datum_unsigned_cmp;
1999 47492 : ssup->abbrev_converter = varstr_abbrev_convert;
2000 47492 : ssup->abbrev_abort = varstr_abbrev_abort;
2001 : }
2002 : }
2003 163386 : }
2004 :
2005 : /*
2006 : * sortsupport comparison func (for C locale case)
2007 : */
2008 : static int
2009 114424040 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2010 : {
2011 114424040 : VarString *arg1 = DatumGetVarStringPP(x);
2012 114424040 : VarString *arg2 = DatumGetVarStringPP(y);
2013 : char *a1p,
2014 : *a2p;
2015 : int len1,
2016 : len2,
2017 : result;
2018 :
2019 114424040 : a1p = VARDATA_ANY(arg1);
2020 114424040 : a2p = VARDATA_ANY(arg2);
2021 :
2022 114424040 : len1 = VARSIZE_ANY_EXHDR(arg1);
2023 114424040 : len2 = VARSIZE_ANY_EXHDR(arg2);
2024 :
2025 114424040 : result = memcmp(a1p, a2p, Min(len1, len2));
2026 114424040 : if ((result == 0) && (len1 != len2))
2027 2763736 : result = (len1 < len2) ? -1 : 1;
2028 :
2029 : /* We can't afford to leak memory here. */
2030 114424040 : if (PointerGetDatum(arg1) != x)
2031 0 : pfree(arg1);
2032 114424040 : if (PointerGetDatum(arg2) != y)
2033 0 : pfree(arg2);
2034 :
2035 114424040 : return result;
2036 : }
2037 :
2038 : /*
2039 : * sortsupport comparison func (for BpChar C locale case)
2040 : *
2041 : * BpChar outsources its sortsupport to this module. Specialization for the
2042 : * varstr_sortsupport BpChar case, modeled on
2043 : * internal_bpchar_pattern_compare().
2044 : */
2045 : static int
2046 16 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2047 : {
2048 16 : BpChar *arg1 = DatumGetBpCharPP(x);
2049 16 : BpChar *arg2 = DatumGetBpCharPP(y);
2050 : char *a1p,
2051 : *a2p;
2052 : int len1,
2053 : len2,
2054 : result;
2055 :
2056 16 : a1p = VARDATA_ANY(arg1);
2057 16 : a2p = VARDATA_ANY(arg2);
2058 :
2059 16 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2060 16 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2061 :
2062 16 : result = memcmp(a1p, a2p, Min(len1, len2));
2063 16 : if ((result == 0) && (len1 != len2))
2064 0 : result = (len1 < len2) ? -1 : 1;
2065 :
2066 : /* We can't afford to leak memory here. */
2067 16 : if (PointerGetDatum(arg1) != x)
2068 0 : pfree(arg1);
2069 16 : if (PointerGetDatum(arg2) != y)
2070 0 : pfree(arg2);
2071 :
2072 16 : return result;
2073 : }
2074 :
2075 : /*
2076 : * sortsupport comparison func (for NAME C locale case)
2077 : */
2078 : static int
2079 118524664 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2080 : {
2081 118524664 : Name arg1 = DatumGetName(x);
2082 118524664 : Name arg2 = DatumGetName(y);
2083 :
2084 118524664 : return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2085 : }
2086 :
2087 : /*
2088 : * sortsupport comparison func (for locale case with all varlena types)
2089 : */
2090 : static int
2091 37242482 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2092 : {
2093 37242482 : VarString *arg1 = DatumGetVarStringPP(x);
2094 37242482 : VarString *arg2 = DatumGetVarStringPP(y);
2095 : char *a1p,
2096 : *a2p;
2097 : int len1,
2098 : len2,
2099 : result;
2100 :
2101 37242482 : a1p = VARDATA_ANY(arg1);
2102 37242482 : a2p = VARDATA_ANY(arg2);
2103 :
2104 37242482 : len1 = VARSIZE_ANY_EXHDR(arg1);
2105 37242482 : len2 = VARSIZE_ANY_EXHDR(arg2);
2106 :
2107 37242482 : result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2108 :
2109 : /* We can't afford to leak memory here. */
2110 37242482 : if (PointerGetDatum(arg1) != x)
2111 6 : pfree(arg1);
2112 37242482 : if (PointerGetDatum(arg2) != y)
2113 6 : pfree(arg2);
2114 :
2115 37242482 : return result;
2116 : }
2117 :
2118 : /*
2119 : * sortsupport comparison func (for locale case with NAME type)
2120 : */
2121 : static int
2122 0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2123 : {
2124 0 : Name arg1 = DatumGetName(x);
2125 0 : Name arg2 = DatumGetName(y);
2126 :
2127 0 : return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2128 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2129 : ssup);
2130 : }
2131 :
2132 : /*
2133 : * sortsupport comparison func for locale cases
2134 : */
2135 : static int
2136 37242482 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2137 : {
2138 37242482 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2139 : int result;
2140 : bool arg1_match;
2141 :
2142 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2143 37242482 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2144 : {
2145 : /*
2146 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2147 : * last_len2. Existing contents of buffers might still be used by
2148 : * next call.
2149 : *
2150 : * It's fine to allow the comparison of BpChar padding bytes here,
2151 : * even though that implies that the memcmp() will usually be
2152 : * performed for BpChar callers (though multibyte characters could
2153 : * still prevent that from occurring). The memcmp() is still very
2154 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2155 : * (not limited to padding), so we need make no distinction between
2156 : * padding space characters and "real" space characters.
2157 : */
2158 13619292 : return 0;
2159 : }
2160 :
2161 23623190 : if (sss->typid == BPCHAROID)
2162 : {
2163 : /* Get true number of bytes, ignoring trailing spaces */
2164 35866 : len1 = bpchartruelen(a1p, len1);
2165 35866 : len2 = bpchartruelen(a2p, len2);
2166 : }
2167 :
2168 23623190 : if (len1 >= sss->buflen1)
2169 : {
2170 0 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2171 0 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2172 : }
2173 23623190 : if (len2 >= sss->buflen2)
2174 : {
2175 6 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2176 6 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2177 : }
2178 :
2179 : /*
2180 : * We're likely to be asked to compare the same strings repeatedly, and
2181 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2182 : * comparisons, even though in general there is no reason to think that
2183 : * that will work out (every string datum may be unique). Caching does
2184 : * not slow things down measurably when it doesn't work out, and can speed
2185 : * things up by rather a lot when it does. In part, this is because the
2186 : * memcmp() compares data from cachelines that are needed in L1 cache even
2187 : * when the last comparison's result cannot be reused.
2188 : */
2189 23623190 : arg1_match = true;
2190 23623190 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2191 : {
2192 21009864 : arg1_match = false;
2193 21009864 : memcpy(sss->buf1, a1p, len1);
2194 21009864 : sss->buf1[len1] = '\0';
2195 21009864 : sss->last_len1 = len1;
2196 : }
2197 :
2198 : /*
2199 : * If we're comparing the same two strings as last time, we can return the
2200 : * same answer without calling strcoll() again. This is more likely than
2201 : * it seems (at least with moderate to low cardinality sets), because
2202 : * quicksort compares the same pivot against many values.
2203 : */
2204 23623190 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2205 : {
2206 3920018 : memcpy(sss->buf2, a2p, len2);
2207 3920018 : sss->buf2[len2] = '\0';
2208 3920018 : sss->last_len2 = len2;
2209 : }
2210 19703172 : else if (arg1_match && !sss->cache_blob)
2211 : {
2212 : /* Use result cached following last actual strcoll() call */
2213 2226204 : return sss->last_returned;
2214 : }
2215 :
2216 21396986 : result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2217 :
2218 : /* Break tie if necessary. */
2219 21396986 : if (result == 0 && pg_locale_deterministic(sss->locale))
2220 0 : result = strcmp(sss->buf1, sss->buf2);
2221 :
2222 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2223 21396986 : sss->cache_blob = false;
2224 21396986 : sss->last_returned = result;
2225 21396986 : return result;
2226 : }
2227 :
2228 : /*
2229 : * Conversion routine for sortsupport. Converts original to abbreviated key
2230 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2231 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2232 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2233 : * locale is used, or in case of bytea, just memcpy() from original instead.
2234 : */
2235 : static Datum
2236 1152366 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2237 : {
2238 1152366 : const size_t max_prefix_bytes = sizeof(Datum);
2239 1152366 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2240 1152366 : VarString *authoritative = DatumGetVarStringPP(original);
2241 1152366 : char *authoritative_data = VARDATA_ANY(authoritative);
2242 :
2243 : /* working state */
2244 : Datum res;
2245 : char *pres;
2246 : int len;
2247 : uint32 hash;
2248 :
2249 1152366 : pres = (char *) &res;
2250 : /* memset(), so any non-overwritten bytes are NUL */
2251 1152366 : memset(pres, 0, max_prefix_bytes);
2252 1152366 : len = VARSIZE_ANY_EXHDR(authoritative);
2253 :
2254 : /* Get number of bytes, ignoring trailing spaces */
2255 1152366 : if (sss->typid == BPCHAROID)
2256 2592 : len = bpchartruelen(authoritative_data, len);
2257 :
2258 : /*
2259 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2260 : * abbreviate keys. The full comparator for the C locale is always
2261 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2262 : * always force the C collation -- bytea isn't a collatable type, but this
2263 : * approach is convenient) to use strxfrm(). This is because bytea
2264 : * strings may contain NUL bytes. Besides, this should be faster, too.
2265 : *
2266 : * More generally, it's okay that bytea callers can have NUL bytes in
2267 : * strings because abbreviated cmp need not make a distinction between
2268 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2269 : * authoritative representation. Hopefully a comparison at or past one
2270 : * abbreviated key's terminating NUL byte will resolve the comparison
2271 : * without consulting the authoritative representation; specifically, some
2272 : * later non-NUL byte in the longer string can resolve the comparison
2273 : * against a subsequent terminating NUL in the shorter string. There will
2274 : * usually be what is effectively a "length-wise" resolution there and
2275 : * then.
2276 : *
2277 : * If that doesn't work out -- if all bytes in the longer string
2278 : * positioned at or past the offset of the smaller string's (first)
2279 : * terminating NUL are actually representative of NUL bytes in the
2280 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2281 : * towards the end of the longer string iff it happens to still be small)
2282 : * -- then an authoritative tie-breaker will happen, and do the right
2283 : * thing: explicitly consider string length.
2284 : */
2285 1152366 : if (sss->collate_c)
2286 512708 : memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2287 : else
2288 : {
2289 : Size bsize;
2290 :
2291 : /*
2292 : * We're not using the C collation, so fall back on strxfrm or ICU
2293 : * analogs.
2294 : */
2295 :
2296 : /* By convention, we use buffer 1 to store and NUL-terminate */
2297 639658 : if (len >= sss->buflen1)
2298 : {
2299 24 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2300 24 : sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2301 : }
2302 :
2303 : /* Might be able to reuse strxfrm() blob from last call */
2304 639658 : if (sss->last_len1 == len && sss->cache_blob &&
2305 623826 : memcmp(sss->buf1, authoritative_data, len) == 0)
2306 : {
2307 306354 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2308 : /* No change affecting cardinality, so no hashing required */
2309 306354 : goto done;
2310 : }
2311 :
2312 333304 : memcpy(sss->buf1, authoritative_data, len);
2313 :
2314 : /*
2315 : * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2316 : */
2317 333304 : sss->buf1[len] = '\0';
2318 333304 : sss->last_len1 = len;
2319 :
2320 333304 : if (pg_strxfrm_prefix_enabled(sss->locale))
2321 : {
2322 333304 : if (sss->buflen2 < max_prefix_bytes)
2323 : {
2324 0 : sss->buflen2 = Max(max_prefix_bytes,
2325 : Min(sss->buflen2 * 2, MaxAllocSize));
2326 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2327 : }
2328 :
2329 333304 : bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2330 : max_prefix_bytes, sss->locale);
2331 333304 : sss->last_len2 = bsize;
2332 : }
2333 : else
2334 : {
2335 : /*
2336 : * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2337 : * again. The pg_strxfrm() function leaves the result buffer
2338 : * content undefined if the result did not fit, so we need to
2339 : * retry until everything fits, even though we only need the first
2340 : * few bytes in the end.
2341 : */
2342 : for (;;)
2343 : {
2344 0 : bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2345 : sss->locale);
2346 :
2347 0 : sss->last_len2 = bsize;
2348 0 : if (bsize < sss->buflen2)
2349 0 : break;
2350 :
2351 : /*
2352 : * Grow buffer and retry.
2353 : */
2354 0 : sss->buflen2 = Max(bsize + 1,
2355 : Min(sss->buflen2 * 2, MaxAllocSize));
2356 0 : sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2357 : }
2358 : }
2359 :
2360 : /*
2361 : * Every Datum byte is always compared. This is safe because the
2362 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2363 : * misinterpreting any NUL bytes not intended to be interpreted as
2364 : * logically representing termination.
2365 : *
2366 : * (Actually, even if there were NUL bytes in the blob it would be
2367 : * okay. See remarks on bytea case above.)
2368 : */
2369 333304 : memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2370 : }
2371 :
2372 : /*
2373 : * Maintain approximate cardinality of both abbreviated keys and original,
2374 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2375 : * the worst case, where we do many string transformations for no saving
2376 : * in full strcoll()-based comparisons. These statistics are used by
2377 : * varstr_abbrev_abort().
2378 : *
2379 : * First, Hash key proper, or a significant fraction of it. Mix in length
2380 : * in order to compensate for cases where differences are past
2381 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2382 : */
2383 846012 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2384 : Min(len, PG_CACHE_LINE_SIZE)));
2385 :
2386 846012 : if (len > PG_CACHE_LINE_SIZE)
2387 46 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2388 :
2389 846012 : addHyperLogLog(&sss->full_card, hash);
2390 :
2391 : /* Hash abbreviated key */
2392 : #if SIZEOF_DATUM == 8
2393 : {
2394 : uint32 lohalf,
2395 : hihalf;
2396 :
2397 846012 : lohalf = (uint32) res;
2398 846012 : hihalf = (uint32) (res >> 32);
2399 846012 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2400 : }
2401 : #else /* SIZEOF_DATUM != 8 */
2402 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2403 : #endif
2404 :
2405 846012 : addHyperLogLog(&sss->abbr_card, hash);
2406 :
2407 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2408 846012 : sss->cache_blob = true;
2409 1152366 : done:
2410 :
2411 : /*
2412 : * Byteswap on little-endian machines.
2413 : *
2414 : * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2415 : * 3-way comparator) works correctly on all platforms. If we didn't do
2416 : * this, the comparator would have to call memcmp() with a pair of
2417 : * pointers to the first byte of each abbreviated key, which is slower.
2418 : */
2419 1152366 : res = DatumBigEndianToNative(res);
2420 :
2421 : /* Don't leak memory here */
2422 1152366 : if (PointerGetDatum(authoritative) != original)
2423 12 : pfree(authoritative);
2424 :
2425 1152366 : return res;
2426 : }
2427 :
2428 : /*
2429 : * Callback for estimating effectiveness of abbreviated key optimization, using
2430 : * heuristic rules. Returns value indicating if the abbreviation optimization
2431 : * should be aborted, based on its projected effectiveness.
2432 : */
2433 : static bool
2434 3744 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2435 : {
2436 3744 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2437 : double abbrev_distinct,
2438 : key_distinct;
2439 :
2440 : Assert(ssup->abbreviate);
2441 :
2442 : /* Have a little patience */
2443 3744 : if (memtupcount < 100)
2444 2300 : return false;
2445 :
2446 1444 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2447 1444 : key_distinct = estimateHyperLogLog(&sss->full_card);
2448 :
2449 : /*
2450 : * Clamp cardinality estimates to at least one distinct value. While
2451 : * NULLs are generally disregarded, if only NULL values were seen so far,
2452 : * that might misrepresent costs if we failed to clamp.
2453 : */
2454 1444 : if (abbrev_distinct <= 1.0)
2455 0 : abbrev_distinct = 1.0;
2456 :
2457 1444 : if (key_distinct <= 1.0)
2458 0 : key_distinct = 1.0;
2459 :
2460 : /*
2461 : * In the worst case all abbreviated keys are identical, while at the same
2462 : * time there are differences within full key strings not captured in
2463 : * abbreviations.
2464 : */
2465 : #ifdef TRACE_SORT
2466 1444 : if (trace_sort)
2467 : {
2468 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2469 :
2470 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2471 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2472 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2473 : sss->prop_card);
2474 : }
2475 : #endif
2476 :
2477 : /*
2478 : * If the number of distinct abbreviated keys approximately matches the
2479 : * number of distinct authoritative original keys, that's reason enough to
2480 : * proceed. We can win even with a very low cardinality set if most
2481 : * tie-breakers only memcmp(). This is by far the most important
2482 : * consideration.
2483 : *
2484 : * While comparisons that are resolved at the abbreviated key level are
2485 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2486 : * those two outcomes are so much cheaper than a full strcoll() once
2487 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2488 : * cardinality against the overall size of the set in order to more
2489 : * accurately model costs. Assume that an abbreviated comparison, and an
2490 : * abbreviated comparison with a cheap memcmp()-based authoritative
2491 : * resolution are equivalent.
2492 : */
2493 1444 : if (abbrev_distinct > key_distinct * sss->prop_card)
2494 : {
2495 : /*
2496 : * When we have exceeded 10,000 tuples, decay required cardinality
2497 : * aggressively for next call.
2498 : *
2499 : * This is useful because the number of comparisons required on
2500 : * average increases at a linearithmic rate, and at roughly 10,000
2501 : * tuples that factor will start to dominate over the linear costs of
2502 : * string transformation (this is a conservative estimate). The decay
2503 : * rate is chosen to be a little less aggressive than halving -- which
2504 : * (since we're called at points at which memtupcount has doubled)
2505 : * would never see the cost model actually abort past the first call
2506 : * following a decay. This decay rate is mostly a precaution against
2507 : * a sudden, violent swing in how well abbreviated cardinality tracks
2508 : * full key cardinality. The decay also serves to prevent a marginal
2509 : * case from being aborted too late, when too much has already been
2510 : * invested in string transformation.
2511 : *
2512 : * It's possible for sets of several million distinct strings with
2513 : * mere tens of thousands of distinct abbreviated keys to still
2514 : * benefit very significantly. This will generally occur provided
2515 : * each abbreviated key is a proxy for a roughly uniform number of the
2516 : * set's full keys. If it isn't so, we hope to catch that early and
2517 : * abort. If it isn't caught early, by the time the problem is
2518 : * apparent it's probably not worth aborting.
2519 : */
2520 1386 : if (memtupcount > 10000)
2521 6 : sss->prop_card *= 0.65;
2522 :
2523 1386 : return false;
2524 : }
2525 :
2526 : /*
2527 : * Abort abbreviation strategy.
2528 : *
2529 : * The worst case, where all abbreviated keys are identical while all
2530 : * original strings differ will typically only see a regression of about
2531 : * 10% in execution time for small to medium sized lists of strings.
2532 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2533 : * often expect very large improvements, particularly with sets of strings
2534 : * of moderately high to high abbreviated cardinality. There is little to
2535 : * lose but much to gain, which our strategy reflects.
2536 : */
2537 : #ifdef TRACE_SORT
2538 58 : if (trace_sort)
2539 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2540 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2541 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2542 : #endif
2543 :
2544 58 : return true;
2545 : }
2546 :
2547 : /*
2548 : * Generic equalimage support function for character type's operator classes.
2549 : * Disables the use of deduplication with nondeterministic collations.
2550 : */
2551 : Datum
2552 26052 : btvarstrequalimage(PG_FUNCTION_ARGS)
2553 : {
2554 : /* Oid opcintype = PG_GETARG_OID(0); */
2555 26052 : Oid collid = PG_GET_COLLATION();
2556 :
2557 26052 : check_collation_set(collid);
2558 :
2559 26052 : if (lc_collate_is_c(collid) ||
2560 44 : collid == DEFAULT_COLLATION_OID ||
2561 44 : get_collation_isdeterministic(collid))
2562 26032 : PG_RETURN_BOOL(true);
2563 : else
2564 20 : PG_RETURN_BOOL(false);
2565 : }
2566 :
2567 : Datum
2568 229560 : text_larger(PG_FUNCTION_ARGS)
2569 : {
2570 229560 : text *arg1 = PG_GETARG_TEXT_PP(0);
2571 229560 : text *arg2 = PG_GETARG_TEXT_PP(1);
2572 : text *result;
2573 :
2574 229560 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2575 :
2576 229560 : PG_RETURN_TEXT_P(result);
2577 : }
2578 :
2579 : Datum
2580 86076 : text_smaller(PG_FUNCTION_ARGS)
2581 : {
2582 86076 : text *arg1 = PG_GETARG_TEXT_PP(0);
2583 86076 : text *arg2 = PG_GETARG_TEXT_PP(1);
2584 : text *result;
2585 :
2586 86076 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2587 :
2588 86076 : PG_RETURN_TEXT_P(result);
2589 : }
2590 :
2591 :
2592 : /*
2593 : * Cross-type comparison functions for types text and name.
2594 : */
2595 :
2596 : Datum
2597 192382 : nameeqtext(PG_FUNCTION_ARGS)
2598 : {
2599 192382 : Name arg1 = PG_GETARG_NAME(0);
2600 192382 : text *arg2 = PG_GETARG_TEXT_PP(1);
2601 192382 : size_t len1 = strlen(NameStr(*arg1));
2602 192382 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2603 192382 : Oid collid = PG_GET_COLLATION();
2604 : bool result;
2605 :
2606 192382 : check_collation_set(collid);
2607 :
2608 192382 : if (collid == C_COLLATION_OID)
2609 291990 : result = (len1 == len2 &&
2610 129508 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2611 : else
2612 29900 : result = (varstr_cmp(NameStr(*arg1), len1,
2613 29900 : VARDATA_ANY(arg2), len2,
2614 : collid) == 0);
2615 :
2616 192382 : PG_FREE_IF_COPY(arg2, 1);
2617 :
2618 192382 : PG_RETURN_BOOL(result);
2619 : }
2620 :
2621 : Datum
2622 6786 : texteqname(PG_FUNCTION_ARGS)
2623 : {
2624 6786 : text *arg1 = PG_GETARG_TEXT_PP(0);
2625 6786 : Name arg2 = PG_GETARG_NAME(1);
2626 6786 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2627 6786 : size_t len2 = strlen(NameStr(*arg2));
2628 6786 : Oid collid = PG_GET_COLLATION();
2629 : bool result;
2630 :
2631 6786 : check_collation_set(collid);
2632 :
2633 6786 : if (collid == C_COLLATION_OID)
2634 564 : result = (len1 == len2 &&
2635 180 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2636 : else
2637 6402 : result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2638 6402 : NameStr(*arg2), len2,
2639 : collid) == 0);
2640 :
2641 6786 : PG_FREE_IF_COPY(arg1, 0);
2642 :
2643 6786 : PG_RETURN_BOOL(result);
2644 : }
2645 :
2646 : Datum
2647 36 : namenetext(PG_FUNCTION_ARGS)
2648 : {
2649 36 : Name arg1 = PG_GETARG_NAME(0);
2650 36 : text *arg2 = PG_GETARG_TEXT_PP(1);
2651 36 : size_t len1 = strlen(NameStr(*arg1));
2652 36 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2653 36 : Oid collid = PG_GET_COLLATION();
2654 : bool result;
2655 :
2656 36 : check_collation_set(collid);
2657 :
2658 36 : if (collid == C_COLLATION_OID)
2659 18 : result = !(len1 == len2 &&
2660 0 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2661 : else
2662 18 : result = !(varstr_cmp(NameStr(*arg1), len1,
2663 18 : VARDATA_ANY(arg2), len2,
2664 : collid) == 0);
2665 :
2666 36 : PG_FREE_IF_COPY(arg2, 1);
2667 :
2668 36 : PG_RETURN_BOOL(result);
2669 : }
2670 :
2671 : Datum
2672 18 : textnename(PG_FUNCTION_ARGS)
2673 : {
2674 18 : text *arg1 = PG_GETARG_TEXT_PP(0);
2675 18 : Name arg2 = PG_GETARG_NAME(1);
2676 18 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2677 18 : size_t len2 = strlen(NameStr(*arg2));
2678 18 : Oid collid = PG_GET_COLLATION();
2679 : bool result;
2680 :
2681 18 : check_collation_set(collid);
2682 :
2683 18 : if (collid == C_COLLATION_OID)
2684 0 : result = !(len1 == len2 &&
2685 0 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2686 : else
2687 18 : result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2688 18 : NameStr(*arg2), len2,
2689 : collid) == 0);
2690 :
2691 18 : PG_FREE_IF_COPY(arg1, 0);
2692 :
2693 18 : PG_RETURN_BOOL(result);
2694 : }
2695 :
2696 : Datum
2697 148574 : btnametextcmp(PG_FUNCTION_ARGS)
2698 : {
2699 148574 : Name arg1 = PG_GETARG_NAME(0);
2700 148574 : text *arg2 = PG_GETARG_TEXT_PP(1);
2701 : int32 result;
2702 :
2703 297148 : result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2704 297148 : VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2705 : PG_GET_COLLATION());
2706 :
2707 148574 : PG_FREE_IF_COPY(arg2, 1);
2708 :
2709 148574 : PG_RETURN_INT32(result);
2710 : }
2711 :
2712 : Datum
2713 0 : bttextnamecmp(PG_FUNCTION_ARGS)
2714 : {
2715 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2716 0 : Name arg2 = PG_GETARG_NAME(1);
2717 : int32 result;
2718 :
2719 0 : result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2720 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2721 : PG_GET_COLLATION());
2722 :
2723 0 : PG_FREE_IF_COPY(arg1, 0);
2724 :
2725 0 : PG_RETURN_INT32(result);
2726 : }
2727 :
2728 : #define CmpCall(cmpfunc) \
2729 : DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2730 : PG_GET_COLLATION(), \
2731 : PG_GETARG_DATUM(0), \
2732 : PG_GETARG_DATUM(1)))
2733 :
2734 : Datum
2735 47414 : namelttext(PG_FUNCTION_ARGS)
2736 : {
2737 47414 : PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2738 : }
2739 :
2740 : Datum
2741 0 : nameletext(PG_FUNCTION_ARGS)
2742 : {
2743 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2744 : }
2745 :
2746 : Datum
2747 0 : namegttext(PG_FUNCTION_ARGS)
2748 : {
2749 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2750 : }
2751 :
2752 : Datum
2753 45790 : namegetext(PG_FUNCTION_ARGS)
2754 : {
2755 45790 : PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2756 : }
2757 :
2758 : Datum
2759 0 : textltname(PG_FUNCTION_ARGS)
2760 : {
2761 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2762 : }
2763 :
2764 : Datum
2765 0 : textlename(PG_FUNCTION_ARGS)
2766 : {
2767 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2768 : }
2769 :
2770 : Datum
2771 0 : textgtname(PG_FUNCTION_ARGS)
2772 : {
2773 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2774 : }
2775 :
2776 : Datum
2777 0 : textgename(PG_FUNCTION_ARGS)
2778 : {
2779 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2780 : }
2781 :
2782 : #undef CmpCall
2783 :
2784 :
2785 : /*
2786 : * The following operators support character-by-character comparison
2787 : * of text datums, to allow building indexes suitable for LIKE clauses.
2788 : * Note that the regular texteq/textne comparison operators, and regular
2789 : * support functions 1 and 2 with "C" collation are assumed to be
2790 : * compatible with these!
2791 : */
2792 :
2793 : static int
2794 152080 : internal_text_pattern_compare(text *arg1, text *arg2)
2795 : {
2796 : int result;
2797 : int len1,
2798 : len2;
2799 :
2800 152080 : len1 = VARSIZE_ANY_EXHDR(arg1);
2801 152080 : len2 = VARSIZE_ANY_EXHDR(arg2);
2802 :
2803 152080 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2804 152080 : if (result != 0)
2805 152026 : return result;
2806 54 : else if (len1 < len2)
2807 0 : return -1;
2808 54 : else if (len1 > len2)
2809 18 : return 1;
2810 : else
2811 36 : return 0;
2812 : }
2813 :
2814 :
2815 : Datum
2816 39538 : text_pattern_lt(PG_FUNCTION_ARGS)
2817 : {
2818 39538 : text *arg1 = PG_GETARG_TEXT_PP(0);
2819 39538 : text *arg2 = PG_GETARG_TEXT_PP(1);
2820 : int result;
2821 :
2822 39538 : result = internal_text_pattern_compare(arg1, arg2);
2823 :
2824 39538 : PG_FREE_IF_COPY(arg1, 0);
2825 39538 : PG_FREE_IF_COPY(arg2, 1);
2826 :
2827 39538 : PG_RETURN_BOOL(result < 0);
2828 : }
2829 :
2830 :
2831 : Datum
2832 37510 : text_pattern_le(PG_FUNCTION_ARGS)
2833 : {
2834 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2835 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2836 : int result;
2837 :
2838 37510 : result = internal_text_pattern_compare(arg1, arg2);
2839 :
2840 37510 : PG_FREE_IF_COPY(arg1, 0);
2841 37510 : PG_FREE_IF_COPY(arg2, 1);
2842 :
2843 37510 : PG_RETURN_BOOL(result <= 0);
2844 : }
2845 :
2846 :
2847 : Datum
2848 37510 : text_pattern_ge(PG_FUNCTION_ARGS)
2849 : {
2850 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2851 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2852 : int result;
2853 :
2854 37510 : result = internal_text_pattern_compare(arg1, arg2);
2855 :
2856 37510 : PG_FREE_IF_COPY(arg1, 0);
2857 37510 : PG_FREE_IF_COPY(arg2, 1);
2858 :
2859 37510 : PG_RETURN_BOOL(result >= 0);
2860 : }
2861 :
2862 :
2863 : Datum
2864 37510 : text_pattern_gt(PG_FUNCTION_ARGS)
2865 : {
2866 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
2867 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
2868 : int result;
2869 :
2870 37510 : result = internal_text_pattern_compare(arg1, arg2);
2871 :
2872 37510 : PG_FREE_IF_COPY(arg1, 0);
2873 37510 : PG_FREE_IF_COPY(arg2, 1);
2874 :
2875 37510 : PG_RETURN_BOOL(result > 0);
2876 : }
2877 :
2878 :
2879 : Datum
2880 12 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
2881 : {
2882 12 : text *arg1 = PG_GETARG_TEXT_PP(0);
2883 12 : text *arg2 = PG_GETARG_TEXT_PP(1);
2884 : int result;
2885 :
2886 12 : result = internal_text_pattern_compare(arg1, arg2);
2887 :
2888 12 : PG_FREE_IF_COPY(arg1, 0);
2889 12 : PG_FREE_IF_COPY(arg2, 1);
2890 :
2891 12 : PG_RETURN_INT32(result);
2892 : }
2893 :
2894 :
2895 : Datum
2896 116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2897 : {
2898 116 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2899 : MemoryContext oldcontext;
2900 :
2901 116 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2902 :
2903 : /* Use generic string SortSupport, forcing "C" collation */
2904 116 : varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2905 :
2906 116 : MemoryContextSwitchTo(oldcontext);
2907 :
2908 116 : PG_RETURN_VOID();
2909 : }
2910 :
2911 :
2912 : /*-------------------------------------------------------------
2913 : * byteaoctetlen
2914 : *
2915 : * get the number of bytes contained in an instance of type 'bytea'
2916 : *-------------------------------------------------------------
2917 : */
2918 : Datum
2919 314 : byteaoctetlen(PG_FUNCTION_ARGS)
2920 : {
2921 314 : Datum str = PG_GETARG_DATUM(0);
2922 :
2923 : /* We need not detoast the input at all */
2924 314 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2925 : }
2926 :
2927 : /*
2928 : * byteacat -
2929 : * takes two bytea* and returns a bytea* that is the concatenation of
2930 : * the two.
2931 : *
2932 : * Cloned from textcat and modified as required.
2933 : */
2934 : Datum
2935 1520 : byteacat(PG_FUNCTION_ARGS)
2936 : {
2937 1520 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2938 1520 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2939 :
2940 1520 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2941 : }
2942 :
2943 : /*
2944 : * bytea_catenate
2945 : * Guts of byteacat(), broken out so it can be used by other functions
2946 : *
2947 : * Arguments can be in short-header form, but not compressed or out-of-line
2948 : */
2949 : static bytea *
2950 1556 : bytea_catenate(bytea *t1, bytea *t2)
2951 : {
2952 : bytea *result;
2953 : int len1,
2954 : len2,
2955 : len;
2956 : char *ptr;
2957 :
2958 1556 : len1 = VARSIZE_ANY_EXHDR(t1);
2959 1556 : len2 = VARSIZE_ANY_EXHDR(t2);
2960 :
2961 : /* paranoia ... probably should throw error instead? */
2962 1556 : if (len1 < 0)
2963 0 : len1 = 0;
2964 1556 : if (len2 < 0)
2965 0 : len2 = 0;
2966 :
2967 1556 : len = len1 + len2 + VARHDRSZ;
2968 1556 : result = (bytea *) palloc(len);
2969 :
2970 : /* Set size of result string... */
2971 1556 : SET_VARSIZE(result, len);
2972 :
2973 : /* Fill data field of result string... */
2974 1556 : ptr = VARDATA(result);
2975 1556 : if (len1 > 0)
2976 1556 : memcpy(ptr, VARDATA_ANY(t1), len1);
2977 1556 : if (len2 > 0)
2978 1538 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2979 :
2980 1556 : return result;
2981 : }
2982 :
2983 : #define PG_STR_GET_BYTEA(str_) \
2984 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2985 :
2986 : /*
2987 : * bytea_substr()
2988 : * Return a substring starting at the specified position.
2989 : * Cloned from text_substr and modified as required.
2990 : *
2991 : * Input:
2992 : * - string
2993 : * - starting position (is one-based)
2994 : * - string length (optional)
2995 : *
2996 : * If the starting position is zero or less, then return from the start of the string
2997 : * adjusting the length to be consistent with the "negative start" per SQL.
2998 : * If the length is less than zero, an ERROR is thrown. If no third argument
2999 : * (length) is provided, the length to the end of the string is assumed.
3000 : */
3001 : Datum
3002 86 : bytea_substr(PG_FUNCTION_ARGS)
3003 : {
3004 86 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3005 : PG_GETARG_INT32(1),
3006 : PG_GETARG_INT32(2),
3007 : false));
3008 : }
3009 :
3010 : /*
3011 : * bytea_substr_no_len -
3012 : * Wrapper to avoid opr_sanity failure due to
3013 : * one function accepting a different number of args.
3014 : */
3015 : Datum
3016 3900 : bytea_substr_no_len(PG_FUNCTION_ARGS)
3017 : {
3018 3900 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3019 : PG_GETARG_INT32(1),
3020 : -1,
3021 : true));
3022 : }
3023 :
3024 : static bytea *
3025 4022 : bytea_substring(Datum str,
3026 : int S,
3027 : int L,
3028 : bool length_not_specified)
3029 : {
3030 : int32 S1; /* adjusted start position */
3031 : int32 L1; /* adjusted substring length */
3032 : int32 E; /* end position */
3033 :
3034 : /*
3035 : * The logic here should generally match text_substring().
3036 : */
3037 4022 : S1 = Max(S, 1);
3038 :
3039 4022 : if (length_not_specified)
3040 : {
3041 : /*
3042 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3043 : * end of the string if we pass it a negative value for length.
3044 : */
3045 3918 : L1 = -1;
3046 : }
3047 104 : else if (L < 0)
3048 : {
3049 : /* SQL99 says to throw an error for E < S, i.e., negative length */
3050 12 : ereport(ERROR,
3051 : (errcode(ERRCODE_SUBSTRING_ERROR),
3052 : errmsg("negative substring length not allowed")));
3053 : L1 = -1; /* silence stupider compilers */
3054 : }
3055 92 : else if (pg_add_s32_overflow(S, L, &E))
3056 : {
3057 : /*
3058 : * L could be large enough for S + L to overflow, in which case the
3059 : * substring must run to end of string.
3060 : */
3061 6 : L1 = -1;
3062 : }
3063 : else
3064 : {
3065 : /*
3066 : * A zero or negative value for the end position can happen if the
3067 : * start was negative or one. SQL99 says to return a zero-length
3068 : * string.
3069 : */
3070 86 : if (E < 1)
3071 0 : return PG_STR_GET_BYTEA("");
3072 :
3073 86 : L1 = E - S1;
3074 : }
3075 :
3076 : /*
3077 : * If the start position is past the end of the string, SQL99 says to
3078 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
3079 : * us. We need only convert S1 to zero-based starting position.
3080 : */
3081 4010 : return DatumGetByteaPSlice(str, S1 - 1, L1);
3082 : }
3083 :
3084 : /*
3085 : * byteaoverlay
3086 : * Replace specified substring of first string with second
3087 : *
3088 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3089 : * This code is a direct implementation of what the standard says.
3090 : */
3091 : Datum
3092 6 : byteaoverlay(PG_FUNCTION_ARGS)
3093 : {
3094 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3095 6 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3096 6 : int sp = PG_GETARG_INT32(2); /* substring start position */
3097 6 : int sl = PG_GETARG_INT32(3); /* substring length */
3098 :
3099 6 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3100 : }
3101 :
3102 : Datum
3103 12 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
3104 : {
3105 12 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3106 12 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3107 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
3108 : int sl;
3109 :
3110 12 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3111 12 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3112 : }
3113 :
3114 : static bytea *
3115 18 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3116 : {
3117 : bytea *result;
3118 : bytea *s1;
3119 : bytea *s2;
3120 : int sp_pl_sl;
3121 :
3122 : /*
3123 : * Check for possible integer-overflow cases. For negative sp, throw a
3124 : * "substring length" error because that's what should be expected
3125 : * according to the spec's definition of OVERLAY().
3126 : */
3127 18 : if (sp <= 0)
3128 0 : ereport(ERROR,
3129 : (errcode(ERRCODE_SUBSTRING_ERROR),
3130 : errmsg("negative substring length not allowed")));
3131 18 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3132 0 : ereport(ERROR,
3133 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3134 : errmsg("integer out of range")));
3135 :
3136 18 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3137 18 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3138 18 : result = bytea_catenate(s1, t2);
3139 18 : result = bytea_catenate(result, s2);
3140 :
3141 18 : return result;
3142 : }
3143 :
3144 : /*
3145 : * bit_count
3146 : */
3147 : Datum
3148 6 : bytea_bit_count(PG_FUNCTION_ARGS)
3149 : {
3150 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3151 :
3152 6 : PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3153 : }
3154 :
3155 : /*
3156 : * byteapos -
3157 : * Return the position of the specified substring.
3158 : * Implements the SQL POSITION() function.
3159 : * Cloned from textpos and modified as required.
3160 : */
3161 : Datum
3162 0 : byteapos(PG_FUNCTION_ARGS)
3163 : {
3164 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3165 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3166 : int pos;
3167 : int px,
3168 : p;
3169 : int len1,
3170 : len2;
3171 : char *p1,
3172 : *p2;
3173 :
3174 0 : len1 = VARSIZE_ANY_EXHDR(t1);
3175 0 : len2 = VARSIZE_ANY_EXHDR(t2);
3176 :
3177 0 : if (len2 <= 0)
3178 0 : PG_RETURN_INT32(1); /* result for empty pattern */
3179 :
3180 0 : p1 = VARDATA_ANY(t1);
3181 0 : p2 = VARDATA_ANY(t2);
3182 :
3183 0 : pos = 0;
3184 0 : px = (len1 - len2);
3185 0 : for (p = 0; p <= px; p++)
3186 : {
3187 0 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3188 : {
3189 0 : pos = p + 1;
3190 0 : break;
3191 : };
3192 0 : p1++;
3193 : };
3194 :
3195 0 : PG_RETURN_INT32(pos);
3196 : }
3197 :
3198 : /*-------------------------------------------------------------
3199 : * byteaGetByte
3200 : *
3201 : * this routine treats "bytea" as an array of bytes.
3202 : * It returns the Nth byte (a number between 0 and 255).
3203 : *-------------------------------------------------------------
3204 : */
3205 : Datum
3206 60 : byteaGetByte(PG_FUNCTION_ARGS)
3207 : {
3208 60 : bytea *v = PG_GETARG_BYTEA_PP(0);
3209 60 : int32 n = PG_GETARG_INT32(1);
3210 : int len;
3211 : int byte;
3212 :
3213 60 : len = VARSIZE_ANY_EXHDR(v);
3214 :
3215 60 : if (n < 0 || n >= len)
3216 6 : ereport(ERROR,
3217 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3218 : errmsg("index %d out of valid range, 0..%d",
3219 : n, len - 1)));
3220 :
3221 54 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3222 :
3223 54 : PG_RETURN_INT32(byte);
3224 : }
3225 :
3226 : /*-------------------------------------------------------------
3227 : * byteaGetBit
3228 : *
3229 : * This routine treats a "bytea" type like an array of bits.
3230 : * It returns the value of the Nth bit (0 or 1).
3231 : *
3232 : *-------------------------------------------------------------
3233 : */
3234 : Datum
3235 12 : byteaGetBit(PG_FUNCTION_ARGS)
3236 : {
3237 12 : bytea *v = PG_GETARG_BYTEA_PP(0);
3238 12 : int64 n = PG_GETARG_INT64(1);
3239 : int byteNo,
3240 : bitNo;
3241 : int len;
3242 : int byte;
3243 :
3244 12 : len = VARSIZE_ANY_EXHDR(v);
3245 :
3246 12 : if (n < 0 || n >= (int64) len * 8)
3247 6 : ereport(ERROR,
3248 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3249 : errmsg("index %lld out of valid range, 0..%lld",
3250 : (long long) n, (long long) len * 8 - 1)));
3251 :
3252 : /* n/8 is now known < len, so safe to cast to int */
3253 6 : byteNo = (int) (n / 8);
3254 6 : bitNo = (int) (n % 8);
3255 :
3256 6 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3257 :
3258 6 : if (byte & (1 << bitNo))
3259 6 : PG_RETURN_INT32(1);
3260 : else
3261 0 : PG_RETURN_INT32(0);
3262 : }
3263 :
3264 : /*-------------------------------------------------------------
3265 : * byteaSetByte
3266 : *
3267 : * Given an instance of type 'bytea' creates a new one with
3268 : * the Nth byte set to the given value.
3269 : *
3270 : *-------------------------------------------------------------
3271 : */
3272 : Datum
3273 12 : byteaSetByte(PG_FUNCTION_ARGS)
3274 : {
3275 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3276 12 : int32 n = PG_GETARG_INT32(1);
3277 12 : int32 newByte = PG_GETARG_INT32(2);
3278 : int len;
3279 :
3280 12 : len = VARSIZE(res) - VARHDRSZ;
3281 :
3282 12 : if (n < 0 || n >= len)
3283 6 : ereport(ERROR,
3284 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3285 : errmsg("index %d out of valid range, 0..%d",
3286 : n, len - 1)));
3287 :
3288 : /*
3289 : * Now set the byte.
3290 : */
3291 6 : ((unsigned char *) VARDATA(res))[n] = newByte;
3292 :
3293 6 : PG_RETURN_BYTEA_P(res);
3294 : }
3295 :
3296 : /*-------------------------------------------------------------
3297 : * byteaSetBit
3298 : *
3299 : * Given an instance of type 'bytea' creates a new one with
3300 : * the Nth bit set to the given value.
3301 : *
3302 : *-------------------------------------------------------------
3303 : */
3304 : Datum
3305 12 : byteaSetBit(PG_FUNCTION_ARGS)
3306 : {
3307 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3308 12 : int64 n = PG_GETARG_INT64(1);
3309 12 : int32 newBit = PG_GETARG_INT32(2);
3310 : int len;
3311 : int oldByte,
3312 : newByte;
3313 : int byteNo,
3314 : bitNo;
3315 :
3316 12 : len = VARSIZE(res) - VARHDRSZ;
3317 :
3318 12 : if (n < 0 || n >= (int64) len * 8)
3319 6 : ereport(ERROR,
3320 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3321 : errmsg("index %lld out of valid range, 0..%lld",
3322 : (long long) n, (long long) len * 8 - 1)));
3323 :
3324 : /* n/8 is now known < len, so safe to cast to int */
3325 6 : byteNo = (int) (n / 8);
3326 6 : bitNo = (int) (n % 8);
3327 :
3328 : /*
3329 : * sanity check!
3330 : */
3331 6 : if (newBit != 0 && newBit != 1)
3332 0 : ereport(ERROR,
3333 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3334 : errmsg("new bit must be 0 or 1")));
3335 :
3336 : /*
3337 : * Update the byte.
3338 : */
3339 6 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3340 :
3341 6 : if (newBit == 0)
3342 6 : newByte = oldByte & (~(1 << bitNo));
3343 : else
3344 0 : newByte = oldByte | (1 << bitNo);
3345 :
3346 6 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3347 :
3348 6 : PG_RETURN_BYTEA_P(res);
3349 : }
3350 :
3351 :
3352 : /* text_name()
3353 : * Converts a text type to a Name type.
3354 : */
3355 : Datum
3356 30522 : text_name(PG_FUNCTION_ARGS)
3357 : {
3358 30522 : text *s = PG_GETARG_TEXT_PP(0);
3359 : Name result;
3360 : int len;
3361 :
3362 30522 : len = VARSIZE_ANY_EXHDR(s);
3363 :
3364 : /* Truncate oversize input */
3365 30522 : if (len >= NAMEDATALEN)
3366 6 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3367 :
3368 : /* We use palloc0 here to ensure result is zero-padded */
3369 30522 : result = (Name) palloc0(NAMEDATALEN);
3370 30522 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3371 :
3372 30522 : PG_RETURN_NAME(result);
3373 : }
3374 :
3375 : /* name_text()
3376 : * Converts a Name type to a text type.
3377 : */
3378 : Datum
3379 1042242 : name_text(PG_FUNCTION_ARGS)
3380 : {
3381 1042242 : Name s = PG_GETARG_NAME(0);
3382 :
3383 1042242 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3384 : }
3385 :
3386 :
3387 : /*
3388 : * textToQualifiedNameList - convert a text object to list of names
3389 : *
3390 : * This implements the input parsing needed by nextval() and other
3391 : * functions that take a text parameter representing a qualified name.
3392 : * We split the name at dots, downcase if not double-quoted, and
3393 : * truncate names if they're too long.
3394 : */
3395 : List *
3396 1374 : textToQualifiedNameList(text *textval)
3397 : {
3398 : char *rawname;
3399 1374 : List *result = NIL;
3400 : List *namelist;
3401 : ListCell *l;
3402 :
3403 : /* Convert to C string (handles possible detoasting). */
3404 : /* Note we rely on being able to modify rawname below. */
3405 1374 : rawname = text_to_cstring(textval);
3406 :
3407 1374 : if (!SplitIdentifierString(rawname, '.', &namelist))
3408 0 : ereport(ERROR,
3409 : (errcode(ERRCODE_INVALID_NAME),
3410 : errmsg("invalid name syntax")));
3411 :
3412 1374 : if (namelist == NIL)
3413 0 : ereport(ERROR,
3414 : (errcode(ERRCODE_INVALID_NAME),
3415 : errmsg("invalid name syntax")));
3416 :
3417 2858 : foreach(l, namelist)
3418 : {
3419 1484 : char *curname = (char *) lfirst(l);
3420 :
3421 1484 : result = lappend(result, makeString(pstrdup(curname)));
3422 : }
3423 :
3424 1374 : pfree(rawname);
3425 1374 : list_free(namelist);
3426 :
3427 1374 : return result;
3428 : }
3429 :
3430 : /*
3431 : * SplitIdentifierString --- parse a string containing identifiers
3432 : *
3433 : * This is the guts of textToQualifiedNameList, and is exported for use in
3434 : * other situations such as parsing GUC variables. In the GUC case, it's
3435 : * important to avoid memory leaks, so the API is designed to minimize the
3436 : * amount of stuff that needs to be allocated and freed.
3437 : *
3438 : * Inputs:
3439 : * rawstring: the input string; must be overwritable! On return, it's
3440 : * been modified to contain the separated identifiers.
3441 : * separator: the separator punctuation expected between identifiers
3442 : * (typically '.' or ','). Whitespace may also appear around
3443 : * identifiers.
3444 : * Outputs:
3445 : * namelist: filled with a palloc'd list of pointers to identifiers within
3446 : * rawstring. Caller should list_free() this even on error return.
3447 : *
3448 : * Returns true if okay, false if there is a syntax error in the string.
3449 : *
3450 : * Note that an empty string is considered okay here, though not in
3451 : * textToQualifiedNameList.
3452 : */
3453 : bool
3454 144856 : SplitIdentifierString(char *rawstring, char separator,
3455 : List **namelist)
3456 : {
3457 144856 : char *nextp = rawstring;
3458 144856 : bool done = false;
3459 :
3460 144856 : *namelist = NIL;
3461 :
3462 144862 : while (scanner_isspace(*nextp))
3463 6 : nextp++; /* skip leading whitespace */
3464 :
3465 144856 : if (*nextp == '\0')
3466 19692 : return true; /* allow empty string */
3467 :
3468 : /* At the top of the loop, we are at start of a new identifier. */
3469 : do
3470 : {
3471 : char *curname;
3472 : char *endp;
3473 :
3474 200614 : if (*nextp == '"')
3475 : {
3476 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3477 30764 : curname = nextp + 1;
3478 : for (;;)
3479 : {
3480 30768 : endp = strchr(nextp + 1, '"');
3481 30766 : if (endp == NULL)
3482 0 : return false; /* mismatched quotes */
3483 30766 : if (endp[1] != '"')
3484 30764 : break; /* found end of quoted name */
3485 : /* Collapse adjacent quotes into one quote, and look again */
3486 2 : memmove(endp, endp + 1, strlen(endp));
3487 2 : nextp = endp;
3488 : }
3489 : /* endp now points at the terminating quote */
3490 30764 : nextp = endp + 1;
3491 : }
3492 : else
3493 : {
3494 : /* Unquoted name --- extends to separator or whitespace */
3495 : char *downname;
3496 : int len;
3497 :
3498 169850 : curname = nextp;
3499 1474152 : while (*nextp && *nextp != separator &&
3500 1304304 : !scanner_isspace(*nextp))
3501 1304302 : nextp++;
3502 169850 : endp = nextp;
3503 169850 : if (curname == nextp)
3504 0 : return false; /* empty unquoted name not allowed */
3505 :
3506 : /*
3507 : * Downcase the identifier, using same code as main lexer does.
3508 : *
3509 : * XXX because we want to overwrite the input in-place, we cannot
3510 : * support a downcasing transformation that increases the string
3511 : * length. This is not a problem given the current implementation
3512 : * of downcase_truncate_identifier, but we'll probably have to do
3513 : * something about this someday.
3514 : */
3515 169850 : len = endp - curname;
3516 169850 : downname = downcase_truncate_identifier(curname, len, false);
3517 : Assert(strlen(downname) <= len);
3518 169850 : strncpy(curname, downname, len); /* strncpy is required here */
3519 169850 : pfree(downname);
3520 : }
3521 :
3522 200616 : while (scanner_isspace(*nextp))
3523 2 : nextp++; /* skip trailing whitespace */
3524 :
3525 200614 : if (*nextp == separator)
3526 : {
3527 75450 : nextp++;
3528 121802 : while (scanner_isspace(*nextp))
3529 46352 : nextp++; /* skip leading whitespace for next */
3530 : /* we expect another name, so done remains false */
3531 : }
3532 125164 : else if (*nextp == '\0')
3533 125162 : done = true;
3534 : else
3535 2 : return false; /* invalid syntax */
3536 :
3537 : /* Now safe to overwrite separator with a null */
3538 200612 : *endp = '\0';
3539 :
3540 : /* Truncate name if it's overlength */
3541 200612 : truncate_identifier(curname, strlen(curname), false);
3542 :
3543 : /*
3544 : * Finished isolating current name --- add it to list
3545 : */
3546 200612 : *namelist = lappend(*namelist, curname);
3547 :
3548 : /* Loop back if we didn't reach end of string */
3549 200612 : } while (!done);
3550 :
3551 125162 : return true;
3552 : }
3553 :
3554 :
3555 : /*
3556 : * SplitDirectoriesString --- parse a string containing file/directory names
3557 : *
3558 : * This works fine on file names too; the function name is historical.
3559 : *
3560 : * This is similar to SplitIdentifierString, except that the parsing
3561 : * rules are meant to handle pathnames instead of identifiers: there is
3562 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3563 : * and we apply canonicalize_path() to each extracted string. Because of the
3564 : * last, the returned strings are separately palloc'd rather than being
3565 : * pointers into rawstring --- but we still scribble on rawstring.
3566 : *
3567 : * Inputs:
3568 : * rawstring: the input string; must be modifiable!
3569 : * separator: the separator punctuation expected between directories
3570 : * (typically ',' or ';'). Whitespace may also appear around
3571 : * directories.
3572 : * Outputs:
3573 : * namelist: filled with a palloc'd list of directory names.
3574 : * Caller should list_free_deep() this even on error return.
3575 : *
3576 : * Returns true if okay, false if there is a syntax error in the string.
3577 : *
3578 : * Note that an empty string is considered okay here.
3579 : */
3580 : bool
3581 1242 : SplitDirectoriesString(char *rawstring, char separator,
3582 : List **namelist)
3583 : {
3584 1242 : char *nextp = rawstring;
3585 1242 : bool done = false;
3586 :
3587 1242 : *namelist = NIL;
3588 :
3589 1242 : while (scanner_isspace(*nextp))
3590 0 : nextp++; /* skip leading whitespace */
3591 :
3592 1242 : if (*nextp == '\0')
3593 2 : return true; /* allow empty string */
3594 :
3595 : /* At the top of the loop, we are at start of a new directory. */
3596 : do
3597 : {
3598 : char *curname;
3599 : char *endp;
3600 :
3601 1240 : if (*nextp == '"')
3602 : {
3603 : /* Quoted name --- collapse quote-quote pairs */
3604 0 : curname = nextp + 1;
3605 : for (;;)
3606 : {
3607 0 : endp = strchr(nextp + 1, '"');
3608 0 : if (endp == NULL)
3609 0 : return false; /* mismatched quotes */
3610 0 : if (endp[1] != '"')
3611 0 : break; /* found end of quoted name */
3612 : /* Collapse adjacent quotes into one quote, and look again */
3613 0 : memmove(endp, endp + 1, strlen(endp));
3614 0 : nextp = endp;
3615 : }
3616 : /* endp now points at the terminating quote */
3617 0 : nextp = endp + 1;
3618 : }
3619 : else
3620 : {
3621 : /* Unquoted name --- extends to separator or end of string */
3622 1240 : curname = endp = nextp;
3623 20948 : while (*nextp && *nextp != separator)
3624 : {
3625 : /* trailing whitespace should not be included in name */
3626 19708 : if (!scanner_isspace(*nextp))
3627 19708 : endp = nextp + 1;
3628 19708 : nextp++;
3629 : }
3630 1240 : if (curname == endp)
3631 0 : return false; /* empty unquoted name not allowed */
3632 : }
3633 :
3634 1240 : while (scanner_isspace(*nextp))
3635 0 : nextp++; /* skip trailing whitespace */
3636 :
3637 1240 : if (*nextp == separator)
3638 : {
3639 0 : nextp++;
3640 0 : while (scanner_isspace(*nextp))
3641 0 : nextp++; /* skip leading whitespace for next */
3642 : /* we expect another name, so done remains false */
3643 : }
3644 1240 : else if (*nextp == '\0')
3645 1240 : done = true;
3646 : else
3647 0 : return false; /* invalid syntax */
3648 :
3649 : /* Now safe to overwrite separator with a null */
3650 1240 : *endp = '\0';
3651 :
3652 : /* Truncate path if it's overlength */
3653 1240 : if (strlen(curname) >= MAXPGPATH)
3654 0 : curname[MAXPGPATH - 1] = '\0';
3655 :
3656 : /*
3657 : * Finished isolating current name --- add it to list
3658 : */
3659 1240 : curname = pstrdup(curname);
3660 1240 : canonicalize_path(curname);
3661 1240 : *namelist = lappend(*namelist, curname);
3662 :
3663 : /* Loop back if we didn't reach end of string */
3664 1240 : } while (!done);
3665 :
3666 1240 : return true;
3667 : }
3668 :
3669 :
3670 : /*
3671 : * SplitGUCList --- parse a string containing identifiers or file names
3672 : *
3673 : * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3674 : * presuming whether the elements will be taken as identifiers or file names.
3675 : * We assume the input has already been through flatten_set_variable_args(),
3676 : * so that we need never downcase (if appropriate, that was done already).
3677 : * Nor do we ever truncate, since we don't know the correct max length.
3678 : * We disallow embedded whitespace for simplicity (it shouldn't matter,
3679 : * because any embedded whitespace should have led to double-quoting).
3680 : * Otherwise the API is identical to SplitIdentifierString.
3681 : *
3682 : * XXX it's annoying to have so many copies of this string-splitting logic.
3683 : * However, it's not clear that having one function with a bunch of option
3684 : * flags would be much better.
3685 : *
3686 : * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3687 : * Be sure to update that if you have to change this.
3688 : *
3689 : * Inputs:
3690 : * rawstring: the input string; must be overwritable! On return, it's
3691 : * been modified to contain the separated identifiers.
3692 : * separator: the separator punctuation expected between identifiers
3693 : * (typically '.' or ','). Whitespace may also appear around
3694 : * identifiers.
3695 : * Outputs:
3696 : * namelist: filled with a palloc'd list of pointers to identifiers within
3697 : * rawstring. Caller should list_free() this even on error return.
3698 : *
3699 : * Returns true if okay, false if there is a syntax error in the string.
3700 : */
3701 : bool
3702 4900 : SplitGUCList(char *rawstring, char separator,
3703 : List **namelist)
3704 : {
3705 4900 : char *nextp = rawstring;
3706 4900 : bool done = false;
3707 :
3708 4900 : *namelist = NIL;
3709 :
3710 4900 : while (scanner_isspace(*nextp))
3711 0 : nextp++; /* skip leading whitespace */
3712 :
3713 4900 : if (*nextp == '\0')
3714 4834 : return true; /* allow empty string */
3715 :
3716 : /* At the top of the loop, we are at start of a new identifier. */
3717 : do
3718 : {
3719 : char *curname;
3720 : char *endp;
3721 :
3722 92 : if (*nextp == '"')
3723 : {
3724 : /* Quoted name --- collapse quote-quote pairs */
3725 24 : curname = nextp + 1;
3726 : for (;;)
3727 : {
3728 36 : endp = strchr(nextp + 1, '"');
3729 30 : if (endp == NULL)
3730 0 : return false; /* mismatched quotes */
3731 30 : if (endp[1] != '"')
3732 24 : break; /* found end of quoted name */
3733 : /* Collapse adjacent quotes into one quote, and look again */
3734 6 : memmove(endp, endp + 1, strlen(endp));
3735 6 : nextp = endp;
3736 : }
3737 : /* endp now points at the terminating quote */
3738 24 : nextp = endp + 1;
3739 : }
3740 : else
3741 : {
3742 : /* Unquoted name --- extends to separator or whitespace */
3743 68 : curname = nextp;
3744 638 : while (*nextp && *nextp != separator &&
3745 570 : !scanner_isspace(*nextp))
3746 570 : nextp++;
3747 68 : endp = nextp;
3748 68 : if (curname == nextp)
3749 0 : return false; /* empty unquoted name not allowed */
3750 : }
3751 :
3752 92 : while (scanner_isspace(*nextp))
3753 0 : nextp++; /* skip trailing whitespace */
3754 :
3755 92 : if (*nextp == separator)
3756 : {
3757 26 : nextp++;
3758 44 : while (scanner_isspace(*nextp))
3759 18 : nextp++; /* skip leading whitespace for next */
3760 : /* we expect another name, so done remains false */
3761 : }
3762 66 : else if (*nextp == '\0')
3763 66 : done = true;
3764 : else
3765 0 : return false; /* invalid syntax */
3766 :
3767 : /* Now safe to overwrite separator with a null */
3768 92 : *endp = '\0';
3769 :
3770 : /*
3771 : * Finished isolating current name --- add it to list
3772 : */
3773 92 : *namelist = lappend(*namelist, curname);
3774 :
3775 : /* Loop back if we didn't reach end of string */
3776 92 : } while (!done);
3777 :
3778 66 : return true;
3779 : }
3780 :
3781 :
3782 : /*****************************************************************************
3783 : * Comparison Functions used for bytea
3784 : *
3785 : * Note: btree indexes need these routines not to leak memory; therefore,
3786 : * be careful to free working copies of toasted datums. Most places don't
3787 : * need to be so careful.
3788 : *****************************************************************************/
3789 :
3790 : Datum
3791 10378 : byteaeq(PG_FUNCTION_ARGS)
3792 : {
3793 10378 : Datum arg1 = PG_GETARG_DATUM(0);
3794 10378 : Datum arg2 = PG_GETARG_DATUM(1);
3795 : bool result;
3796 : Size len1,
3797 : len2;
3798 :
3799 : /*
3800 : * We can use a fast path for unequal lengths, which might save us from
3801 : * having to detoast one or both values.
3802 : */
3803 10378 : len1 = toast_raw_datum_size(arg1);
3804 10378 : len2 = toast_raw_datum_size(arg2);
3805 10378 : if (len1 != len2)
3806 4308 : result = false;
3807 : else
3808 : {
3809 6070 : bytea *barg1 = DatumGetByteaPP(arg1);
3810 6070 : bytea *barg2 = DatumGetByteaPP(arg2);
3811 :
3812 6070 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3813 : len1 - VARHDRSZ) == 0);
3814 :
3815 6070 : PG_FREE_IF_COPY(barg1, 0);
3816 6070 : PG_FREE_IF_COPY(barg2, 1);
3817 : }
3818 :
3819 10378 : PG_RETURN_BOOL(result);
3820 : }
3821 :
3822 : Datum
3823 768 : byteane(PG_FUNCTION_ARGS)
3824 : {
3825 768 : Datum arg1 = PG_GETARG_DATUM(0);
3826 768 : Datum arg2 = PG_GETARG_DATUM(1);
3827 : bool result;
3828 : Size len1,
3829 : len2;
3830 :
3831 : /*
3832 : * We can use a fast path for unequal lengths, which might save us from
3833 : * having to detoast one or both values.
3834 : */
3835 768 : len1 = toast_raw_datum_size(arg1);
3836 768 : len2 = toast_raw_datum_size(arg2);
3837 768 : if (len1 != len2)
3838 0 : result = true;
3839 : else
3840 : {
3841 768 : bytea *barg1 = DatumGetByteaPP(arg1);
3842 768 : bytea *barg2 = DatumGetByteaPP(arg2);
3843 :
3844 768 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3845 : len1 - VARHDRSZ) != 0);
3846 :
3847 768 : PG_FREE_IF_COPY(barg1, 0);
3848 768 : PG_FREE_IF_COPY(barg2, 1);
3849 : }
3850 :
3851 768 : PG_RETURN_BOOL(result);
3852 : }
3853 :
3854 : Datum
3855 8316 : bytealt(PG_FUNCTION_ARGS)
3856 : {
3857 8316 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3858 8316 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3859 : int len1,
3860 : len2;
3861 : int cmp;
3862 :
3863 8316 : len1 = VARSIZE_ANY_EXHDR(arg1);
3864 8316 : len2 = VARSIZE_ANY_EXHDR(arg2);
3865 :
3866 8316 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3867 :
3868 8316 : PG_FREE_IF_COPY(arg1, 0);
3869 8316 : PG_FREE_IF_COPY(arg2, 1);
3870 :
3871 8316 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3872 : }
3873 :
3874 : Datum
3875 6356 : byteale(PG_FUNCTION_ARGS)
3876 : {
3877 6356 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3878 6356 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3879 : int len1,
3880 : len2;
3881 : int cmp;
3882 :
3883 6356 : len1 = VARSIZE_ANY_EXHDR(arg1);
3884 6356 : len2 = VARSIZE_ANY_EXHDR(arg2);
3885 :
3886 6356 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3887 :
3888 6356 : PG_FREE_IF_COPY(arg1, 0);
3889 6356 : PG_FREE_IF_COPY(arg2, 1);
3890 :
3891 6356 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3892 : }
3893 :
3894 : Datum
3895 6228 : byteagt(PG_FUNCTION_ARGS)
3896 : {
3897 6228 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3898 6228 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3899 : int len1,
3900 : len2;
3901 : int cmp;
3902 :
3903 6228 : len1 = VARSIZE_ANY_EXHDR(arg1);
3904 6228 : len2 = VARSIZE_ANY_EXHDR(arg2);
3905 :
3906 6228 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3907 :
3908 6228 : PG_FREE_IF_COPY(arg1, 0);
3909 6228 : PG_FREE_IF_COPY(arg2, 1);
3910 :
3911 6228 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3912 : }
3913 :
3914 : Datum
3915 5010 : byteage(PG_FUNCTION_ARGS)
3916 : {
3917 5010 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3918 5010 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3919 : int len1,
3920 : len2;
3921 : int cmp;
3922 :
3923 5010 : len1 = VARSIZE_ANY_EXHDR(arg1);
3924 5010 : len2 = VARSIZE_ANY_EXHDR(arg2);
3925 :
3926 5010 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3927 :
3928 5010 : PG_FREE_IF_COPY(arg1, 0);
3929 5010 : PG_FREE_IF_COPY(arg2, 1);
3930 :
3931 5010 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3932 : }
3933 :
3934 : Datum
3935 87600 : byteacmp(PG_FUNCTION_ARGS)
3936 : {
3937 87600 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3938 87600 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3939 : int len1,
3940 : len2;
3941 : int cmp;
3942 :
3943 87600 : len1 = VARSIZE_ANY_EXHDR(arg1);
3944 87600 : len2 = VARSIZE_ANY_EXHDR(arg2);
3945 :
3946 87600 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3947 87600 : if ((cmp == 0) && (len1 != len2))
3948 14708 : cmp = (len1 < len2) ? -1 : 1;
3949 :
3950 87600 : PG_FREE_IF_COPY(arg1, 0);
3951 87600 : PG_FREE_IF_COPY(arg2, 1);
3952 :
3953 87600 : PG_RETURN_INT32(cmp);
3954 : }
3955 :
3956 : Datum
3957 40 : bytea_sortsupport(PG_FUNCTION_ARGS)
3958 : {
3959 40 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3960 : MemoryContext oldcontext;
3961 :
3962 40 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3963 :
3964 : /* Use generic string SortSupport, forcing "C" collation */
3965 40 : varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
3966 :
3967 40 : MemoryContextSwitchTo(oldcontext);
3968 :
3969 40 : PG_RETURN_VOID();
3970 : }
3971 :
3972 : /*
3973 : * appendStringInfoText
3974 : *
3975 : * Append a text to str.
3976 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3977 : */
3978 : static void
3979 1681666 : appendStringInfoText(StringInfo str, const text *t)
3980 : {
3981 1681666 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3982 1681666 : }
3983 :
3984 : /*
3985 : * replace_text
3986 : * replace all occurrences of 'old_sub_str' in 'orig_str'
3987 : * with 'new_sub_str' to form 'new_str'
3988 : *
3989 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3990 : * otherwise returns 'new_str'
3991 : */
3992 : Datum
3993 2540 : replace_text(PG_FUNCTION_ARGS)
3994 : {
3995 2540 : text *src_text = PG_GETARG_TEXT_PP(0);
3996 2540 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
3997 2540 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
3998 : int src_text_len;
3999 : int from_sub_text_len;
4000 : TextPositionState state;
4001 : text *ret_text;
4002 : int chunk_len;
4003 : char *curr_ptr;
4004 : char *start_ptr;
4005 : StringInfoData str;
4006 : bool found;
4007 :
4008 2540 : src_text_len = VARSIZE_ANY_EXHDR(src_text);
4009 2540 : from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4010 :
4011 : /* Return unmodified source string if empty source or pattern */
4012 2540 : if (src_text_len < 1 || from_sub_text_len < 1)
4013 : {
4014 0 : PG_RETURN_TEXT_P(src_text);
4015 : }
4016 :
4017 2540 : text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4018 :
4019 2540 : found = text_position_next(&state);
4020 :
4021 : /* When the from_sub_text is not found, there is nothing to do. */
4022 2540 : if (!found)
4023 : {
4024 780 : text_position_cleanup(&state);
4025 780 : PG_RETURN_TEXT_P(src_text);
4026 : }
4027 1760 : curr_ptr = text_position_get_match_ptr(&state);
4028 1760 : start_ptr = VARDATA_ANY(src_text);
4029 :
4030 1760 : initStringInfo(&str);
4031 :
4032 : do
4033 : {
4034 6268 : CHECK_FOR_INTERRUPTS();
4035 :
4036 : /* copy the data skipped over by last text_position_next() */
4037 6268 : chunk_len = curr_ptr - start_ptr;
4038 6268 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4039 :
4040 6268 : appendStringInfoText(&str, to_sub_text);
4041 :
4042 6268 : start_ptr = curr_ptr + from_sub_text_len;
4043 :
4044 6268 : found = text_position_next(&state);
4045 6268 : if (found)
4046 4508 : curr_ptr = text_position_get_match_ptr(&state);
4047 : }
4048 6268 : while (found);
4049 :
4050 : /* copy trailing data */
4051 1760 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4052 1760 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4053 :
4054 1760 : text_position_cleanup(&state);
4055 :
4056 1760 : ret_text = cstring_to_text_with_len(str.data, str.len);
4057 1760 : pfree(str.data);
4058 :
4059 1760 : PG_RETURN_TEXT_P(ret_text);
4060 : }
4061 :
4062 : /*
4063 : * check_replace_text_has_escape
4064 : *
4065 : * Returns 0 if text contains no backslashes that need processing.
4066 : * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4067 : * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4068 : */
4069 : static int
4070 10526 : check_replace_text_has_escape(const text *replace_text)
4071 : {
4072 10526 : int result = 0;
4073 10526 : const char *p = VARDATA_ANY(replace_text);
4074 10526 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4075 :
4076 10570 : while (p < p_end)
4077 : {
4078 : /* Find next escape char, if any. */
4079 9272 : p = memchr(p, '\\', p_end - p);
4080 9272 : if (p == NULL)
4081 8594 : break;
4082 678 : p++;
4083 : /* Note: a backslash at the end doesn't require extra processing. */
4084 678 : if (p < p_end)
4085 : {
4086 678 : if (*p >= '1' && *p <= '9')
4087 634 : return 2; /* Found a submatch specifier, so done */
4088 44 : result = 1; /* Found some other sequence, keep looking */
4089 44 : p++;
4090 : }
4091 : }
4092 9892 : return result;
4093 : }
4094 :
4095 : /*
4096 : * appendStringInfoRegexpSubstr
4097 : *
4098 : * Append replace_text to str, substituting regexp back references for
4099 : * \n escapes. start_ptr is the start of the match in the source string,
4100 : * at logical character position data_pos.
4101 : */
4102 : static void
4103 212 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4104 : regmatch_t *pmatch,
4105 : char *start_ptr, int data_pos)
4106 : {
4107 212 : const char *p = VARDATA_ANY(replace_text);
4108 212 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4109 :
4110 526 : while (p < p_end)
4111 : {
4112 470 : const char *chunk_start = p;
4113 : int so;
4114 : int eo;
4115 :
4116 : /* Find next escape char, if any. */
4117 470 : p = memchr(p, '\\', p_end - p);
4118 470 : if (p == NULL)
4119 150 : p = p_end;
4120 :
4121 : /* Copy the text we just scanned over, if any. */
4122 470 : if (p > chunk_start)
4123 294 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4124 :
4125 : /* Done if at end of string, else advance over escape char. */
4126 470 : if (p >= p_end)
4127 150 : break;
4128 320 : p++;
4129 :
4130 320 : if (p >= p_end)
4131 : {
4132 : /* Escape at very end of input. Treat same as unexpected char */
4133 6 : appendStringInfoChar(str, '\\');
4134 6 : break;
4135 : }
4136 :
4137 314 : if (*p >= '1' && *p <= '9')
4138 254 : {
4139 : /* Use the back reference of regexp. */
4140 254 : int idx = *p - '0';
4141 :
4142 254 : so = pmatch[idx].rm_so;
4143 254 : eo = pmatch[idx].rm_eo;
4144 254 : p++;
4145 : }
4146 60 : else if (*p == '&')
4147 : {
4148 : /* Use the entire matched string. */
4149 18 : so = pmatch[0].rm_so;
4150 18 : eo = pmatch[0].rm_eo;
4151 18 : p++;
4152 : }
4153 42 : else if (*p == '\\')
4154 : {
4155 : /* \\ means transfer one \ to output. */
4156 36 : appendStringInfoChar(str, '\\');
4157 36 : p++;
4158 36 : continue;
4159 : }
4160 : else
4161 : {
4162 : /*
4163 : * If escape char is not followed by any expected char, just treat
4164 : * it as ordinary data to copy. (XXX would it be better to throw
4165 : * an error?)
4166 : */
4167 6 : appendStringInfoChar(str, '\\');
4168 6 : continue;
4169 : }
4170 :
4171 272 : if (so >= 0 && eo >= 0)
4172 : {
4173 : /*
4174 : * Copy the text that is back reference of regexp. Note so and eo
4175 : * are counted in characters not bytes.
4176 : */
4177 : char *chunk_start;
4178 : int chunk_len;
4179 :
4180 : Assert(so >= data_pos);
4181 272 : chunk_start = start_ptr;
4182 272 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4183 272 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4184 272 : appendBinaryStringInfo(str, chunk_start, chunk_len);
4185 : }
4186 : }
4187 212 : }
4188 :
4189 : /*
4190 : * replace_text_regexp
4191 : *
4192 : * replace substring(s) in src_text that match pattern with replace_text.
4193 : * The replace_text can contain backslash markers to substitute
4194 : * (parts of) the matched text.
4195 : *
4196 : * cflags: regexp compile flags.
4197 : * collation: collation to use.
4198 : * search_start: the character (not byte) offset in src_text at which to
4199 : * begin searching.
4200 : * n: if 0, replace all matches; if > 0, replace only the N'th match.
4201 : */
4202 : text *
4203 10526 : replace_text_regexp(text *src_text, text *pattern_text,
4204 : text *replace_text,
4205 : int cflags, Oid collation,
4206 : int search_start, int n)
4207 : {
4208 : text *ret_text;
4209 : regex_t *re;
4210 10526 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4211 10526 : int nmatches = 0;
4212 : StringInfoData buf;
4213 : regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4214 10526 : int nmatch = lengthof(pmatch);
4215 : pg_wchar *data;
4216 : size_t data_len;
4217 : int data_pos;
4218 : char *start_ptr;
4219 : int escape_status;
4220 :
4221 10526 : initStringInfo(&buf);
4222 :
4223 : /* Convert data string to wide characters. */
4224 10526 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4225 10526 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4226 :
4227 : /* Check whether replace_text has escapes, especially regexp submatches. */
4228 10526 : escape_status = check_replace_text_has_escape(replace_text);
4229 :
4230 : /* If no regexp submatches, we can use REG_NOSUB. */
4231 10526 : if (escape_status < 2)
4232 : {
4233 9892 : cflags |= REG_NOSUB;
4234 : /* Also tell pg_regexec we only want the whole-match location. */
4235 9892 : nmatch = 1;
4236 : }
4237 :
4238 : /* Prepare the regexp. */
4239 10526 : re = RE_compile_and_cache(pattern_text, cflags, collation);
4240 :
4241 : /* start_ptr points to the data_pos'th character of src_text */
4242 10526 : start_ptr = (char *) VARDATA_ANY(src_text);
4243 10526 : data_pos = 0;
4244 :
4245 14504 : while (search_start <= data_len)
4246 : {
4247 : int regexec_result;
4248 :
4249 14498 : CHECK_FOR_INTERRUPTS();
4250 :
4251 14498 : regexec_result = pg_regexec(re,
4252 : data,
4253 : data_len,
4254 : search_start,
4255 : NULL, /* no details */
4256 : nmatch,
4257 : pmatch,
4258 : 0);
4259 :
4260 14498 : if (regexec_result == REG_NOMATCH)
4261 9164 : break;
4262 :
4263 5334 : if (regexec_result != REG_OKAY)
4264 : {
4265 : char errMsg[100];
4266 :
4267 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4268 0 : ereport(ERROR,
4269 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4270 : errmsg("regular expression failed: %s", errMsg)));
4271 : }
4272 :
4273 : /*
4274 : * Count matches, and decide whether to replace this match.
4275 : */
4276 5334 : nmatches++;
4277 5334 : if (n > 0 && nmatches != n)
4278 : {
4279 : /*
4280 : * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4281 : * we treat the matched text as if it weren't matched, and copy it
4282 : * to the output later.)
4283 : */
4284 60 : search_start = pmatch[0].rm_eo;
4285 60 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4286 0 : search_start++;
4287 60 : continue;
4288 : }
4289 :
4290 : /*
4291 : * Copy the text to the left of the match position. Note we are given
4292 : * character not byte indexes.
4293 : */
4294 5274 : if (pmatch[0].rm_so - data_pos > 0)
4295 : {
4296 : int chunk_len;
4297 :
4298 5102 : chunk_len = charlen_to_bytelen(start_ptr,
4299 5102 : pmatch[0].rm_so - data_pos);
4300 5102 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4301 :
4302 : /*
4303 : * Advance start_ptr over that text, to avoid multiple rescans of
4304 : * it if the replace_text contains multiple back-references.
4305 : */
4306 5102 : start_ptr += chunk_len;
4307 5102 : data_pos = pmatch[0].rm_so;
4308 : }
4309 :
4310 : /*
4311 : * Copy the replace_text, processing escapes if any are present.
4312 : */
4313 5274 : if (escape_status > 0)
4314 212 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4315 : start_ptr, data_pos);
4316 : else
4317 5062 : appendStringInfoText(&buf, replace_text);
4318 :
4319 : /* Advance start_ptr and data_pos over the matched text. */
4320 10548 : start_ptr += charlen_to_bytelen(start_ptr,
4321 5274 : pmatch[0].rm_eo - data_pos);
4322 5274 : data_pos = pmatch[0].rm_eo;
4323 :
4324 : /*
4325 : * If we only want to replace one occurrence, we're done.
4326 : */
4327 5274 : if (n > 0)
4328 1356 : break;
4329 :
4330 : /*
4331 : * Advance search position. Normally we start the next search at the
4332 : * end of the previous match; but if the match was of zero length, we
4333 : * have to advance by one character, or we'd just find the same match
4334 : * again.
4335 : */
4336 3918 : search_start = data_pos;
4337 3918 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4338 12 : search_start++;
4339 : }
4340 :
4341 : /*
4342 : * Copy the text to the right of the last match.
4343 : */
4344 10526 : if (data_pos < data_len)
4345 : {
4346 : int chunk_len;
4347 :
4348 10062 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4349 10062 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4350 : }
4351 :
4352 10526 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4353 10526 : pfree(buf.data);
4354 10526 : pfree(data);
4355 :
4356 10526 : return ret_text;
4357 : }
4358 :
4359 : /*
4360 : * split_part
4361 : * parse input string based on provided field separator
4362 : * return N'th item (1 based, negative counts from end)
4363 : */
4364 : Datum
4365 102 : split_part(PG_FUNCTION_ARGS)
4366 : {
4367 102 : text *inputstring = PG_GETARG_TEXT_PP(0);
4368 102 : text *fldsep = PG_GETARG_TEXT_PP(1);
4369 102 : int fldnum = PG_GETARG_INT32(2);
4370 : int inputstring_len;
4371 : int fldsep_len;
4372 : TextPositionState state;
4373 : char *start_ptr;
4374 : char *end_ptr;
4375 : text *result_text;
4376 : bool found;
4377 :
4378 : /* field number is 1 based */
4379 102 : if (fldnum == 0)
4380 6 : ereport(ERROR,
4381 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4382 : errmsg("field position must not be zero")));
4383 :
4384 96 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4385 96 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4386 :
4387 : /* return empty string for empty input string */
4388 96 : if (inputstring_len < 1)
4389 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4390 :
4391 : /* handle empty field separator */
4392 84 : if (fldsep_len < 1)
4393 : {
4394 : /* if first or last field, return input string, else empty string */
4395 24 : if (fldnum == 1 || fldnum == -1)
4396 12 : PG_RETURN_TEXT_P(inputstring);
4397 : else
4398 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4399 : }
4400 :
4401 : /* find the first field separator */
4402 60 : text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4403 :
4404 60 : found = text_position_next(&state);
4405 :
4406 : /* special case if fldsep not found at all */
4407 60 : if (!found)
4408 : {
4409 12 : text_position_cleanup(&state);
4410 : /* if first or last field, return input string, else empty string */
4411 12 : if (fldnum == 1 || fldnum == -1)
4412 6 : PG_RETURN_TEXT_P(inputstring);
4413 : else
4414 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4415 : }
4416 :
4417 : /*
4418 : * take care of a negative field number (i.e. count from the right) by
4419 : * converting to a positive field number; we need total number of fields
4420 : */
4421 48 : if (fldnum < 0)
4422 : {
4423 : /* we found a fldsep, so there are at least two fields */
4424 24 : int numfields = 2;
4425 :
4426 36 : while (text_position_next(&state))
4427 12 : numfields++;
4428 :
4429 : /* special case of last field does not require an extra pass */
4430 24 : if (fldnum == -1)
4431 : {
4432 6 : start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4433 6 : end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4434 6 : text_position_cleanup(&state);
4435 6 : PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4436 : end_ptr - start_ptr));
4437 : }
4438 :
4439 : /* else, convert fldnum to positive notation */
4440 18 : fldnum += numfields + 1;
4441 :
4442 : /* if nonexistent field, return empty string */
4443 18 : if (fldnum <= 0)
4444 : {
4445 6 : text_position_cleanup(&state);
4446 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4447 : }
4448 :
4449 : /* reset to pointing at first match, but now with positive fldnum */
4450 12 : text_position_reset(&state);
4451 12 : found = text_position_next(&state);
4452 : Assert(found);
4453 : }
4454 :
4455 : /* identify bounds of first field */
4456 36 : start_ptr = VARDATA_ANY(inputstring);
4457 36 : end_ptr = text_position_get_match_ptr(&state);
4458 :
4459 66 : while (found && --fldnum > 0)
4460 : {
4461 : /* identify bounds of next field */
4462 30 : start_ptr = end_ptr + fldsep_len;
4463 30 : found = text_position_next(&state);
4464 30 : if (found)
4465 18 : end_ptr = text_position_get_match_ptr(&state);
4466 : }
4467 :
4468 36 : text_position_cleanup(&state);
4469 :
4470 36 : if (fldnum > 0)
4471 : {
4472 : /* N'th field separator not found */
4473 : /* if last field requested, return it, else empty string */
4474 12 : if (fldnum == 1)
4475 : {
4476 6 : int last_len = start_ptr - VARDATA_ANY(inputstring);
4477 :
4478 6 : result_text = cstring_to_text_with_len(start_ptr,
4479 : inputstring_len - last_len);
4480 : }
4481 : else
4482 6 : result_text = cstring_to_text("");
4483 : }
4484 : else
4485 : {
4486 : /* non-last field requested */
4487 24 : result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4488 : }
4489 :
4490 36 : PG_RETURN_TEXT_P(result_text);
4491 : }
4492 :
4493 : /*
4494 : * Convenience function to return true when two text params are equal.
4495 : */
4496 : static bool
4497 348 : text_isequal(text *txt1, text *txt2, Oid collid)
4498 : {
4499 348 : return DatumGetBool(DirectFunctionCall2Coll(texteq,
4500 : collid,
4501 : PointerGetDatum(txt1),
4502 : PointerGetDatum(txt2)));
4503 : }
4504 :
4505 : /*
4506 : * text_to_array
4507 : * parse input string and return text array of elements,
4508 : * based on provided field separator
4509 : */
4510 : Datum
4511 146 : text_to_array(PG_FUNCTION_ARGS)
4512 : {
4513 : SplitTextOutputData tstate;
4514 :
4515 : /* For array output, tstate should start as all zeroes */
4516 146 : memset(&tstate, 0, sizeof(tstate));
4517 :
4518 146 : if (!split_text(fcinfo, &tstate))
4519 6 : PG_RETURN_NULL();
4520 :
4521 128 : if (tstate.astate == NULL)
4522 6 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4523 :
4524 122 : PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4525 : CurrentMemoryContext));
4526 : }
4527 :
4528 : /*
4529 : * text_to_array_null
4530 : * parse input string and return text array of elements,
4531 : * based on provided field separator and null string
4532 : *
4533 : * This is a separate entry point only to prevent the regression tests from
4534 : * complaining about different argument sets for the same internal function.
4535 : */
4536 : Datum
4537 60 : text_to_array_null(PG_FUNCTION_ARGS)
4538 : {
4539 60 : return text_to_array(fcinfo);
4540 : }
4541 :
4542 : /*
4543 : * text_to_table
4544 : * parse input string and return table of elements,
4545 : * based on provided field separator
4546 : */
4547 : Datum
4548 84 : text_to_table(PG_FUNCTION_ARGS)
4549 : {
4550 84 : ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4551 : SplitTextOutputData tstate;
4552 :
4553 84 : tstate.astate = NULL;
4554 84 : InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4555 84 : tstate.tupstore = rsi->setResult;
4556 84 : tstate.tupdesc = rsi->setDesc;
4557 :
4558 84 : (void) split_text(fcinfo, &tstate);
4559 :
4560 84 : return (Datum) 0;
4561 : }
4562 :
4563 : /*
4564 : * text_to_table_null
4565 : * parse input string and return table of elements,
4566 : * based on provided field separator and null string
4567 : *
4568 : * This is a separate entry point only to prevent the regression tests from
4569 : * complaining about different argument sets for the same internal function.
4570 : */
4571 : Datum
4572 24 : text_to_table_null(PG_FUNCTION_ARGS)
4573 : {
4574 24 : return text_to_table(fcinfo);
4575 : }
4576 :
4577 : /*
4578 : * Common code for text_to_array, text_to_array_null, text_to_table
4579 : * and text_to_table_null functions.
4580 : *
4581 : * These are not strict so we have to test for null inputs explicitly.
4582 : * Returns false if result is to be null, else returns true.
4583 : *
4584 : * Note that if the result is valid but empty (zero elements), we return
4585 : * without changing *tstate --- caller must handle that case, too.
4586 : */
4587 : static bool
4588 230 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4589 : {
4590 : text *inputstring;
4591 : text *fldsep;
4592 : text *null_string;
4593 230 : Oid collation = PG_GET_COLLATION();
4594 : int inputstring_len;
4595 : int fldsep_len;
4596 : char *start_ptr;
4597 : text *result_text;
4598 :
4599 : /* when input string is NULL, then result is NULL too */
4600 230 : if (PG_ARGISNULL(0))
4601 12 : return false;
4602 :
4603 218 : inputstring = PG_GETARG_TEXT_PP(0);
4604 :
4605 : /* fldsep can be NULL */
4606 218 : if (!PG_ARGISNULL(1))
4607 188 : fldsep = PG_GETARG_TEXT_PP(1);
4608 : else
4609 30 : fldsep = NULL;
4610 :
4611 : /* null_string can be NULL or omitted */
4612 218 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4613 84 : null_string = PG_GETARG_TEXT_PP(2);
4614 : else
4615 134 : null_string = NULL;
4616 :
4617 218 : if (fldsep != NULL)
4618 : {
4619 : /*
4620 : * Normal case with non-null fldsep. Use the text_position machinery
4621 : * to search for occurrences of fldsep.
4622 : */
4623 : TextPositionState state;
4624 :
4625 188 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4626 188 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4627 :
4628 : /* return empty set for empty input string */
4629 188 : if (inputstring_len < 1)
4630 60 : return true;
4631 :
4632 : /* empty field separator: return input string as a one-element set */
4633 176 : if (fldsep_len < 1)
4634 : {
4635 48 : split_text_accum_result(tstate, inputstring,
4636 : null_string, collation);
4637 48 : return true;
4638 : }
4639 :
4640 128 : text_position_setup(inputstring, fldsep, collation, &state);
4641 :
4642 116 : start_ptr = VARDATA_ANY(inputstring);
4643 :
4644 : for (;;)
4645 464 : {
4646 : bool found;
4647 : char *end_ptr;
4648 : int chunk_len;
4649 :
4650 580 : CHECK_FOR_INTERRUPTS();
4651 :
4652 580 : found = text_position_next(&state);
4653 580 : if (!found)
4654 : {
4655 : /* fetch last field */
4656 116 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4657 116 : end_ptr = NULL; /* not used, but some compilers complain */
4658 : }
4659 : else
4660 : {
4661 : /* fetch non-last field */
4662 464 : end_ptr = text_position_get_match_ptr(&state);
4663 464 : chunk_len = end_ptr - start_ptr;
4664 : }
4665 :
4666 : /* build a temp text datum to pass to split_text_accum_result */
4667 580 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4668 :
4669 : /* stash away this field */
4670 580 : split_text_accum_result(tstate, result_text,
4671 : null_string, collation);
4672 :
4673 580 : pfree(result_text);
4674 :
4675 580 : if (!found)
4676 116 : break;
4677 :
4678 464 : start_ptr = end_ptr + fldsep_len;
4679 : }
4680 :
4681 116 : text_position_cleanup(&state);
4682 : }
4683 : else
4684 : {
4685 : /*
4686 : * When fldsep is NULL, each character in the input string becomes a
4687 : * separate element in the result set. The separator is effectively
4688 : * the space between characters.
4689 : */
4690 30 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4691 :
4692 30 : start_ptr = VARDATA_ANY(inputstring);
4693 :
4694 252 : while (inputstring_len > 0)
4695 : {
4696 222 : int chunk_len = pg_mblen(start_ptr);
4697 :
4698 222 : CHECK_FOR_INTERRUPTS();
4699 :
4700 : /* build a temp text datum to pass to split_text_accum_result */
4701 222 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4702 :
4703 : /* stash away this field */
4704 222 : split_text_accum_result(tstate, result_text,
4705 : null_string, collation);
4706 :
4707 222 : pfree(result_text);
4708 :
4709 222 : start_ptr += chunk_len;
4710 222 : inputstring_len -= chunk_len;
4711 : }
4712 : }
4713 :
4714 146 : return true;
4715 : }
4716 :
4717 : /*
4718 : * Add text item to result set (table or array).
4719 : *
4720 : * This is also responsible for checking to see if the item matches
4721 : * the null_string, in which case we should emit NULL instead.
4722 : */
4723 : static void
4724 850 : split_text_accum_result(SplitTextOutputData *tstate,
4725 : text *field_value,
4726 : text *null_string,
4727 : Oid collation)
4728 : {
4729 850 : bool is_null = false;
4730 :
4731 850 : if (null_string && text_isequal(field_value, null_string, collation))
4732 60 : is_null = true;
4733 :
4734 850 : if (tstate->tupstore)
4735 : {
4736 : Datum values[1];
4737 : bool nulls[1];
4738 :
4739 228 : values[0] = PointerGetDatum(field_value);
4740 228 : nulls[0] = is_null;
4741 :
4742 228 : tuplestore_putvalues(tstate->tupstore,
4743 : tstate->tupdesc,
4744 : values,
4745 : nulls);
4746 : }
4747 : else
4748 : {
4749 622 : tstate->astate = accumArrayResult(tstate->astate,
4750 : PointerGetDatum(field_value),
4751 : is_null,
4752 : TEXTOID,
4753 : CurrentMemoryContext);
4754 : }
4755 850 : }
4756 :
4757 : /*
4758 : * array_to_text
4759 : * concatenate Cstring representation of input array elements
4760 : * using provided field separator
4761 : */
4762 : Datum
4763 59664 : array_to_text(PG_FUNCTION_ARGS)
4764 : {
4765 59664 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4766 59664 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4767 :
4768 59664 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4769 : }
4770 :
4771 : /*
4772 : * array_to_text_null
4773 : * concatenate Cstring representation of input array elements
4774 : * using provided field separator and null string
4775 : *
4776 : * This version is not strict so we have to test for null inputs explicitly.
4777 : */
4778 : Datum
4779 12 : array_to_text_null(PG_FUNCTION_ARGS)
4780 : {
4781 : ArrayType *v;
4782 : char *fldsep;
4783 : char *null_string;
4784 :
4785 : /* returns NULL when first or second parameter is NULL */
4786 12 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4787 0 : PG_RETURN_NULL();
4788 :
4789 12 : v = PG_GETARG_ARRAYTYPE_P(0);
4790 12 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4791 :
4792 : /* NULL null string is passed through as a null pointer */
4793 12 : if (!PG_ARGISNULL(2))
4794 6 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4795 : else
4796 6 : null_string = NULL;
4797 :
4798 12 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4799 : }
4800 :
4801 : /*
4802 : * common code for array_to_text and array_to_text_null functions
4803 : */
4804 : static text *
4805 59694 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4806 : const char *fldsep, const char *null_string)
4807 : {
4808 : text *result;
4809 : int nitems,
4810 : *dims,
4811 : ndims;
4812 : Oid element_type;
4813 : int typlen;
4814 : bool typbyval;
4815 : char typalign;
4816 : StringInfoData buf;
4817 59694 : bool printed = false;
4818 : char *p;
4819 : bits8 *bitmap;
4820 : int bitmask;
4821 : int i;
4822 : ArrayMetaState *my_extra;
4823 :
4824 59694 : ndims = ARR_NDIM(v);
4825 59694 : dims = ARR_DIMS(v);
4826 59694 : nitems = ArrayGetNItems(ndims, dims);
4827 :
4828 : /* if there are no elements, return an empty string */
4829 59694 : if (nitems == 0)
4830 36048 : return cstring_to_text_with_len("", 0);
4831 :
4832 23646 : element_type = ARR_ELEMTYPE(v);
4833 23646 : initStringInfo(&buf);
4834 :
4835 : /*
4836 : * We arrange to look up info about element type, including its output
4837 : * conversion proc, only once per series of calls, assuming the element
4838 : * type doesn't change underneath us.
4839 : */
4840 23646 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4841 23646 : if (my_extra == NULL)
4842 : {
4843 1354 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4844 : sizeof(ArrayMetaState));
4845 1354 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4846 1354 : my_extra->element_type = ~element_type;
4847 : }
4848 :
4849 23646 : if (my_extra->element_type != element_type)
4850 : {
4851 : /*
4852 : * Get info about element type, including its output conversion proc
4853 : */
4854 1354 : get_type_io_data(element_type, IOFunc_output,
4855 : &my_extra->typlen, &my_extra->typbyval,
4856 : &my_extra->typalign, &my_extra->typdelim,
4857 : &my_extra->typioparam, &my_extra->typiofunc);
4858 1354 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4859 1354 : fcinfo->flinfo->fn_mcxt);
4860 1354 : my_extra->element_type = element_type;
4861 : }
4862 23646 : typlen = my_extra->typlen;
4863 23646 : typbyval = my_extra->typbyval;
4864 23646 : typalign = my_extra->typalign;
4865 :
4866 23646 : p = ARR_DATA_PTR(v);
4867 23646 : bitmap = ARR_NULLBITMAP(v);
4868 23646 : bitmask = 1;
4869 :
4870 80826 : for (i = 0; i < nitems; i++)
4871 : {
4872 : Datum itemvalue;
4873 : char *value;
4874 :
4875 : /* Get source element, checking for NULL */
4876 57180 : if (bitmap && (*bitmap & bitmask) == 0)
4877 : {
4878 : /* if null_string is NULL, we just ignore null elements */
4879 18 : if (null_string != NULL)
4880 : {
4881 6 : if (printed)
4882 6 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
4883 : else
4884 0 : appendStringInfoString(&buf, null_string);
4885 6 : printed = true;
4886 : }
4887 : }
4888 : else
4889 : {
4890 57162 : itemvalue = fetch_att(p, typbyval, typlen);
4891 :
4892 57162 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
4893 :
4894 57162 : if (printed)
4895 33516 : appendStringInfo(&buf, "%s%s", fldsep, value);
4896 : else
4897 23646 : appendStringInfoString(&buf, value);
4898 57162 : printed = true;
4899 :
4900 57162 : p = att_addlength_pointer(p, typlen, p);
4901 57162 : p = (char *) att_align_nominal(p, typalign);
4902 : }
4903 :
4904 : /* advance bitmap pointer if any */
4905 57180 : if (bitmap)
4906 : {
4907 108 : bitmask <<= 1;
4908 108 : if (bitmask == 0x100)
4909 : {
4910 0 : bitmap++;
4911 0 : bitmask = 1;
4912 : }
4913 : }
4914 : }
4915 :
4916 23646 : result = cstring_to_text_with_len(buf.data, buf.len);
4917 23646 : pfree(buf.data);
4918 :
4919 23646 : return result;
4920 : }
4921 :
4922 : #define HEXBASE 16
4923 : /*
4924 : * Convert an int32 to a string containing a base 16 (hex) representation of
4925 : * the number.
4926 : */
4927 : Datum
4928 38684 : to_hex32(PG_FUNCTION_ARGS)
4929 : {
4930 38684 : uint32 value = (uint32) PG_GETARG_INT32(0);
4931 : char *ptr;
4932 38684 : const char *digits = "0123456789abcdef";
4933 : char buf[32]; /* bigger than needed, but reasonable */
4934 :
4935 38684 : ptr = buf + sizeof(buf) - 1;
4936 38684 : *ptr = '\0';
4937 :
4938 : do
4939 : {
4940 74558 : *--ptr = digits[value % HEXBASE];
4941 74558 : value /= HEXBASE;
4942 74558 : } while (ptr > buf && value);
4943 :
4944 38684 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
4945 : }
4946 :
4947 : /*
4948 : * Convert an int64 to a string containing a base 16 (hex) representation of
4949 : * the number.
4950 : */
4951 : Datum
4952 6 : to_hex64(PG_FUNCTION_ARGS)
4953 : {
4954 6 : uint64 value = (uint64) PG_GETARG_INT64(0);
4955 : char *ptr;
4956 6 : const char *digits = "0123456789abcdef";
4957 : char buf[32]; /* bigger than needed, but reasonable */
4958 :
4959 6 : ptr = buf + sizeof(buf) - 1;
4960 6 : *ptr = '\0';
4961 :
4962 : do
4963 : {
4964 48 : *--ptr = digits[value % HEXBASE];
4965 48 : value /= HEXBASE;
4966 48 : } while (ptr > buf && value);
4967 :
4968 6 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
4969 : }
4970 :
4971 : /*
4972 : * Return the size of a datum, possibly compressed
4973 : *
4974 : * Works on any data type
4975 : */
4976 : Datum
4977 122 : pg_column_size(PG_FUNCTION_ARGS)
4978 : {
4979 122 : Datum value = PG_GETARG_DATUM(0);
4980 : int32 result;
4981 : int typlen;
4982 :
4983 : /* On first call, get the input type's typlen, and save at *fn_extra */
4984 122 : if (fcinfo->flinfo->fn_extra == NULL)
4985 : {
4986 : /* Lookup the datatype of the supplied argument */
4987 122 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4988 :
4989 122 : typlen = get_typlen(argtypeid);
4990 122 : if (typlen == 0) /* should not happen */
4991 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
4992 :
4993 122 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4994 : sizeof(int));
4995 122 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
4996 : }
4997 : else
4998 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
4999 :
5000 122 : if (typlen == -1)
5001 : {
5002 : /* varlena type, possibly toasted */
5003 122 : result = toast_datum_size(value);
5004 : }
5005 0 : else if (typlen == -2)
5006 : {
5007 : /* cstring */
5008 0 : result = strlen(DatumGetCString(value)) + 1;
5009 : }
5010 : else
5011 : {
5012 : /* ordinary fixed-width type */
5013 0 : result = typlen;
5014 : }
5015 :
5016 122 : PG_RETURN_INT32(result);
5017 : }
5018 :
5019 : /*
5020 : * Return the compression method stored in the compressed attribute. Return
5021 : * NULL for non varlena type or uncompressed data.
5022 : */
5023 : Datum
5024 162 : pg_column_compression(PG_FUNCTION_ARGS)
5025 : {
5026 : int typlen;
5027 : char *result;
5028 : ToastCompressionId cmid;
5029 :
5030 : /* On first call, get the input type's typlen, and save at *fn_extra */
5031 162 : if (fcinfo->flinfo->fn_extra == NULL)
5032 : {
5033 : /* Lookup the datatype of the supplied argument */
5034 108 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5035 :
5036 108 : typlen = get_typlen(argtypeid);
5037 108 : if (typlen == 0) /* should not happen */
5038 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5039 :
5040 108 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5041 : sizeof(int));
5042 108 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5043 : }
5044 : else
5045 54 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5046 :
5047 162 : if (typlen != -1)
5048 0 : PG_RETURN_NULL();
5049 :
5050 : /* get the compression method id stored in the compressed varlena */
5051 162 : cmid = toast_get_compression_id((struct varlena *)
5052 162 : DatumGetPointer(PG_GETARG_DATUM(0)));
5053 162 : if (cmid == TOAST_INVALID_COMPRESSION_ID)
5054 6 : PG_RETURN_NULL();
5055 :
5056 : /* convert compression method id to compression method name */
5057 156 : switch (cmid)
5058 : {
5059 66 : case TOAST_PGLZ_COMPRESSION_ID:
5060 66 : result = "pglz";
5061 66 : break;
5062 90 : case TOAST_LZ4_COMPRESSION_ID:
5063 90 : result = "lz4";
5064 90 : break;
5065 0 : default:
5066 0 : elog(ERROR, "invalid compression method id %d", cmid);
5067 : }
5068 :
5069 156 : PG_RETURN_TEXT_P(cstring_to_text(result));
5070 : }
5071 :
5072 : /*
5073 : * string_agg - Concatenates values and returns string.
5074 : *
5075 : * Syntax: string_agg(value text, delimiter text) RETURNS text
5076 : *
5077 : * Note: Any NULL values are ignored. The first-call delimiter isn't
5078 : * actually used at all, and on subsequent calls the delimiter precedes
5079 : * the associated value.
5080 : */
5081 :
5082 : /* subroutine to initialize state */
5083 : static StringInfo
5084 1974 : makeStringAggState(FunctionCallInfo fcinfo)
5085 : {
5086 : StringInfo state;
5087 : MemoryContext aggcontext;
5088 : MemoryContext oldcontext;
5089 :
5090 1974 : if (!AggCheckCallContext(fcinfo, &aggcontext))
5091 : {
5092 : /* cannot be called directly because of internal-type argument */
5093 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
5094 : }
5095 :
5096 : /*
5097 : * Create state in aggregate context. It'll stay there across subsequent
5098 : * calls.
5099 : */
5100 1974 : oldcontext = MemoryContextSwitchTo(aggcontext);
5101 1974 : state = makeStringInfo();
5102 1974 : MemoryContextSwitchTo(oldcontext);
5103 :
5104 1974 : return state;
5105 : }
5106 :
5107 : Datum
5108 850216 : string_agg_transfn(PG_FUNCTION_ARGS)
5109 : {
5110 : StringInfo state;
5111 :
5112 850216 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5113 :
5114 : /* Append the value unless null, preceding it with the delimiter. */
5115 850216 : if (!PG_ARGISNULL(1))
5116 : {
5117 835168 : text *value = PG_GETARG_TEXT_PP(1);
5118 835168 : bool isfirst = false;
5119 :
5120 : /*
5121 : * You might think we can just throw away the first delimiter, however
5122 : * we must keep it as we may be a parallel worker doing partial
5123 : * aggregation building a state to send to the main process. We need
5124 : * to keep the delimiter of every aggregation so that the combine
5125 : * function can properly join up the strings of two separately
5126 : * partially aggregated results. The first delimiter is only stripped
5127 : * off in the final function. To know how much to strip off the front
5128 : * of the string, we store the length of the first delimiter in the
5129 : * StringInfo's cursor field, which we don't otherwise need here.
5130 : */
5131 835168 : if (state == NULL)
5132 : {
5133 1528 : state = makeStringAggState(fcinfo);
5134 1528 : isfirst = true;
5135 : }
5136 :
5137 835168 : if (!PG_ARGISNULL(2))
5138 : {
5139 835168 : text *delim = PG_GETARG_TEXT_PP(2);
5140 :
5141 835168 : appendStringInfoText(state, delim);
5142 835168 : if (isfirst)
5143 1528 : state->cursor = VARSIZE_ANY_EXHDR(delim);
5144 : }
5145 :
5146 835168 : appendStringInfoText(state, value);
5147 : }
5148 :
5149 : /*
5150 : * The transition type for string_agg() is declared to be "internal",
5151 : * which is a pass-by-value type the same size as a pointer.
5152 : */
5153 850216 : if (state)
5154 850132 : PG_RETURN_POINTER(state);
5155 84 : PG_RETURN_NULL();
5156 : }
5157 :
5158 : /*
5159 : * string_agg_combine
5160 : * Aggregate combine function for string_agg(text) and string_agg(bytea)
5161 : */
5162 : Datum
5163 160 : string_agg_combine(PG_FUNCTION_ARGS)
5164 : {
5165 : StringInfo state1;
5166 : StringInfo state2;
5167 : MemoryContext agg_context;
5168 :
5169 160 : if (!AggCheckCallContext(fcinfo, &agg_context))
5170 0 : elog(ERROR, "aggregate function called in non-aggregate context");
5171 :
5172 160 : state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5173 160 : state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5174 :
5175 160 : if (state2 == NULL)
5176 : {
5177 : /*
5178 : * NULL state2 is easy, just return state1, which we know is already
5179 : * in the agg_context
5180 : */
5181 0 : if (state1 == NULL)
5182 0 : PG_RETURN_NULL();
5183 0 : PG_RETURN_POINTER(state1);
5184 : }
5185 :
5186 160 : if (state1 == NULL)
5187 : {
5188 : /* We must copy state2's data into the agg_context */
5189 : MemoryContext old_context;
5190 :
5191 120 : old_context = MemoryContextSwitchTo(agg_context);
5192 120 : state1 = makeStringAggState(fcinfo);
5193 120 : appendBinaryStringInfo(state1, state2->data, state2->len);
5194 120 : state1->cursor = state2->cursor;
5195 120 : MemoryContextSwitchTo(old_context);
5196 : }
5197 40 : else if (state2->len > 0)
5198 : {
5199 : /* Combine ... state1->cursor does not change in this case */
5200 40 : appendBinaryStringInfo(state1, state2->data, state2->len);
5201 : }
5202 :
5203 160 : PG_RETURN_POINTER(state1);
5204 : }
5205 :
5206 : /*
5207 : * string_agg_serialize
5208 : * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5209 : *
5210 : * This is strict, so we need not handle NULL input
5211 : */
5212 : Datum
5213 160 : string_agg_serialize(PG_FUNCTION_ARGS)
5214 : {
5215 : StringInfo state;
5216 : StringInfoData buf;
5217 : bytea *result;
5218 :
5219 : /* cannot be called directly because of internal-type argument */
5220 : Assert(AggCheckCallContext(fcinfo, NULL));
5221 :
5222 160 : state = (StringInfo) PG_GETARG_POINTER(0);
5223 :
5224 160 : pq_begintypsend(&buf);
5225 :
5226 : /* cursor */
5227 160 : pq_sendint(&buf, state->cursor, 4);
5228 :
5229 : /* data */
5230 160 : pq_sendbytes(&buf, state->data, state->len);
5231 :
5232 160 : result = pq_endtypsend(&buf);
5233 :
5234 160 : PG_RETURN_BYTEA_P(result);
5235 : }
5236 :
5237 : /*
5238 : * string_agg_deserialize
5239 : * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5240 : *
5241 : * This is strict, so we need not handle NULL input
5242 : */
5243 : Datum
5244 160 : string_agg_deserialize(PG_FUNCTION_ARGS)
5245 : {
5246 : bytea *sstate;
5247 : StringInfo result;
5248 : StringInfoData buf;
5249 : char *data;
5250 : int datalen;
5251 :
5252 : /* cannot be called directly because of internal-type argument */
5253 : Assert(AggCheckCallContext(fcinfo, NULL));
5254 :
5255 160 : sstate = PG_GETARG_BYTEA_PP(0);
5256 :
5257 : /*
5258 : * Copy the bytea into a StringInfo so that we can "receive" it using the
5259 : * standard recv-function infrastructure.
5260 : */
5261 160 : initStringInfo(&buf);
5262 320 : appendBinaryStringInfo(&buf,
5263 320 : VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate));
5264 :
5265 160 : result = makeStringAggState(fcinfo);
5266 :
5267 : /* cursor */
5268 160 : result->cursor = pq_getmsgint(&buf, 4);
5269 :
5270 : /* data */
5271 160 : datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5272 160 : data = (char *) pq_getmsgbytes(&buf, datalen);
5273 160 : appendBinaryStringInfo(result, data, datalen);
5274 :
5275 160 : pq_getmsgend(&buf);
5276 160 : pfree(buf.data);
5277 :
5278 160 : PG_RETURN_POINTER(result);
5279 : }
5280 :
5281 : Datum
5282 1580 : string_agg_finalfn(PG_FUNCTION_ARGS)
5283 : {
5284 : StringInfo state;
5285 :
5286 : /* cannot be called directly because of internal-type argument */
5287 : Assert(AggCheckCallContext(fcinfo, NULL));
5288 :
5289 1580 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5290 :
5291 1580 : if (state != NULL)
5292 : {
5293 : /* As per comment in transfn, strip data before the cursor position */
5294 1508 : PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5295 : state->len - state->cursor));
5296 : }
5297 : else
5298 72 : PG_RETURN_NULL();
5299 : }
5300 :
5301 : /*
5302 : * Prepare cache with fmgr info for the output functions of the datatypes of
5303 : * the arguments of a concat-like function, beginning with argument "argidx".
5304 : * (Arguments before that will have corresponding slots in the resulting
5305 : * FmgrInfo array, but we don't fill those slots.)
5306 : */
5307 : static FmgrInfo *
5308 40 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5309 : {
5310 : FmgrInfo *foutcache;
5311 : int i;
5312 :
5313 : /* We keep the info in fn_mcxt so it survives across calls */
5314 40 : foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5315 40 : PG_NARGS() * sizeof(FmgrInfo));
5316 :
5317 196 : for (i = argidx; i < PG_NARGS(); i++)
5318 : {
5319 : Oid valtype;
5320 : Oid typOutput;
5321 : bool typIsVarlena;
5322 :
5323 156 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5324 156 : if (!OidIsValid(valtype))
5325 0 : elog(ERROR, "could not determine data type of concat() input");
5326 :
5327 156 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5328 156 : fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5329 : }
5330 :
5331 40 : fcinfo->flinfo->fn_extra = foutcache;
5332 :
5333 40 : return foutcache;
5334 : }
5335 :
5336 : /*
5337 : * Implementation of both concat() and concat_ws().
5338 : *
5339 : * sepstr is the separator string to place between values.
5340 : * argidx identifies the first argument to concatenate (counting from zero);
5341 : * note that this must be constant across any one series of calls.
5342 : *
5343 : * Returns NULL if result should be NULL, else text value.
5344 : */
5345 : static text *
5346 72 : concat_internal(const char *sepstr, int argidx,
5347 : FunctionCallInfo fcinfo)
5348 : {
5349 : text *result;
5350 : StringInfoData str;
5351 : FmgrInfo *foutcache;
5352 72 : bool first_arg = true;
5353 : int i;
5354 :
5355 : /*
5356 : * concat(VARIADIC some-array) is essentially equivalent to
5357 : * array_to_text(), ie concat the array elements with the given separator.
5358 : * So we just pass the case off to that code.
5359 : */
5360 72 : if (get_fn_expr_variadic(fcinfo->flinfo))
5361 : {
5362 : ArrayType *arr;
5363 :
5364 : /* Should have just the one argument */
5365 : Assert(argidx == PG_NARGS() - 1);
5366 :
5367 : /* concat(VARIADIC NULL) is defined as NULL */
5368 30 : if (PG_ARGISNULL(argidx))
5369 12 : return NULL;
5370 :
5371 : /*
5372 : * Non-null argument had better be an array. We assume that any call
5373 : * context that could let get_fn_expr_variadic return true will have
5374 : * checked that a VARIADIC-labeled parameter actually is an array. So
5375 : * it should be okay to just Assert that it's an array rather than
5376 : * doing a full-fledged error check.
5377 : */
5378 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5379 :
5380 : /* OK, safe to fetch the array value */
5381 18 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
5382 :
5383 : /*
5384 : * And serialize the array. We tell array_to_text to ignore null
5385 : * elements, which matches the behavior of the loop below.
5386 : */
5387 18 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5388 : }
5389 :
5390 : /* Normal case without explicit VARIADIC marker */
5391 42 : initStringInfo(&str);
5392 :
5393 : /* Get output function info, building it if first time through */
5394 42 : foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5395 42 : if (foutcache == NULL)
5396 40 : foutcache = build_concat_foutcache(fcinfo, argidx);
5397 :
5398 204 : for (i = argidx; i < PG_NARGS(); i++)
5399 : {
5400 162 : if (!PG_ARGISNULL(i))
5401 : {
5402 150 : Datum value = PG_GETARG_DATUM(i);
5403 :
5404 : /* add separator if appropriate */
5405 150 : if (first_arg)
5406 42 : first_arg = false;
5407 : else
5408 108 : appendStringInfoString(&str, sepstr);
5409 :
5410 : /* call the appropriate type output function, append the result */
5411 150 : appendStringInfoString(&str,
5412 150 : OutputFunctionCall(&foutcache[i], value));
5413 : }
5414 : }
5415 :
5416 42 : result = cstring_to_text_with_len(str.data, str.len);
5417 42 : pfree(str.data);
5418 :
5419 42 : return result;
5420 : }
5421 :
5422 : /*
5423 : * Concatenate all arguments. NULL arguments are ignored.
5424 : */
5425 : Datum
5426 36 : text_concat(PG_FUNCTION_ARGS)
5427 : {
5428 : text *result;
5429 :
5430 36 : result = concat_internal("", 0, fcinfo);
5431 36 : if (result == NULL)
5432 6 : PG_RETURN_NULL();
5433 30 : PG_RETURN_TEXT_P(result);
5434 : }
5435 :
5436 : /*
5437 : * Concatenate all but first argument value with separators. The first
5438 : * parameter is used as the separator. NULL arguments are ignored.
5439 : */
5440 : Datum
5441 42 : text_concat_ws(PG_FUNCTION_ARGS)
5442 : {
5443 : char *sep;
5444 : text *result;
5445 :
5446 : /* return NULL when separator is NULL */
5447 42 : if (PG_ARGISNULL(0))
5448 6 : PG_RETURN_NULL();
5449 36 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5450 :
5451 36 : result = concat_internal(sep, 1, fcinfo);
5452 36 : if (result == NULL)
5453 6 : PG_RETURN_NULL();
5454 30 : PG_RETURN_TEXT_P(result);
5455 : }
5456 :
5457 : /*
5458 : * Return first n characters in the string. When n is negative,
5459 : * return all but last |n| characters.
5460 : */
5461 : Datum
5462 1884 : text_left(PG_FUNCTION_ARGS)
5463 : {
5464 1884 : int n = PG_GETARG_INT32(1);
5465 :
5466 1884 : if (n < 0)
5467 : {
5468 30 : text *str = PG_GETARG_TEXT_PP(0);
5469 30 : const char *p = VARDATA_ANY(str);
5470 30 : int len = VARSIZE_ANY_EXHDR(str);
5471 : int rlen;
5472 :
5473 30 : n = pg_mbstrlen_with_len(p, len) + n;
5474 30 : rlen = pg_mbcharcliplen(p, len, n);
5475 30 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5476 : }
5477 : else
5478 1854 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5479 : }
5480 :
5481 : /*
5482 : * Return last n characters in the string. When n is negative,
5483 : * return all but first |n| characters.
5484 : */
5485 : Datum
5486 66 : text_right(PG_FUNCTION_ARGS)
5487 : {
5488 66 : text *str = PG_GETARG_TEXT_PP(0);
5489 66 : const char *p = VARDATA_ANY(str);
5490 66 : int len = VARSIZE_ANY_EXHDR(str);
5491 66 : int n = PG_GETARG_INT32(1);
5492 : int off;
5493 :
5494 66 : if (n < 0)
5495 30 : n = -n;
5496 : else
5497 36 : n = pg_mbstrlen_with_len(p, len) - n;
5498 66 : off = pg_mbcharcliplen(p, len, n);
5499 :
5500 66 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5501 : }
5502 :
5503 : /*
5504 : * Return reversed string
5505 : */
5506 : Datum
5507 6 : text_reverse(PG_FUNCTION_ARGS)
5508 : {
5509 6 : text *str = PG_GETARG_TEXT_PP(0);
5510 6 : const char *p = VARDATA_ANY(str);
5511 6 : int len = VARSIZE_ANY_EXHDR(str);
5512 6 : const char *endp = p + len;
5513 : text *result;
5514 : char *dst;
5515 :
5516 6 : result = palloc(len + VARHDRSZ);
5517 6 : dst = (char *) VARDATA(result) + len;
5518 6 : SET_VARSIZE(result, len + VARHDRSZ);
5519 :
5520 6 : if (pg_database_encoding_max_length() > 1)
5521 : {
5522 : /* multibyte version */
5523 36 : while (p < endp)
5524 : {
5525 : int sz;
5526 :
5527 30 : sz = pg_mblen(p);
5528 30 : dst -= sz;
5529 30 : memcpy(dst, p, sz);
5530 30 : p += sz;
5531 : }
5532 : }
5533 : else
5534 : {
5535 : /* single byte version */
5536 0 : while (p < endp)
5537 0 : *(--dst) = *p++;
5538 : }
5539 :
5540 6 : PG_RETURN_TEXT_P(result);
5541 : }
5542 :
5543 :
5544 : /*
5545 : * Support macros for text_format()
5546 : */
5547 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5548 :
5549 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5550 : do { \
5551 : if (++(ptr) >= (end_ptr)) \
5552 : ereport(ERROR, \
5553 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5554 : errmsg("unterminated format() type specifier"), \
5555 : errhint("For a single \"%%\" use \"%%%%\"."))); \
5556 : } while (0)
5557 :
5558 : /*
5559 : * Returns a formatted string
5560 : */
5561 : Datum
5562 25068 : text_format(PG_FUNCTION_ARGS)
5563 : {
5564 : text *fmt;
5565 : StringInfoData str;
5566 : const char *cp;
5567 : const char *start_ptr;
5568 : const char *end_ptr;
5569 : text *result;
5570 : int arg;
5571 : bool funcvariadic;
5572 : int nargs;
5573 25068 : Datum *elements = NULL;
5574 25068 : bool *nulls = NULL;
5575 25068 : Oid element_type = InvalidOid;
5576 25068 : Oid prev_type = InvalidOid;
5577 25068 : Oid prev_width_type = InvalidOid;
5578 : FmgrInfo typoutputfinfo;
5579 : FmgrInfo typoutputinfo_width;
5580 :
5581 : /* When format string is null, immediately return null */
5582 25068 : if (PG_ARGISNULL(0))
5583 6 : PG_RETURN_NULL();
5584 :
5585 : /* If argument is marked VARIADIC, expand array into elements */
5586 25062 : if (get_fn_expr_variadic(fcinfo->flinfo))
5587 : {
5588 : ArrayType *arr;
5589 : int16 elmlen;
5590 : bool elmbyval;
5591 : char elmalign;
5592 : int nitems;
5593 :
5594 : /* Should have just the one argument */
5595 : Assert(PG_NARGS() == 2);
5596 :
5597 : /* If argument is NULL, we treat it as zero-length array */
5598 48 : if (PG_ARGISNULL(1))
5599 6 : nitems = 0;
5600 : else
5601 : {
5602 : /*
5603 : * Non-null argument had better be an array. We assume that any
5604 : * call context that could let get_fn_expr_variadic return true
5605 : * will have checked that a VARIADIC-labeled parameter actually is
5606 : * an array. So it should be okay to just Assert that it's an
5607 : * array rather than doing a full-fledged error check.
5608 : */
5609 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5610 :
5611 : /* OK, safe to fetch the array value */
5612 42 : arr = PG_GETARG_ARRAYTYPE_P(1);
5613 :
5614 : /* Get info about array element type */
5615 42 : element_type = ARR_ELEMTYPE(arr);
5616 42 : get_typlenbyvalalign(element_type,
5617 : &elmlen, &elmbyval, &elmalign);
5618 :
5619 : /* Extract all array elements */
5620 42 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5621 : &elements, &nulls, &nitems);
5622 : }
5623 :
5624 48 : nargs = nitems + 1;
5625 48 : funcvariadic = true;
5626 : }
5627 : else
5628 : {
5629 : /* Non-variadic case, we'll process the arguments individually */
5630 25014 : nargs = PG_NARGS();
5631 25014 : funcvariadic = false;
5632 : }
5633 :
5634 : /* Setup for main loop. */
5635 25062 : fmt = PG_GETARG_TEXT_PP(0);
5636 25062 : start_ptr = VARDATA_ANY(fmt);
5637 25062 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5638 25062 : initStringInfo(&str);
5639 25062 : arg = 1; /* next argument position to print */
5640 :
5641 : /* Scan format string, looking for conversion specifiers. */
5642 727698 : for (cp = start_ptr; cp < end_ptr; cp++)
5643 : {
5644 : int argpos;
5645 : int widthpos;
5646 : int flags;
5647 : int width;
5648 : Datum value;
5649 : bool isNull;
5650 : Oid typid;
5651 :
5652 : /*
5653 : * If it's not the start of a conversion specifier, just copy it to
5654 : * the output buffer.
5655 : */
5656 702696 : if (*cp != '%')
5657 : {
5658 643614 : appendStringInfoCharMacro(&str, *cp);
5659 643632 : continue;
5660 : }
5661 :
5662 59082 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5663 :
5664 : /* Easy case: %% outputs a single % */
5665 59082 : if (*cp == '%')
5666 : {
5667 18 : appendStringInfoCharMacro(&str, *cp);
5668 18 : continue;
5669 : }
5670 :
5671 : /* Parse the optional portions of the format specifier */
5672 59064 : cp = text_format_parse_format(cp, end_ptr,
5673 : &argpos, &widthpos,
5674 : &flags, &width);
5675 :
5676 : /*
5677 : * Next we should see the main conversion specifier. Whether or not
5678 : * an argument position was present, it's known that at least one
5679 : * character remains in the string at this point. Experience suggests
5680 : * that it's worth checking that that character is one of the expected
5681 : * ones before we try to fetch arguments, so as to produce the least
5682 : * confusing response to a mis-formatted specifier.
5683 : */
5684 59040 : if (strchr("sIL", *cp) == NULL)
5685 6 : ereport(ERROR,
5686 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5687 : errmsg("unrecognized format() type specifier \"%.*s\"",
5688 : pg_mblen(cp), cp),
5689 : errhint("For a single \"%%\" use \"%%%%\".")));
5690 :
5691 : /* If indirect width was specified, get its value */
5692 59034 : if (widthpos >= 0)
5693 : {
5694 : /* Collect the specified or next argument position */
5695 42 : if (widthpos > 0)
5696 36 : arg = widthpos;
5697 42 : if (arg >= nargs)
5698 0 : ereport(ERROR,
5699 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5700 : errmsg("too few arguments for format()")));
5701 :
5702 : /* Get the value and type of the selected argument */
5703 42 : if (!funcvariadic)
5704 : {
5705 42 : value = PG_GETARG_DATUM(arg);
5706 42 : isNull = PG_ARGISNULL(arg);
5707 42 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5708 : }
5709 : else
5710 : {
5711 0 : value = elements[arg - 1];
5712 0 : isNull = nulls[arg - 1];
5713 0 : typid = element_type;
5714 : }
5715 42 : if (!OidIsValid(typid))
5716 0 : elog(ERROR, "could not determine data type of format() input");
5717 :
5718 42 : arg++;
5719 :
5720 : /* We can treat NULL width the same as zero */
5721 42 : if (isNull)
5722 6 : width = 0;
5723 36 : else if (typid == INT4OID)
5724 36 : width = DatumGetInt32(value);
5725 0 : else if (typid == INT2OID)
5726 0 : width = DatumGetInt16(value);
5727 : else
5728 : {
5729 : /* For less-usual datatypes, convert to text then to int */
5730 : char *str;
5731 :
5732 0 : if (typid != prev_width_type)
5733 : {
5734 : Oid typoutputfunc;
5735 : bool typIsVarlena;
5736 :
5737 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5738 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
5739 0 : prev_width_type = typid;
5740 : }
5741 :
5742 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
5743 :
5744 : /* pg_strtoint32 will complain about bad data or overflow */
5745 0 : width = pg_strtoint32(str);
5746 :
5747 0 : pfree(str);
5748 : }
5749 : }
5750 :
5751 : /* Collect the specified or next argument position */
5752 59034 : if (argpos > 0)
5753 132 : arg = argpos;
5754 59034 : if (arg >= nargs)
5755 24 : ereport(ERROR,
5756 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5757 : errmsg("too few arguments for format()")));
5758 :
5759 : /* Get the value and type of the selected argument */
5760 59010 : if (!funcvariadic)
5761 : {
5762 57738 : value = PG_GETARG_DATUM(arg);
5763 57738 : isNull = PG_ARGISNULL(arg);
5764 57738 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5765 : }
5766 : else
5767 : {
5768 1272 : value = elements[arg - 1];
5769 1272 : isNull = nulls[arg - 1];
5770 1272 : typid = element_type;
5771 : }
5772 59010 : if (!OidIsValid(typid))
5773 0 : elog(ERROR, "could not determine data type of format() input");
5774 :
5775 59010 : arg++;
5776 :
5777 : /*
5778 : * Get the appropriate typOutput function, reusing previous one if
5779 : * same type as previous argument. That's particularly useful in the
5780 : * variadic-array case, but often saves work even for ordinary calls.
5781 : */
5782 59010 : if (typid != prev_type)
5783 : {
5784 : Oid typoutputfunc;
5785 : bool typIsVarlena;
5786 :
5787 27960 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5788 27960 : fmgr_info(typoutputfunc, &typoutputfinfo);
5789 27960 : prev_type = typid;
5790 : }
5791 :
5792 : /*
5793 : * And now we can format the value.
5794 : */
5795 59010 : switch (*cp)
5796 : {
5797 59010 : case 's':
5798 : case 'I':
5799 : case 'L':
5800 59010 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
5801 : value, isNull,
5802 : flags, width);
5803 59004 : break;
5804 0 : default:
5805 : /* should not get here, because of previous check */
5806 0 : ereport(ERROR,
5807 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5808 : errmsg("unrecognized format() type specifier \"%.*s\"",
5809 : pg_mblen(cp), cp),
5810 : errhint("For a single \"%%\" use \"%%%%\".")));
5811 : break;
5812 : }
5813 : }
5814 :
5815 : /* Don't need deconstruct_array results anymore. */
5816 25002 : if (elements != NULL)
5817 42 : pfree(elements);
5818 25002 : if (nulls != NULL)
5819 42 : pfree(nulls);
5820 :
5821 : /* Generate results. */
5822 25002 : result = cstring_to_text_with_len(str.data, str.len);
5823 25002 : pfree(str.data);
5824 :
5825 25002 : PG_RETURN_TEXT_P(result);
5826 : }
5827 :
5828 : /*
5829 : * Parse contiguous digits as a decimal number.
5830 : *
5831 : * Returns true if some digits could be parsed.
5832 : * The value is returned into *value, and *ptr is advanced to the next
5833 : * character to be parsed.
5834 : *
5835 : * Note parsing invariant: at least one character is known available before
5836 : * string end (end_ptr) at entry, and this is still true at exit.
5837 : */
5838 : static bool
5839 118092 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5840 : {
5841 118092 : bool found = false;
5842 118092 : const char *cp = *ptr;
5843 118092 : int val = 0;
5844 :
5845 118404 : while (*cp >= '0' && *cp <= '9')
5846 : {
5847 318 : int8 digit = (*cp - '0');
5848 :
5849 318 : if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5850 318 : unlikely(pg_add_s32_overflow(val, digit, &val)))
5851 0 : ereport(ERROR,
5852 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5853 : errmsg("number is out of range")));
5854 318 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5855 312 : found = true;
5856 : }
5857 :
5858 118086 : *ptr = cp;
5859 118086 : *value = val;
5860 :
5861 118086 : return found;
5862 : }
5863 :
5864 : /*
5865 : * Parse a format specifier (generally following the SUS printf spec).
5866 : *
5867 : * We have already advanced over the initial '%', and we are looking for
5868 : * [argpos][flags][width]type (but the type character is not consumed here).
5869 : *
5870 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5871 : * Output parameters:
5872 : * argpos: argument position for value to be printed. -1 means unspecified.
5873 : * widthpos: argument position for width. Zero means the argument position
5874 : * was unspecified (ie, take the next arg) and -1 means no width
5875 : * argument (width was omitted or specified as a constant).
5876 : * flags: bitmask of flags.
5877 : * width: directly-specified width value. Zero means the width was omitted
5878 : * (note it's not necessary to distinguish this case from an explicit
5879 : * zero width value).
5880 : *
5881 : * The function result is the next character position to be parsed, ie, the
5882 : * location where the type character is/should be.
5883 : *
5884 : * Note parsing invariant: at least one character is known available before
5885 : * string end (end_ptr) at entry, and this is still true at exit.
5886 : */
5887 : static const char *
5888 59064 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
5889 : int *argpos, int *widthpos,
5890 : int *flags, int *width)
5891 : {
5892 59064 : const char *cp = start_ptr;
5893 : int n;
5894 :
5895 : /* set defaults for output parameters */
5896 59064 : *argpos = -1;
5897 59064 : *widthpos = -1;
5898 59064 : *flags = 0;
5899 59064 : *width = 0;
5900 :
5901 : /* try to identify first number */
5902 59064 : if (text_format_parse_digits(&cp, end_ptr, &n))
5903 : {
5904 174 : if (*cp != '$')
5905 : {
5906 : /* Must be just a width and a type, so we're done */
5907 24 : *width = n;
5908 24 : return cp;
5909 : }
5910 : /* The number was argument position */
5911 150 : *argpos = n;
5912 : /* Explicit 0 for argument index is immediately refused */
5913 150 : if (n == 0)
5914 6 : ereport(ERROR,
5915 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5916 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5917 144 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5918 : }
5919 :
5920 : /* Handle flags (only minus is supported now) */
5921 59058 : while (*cp == '-')
5922 : {
5923 30 : *flags |= TEXT_FORMAT_FLAG_MINUS;
5924 30 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5925 : }
5926 :
5927 59028 : if (*cp == '*')
5928 : {
5929 : /* Handle indirect width */
5930 48 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5931 48 : if (text_format_parse_digits(&cp, end_ptr, &n))
5932 : {
5933 : /* number in this position must be closed by $ */
5934 42 : if (*cp != '$')
5935 0 : ereport(ERROR,
5936 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5937 : errmsg("width argument position must be ended by \"$\"")));
5938 : /* The number was width argument position */
5939 42 : *widthpos = n;
5940 : /* Explicit 0 for argument index is immediately refused */
5941 42 : if (n == 0)
5942 6 : ereport(ERROR,
5943 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5944 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5945 36 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5946 : }
5947 : else
5948 6 : *widthpos = 0; /* width's argument position is unspecified */
5949 : }
5950 : else
5951 : {
5952 : /* Check for direct width specification */
5953 58980 : if (text_format_parse_digits(&cp, end_ptr, &n))
5954 30 : *width = n;
5955 : }
5956 :
5957 : /* cp should now be pointing at type character */
5958 59016 : return cp;
5959 : }
5960 :
5961 : /*
5962 : * Format a %s, %I, or %L conversion
5963 : */
5964 : static void
5965 59010 : text_format_string_conversion(StringInfo buf, char conversion,
5966 : FmgrInfo *typOutputInfo,
5967 : Datum value, bool isNull,
5968 : int flags, int width)
5969 : {
5970 : char *str;
5971 :
5972 : /* Handle NULL arguments before trying to stringify the value. */
5973 59010 : if (isNull)
5974 : {
5975 306 : if (conversion == 's')
5976 234 : text_format_append_string(buf, "", flags, width);
5977 72 : else if (conversion == 'L')
5978 66 : text_format_append_string(buf, "NULL", flags, width);
5979 6 : else if (conversion == 'I')
5980 6 : ereport(ERROR,
5981 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5982 : errmsg("null values cannot be formatted as an SQL identifier")));
5983 300 : return;
5984 : }
5985 :
5986 : /* Stringify. */
5987 58704 : str = OutputFunctionCall(typOutputInfo, value);
5988 :
5989 : /* Escape. */
5990 58704 : if (conversion == 'I')
5991 : {
5992 : /* quote_identifier may or may not allocate a new string. */
5993 3106 : text_format_append_string(buf, quote_identifier(str), flags, width);
5994 : }
5995 55598 : else if (conversion == 'L')
5996 : {
5997 2578 : char *qstr = quote_literal_cstr(str);
5998 :
5999 2578 : text_format_append_string(buf, qstr, flags, width);
6000 : /* quote_literal_cstr() always allocates a new string */
6001 2578 : pfree(qstr);
6002 : }
6003 : else
6004 53020 : text_format_append_string(buf, str, flags, width);
6005 :
6006 : /* Cleanup. */
6007 58704 : pfree(str);
6008 : }
6009 :
6010 : /*
6011 : * Append str to buf, padding as directed by flags/width
6012 : */
6013 : static void
6014 59004 : text_format_append_string(StringInfo buf, const char *str,
6015 : int flags, int width)
6016 : {
6017 59004 : bool align_to_left = false;
6018 : int len;
6019 :
6020 : /* fast path for typical easy case */
6021 59004 : if (width == 0)
6022 : {
6023 58920 : appendStringInfoString(buf, str);
6024 58920 : return;
6025 : }
6026 :
6027 84 : if (width < 0)
6028 : {
6029 : /* Negative width: implicit '-' flag, then take absolute value */
6030 6 : align_to_left = true;
6031 : /* -INT_MIN is undefined */
6032 6 : if (width <= INT_MIN)
6033 0 : ereport(ERROR,
6034 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6035 : errmsg("number is out of range")));
6036 6 : width = -width;
6037 : }
6038 78 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
6039 24 : align_to_left = true;
6040 :
6041 84 : len = pg_mbstrlen(str);
6042 84 : if (align_to_left)
6043 : {
6044 : /* left justify */
6045 30 : appendStringInfoString(buf, str);
6046 30 : if (len < width)
6047 30 : appendStringInfoSpaces(buf, width - len);
6048 : }
6049 : else
6050 : {
6051 : /* right justify */
6052 54 : if (len < width)
6053 54 : appendStringInfoSpaces(buf, width - len);
6054 54 : appendStringInfoString(buf, str);
6055 : }
6056 : }
6057 :
6058 : /*
6059 : * text_format_nv - nonvariadic wrapper for text_format function.
6060 : *
6061 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6062 : * which checks that all built-in functions that share the implementing C
6063 : * function take the same number of arguments.
6064 : */
6065 : Datum
6066 30 : text_format_nv(PG_FUNCTION_ARGS)
6067 : {
6068 30 : return text_format(fcinfo);
6069 : }
6070 :
6071 : /*
6072 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
6073 : * for this use case.
6074 : */
6075 : static inline bool
6076 0 : rest_of_char_same(const char *s1, const char *s2, int len)
6077 : {
6078 0 : while (len > 0)
6079 : {
6080 0 : len--;
6081 0 : if (s1[len] != s2[len])
6082 0 : return false;
6083 : }
6084 0 : return true;
6085 : }
6086 :
6087 : /* Expand each Levenshtein distance variant */
6088 : #include "levenshtein.c"
6089 : #define LEVENSHTEIN_LESS_EQUAL
6090 : #include "levenshtein.c"
6091 :
6092 :
6093 : /*
6094 : * The following *ClosestMatch() functions can be used to determine whether a
6095 : * user-provided string resembles any known valid values, which is useful for
6096 : * providing hints in log messages, among other things. Use these functions
6097 : * like so:
6098 : *
6099 : * initClosestMatch(&state, source_string, max_distance);
6100 : *
6101 : * for (int i = 0; i < num_valid_strings; i++)
6102 : * updateClosestMatch(&state, valid_strings[i]);
6103 : *
6104 : * closestMatch = getClosestMatch(&state);
6105 : */
6106 :
6107 : /*
6108 : * Initialize the given state with the source string and maximum Levenshtein
6109 : * distance to consider.
6110 : */
6111 : void
6112 56 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6113 : {
6114 : Assert(state);
6115 : Assert(max_d >= 0);
6116 :
6117 56 : state->source = source;
6118 56 : state->min_d = -1;
6119 56 : state->max_d = max_d;
6120 56 : state->match = NULL;
6121 56 : }
6122 :
6123 : /*
6124 : * If the candidate string is a closer match than the current one saved (or
6125 : * there is no match saved), save it as the closest match.
6126 : *
6127 : * If the source or candidate string is NULL, empty, or too long, this function
6128 : * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6129 : * allowed or more than half the characters are different, no action is taken.
6130 : */
6131 : void
6132 334 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
6133 : {
6134 : int dist;
6135 :
6136 : Assert(state);
6137 :
6138 334 : if (state->source == NULL || state->source[0] == '\0' ||
6139 334 : candidate == NULL || candidate[0] == '\0')
6140 0 : return;
6141 :
6142 : /*
6143 : * To avoid ERROR-ing, we check the lengths here instead of setting
6144 : * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6145 : */
6146 334 : if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6147 334 : strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6148 0 : return;
6149 :
6150 334 : dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6151 334 : candidate, strlen(candidate), 1, 1, 1,
6152 : state->max_d, true);
6153 334 : if (dist <= state->max_d &&
6154 56 : dist <= strlen(state->source) / 2 &&
6155 14 : (state->min_d == -1 || dist < state->min_d))
6156 : {
6157 14 : state->min_d = dist;
6158 14 : state->match = candidate;
6159 : }
6160 : }
6161 :
6162 : /*
6163 : * Return the closest match. If no suitable candidates were provided via
6164 : * updateClosestMatch(), return NULL.
6165 : */
6166 : const char *
6167 56 : getClosestMatch(ClosestMatchState *state)
6168 : {
6169 : Assert(state);
6170 :
6171 56 : return state->match;
6172 : }
6173 :
6174 :
6175 : /*
6176 : * Unicode support
6177 : */
6178 :
6179 : static UnicodeNormalizationForm
6180 186 : unicode_norm_form_from_string(const char *formstr)
6181 : {
6182 186 : UnicodeNormalizationForm form = -1;
6183 :
6184 : /*
6185 : * Might as well check this while we're here.
6186 : */
6187 186 : if (GetDatabaseEncoding() != PG_UTF8)
6188 0 : ereport(ERROR,
6189 : (errcode(ERRCODE_SYNTAX_ERROR),
6190 : errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6191 :
6192 186 : if (pg_strcasecmp(formstr, "NFC") == 0)
6193 66 : form = UNICODE_NFC;
6194 120 : else if (pg_strcasecmp(formstr, "NFD") == 0)
6195 36 : form = UNICODE_NFD;
6196 84 : else if (pg_strcasecmp(formstr, "NFKC") == 0)
6197 36 : form = UNICODE_NFKC;
6198 48 : else if (pg_strcasecmp(formstr, "NFKD") == 0)
6199 36 : form = UNICODE_NFKD;
6200 : else
6201 12 : ereport(ERROR,
6202 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6203 : errmsg("invalid normalization form: %s", formstr)));
6204 :
6205 174 : return form;
6206 : }
6207 :
6208 : Datum
6209 48 : unicode_normalize_func(PG_FUNCTION_ARGS)
6210 : {
6211 48 : text *input = PG_GETARG_TEXT_PP(0);
6212 48 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6213 : UnicodeNormalizationForm form;
6214 : int size;
6215 : pg_wchar *input_chars;
6216 : pg_wchar *output_chars;
6217 : unsigned char *p;
6218 : text *result;
6219 : int i;
6220 :
6221 48 : form = unicode_norm_form_from_string(formstr);
6222 :
6223 : /* convert to pg_wchar */
6224 42 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6225 42 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6226 42 : p = (unsigned char *) VARDATA_ANY(input);
6227 168 : for (i = 0; i < size; i++)
6228 : {
6229 126 : input_chars[i] = utf8_to_unicode(p);
6230 126 : p += pg_utf_mblen(p);
6231 : }
6232 42 : input_chars[i] = (pg_wchar) '\0';
6233 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6234 :
6235 : /* action */
6236 42 : output_chars = unicode_normalize(form, input_chars);
6237 :
6238 : /* convert back to UTF-8 string */
6239 42 : size = 0;
6240 162 : for (pg_wchar *wp = output_chars; *wp; wp++)
6241 : {
6242 : unsigned char buf[4];
6243 :
6244 120 : unicode_to_utf8(*wp, buf);
6245 120 : size += pg_utf_mblen(buf);
6246 : }
6247 :
6248 42 : result = palloc(size + VARHDRSZ);
6249 42 : SET_VARSIZE(result, size + VARHDRSZ);
6250 :
6251 42 : p = (unsigned char *) VARDATA_ANY(result);
6252 162 : for (pg_wchar *wp = output_chars; *wp; wp++)
6253 : {
6254 120 : unicode_to_utf8(*wp, p);
6255 120 : p += pg_utf_mblen(p);
6256 : }
6257 : Assert((char *) p == (char *) result + size + VARHDRSZ);
6258 :
6259 42 : PG_RETURN_TEXT_P(result);
6260 : }
6261 :
6262 : /*
6263 : * Check whether the string is in the specified Unicode normalization form.
6264 : *
6265 : * This is done by converting the string to the specified normal form and then
6266 : * comparing that to the original string. To speed that up, we also apply the
6267 : * "quick check" algorithm specified in UAX #15, which can give a yes or no
6268 : * answer for many strings by just scanning the string once.
6269 : *
6270 : * This function should generally be optimized for the case where the string
6271 : * is in fact normalized. In that case, we'll end up looking at the entire
6272 : * string, so it's probably not worth doing any incremental conversion etc.
6273 : */
6274 : Datum
6275 138 : unicode_is_normalized(PG_FUNCTION_ARGS)
6276 : {
6277 138 : text *input = PG_GETARG_TEXT_PP(0);
6278 138 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6279 : UnicodeNormalizationForm form;
6280 : int size;
6281 : pg_wchar *input_chars;
6282 : pg_wchar *output_chars;
6283 : unsigned char *p;
6284 : int i;
6285 : UnicodeNormalizationQC quickcheck;
6286 : int output_size;
6287 : bool result;
6288 :
6289 138 : form = unicode_norm_form_from_string(formstr);
6290 :
6291 : /* convert to pg_wchar */
6292 132 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6293 132 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6294 132 : p = (unsigned char *) VARDATA_ANY(input);
6295 504 : for (i = 0; i < size; i++)
6296 : {
6297 372 : input_chars[i] = utf8_to_unicode(p);
6298 372 : p += pg_utf_mblen(p);
6299 : }
6300 132 : input_chars[i] = (pg_wchar) '\0';
6301 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6302 :
6303 : /* quick check (see UAX #15) */
6304 132 : quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6305 132 : if (quickcheck == UNICODE_NORM_QC_YES)
6306 42 : PG_RETURN_BOOL(true);
6307 90 : else if (quickcheck == UNICODE_NORM_QC_NO)
6308 12 : PG_RETURN_BOOL(false);
6309 :
6310 : /* normalize and compare with original */
6311 78 : output_chars = unicode_normalize(form, input_chars);
6312 :
6313 78 : output_size = 0;
6314 324 : for (pg_wchar *wp = output_chars; *wp; wp++)
6315 246 : output_size++;
6316 :
6317 114 : result = (size == output_size) &&
6318 36 : (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6319 :
6320 78 : PG_RETURN_BOOL(result);
6321 : }
6322 :
6323 : /*
6324 : * Check if first n chars are hexadecimal digits
6325 : */
6326 : static bool
6327 156 : isxdigits_n(const char *instr, size_t n)
6328 : {
6329 660 : for (size_t i = 0; i < n; i++)
6330 570 : if (!isxdigit((unsigned char) instr[i]))
6331 66 : return false;
6332 :
6333 90 : return true;
6334 : }
6335 :
6336 : static unsigned int
6337 504 : hexval(unsigned char c)
6338 : {
6339 504 : if (c >= '0' && c <= '9')
6340 384 : return c - '0';
6341 120 : if (c >= 'a' && c <= 'f')
6342 60 : return c - 'a' + 0xA;
6343 60 : if (c >= 'A' && c <= 'F')
6344 60 : return c - 'A' + 0xA;
6345 0 : elog(ERROR, "invalid hexadecimal digit");
6346 : return 0; /* not reached */
6347 : }
6348 :
6349 : /*
6350 : * Translate string with hexadecimal digits to number
6351 : */
6352 : static unsigned int
6353 90 : hexval_n(const char *instr, size_t n)
6354 : {
6355 90 : unsigned int result = 0;
6356 :
6357 594 : for (size_t i = 0; i < n; i++)
6358 504 : result += hexval(instr[i]) << (4 * (n - i - 1));
6359 :
6360 90 : return result;
6361 : }
6362 :
6363 : /*
6364 : * Replaces Unicode escape sequences by Unicode characters
6365 : */
6366 : Datum
6367 66 : unistr(PG_FUNCTION_ARGS)
6368 : {
6369 66 : text *input_text = PG_GETARG_TEXT_PP(0);
6370 : char *instr;
6371 : int len;
6372 : StringInfoData str;
6373 : text *result;
6374 66 : pg_wchar pair_first = 0;
6375 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6376 :
6377 66 : instr = VARDATA_ANY(input_text);
6378 66 : len = VARSIZE_ANY_EXHDR(input_text);
6379 :
6380 66 : initStringInfo(&str);
6381 :
6382 510 : while (len > 0)
6383 : {
6384 486 : if (instr[0] == '\\')
6385 : {
6386 102 : if (len >= 2 &&
6387 102 : instr[1] == '\\')
6388 : {
6389 6 : if (pair_first)
6390 0 : goto invalid_pair;
6391 6 : appendStringInfoChar(&str, '\\');
6392 6 : instr += 2;
6393 6 : len -= 2;
6394 : }
6395 96 : else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6396 66 : (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6397 30 : {
6398 : pg_wchar unicode;
6399 42 : int offset = instr[1] == 'u' ? 2 : 1;
6400 :
6401 42 : unicode = hexval_n(instr + offset, 4);
6402 :
6403 42 : if (!is_valid_unicode_codepoint(unicode))
6404 0 : ereport(ERROR,
6405 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6406 : errmsg("invalid Unicode code point: %04X", unicode));
6407 :
6408 42 : if (pair_first)
6409 : {
6410 12 : if (is_utf16_surrogate_second(unicode))
6411 : {
6412 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6413 0 : pair_first = 0;
6414 : }
6415 : else
6416 12 : goto invalid_pair;
6417 : }
6418 30 : else if (is_utf16_surrogate_second(unicode))
6419 0 : goto invalid_pair;
6420 :
6421 30 : if (is_utf16_surrogate_first(unicode))
6422 18 : pair_first = unicode;
6423 : else
6424 : {
6425 12 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6426 12 : appendStringInfoString(&str, cbuf);
6427 : }
6428 :
6429 30 : instr += 4 + offset;
6430 30 : len -= 4 + offset;
6431 : }
6432 54 : else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6433 12 : {
6434 : pg_wchar unicode;
6435 :
6436 24 : unicode = hexval_n(instr + 2, 6);
6437 :
6438 24 : if (!is_valid_unicode_codepoint(unicode))
6439 6 : ereport(ERROR,
6440 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6441 : errmsg("invalid Unicode code point: %04X", unicode));
6442 :
6443 18 : if (pair_first)
6444 : {
6445 6 : if (is_utf16_surrogate_second(unicode))
6446 : {
6447 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6448 0 : pair_first = 0;
6449 : }
6450 : else
6451 6 : goto invalid_pair;
6452 : }
6453 12 : else if (is_utf16_surrogate_second(unicode))
6454 0 : goto invalid_pair;
6455 :
6456 12 : if (is_utf16_surrogate_first(unicode))
6457 6 : pair_first = unicode;
6458 : else
6459 : {
6460 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6461 6 : appendStringInfoString(&str, cbuf);
6462 : }
6463 :
6464 12 : instr += 8;
6465 12 : len -= 8;
6466 : }
6467 30 : else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6468 12 : {
6469 : pg_wchar unicode;
6470 :
6471 24 : unicode = hexval_n(instr + 2, 8);
6472 :
6473 24 : if (!is_valid_unicode_codepoint(unicode))
6474 6 : ereport(ERROR,
6475 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6476 : errmsg("invalid Unicode code point: %04X", unicode));
6477 :
6478 18 : if (pair_first)
6479 : {
6480 6 : if (is_utf16_surrogate_second(unicode))
6481 : {
6482 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6483 0 : pair_first = 0;
6484 : }
6485 : else
6486 6 : goto invalid_pair;
6487 : }
6488 12 : else if (is_utf16_surrogate_second(unicode))
6489 0 : goto invalid_pair;
6490 :
6491 12 : if (is_utf16_surrogate_first(unicode))
6492 6 : pair_first = unicode;
6493 : else
6494 : {
6495 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6496 6 : appendStringInfoString(&str, cbuf);
6497 : }
6498 :
6499 12 : instr += 10;
6500 12 : len -= 10;
6501 : }
6502 : else
6503 6 : ereport(ERROR,
6504 : (errcode(ERRCODE_SYNTAX_ERROR),
6505 : errmsg("invalid Unicode escape"),
6506 : errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6507 : }
6508 : else
6509 : {
6510 384 : if (pair_first)
6511 0 : goto invalid_pair;
6512 :
6513 384 : appendStringInfoChar(&str, *instr++);
6514 384 : len--;
6515 : }
6516 : }
6517 :
6518 : /* unfinished surrogate pair? */
6519 24 : if (pair_first)
6520 6 : goto invalid_pair;
6521 :
6522 18 : result = cstring_to_text_with_len(str.data, str.len);
6523 18 : pfree(str.data);
6524 :
6525 18 : PG_RETURN_TEXT_P(result);
6526 :
6527 30 : invalid_pair:
6528 30 : ereport(ERROR,
6529 : (errcode(ERRCODE_SYNTAX_ERROR),
6530 : errmsg("invalid Unicode surrogate pair")));
6531 : PG_RETURN_NULL(); /* keep compiler quiet */
6532 : }
|