Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/detoast.h"
21 : #include "access/toast_compression.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/hashfn.h"
25 : #include "common/int.h"
26 : #include "common/unicode_norm.h"
27 : #include "funcapi.h"
28 : #include "lib/hyperloglog.h"
29 : #include "libpq/pqformat.h"
30 : #include "miscadmin.h"
31 : #include "nodes/execnodes.h"
32 : #include "parser/scansup.h"
33 : #include "port/pg_bswap.h"
34 : #include "regex/regex.h"
35 : #include "utils/builtins.h"
36 : #include "utils/bytea.h"
37 : #include "utils/lsyscache.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/sortsupport.h"
41 : #include "utils/varlena.h"
42 :
43 :
44 : /* GUC variable */
45 : int bytea_output = BYTEA_OUTPUT_HEX;
46 :
47 : typedef struct varlena unknown;
48 : typedef struct varlena VarString;
49 :
50 : /*
51 : * State for text_position_* functions.
52 : */
53 : typedef struct
54 : {
55 : bool is_multibyte_char_in_char; /* need to check char boundaries? */
56 :
57 : char *str1; /* haystack string */
58 : char *str2; /* needle string */
59 : int len1; /* string lengths in bytes */
60 : int len2;
61 :
62 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
63 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
64 : int skiptable[256]; /* skip distance for given mismatched char */
65 :
66 : char *last_match; /* pointer to last match in 'str1' */
67 :
68 : /*
69 : * Sometimes we need to convert the byte position of a match to a
70 : * character position. These store the last position that was converted,
71 : * so that on the next call, we can continue from that point, rather than
72 : * count characters from the very beginning.
73 : */
74 : char *refpoint; /* pointer within original haystack string */
75 : int refpos; /* 0-based character offset of the same point */
76 : } TextPositionState;
77 :
78 : typedef struct
79 : {
80 : char *buf1; /* 1st string, or abbreviation original string
81 : * buf */
82 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
83 : int buflen1;
84 : int buflen2;
85 : int last_len1; /* Length of last buf1 string/strxfrm() input */
86 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
87 : int last_returned; /* Last comparison result (cache) */
88 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
89 : bool collate_c;
90 : Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
91 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92 : hyperLogLogState full_card; /* Full key cardinality state */
93 : double prop_card; /* Required cardinality proportion */
94 : pg_locale_t locale;
95 : } VarStringSortSupport;
96 :
97 : /*
98 : * Output data for split_text(): we output either to an array or a table.
99 : * tupstore and tupdesc must be set up in advance to output to a table.
100 : */
101 : typedef struct
102 : {
103 : ArrayBuildState *astate;
104 : Tuplestorestate *tupstore;
105 : TupleDesc tupdesc;
106 : } SplitTextOutputData;
107 :
108 : /*
109 : * This should be large enough that most strings will fit, but small enough
110 : * that we feel comfortable putting it on the stack
111 : */
112 : #define TEXTBUFLEN 1024
113 :
114 : #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
115 : #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
116 : #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
117 : #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 : #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
119 :
120 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
121 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
122 :
123 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
124 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 : static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 : static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 : static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 : static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
130 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
131 : static int32 text_length(Datum str);
132 : static text *text_catenate(text *t1, text *t2);
133 : static text *text_substring(Datum str,
134 : int32 start,
135 : int32 length,
136 : bool length_not_specified);
137 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
138 : static int text_position(text *t1, text *t2, Oid collid);
139 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
140 : static bool text_position_next(TextPositionState *state);
141 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
142 : static char *text_position_get_match_ptr(TextPositionState *state);
143 : static int text_position_get_match_pos(TextPositionState *state);
144 : static void text_position_cleanup(TextPositionState *state);
145 : static void check_collation_set(Oid collid);
146 : static int text_cmp(text *arg1, text *arg2, Oid collid);
147 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
148 : static bytea *bytea_substring(Datum str,
149 : int S,
150 : int L,
151 : bool length_not_specified);
152 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
153 : static void appendStringInfoText(StringInfo str, const text *t);
154 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
155 : static void split_text_accum_result(SplitTextOutputData *tstate,
156 : text *field_value,
157 : text *null_string,
158 : Oid collation);
159 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
160 : const char *fldsep, const char *null_string);
161 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
162 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
163 : int *value);
164 : static const char *text_format_parse_format(const char *start_ptr,
165 : const char *end_ptr,
166 : int *argpos, int *widthpos,
167 : int *flags, int *width);
168 : static void text_format_string_conversion(StringInfo buf, char conversion,
169 : FmgrInfo *typOutputInfo,
170 : Datum value, bool isNull,
171 : int flags, int width);
172 : static void text_format_append_string(StringInfo buf, const char *str,
173 : int flags, int width);
174 :
175 :
176 : /*****************************************************************************
177 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
178 : *****************************************************************************/
179 :
180 : /*
181 : * cstring_to_text
182 : *
183 : * Create a text value from a null-terminated C string.
184 : *
185 : * The new text value is freshly palloc'd with a full-size VARHDR.
186 : */
187 : text *
188 21610340 : cstring_to_text(const char *s)
189 : {
190 21610340 : return cstring_to_text_with_len(s, strlen(s));
191 : }
192 :
193 : /*
194 : * cstring_to_text_with_len
195 : *
196 : * Same as cstring_to_text except the caller specifies the string length;
197 : * the string need not be null_terminated.
198 : */
199 : text *
200 26124082 : cstring_to_text_with_len(const char *s, int len)
201 : {
202 26124082 : text *result = (text *) palloc(len + VARHDRSZ);
203 :
204 26124082 : SET_VARSIZE(result, len + VARHDRSZ);
205 26124082 : memcpy(VARDATA(result), s, len);
206 :
207 26124082 : return result;
208 : }
209 :
210 : /*
211 : * text_to_cstring
212 : *
213 : * Create a palloc'd, null-terminated C string from a text value.
214 : *
215 : * We support being passed a compressed or toasted text value.
216 : * This is a bit bogus since such values shouldn't really be referred to as
217 : * "text *", but it seems useful for robustness. If we didn't handle that
218 : * case here, we'd need another routine that did, anyway.
219 : */
220 : char *
221 13649410 : text_to_cstring(const text *t)
222 : {
223 : /* must cast away the const, unfortunately */
224 13649410 : text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
225 13649410 : int len = VARSIZE_ANY_EXHDR(tunpacked);
226 : char *result;
227 :
228 13649410 : result = (char *) palloc(len + 1);
229 13649410 : memcpy(result, VARDATA_ANY(tunpacked), len);
230 13649410 : result[len] = '\0';
231 :
232 13649410 : if (tunpacked != t)
233 119696 : pfree(tunpacked);
234 :
235 13649410 : return result;
236 : }
237 :
238 : /*
239 : * text_to_cstring_buffer
240 : *
241 : * Copy a text value into a caller-supplied buffer of size dst_len.
242 : *
243 : * The text string is truncated if necessary to fit. The result is
244 : * guaranteed null-terminated (unless dst_len == 0).
245 : *
246 : * We support being passed a compressed or toasted text value.
247 : * This is a bit bogus since such values shouldn't really be referred to as
248 : * "text *", but it seems useful for robustness. If we didn't handle that
249 : * case here, we'd need another routine that did, anyway.
250 : */
251 : void
252 622 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
253 : {
254 : /* must cast away the const, unfortunately */
255 622 : text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
256 622 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
257 :
258 622 : if (dst_len > 0)
259 : {
260 622 : dst_len--;
261 622 : if (dst_len >= src_len)
262 622 : dst_len = src_len;
263 : else /* ensure truncation is encoding-safe */
264 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
265 622 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
266 622 : dst[dst_len] = '\0';
267 : }
268 :
269 622 : if (srcunpacked != src)
270 0 : pfree(srcunpacked);
271 622 : }
272 :
273 :
274 : /*****************************************************************************
275 : * USER I/O ROUTINES *
276 : *****************************************************************************/
277 :
278 :
279 : #define VAL(CH) ((CH) - '0')
280 : #define DIG(VAL) ((VAL) + '0')
281 :
282 : /*
283 : * byteain - converts from printable representation of byte array
284 : *
285 : * Non-printable characters must be passed as '\nnn' (octal) and are
286 : * converted to internal form. '\' must be passed as '\\'.
287 : * ereport(ERROR, ...) if bad form.
288 : *
289 : * BUGS:
290 : * The input is scanned twice.
291 : * The error checking of input is minimal.
292 : */
293 : Datum
294 19118 : byteain(PG_FUNCTION_ARGS)
295 : {
296 19118 : char *inputText = PG_GETARG_CSTRING(0);
297 : char *tp;
298 : char *rp;
299 : int bc;
300 : bytea *result;
301 :
302 : /* Recognize hex input */
303 19118 : if (inputText[0] == '\\' && inputText[1] == 'x')
304 : {
305 792 : size_t len = strlen(inputText);
306 :
307 792 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
308 792 : result = palloc(bc);
309 792 : bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
310 780 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
311 :
312 780 : PG_RETURN_BYTEA_P(result);
313 : }
314 :
315 : /* Else, it's the traditional escaped style */
316 298968 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
317 : {
318 280648 : if (tp[0] != '\\')
319 279640 : tp++;
320 1008 : else if ((tp[0] == '\\') &&
321 1008 : (tp[1] >= '0' && tp[1] <= '3') &&
322 1002 : (tp[2] >= '0' && tp[2] <= '7') &&
323 1002 : (tp[3] >= '0' && tp[3] <= '7'))
324 1002 : tp += 4;
325 6 : else if ((tp[0] == '\\') &&
326 6 : (tp[1] == '\\'))
327 0 : tp += 2;
328 : else
329 : {
330 : /*
331 : * one backslash, not followed by another or ### valid octal
332 : */
333 6 : ereport(ERROR,
334 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
335 : errmsg("invalid input syntax for type %s", "bytea")));
336 : }
337 : }
338 :
339 18320 : bc += VARHDRSZ;
340 :
341 18320 : result = (bytea *) palloc(bc);
342 18320 : SET_VARSIZE(result, bc);
343 :
344 18320 : tp = inputText;
345 18320 : rp = VARDATA(result);
346 298950 : while (*tp != '\0')
347 : {
348 280630 : if (tp[0] != '\\')
349 279628 : *rp++ = *tp++;
350 1002 : else if ((tp[0] == '\\') &&
351 1002 : (tp[1] >= '0' && tp[1] <= '3') &&
352 1002 : (tp[2] >= '0' && tp[2] <= '7') &&
353 1002 : (tp[3] >= '0' && tp[3] <= '7'))
354 : {
355 1002 : bc = VAL(tp[1]);
356 1002 : bc <<= 3;
357 1002 : bc += VAL(tp[2]);
358 1002 : bc <<= 3;
359 1002 : *rp++ = bc + VAL(tp[3]);
360 :
361 1002 : tp += 4;
362 : }
363 0 : else if ((tp[0] == '\\') &&
364 0 : (tp[1] == '\\'))
365 : {
366 0 : *rp++ = '\\';
367 0 : tp += 2;
368 : }
369 : else
370 : {
371 : /*
372 : * We should never get here. The first pass should not allow it.
373 : */
374 0 : ereport(ERROR,
375 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
376 : errmsg("invalid input syntax for type %s", "bytea")));
377 : }
378 : }
379 :
380 18320 : PG_RETURN_BYTEA_P(result);
381 : }
382 :
383 : /*
384 : * byteaout - converts to printable representation of byte array
385 : *
386 : * In the traditional escaped format, non-printable characters are
387 : * printed as '\nnn' (octal) and '\' as '\\'.
388 : */
389 : Datum
390 13360 : byteaout(PG_FUNCTION_ARGS)
391 : {
392 13360 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
393 : char *result;
394 : char *rp;
395 :
396 13360 : if (bytea_output == BYTEA_OUTPUT_HEX)
397 : {
398 : /* Print hex format */
399 13096 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
400 13096 : *rp++ = '\\';
401 13096 : *rp++ = 'x';
402 13096 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
403 : }
404 264 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
405 : {
406 : /* Print traditional escaped format */
407 : char *vp;
408 : uint64 len;
409 : int i;
410 :
411 264 : len = 1; /* empty string has 1 char */
412 264 : vp = VARDATA_ANY(vlena);
413 2608 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
414 : {
415 2344 : if (*vp == '\\')
416 0 : len += 2;
417 2344 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
418 498 : len += 4;
419 : else
420 1846 : len++;
421 : }
422 :
423 : /*
424 : * In principle len can't overflow uint32 if the input fit in 1GB, but
425 : * for safety let's check rather than relying on palloc's internal
426 : * check.
427 : */
428 264 : if (len > MaxAllocSize)
429 0 : ereport(ERROR,
430 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
431 : errmsg_internal("result of bytea output conversion is too large")));
432 264 : rp = result = (char *) palloc(len);
433 :
434 264 : vp = VARDATA_ANY(vlena);
435 2608 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
436 : {
437 2344 : if (*vp == '\\')
438 : {
439 0 : *rp++ = '\\';
440 0 : *rp++ = '\\';
441 : }
442 2344 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
443 498 : {
444 : int val; /* holds unprintable chars */
445 :
446 498 : val = *vp;
447 498 : rp[0] = '\\';
448 498 : rp[3] = DIG(val & 07);
449 498 : val >>= 3;
450 498 : rp[2] = DIG(val & 07);
451 498 : val >>= 3;
452 498 : rp[1] = DIG(val & 03);
453 498 : rp += 4;
454 : }
455 : else
456 1846 : *rp++ = *vp;
457 : }
458 : }
459 : else
460 : {
461 0 : elog(ERROR, "unrecognized bytea_output setting: %d",
462 : bytea_output);
463 : rp = result = NULL; /* keep compiler quiet */
464 : }
465 13360 : *rp = '\0';
466 13360 : PG_RETURN_CSTRING(result);
467 : }
468 :
469 : /*
470 : * bytearecv - converts external binary format to bytea
471 : */
472 : Datum
473 1038 : bytearecv(PG_FUNCTION_ARGS)
474 : {
475 1038 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
476 : bytea *result;
477 : int nbytes;
478 :
479 1038 : nbytes = buf->len - buf->cursor;
480 1038 : result = (bytea *) palloc(nbytes + VARHDRSZ);
481 1038 : SET_VARSIZE(result, nbytes + VARHDRSZ);
482 1038 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
483 1038 : PG_RETURN_BYTEA_P(result);
484 : }
485 :
486 : /*
487 : * byteasend - converts bytea to binary format
488 : *
489 : * This is a special case: just copy the input...
490 : */
491 : Datum
492 5516 : byteasend(PG_FUNCTION_ARGS)
493 : {
494 5516 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
495 :
496 5516 : PG_RETURN_BYTEA_P(vlena);
497 : }
498 :
499 : Datum
500 32774 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
501 : {
502 : StringInfo state;
503 :
504 32774 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
505 :
506 : /* Append the value unless null. */
507 32774 : if (!PG_ARGISNULL(1))
508 : {
509 32774 : bytea *value = PG_GETARG_BYTEA_PP(1);
510 :
511 : /* On the first time through, we ignore the delimiter. */
512 32774 : if (state == NULL)
513 26 : state = makeStringAggState(fcinfo);
514 32748 : else if (!PG_ARGISNULL(2))
515 : {
516 32742 : bytea *delim = PG_GETARG_BYTEA_PP(2);
517 :
518 32742 : appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
519 : }
520 :
521 32774 : appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
522 : }
523 :
524 : /*
525 : * The transition type for string_agg() is declared to be "internal",
526 : * which is a pass-by-value type the same size as a pointer.
527 : */
528 32774 : PG_RETURN_POINTER(state);
529 : }
530 :
531 : Datum
532 32 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
533 : {
534 : StringInfo state;
535 :
536 : /* cannot be called directly because of internal-type argument */
537 : Assert(AggCheckCallContext(fcinfo, NULL));
538 :
539 32 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
540 :
541 32 : if (state != NULL)
542 : {
543 : bytea *result;
544 :
545 26 : result = (bytea *) palloc(state->len + VARHDRSZ);
546 26 : SET_VARSIZE(result, state->len + VARHDRSZ);
547 26 : memcpy(VARDATA(result), state->data, state->len);
548 26 : PG_RETURN_BYTEA_P(result);
549 : }
550 : else
551 6 : PG_RETURN_NULL();
552 : }
553 :
554 : /*
555 : * textin - converts "..." to internal representation
556 : */
557 : Datum
558 17761384 : textin(PG_FUNCTION_ARGS)
559 : {
560 17761384 : char *inputText = PG_GETARG_CSTRING(0);
561 :
562 17761384 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
563 : }
564 :
565 : /*
566 : * textout - converts internal representation to "..."
567 : */
568 : Datum
569 8036942 : textout(PG_FUNCTION_ARGS)
570 : {
571 8036942 : Datum txt = PG_GETARG_DATUM(0);
572 :
573 8036942 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
574 : }
575 :
576 : /*
577 : * textrecv - converts external binary format to text
578 : */
579 : Datum
580 53372 : textrecv(PG_FUNCTION_ARGS)
581 : {
582 53372 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
583 : text *result;
584 : char *str;
585 : int nbytes;
586 :
587 53372 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
588 :
589 53372 : result = cstring_to_text_with_len(str, nbytes);
590 53372 : pfree(str);
591 53372 : PG_RETURN_TEXT_P(result);
592 : }
593 :
594 : /*
595 : * textsend - converts text to binary format
596 : */
597 : Datum
598 36540 : textsend(PG_FUNCTION_ARGS)
599 : {
600 36540 : text *t = PG_GETARG_TEXT_PP(0);
601 : StringInfoData buf;
602 :
603 36540 : pq_begintypsend(&buf);
604 36540 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
605 36540 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
606 : }
607 :
608 :
609 : /*
610 : * unknownin - converts "..." to internal representation
611 : */
612 : Datum
613 0 : unknownin(PG_FUNCTION_ARGS)
614 : {
615 0 : char *str = PG_GETARG_CSTRING(0);
616 :
617 : /* representation is same as cstring */
618 0 : PG_RETURN_CSTRING(pstrdup(str));
619 : }
620 :
621 : /*
622 : * unknownout - converts internal representation to "..."
623 : */
624 : Datum
625 678 : unknownout(PG_FUNCTION_ARGS)
626 : {
627 : /* representation is same as cstring */
628 678 : char *str = PG_GETARG_CSTRING(0);
629 :
630 678 : PG_RETURN_CSTRING(pstrdup(str));
631 : }
632 :
633 : /*
634 : * unknownrecv - converts external binary format to unknown
635 : */
636 : Datum
637 0 : unknownrecv(PG_FUNCTION_ARGS)
638 : {
639 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
640 : char *str;
641 : int nbytes;
642 :
643 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
644 : /* representation is same as cstring */
645 0 : PG_RETURN_CSTRING(str);
646 : }
647 :
648 : /*
649 : * unknownsend - converts unknown to binary format
650 : */
651 : Datum
652 0 : unknownsend(PG_FUNCTION_ARGS)
653 : {
654 : /* representation is same as cstring */
655 0 : char *str = PG_GETARG_CSTRING(0);
656 : StringInfoData buf;
657 :
658 0 : pq_begintypsend(&buf);
659 0 : pq_sendtext(&buf, str, strlen(str));
660 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
661 : }
662 :
663 :
664 : /* ========== PUBLIC ROUTINES ========== */
665 :
666 : /*
667 : * textlen -
668 : * returns the logical length of a text*
669 : * (which is less than the VARSIZE of the text*)
670 : */
671 : Datum
672 430414 : textlen(PG_FUNCTION_ARGS)
673 : {
674 430414 : Datum str = PG_GETARG_DATUM(0);
675 :
676 : /* try to avoid decompressing argument */
677 430414 : PG_RETURN_INT32(text_length(str));
678 : }
679 :
680 : /*
681 : * text_length -
682 : * Does the real work for textlen()
683 : *
684 : * This is broken out so it can be called directly by other string processing
685 : * functions. Note that the argument is passed as a Datum, to indicate that
686 : * it may still be in compressed form. We can avoid decompressing it at all
687 : * in some cases.
688 : */
689 : static int32
690 430426 : text_length(Datum str)
691 : {
692 : /* fastpath when max encoding length is one */
693 430426 : if (pg_database_encoding_max_length() == 1)
694 32 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
695 : else
696 : {
697 430394 : text *t = DatumGetTextPP(str);
698 :
699 430394 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
700 : VARSIZE_ANY_EXHDR(t)));
701 : }
702 : }
703 :
704 : /*
705 : * textoctetlen -
706 : * returns the physical length of a text*
707 : * (which is less than the VARSIZE of the text*)
708 : */
709 : Datum
710 70 : textoctetlen(PG_FUNCTION_ARGS)
711 : {
712 70 : Datum str = PG_GETARG_DATUM(0);
713 :
714 : /* We need not detoast the input at all */
715 70 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
716 : }
717 :
718 : /*
719 : * textcat -
720 : * takes two text* and returns a text* that is the concatenation of
721 : * the two.
722 : *
723 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
724 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
725 : * Allocate space for output in all cases.
726 : * XXX - thomas 1997-07-10
727 : */
728 : Datum
729 2571020 : textcat(PG_FUNCTION_ARGS)
730 : {
731 2571020 : text *t1 = PG_GETARG_TEXT_PP(0);
732 2571020 : text *t2 = PG_GETARG_TEXT_PP(1);
733 :
734 2571020 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
735 : }
736 :
737 : /*
738 : * text_catenate
739 : * Guts of textcat(), broken out so it can be used by other functions
740 : *
741 : * Arguments can be in short-header form, but not compressed or out-of-line
742 : */
743 : static text *
744 2571100 : text_catenate(text *t1, text *t2)
745 : {
746 : text *result;
747 : int len1,
748 : len2,
749 : len;
750 : char *ptr;
751 :
752 2571100 : len1 = VARSIZE_ANY_EXHDR(t1);
753 2571100 : len2 = VARSIZE_ANY_EXHDR(t2);
754 :
755 : /* paranoia ... probably should throw error instead? */
756 2571100 : if (len1 < 0)
757 0 : len1 = 0;
758 2571100 : if (len2 < 0)
759 0 : len2 = 0;
760 :
761 2571100 : len = len1 + len2 + VARHDRSZ;
762 2571100 : result = (text *) palloc(len);
763 :
764 : /* Set size of result string... */
765 2571100 : SET_VARSIZE(result, len);
766 :
767 : /* Fill data field of result string... */
768 2571100 : ptr = VARDATA(result);
769 2571100 : if (len1 > 0)
770 2567892 : memcpy(ptr, VARDATA_ANY(t1), len1);
771 2571100 : if (len2 > 0)
772 2570878 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
773 :
774 2571100 : return result;
775 : }
776 :
777 : /*
778 : * charlen_to_bytelen()
779 : * Compute the number of bytes occupied by n characters starting at *p
780 : *
781 : * It is caller's responsibility that there actually are n characters;
782 : * the string need not be null-terminated.
783 : */
784 : static int
785 9620 : charlen_to_bytelen(const char *p, int n)
786 : {
787 9620 : if (pg_database_encoding_max_length() == 1)
788 : {
789 : /* Optimization for single-byte encodings */
790 0 : return n;
791 : }
792 : else
793 : {
794 : const char *s;
795 :
796 5874612 : for (s = p; n > 0; n--)
797 5864992 : s += pg_mblen(s);
798 :
799 9620 : return s - p;
800 : }
801 : }
802 :
803 : /*
804 : * text_substr()
805 : * Return a substring starting at the specified position.
806 : * - thomas 1997-12-31
807 : *
808 : * Input:
809 : * - string
810 : * - starting position (is one-based)
811 : * - string length
812 : *
813 : * If the starting position is zero or less, then return from the start of the string
814 : * adjusting the length to be consistent with the "negative start" per SQL.
815 : * If the length is less than zero, return the remaining string.
816 : *
817 : * Added multibyte support.
818 : * - Tatsuo Ishii 1998-4-21
819 : * Changed behavior if starting position is less than one to conform to SQL behavior.
820 : * Formerly returned the entire string; now returns a portion.
821 : * - Thomas Lockhart 1998-12-10
822 : * Now uses faster TOAST-slicing interface
823 : * - John Gray 2002-02-22
824 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
825 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
826 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
827 : * S > LC and < LC + 4 sometimes garbage characters are returned.
828 : * - Joe Conway 2002-08-10
829 : */
830 : Datum
831 474808 : text_substr(PG_FUNCTION_ARGS)
832 : {
833 474808 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
834 : PG_GETARG_INT32(1),
835 : PG_GETARG_INT32(2),
836 : false));
837 : }
838 :
839 : /*
840 : * text_substr_no_len -
841 : * Wrapper to avoid opr_sanity failure due to
842 : * one function accepting a different number of args.
843 : */
844 : Datum
845 72 : text_substr_no_len(PG_FUNCTION_ARGS)
846 : {
847 72 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
848 : PG_GETARG_INT32(1),
849 : -1, true));
850 : }
851 :
852 : /*
853 : * text_substring -
854 : * Does the real work for text_substr() and text_substr_no_len()
855 : *
856 : * This is broken out so it can be called directly by other string processing
857 : * functions. Note that the argument is passed as a Datum, to indicate that
858 : * it may still be in compressed/toasted form. We can avoid detoasting all
859 : * of it in some cases.
860 : *
861 : * The result is always a freshly palloc'd datum.
862 : */
863 : static text *
864 514728 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
865 : {
866 514728 : int32 eml = pg_database_encoding_max_length();
867 514728 : int32 S = start; /* start position */
868 : int32 S1; /* adjusted start position */
869 : int32 L1; /* adjusted substring length */
870 : int32 E; /* end position */
871 :
872 : /*
873 : * SQL99 says S can be zero or negative, but we still must fetch from the
874 : * start of the string.
875 : */
876 514728 : S1 = Max(S, 1);
877 :
878 : /* life is easy if the encoding max length is 1 */
879 514728 : if (eml == 1)
880 : {
881 16 : if (length_not_specified) /* special case - get length to end of
882 : * string */
883 0 : L1 = -1;
884 16 : else if (length < 0)
885 : {
886 : /* SQL99 says to throw an error for E < S, i.e., negative length */
887 0 : ereport(ERROR,
888 : (errcode(ERRCODE_SUBSTRING_ERROR),
889 : errmsg("negative substring length not allowed")));
890 : L1 = -1; /* silence stupider compilers */
891 : }
892 16 : else if (pg_add_s32_overflow(S, length, &E))
893 : {
894 : /*
895 : * L could be large enough for S + L to overflow, in which case
896 : * the substring must run to end of string.
897 : */
898 0 : L1 = -1;
899 : }
900 : else
901 : {
902 : /*
903 : * A zero or negative value for the end position can happen if the
904 : * start was negative or one. SQL99 says to return a zero-length
905 : * string.
906 : */
907 16 : if (E < 1)
908 0 : return cstring_to_text("");
909 :
910 16 : L1 = E - S1;
911 : }
912 :
913 : /*
914 : * If the start position is past the end of the string, SQL99 says to
915 : * return a zero-length string -- DatumGetTextPSlice() will do that
916 : * for us. We need only convert S1 to zero-based starting position.
917 : */
918 16 : return DatumGetTextPSlice(str, S1 - 1, L1);
919 : }
920 514712 : else if (eml > 1)
921 : {
922 : /*
923 : * When encoding max length is > 1, we can't get LC without
924 : * detoasting, so we'll grab a conservatively large slice now and go
925 : * back later to do the right thing
926 : */
927 : int32 slice_start;
928 : int32 slice_size;
929 : int32 slice_strlen;
930 : text *slice;
931 : int32 E1;
932 : int32 i;
933 : char *p;
934 : char *s;
935 : text *ret;
936 :
937 : /*
938 : * We need to start at position zero because there is no way to know
939 : * in advance which byte offset corresponds to the supplied start
940 : * position.
941 : */
942 514712 : slice_start = 0;
943 :
944 514712 : if (length_not_specified) /* special case - get length to end of
945 : * string */
946 112 : slice_size = L1 = -1;
947 514600 : else if (length < 0)
948 : {
949 : /* SQL99 says to throw an error for E < S, i.e., negative length */
950 12 : ereport(ERROR,
951 : (errcode(ERRCODE_SUBSTRING_ERROR),
952 : errmsg("negative substring length not allowed")));
953 : slice_size = L1 = -1; /* silence stupider compilers */
954 : }
955 514588 : else if (pg_add_s32_overflow(S, length, &E))
956 : {
957 : /*
958 : * L could be large enough for S + L to overflow, in which case
959 : * the substring must run to end of string.
960 : */
961 6 : slice_size = L1 = -1;
962 : }
963 : else
964 : {
965 : /*
966 : * A zero or negative value for the end position can happen if the
967 : * start was negative or one. SQL99 says to return a zero-length
968 : * string.
969 : */
970 514582 : if (E < 1)
971 0 : return cstring_to_text("");
972 :
973 : /*
974 : * if E is past the end of the string, the tuple toaster will
975 : * truncate the length for us
976 : */
977 514582 : L1 = E - S1;
978 :
979 : /*
980 : * Total slice size in bytes can't be any longer than the start
981 : * position plus substring length times the encoding max length.
982 : * If that overflows, we can just use -1.
983 : */
984 514582 : if (pg_mul_s32_overflow(E, eml, &slice_size))
985 6 : slice_size = -1;
986 : }
987 :
988 : /*
989 : * If we're working with an untoasted source, no need to do an extra
990 : * copying step.
991 : */
992 514700 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
993 514646 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
994 324 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
995 : else
996 514376 : slice = (text *) DatumGetPointer(str);
997 :
998 : /* see if we got back an empty string */
999 514700 : if (VARSIZE_ANY_EXHDR(slice) == 0)
1000 : {
1001 0 : if (slice != (text *) DatumGetPointer(str))
1002 0 : pfree(slice);
1003 0 : return cstring_to_text("");
1004 : }
1005 :
1006 : /* Now we can get the actual length of the slice in MB characters */
1007 514700 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1008 514700 : VARSIZE_ANY_EXHDR(slice));
1009 :
1010 : /*
1011 : * Check that the start position wasn't > slice_strlen. If so, SQL99
1012 : * says to return a zero-length string.
1013 : */
1014 514700 : if (S1 > slice_strlen)
1015 : {
1016 22 : if (slice != (text *) DatumGetPointer(str))
1017 0 : pfree(slice);
1018 22 : return cstring_to_text("");
1019 : }
1020 :
1021 : /*
1022 : * Adjust L1 and E1 now that we know the slice string length. Again
1023 : * remember that S1 is one based, and slice_start is zero based.
1024 : */
1025 514678 : if (L1 > -1)
1026 514582 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1027 : else
1028 96 : E1 = slice_start + 1 + slice_strlen;
1029 :
1030 : /*
1031 : * Find the start position in the slice; remember S1 is not zero based
1032 : */
1033 514678 : p = VARDATA_ANY(slice);
1034 5335676 : for (i = 0; i < S1 - 1; i++)
1035 4820998 : p += pg_mblen(p);
1036 :
1037 : /* hang onto a pointer to our start position */
1038 514678 : s = p;
1039 :
1040 : /*
1041 : * Count the actual bytes used by the substring of the requested
1042 : * length.
1043 : */
1044 2631104 : for (i = S1; i < E1; i++)
1045 2116426 : p += pg_mblen(p);
1046 :
1047 514678 : ret = (text *) palloc(VARHDRSZ + (p - s));
1048 514678 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
1049 514678 : memcpy(VARDATA(ret), s, (p - s));
1050 :
1051 514678 : if (slice != (text *) DatumGetPointer(str))
1052 324 : pfree(slice);
1053 :
1054 514678 : return ret;
1055 : }
1056 : else
1057 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
1058 :
1059 : /* not reached: suppress compiler warning */
1060 : return NULL;
1061 : }
1062 :
1063 : /*
1064 : * textoverlay
1065 : * Replace specified substring of first string with second
1066 : *
1067 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1068 : * This code is a direct implementation of what the standard says.
1069 : */
1070 : Datum
1071 28 : textoverlay(PG_FUNCTION_ARGS)
1072 : {
1073 28 : text *t1 = PG_GETARG_TEXT_PP(0);
1074 28 : text *t2 = PG_GETARG_TEXT_PP(1);
1075 28 : int sp = PG_GETARG_INT32(2); /* substring start position */
1076 28 : int sl = PG_GETARG_INT32(3); /* substring length */
1077 :
1078 28 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1079 : }
1080 :
1081 : Datum
1082 12 : textoverlay_no_len(PG_FUNCTION_ARGS)
1083 : {
1084 12 : text *t1 = PG_GETARG_TEXT_PP(0);
1085 12 : text *t2 = PG_GETARG_TEXT_PP(1);
1086 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
1087 : int sl;
1088 :
1089 12 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1090 12 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1091 : }
1092 :
1093 : static text *
1094 40 : text_overlay(text *t1, text *t2, int sp, int sl)
1095 : {
1096 : text *result;
1097 : text *s1;
1098 : text *s2;
1099 : int sp_pl_sl;
1100 :
1101 : /*
1102 : * Check for possible integer-overflow cases. For negative sp, throw a
1103 : * "substring length" error because that's what should be expected
1104 : * according to the spec's definition of OVERLAY().
1105 : */
1106 40 : if (sp <= 0)
1107 0 : ereport(ERROR,
1108 : (errcode(ERRCODE_SUBSTRING_ERROR),
1109 : errmsg("negative substring length not allowed")));
1110 40 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1111 0 : ereport(ERROR,
1112 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1113 : errmsg("integer out of range")));
1114 :
1115 40 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1116 40 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1117 40 : result = text_catenate(s1, t2);
1118 40 : result = text_catenate(result, s2);
1119 :
1120 40 : return result;
1121 : }
1122 :
1123 : /*
1124 : * textpos -
1125 : * Return the position of the specified substring.
1126 : * Implements the SQL POSITION() function.
1127 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1128 : * - thomas 1997-07-27
1129 : */
1130 : Datum
1131 106 : textpos(PG_FUNCTION_ARGS)
1132 : {
1133 106 : text *str = PG_GETARG_TEXT_PP(0);
1134 106 : text *search_str = PG_GETARG_TEXT_PP(1);
1135 :
1136 106 : PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1137 : }
1138 :
1139 : /*
1140 : * text_position -
1141 : * Does the real work for textpos()
1142 : *
1143 : * Inputs:
1144 : * t1 - string to be searched
1145 : * t2 - pattern to match within t1
1146 : * Result:
1147 : * Character index of the first matched char, starting from 1,
1148 : * or 0 if no match.
1149 : *
1150 : * This is broken out so it can be called directly by other string processing
1151 : * functions.
1152 : */
1153 : static int
1154 106 : text_position(text *t1, text *t2, Oid collid)
1155 : {
1156 : TextPositionState state;
1157 : int result;
1158 :
1159 : /* Empty needle always matches at position 1 */
1160 106 : if (VARSIZE_ANY_EXHDR(t2) < 1)
1161 12 : return 1;
1162 :
1163 : /* Otherwise, can't match if haystack is shorter than needle */
1164 94 : if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1165 22 : return 0;
1166 :
1167 72 : text_position_setup(t1, t2, collid, &state);
1168 72 : if (!text_position_next(&state))
1169 24 : result = 0;
1170 : else
1171 48 : result = text_position_get_match_pos(&state);
1172 72 : text_position_cleanup(&state);
1173 72 : return result;
1174 : }
1175 :
1176 :
1177 : /*
1178 : * text_position_setup, text_position_next, text_position_cleanup -
1179 : * Component steps of text_position()
1180 : *
1181 : * These are broken out so that a string can be efficiently searched for
1182 : * multiple occurrences of the same pattern. text_position_next may be
1183 : * called multiple times, and it advances to the next match on each call.
1184 : * text_position_get_match_ptr() and text_position_get_match_pos() return
1185 : * a pointer or 1-based character position of the last match, respectively.
1186 : *
1187 : * The "state" variable is normally just a local variable in the caller.
1188 : *
1189 : * NOTE: text_position_next skips over the matched portion. For example,
1190 : * searching for "xx" in "xxx" returns only one match, not two.
1191 : */
1192 :
1193 : static void
1194 2422 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1195 : {
1196 2422 : int len1 = VARSIZE_ANY_EXHDR(t1);
1197 2422 : int len2 = VARSIZE_ANY_EXHDR(t2);
1198 2422 : pg_locale_t mylocale = 0;
1199 :
1200 2422 : check_collation_set(collid);
1201 :
1202 2422 : if (!lc_collate_is_c(collid))
1203 260 : mylocale = pg_newlocale_from_collation(collid);
1204 :
1205 2422 : if (mylocale && !mylocale->deterministic)
1206 0 : ereport(ERROR,
1207 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1208 : errmsg("nondeterministic collations are not supported for substring searches")));
1209 :
1210 : Assert(len1 > 0);
1211 : Assert(len2 > 0);
1212 :
1213 : /*
1214 : * Even with a multi-byte encoding, we perform the search using the raw
1215 : * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1216 : * because in UTF-8 the byte sequence of one character cannot contain
1217 : * another character. For other multi-byte encodings, we do the search
1218 : * initially as a simple byte search, ignoring multibyte issues, but
1219 : * verify afterwards that the match we found is at a character boundary,
1220 : * and continue the search if it was a false match.
1221 : */
1222 2422 : if (pg_database_encoding_max_length() == 1)
1223 48 : state->is_multibyte_char_in_char = false;
1224 2374 : else if (GetDatabaseEncoding() == PG_UTF8)
1225 2374 : state->is_multibyte_char_in_char = false;
1226 : else
1227 0 : state->is_multibyte_char_in_char = true;
1228 :
1229 2422 : state->str1 = VARDATA_ANY(t1);
1230 2422 : state->str2 = VARDATA_ANY(t2);
1231 2422 : state->len1 = len1;
1232 2422 : state->len2 = len2;
1233 2422 : state->last_match = NULL;
1234 2422 : state->refpoint = state->str1;
1235 2422 : state->refpos = 0;
1236 :
1237 : /*
1238 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1239 : * notes we use the terminology that the "haystack" is the string to be
1240 : * searched (t1) and the "needle" is the pattern being sought (t2).
1241 : *
1242 : * If the needle is empty or bigger than the haystack then there is no
1243 : * point in wasting cycles initializing the table. We also choose not to
1244 : * use B-M-H for needles of length 1, since the skip table can't possibly
1245 : * save anything in that case.
1246 : */
1247 2422 : if (len1 >= len2 && len2 > 1)
1248 : {
1249 2222 : int searchlength = len1 - len2;
1250 : int skiptablemask;
1251 : int last;
1252 : int i;
1253 2222 : const char *str2 = state->str2;
1254 :
1255 : /*
1256 : * First we must determine how much of the skip table to use. The
1257 : * declaration of TextPositionState allows up to 256 elements, but for
1258 : * short search problems we don't really want to have to initialize so
1259 : * many elements --- it would take too long in comparison to the
1260 : * actual search time. So we choose a useful skip table size based on
1261 : * the haystack length minus the needle length. The closer the needle
1262 : * length is to the haystack length the less useful skipping becomes.
1263 : *
1264 : * Note: since we use bit-masking to select table elements, the skip
1265 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1266 : */
1267 2222 : if (searchlength < 16)
1268 54 : skiptablemask = 3;
1269 2168 : else if (searchlength < 64)
1270 16 : skiptablemask = 7;
1271 2152 : else if (searchlength < 128)
1272 2 : skiptablemask = 15;
1273 2150 : else if (searchlength < 512)
1274 146 : skiptablemask = 31;
1275 2004 : else if (searchlength < 2048)
1276 1898 : skiptablemask = 63;
1277 106 : else if (searchlength < 4096)
1278 42 : skiptablemask = 127;
1279 : else
1280 64 : skiptablemask = 255;
1281 2222 : state->skiptablemask = skiptablemask;
1282 :
1283 : /*
1284 : * Initialize the skip table. We set all elements to the needle
1285 : * length, since this is the correct skip distance for any character
1286 : * not found in the needle.
1287 : */
1288 150502 : for (i = 0; i <= skiptablemask; i++)
1289 148280 : state->skiptable[i] = len2;
1290 :
1291 : /*
1292 : * Now examine the needle. For each character except the last one,
1293 : * set the corresponding table element to the appropriate skip
1294 : * distance. Note that when two characters share the same skip table
1295 : * entry, the one later in the needle must determine the skip
1296 : * distance.
1297 : */
1298 2222 : last = len2 - 1;
1299 :
1300 27386 : for (i = 0; i < last; i++)
1301 25164 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1302 : }
1303 2422 : }
1304 :
1305 : /*
1306 : * Advance to the next match, starting from the end of the previous match
1307 : * (or the beginning of the string, on first call). Returns true if a match
1308 : * is found.
1309 : *
1310 : * Note that this refuses to match an empty-string needle. Most callers
1311 : * will have handled that case specially and we'll never see it here.
1312 : */
1313 : static bool
1314 8638 : text_position_next(TextPositionState *state)
1315 : {
1316 8638 : int needle_len = state->len2;
1317 : char *start_ptr;
1318 : char *matchptr;
1319 :
1320 8638 : if (needle_len <= 0)
1321 0 : return false; /* result for empty pattern */
1322 :
1323 : /* Start from the point right after the previous match. */
1324 8638 : if (state->last_match)
1325 6204 : start_ptr = state->last_match + needle_len;
1326 : else
1327 2434 : start_ptr = state->str1;
1328 :
1329 8638 : retry:
1330 8638 : matchptr = text_position_next_internal(start_ptr, state);
1331 :
1332 8638 : if (!matchptr)
1333 2362 : return false;
1334 :
1335 : /*
1336 : * Found a match for the byte sequence. If this is a multibyte encoding,
1337 : * where one character's byte sequence can appear inside a longer
1338 : * multi-byte character, we need to verify that the match was at a
1339 : * character boundary, not in the middle of a multi-byte character.
1340 : */
1341 6276 : if (state->is_multibyte_char_in_char)
1342 : {
1343 : /* Walk one character at a time, until we reach the match. */
1344 :
1345 : /* the search should never move backwards. */
1346 : Assert(state->refpoint <= matchptr);
1347 :
1348 0 : while (state->refpoint < matchptr)
1349 : {
1350 : /* step to next character. */
1351 0 : state->refpoint += pg_mblen(state->refpoint);
1352 0 : state->refpos++;
1353 :
1354 : /*
1355 : * If we stepped over the match's start position, then it was a
1356 : * false positive, where the byte sequence appeared in the middle
1357 : * of a multi-byte character. Skip it, and continue the search at
1358 : * the next character boundary.
1359 : */
1360 0 : if (state->refpoint > matchptr)
1361 : {
1362 0 : start_ptr = state->refpoint;
1363 0 : goto retry;
1364 : }
1365 : }
1366 : }
1367 :
1368 6276 : state->last_match = matchptr;
1369 6276 : return true;
1370 : }
1371 :
1372 : /*
1373 : * Subroutine of text_position_next(). This searches for the raw byte
1374 : * sequence, ignoring any multi-byte encoding issues. Returns the first
1375 : * match starting at 'start_ptr', or NULL if no match is found.
1376 : */
1377 : static char *
1378 8638 : text_position_next_internal(char *start_ptr, TextPositionState *state)
1379 : {
1380 8638 : int haystack_len = state->len1;
1381 8638 : int needle_len = state->len2;
1382 8638 : int skiptablemask = state->skiptablemask;
1383 8638 : const char *haystack = state->str1;
1384 8638 : const char *needle = state->str2;
1385 8638 : const char *haystack_end = &haystack[haystack_len];
1386 : const char *hptr;
1387 :
1388 : Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1389 :
1390 8638 : if (needle_len == 1)
1391 : {
1392 : /* No point in using B-M-H for a one-character needle */
1393 726 : char nchar = *needle;
1394 :
1395 726 : hptr = start_ptr;
1396 5538 : while (hptr < haystack_end)
1397 : {
1398 5380 : if (*hptr == nchar)
1399 568 : return (char *) hptr;
1400 4812 : hptr++;
1401 : }
1402 : }
1403 : else
1404 : {
1405 7912 : const char *needle_last = &needle[needle_len - 1];
1406 :
1407 : /* Start at startpos plus the length of the needle */
1408 7912 : hptr = start_ptr + needle_len - 1;
1409 204456 : while (hptr < haystack_end)
1410 : {
1411 : /* Match the needle scanning *backward* */
1412 : const char *nptr;
1413 : const char *p;
1414 :
1415 202252 : nptr = needle_last;
1416 202252 : p = hptr;
1417 284180 : while (*nptr == *p)
1418 : {
1419 : /* Matched it all? If so, return 1-based position */
1420 87636 : if (nptr == needle)
1421 5708 : return (char *) p;
1422 81928 : nptr--, p--;
1423 : }
1424 :
1425 : /*
1426 : * No match, so use the haystack char at hptr to decide how far to
1427 : * advance. If the needle had any occurrence of that character
1428 : * (or more precisely, one sharing the same skiptable entry)
1429 : * before its last character, then we advance far enough to align
1430 : * the last such needle character with that haystack position.
1431 : * Otherwise we can advance by the whole needle length.
1432 : */
1433 196544 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1434 : }
1435 : }
1436 :
1437 2362 : return 0; /* not found */
1438 : }
1439 :
1440 : /*
1441 : * Return a pointer to the current match.
1442 : *
1443 : * The returned pointer points into the original haystack string.
1444 : */
1445 : static char *
1446 6198 : text_position_get_match_ptr(TextPositionState *state)
1447 : {
1448 6198 : return state->last_match;
1449 : }
1450 :
1451 : /*
1452 : * Return the offset of the current match.
1453 : *
1454 : * The offset is in characters, 1-based.
1455 : */
1456 : static int
1457 48 : text_position_get_match_pos(TextPositionState *state)
1458 : {
1459 : /* Convert the byte position to char position. */
1460 96 : state->refpos += pg_mbstrlen_with_len(state->refpoint,
1461 48 : state->last_match - state->refpoint);
1462 48 : state->refpoint = state->last_match;
1463 48 : return state->refpos + 1;
1464 : }
1465 :
1466 : /*
1467 : * Reset search state to the initial state installed by text_position_setup.
1468 : *
1469 : * The next call to text_position_next will search from the beginning
1470 : * of the string.
1471 : */
1472 : static void
1473 12 : text_position_reset(TextPositionState *state)
1474 : {
1475 12 : state->last_match = NULL;
1476 12 : state->refpoint = state->str1;
1477 12 : state->refpos = 0;
1478 12 : }
1479 :
1480 : static void
1481 2422 : text_position_cleanup(TextPositionState *state)
1482 : {
1483 : /* no cleanup needed */
1484 2422 : }
1485 :
1486 :
1487 : static void
1488 15719526 : check_collation_set(Oid collid)
1489 : {
1490 15719526 : if (!OidIsValid(collid))
1491 : {
1492 : /*
1493 : * This typically means that the parser could not resolve a conflict
1494 : * of implicit collations, so report it that way.
1495 : */
1496 12 : ereport(ERROR,
1497 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1498 : errmsg("could not determine which collation to use for string comparison"),
1499 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1500 : }
1501 15719514 : }
1502 :
1503 : /* varstr_cmp()
1504 : * Comparison function for text strings with given lengths.
1505 : * Includes locale support, but must copy strings to temporary memory
1506 : * to allow null-termination for inputs to strcoll().
1507 : * Returns an integer less than, equal to, or greater than zero, indicating
1508 : * whether arg1 is less than, equal to, or greater than arg2.
1509 : *
1510 : * Note: many functions that depend on this are marked leakproof; therefore,
1511 : * avoid reporting the actual contents of the input when throwing errors.
1512 : * All errors herein should be things that can't happen except on corrupt
1513 : * data, anyway; otherwise we will have trouble with indexing strings that
1514 : * would cause them.
1515 : */
1516 : int
1517 8480582 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1518 : {
1519 : int result;
1520 :
1521 8480582 : check_collation_set(collid);
1522 :
1523 : /*
1524 : * Unfortunately, there is no strncoll(), so in the non-C locale case we
1525 : * have to do some memory copying. This turns out to be significantly
1526 : * slower, so we optimize the case where LC_COLLATE is C. We also try to
1527 : * optimize relatively-short strings by avoiding palloc/pfree overhead.
1528 : */
1529 8480576 : if (lc_collate_is_c(collid))
1530 : {
1531 4260780 : result = memcmp(arg1, arg2, Min(len1, len2));
1532 4260780 : if ((result == 0) && (len1 != len2))
1533 130692 : result = (len1 < len2) ? -1 : 1;
1534 : }
1535 : else
1536 : {
1537 : char a1buf[TEXTBUFLEN];
1538 : char a2buf[TEXTBUFLEN];
1539 : char *a1p,
1540 : *a2p;
1541 : pg_locale_t mylocale;
1542 :
1543 4219796 : mylocale = pg_newlocale_from_collation(collid);
1544 :
1545 : /*
1546 : * memcmp() can't tell us which of two unequal strings sorts first,
1547 : * but it's a cheap way to tell if they're equal. Testing shows that
1548 : * memcmp() followed by strcoll() is only trivially slower than
1549 : * strcoll() by itself, so we don't lose much if this doesn't work out
1550 : * very often, and if it does - for example, because there are many
1551 : * equal strings in the input - then we win big by avoiding expensive
1552 : * collation-aware comparisons.
1553 : */
1554 4219796 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1555 1692454 : return 0;
1556 :
1557 : #ifdef WIN32
1558 : /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1559 : if (GetDatabaseEncoding() == PG_UTF8
1560 : && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1561 : {
1562 : int a1len;
1563 : int a2len;
1564 : int r;
1565 :
1566 : if (len1 >= TEXTBUFLEN / 2)
1567 : {
1568 : a1len = len1 * 2 + 2;
1569 : a1p = palloc(a1len);
1570 : }
1571 : else
1572 : {
1573 : a1len = TEXTBUFLEN;
1574 : a1p = a1buf;
1575 : }
1576 : if (len2 >= TEXTBUFLEN / 2)
1577 : {
1578 : a2len = len2 * 2 + 2;
1579 : a2p = palloc(a2len);
1580 : }
1581 : else
1582 : {
1583 : a2len = TEXTBUFLEN;
1584 : a2p = a2buf;
1585 : }
1586 :
1587 : /* stupid Microsloth API does not work for zero-length input */
1588 : if (len1 == 0)
1589 : r = 0;
1590 : else
1591 : {
1592 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1593 : (LPWSTR) a1p, a1len / 2);
1594 : if (!r)
1595 : ereport(ERROR,
1596 : (errmsg("could not convert string to UTF-16: error code %lu",
1597 : GetLastError())));
1598 : }
1599 : ((LPWSTR) a1p)[r] = 0;
1600 :
1601 : if (len2 == 0)
1602 : r = 0;
1603 : else
1604 : {
1605 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1606 : (LPWSTR) a2p, a2len / 2);
1607 : if (!r)
1608 : ereport(ERROR,
1609 : (errmsg("could not convert string to UTF-16: error code %lu",
1610 : GetLastError())));
1611 : }
1612 : ((LPWSTR) a2p)[r] = 0;
1613 :
1614 : errno = 0;
1615 : #ifdef HAVE_LOCALE_T
1616 : if (mylocale)
1617 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1618 : else
1619 : #endif
1620 : result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1621 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1622 : * headers */
1623 : ereport(ERROR,
1624 : (errmsg("could not compare Unicode strings: %m")));
1625 :
1626 : /* Break tie if necessary. */
1627 : if (result == 0 &&
1628 : (!mylocale || mylocale->deterministic))
1629 : {
1630 : result = memcmp(arg1, arg2, Min(len1, len2));
1631 : if ((result == 0) && (len1 != len2))
1632 : result = (len1 < len2) ? -1 : 1;
1633 : }
1634 :
1635 : if (a1p != a1buf)
1636 : pfree(a1p);
1637 : if (a2p != a2buf)
1638 : pfree(a2p);
1639 :
1640 : return result;
1641 : }
1642 : #endif /* WIN32 */
1643 :
1644 2527342 : if (len1 >= TEXTBUFLEN)
1645 260 : a1p = (char *) palloc(len1 + 1);
1646 : else
1647 2527082 : a1p = a1buf;
1648 2527342 : if (len2 >= TEXTBUFLEN)
1649 132 : a2p = (char *) palloc(len2 + 1);
1650 : else
1651 2527210 : a2p = a2buf;
1652 :
1653 2527342 : memcpy(a1p, arg1, len1);
1654 2527342 : a1p[len1] = '\0';
1655 2527342 : memcpy(a2p, arg2, len2);
1656 2527342 : a2p[len2] = '\0';
1657 :
1658 2527342 : if (mylocale)
1659 : {
1660 0 : if (mylocale->provider == COLLPROVIDER_ICU)
1661 : {
1662 : #ifdef USE_ICU
1663 : #ifdef HAVE_UCOL_STRCOLLUTF8
1664 : if (GetDatabaseEncoding() == PG_UTF8)
1665 : {
1666 : UErrorCode status;
1667 :
1668 : status = U_ZERO_ERROR;
1669 : result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1670 : arg1, len1,
1671 : arg2, len2,
1672 : &status);
1673 : if (U_FAILURE(status))
1674 : ereport(ERROR,
1675 : (errmsg("collation failed: %s", u_errorName(status))));
1676 : }
1677 : else
1678 : #endif
1679 : {
1680 : int32_t ulen1,
1681 : ulen2;
1682 : UChar *uchar1,
1683 : *uchar2;
1684 :
1685 : ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1686 : ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1687 :
1688 : result = ucol_strcoll(mylocale->info.icu.ucol,
1689 : uchar1, ulen1,
1690 : uchar2, ulen2);
1691 :
1692 : pfree(uchar1);
1693 : pfree(uchar2);
1694 : }
1695 : #else /* not USE_ICU */
1696 : /* shouldn't happen */
1697 0 : elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1698 : #endif /* not USE_ICU */
1699 : }
1700 : else
1701 : {
1702 : #ifdef HAVE_LOCALE_T
1703 0 : result = strcoll_l(a1p, a2p, mylocale->info.lt);
1704 : #else
1705 : /* shouldn't happen */
1706 : elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1707 : #endif
1708 : }
1709 : }
1710 : else
1711 2527342 : result = strcoll(a1p, a2p);
1712 :
1713 : /* Break tie if necessary. */
1714 2527342 : if (result == 0 &&
1715 0 : (!mylocale || mylocale->deterministic))
1716 0 : result = strcmp(a1p, a2p);
1717 :
1718 2527342 : if (a1p != a1buf)
1719 260 : pfree(a1p);
1720 2527342 : if (a2p != a2buf)
1721 132 : pfree(a2p);
1722 : }
1723 :
1724 6788122 : return result;
1725 : }
1726 :
1727 : /* text_cmp()
1728 : * Internal comparison function for text strings.
1729 : * Returns -1, 0 or 1
1730 : */
1731 : static int
1732 6844108 : text_cmp(text *arg1, text *arg2, Oid collid)
1733 : {
1734 : char *a1p,
1735 : *a2p;
1736 : int len1,
1737 : len2;
1738 :
1739 6844108 : a1p = VARDATA_ANY(arg1);
1740 6844108 : a2p = VARDATA_ANY(arg2);
1741 :
1742 6844108 : len1 = VARSIZE_ANY_EXHDR(arg1);
1743 6844108 : len2 = VARSIZE_ANY_EXHDR(arg2);
1744 :
1745 6844108 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1746 : }
1747 :
1748 : /*
1749 : * Comparison functions for text strings.
1750 : *
1751 : * Note: btree indexes need these routines not to leak memory; therefore,
1752 : * be careful to free working copies of toasted datums. Most places don't
1753 : * need to be so careful.
1754 : */
1755 :
1756 : Datum
1757 6815200 : texteq(PG_FUNCTION_ARGS)
1758 : {
1759 6815200 : Oid collid = PG_GET_COLLATION();
1760 6815200 : bool locale_is_c = false;
1761 6815200 : pg_locale_t mylocale = 0;
1762 : bool result;
1763 :
1764 6815200 : check_collation_set(collid);
1765 :
1766 6815200 : if (lc_collate_is_c(collid))
1767 543010 : locale_is_c = true;
1768 : else
1769 6272190 : mylocale = pg_newlocale_from_collation(collid);
1770 :
1771 6815200 : if (locale_is_c || !mylocale || mylocale->deterministic)
1772 6815200 : {
1773 6815200 : Datum arg1 = PG_GETARG_DATUM(0);
1774 6815200 : Datum arg2 = PG_GETARG_DATUM(1);
1775 : Size len1,
1776 : len2;
1777 :
1778 : /*
1779 : * Since we only care about equality or not-equality, we can avoid all
1780 : * the expense of strcoll() here, and just do bitwise comparison. In
1781 : * fact, we don't even have to do a bitwise comparison if we can show
1782 : * the lengths of the strings are unequal; which might save us from
1783 : * having to detoast one or both values.
1784 : */
1785 6815200 : len1 = toast_raw_datum_size(arg1);
1786 6815200 : len2 = toast_raw_datum_size(arg2);
1787 6815200 : if (len1 != len2)
1788 2451528 : result = false;
1789 : else
1790 : {
1791 4363672 : text *targ1 = DatumGetTextPP(arg1);
1792 4363672 : text *targ2 = DatumGetTextPP(arg2);
1793 :
1794 4363672 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1795 : len1 - VARHDRSZ) == 0);
1796 :
1797 4363672 : PG_FREE_IF_COPY(targ1, 0);
1798 4363672 : PG_FREE_IF_COPY(targ2, 1);
1799 : }
1800 : }
1801 : else
1802 : {
1803 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
1804 0 : text *arg2 = PG_GETARG_TEXT_PP(1);
1805 :
1806 0 : result = (text_cmp(arg1, arg2, collid) == 0);
1807 :
1808 0 : PG_FREE_IF_COPY(arg1, 0);
1809 0 : PG_FREE_IF_COPY(arg2, 1);
1810 : }
1811 :
1812 6815200 : PG_RETURN_BOOL(result);
1813 : }
1814 :
1815 : Datum
1816 19258 : textne(PG_FUNCTION_ARGS)
1817 : {
1818 19258 : Oid collid = PG_GET_COLLATION();
1819 19258 : bool locale_is_c = false;
1820 19258 : pg_locale_t mylocale = 0;
1821 : bool result;
1822 :
1823 19258 : check_collation_set(collid);
1824 :
1825 19258 : if (lc_collate_is_c(collid))
1826 18 : locale_is_c = true;
1827 : else
1828 19240 : mylocale = pg_newlocale_from_collation(collid);
1829 :
1830 19258 : if (locale_is_c || !mylocale || mylocale->deterministic)
1831 19258 : {
1832 19258 : Datum arg1 = PG_GETARG_DATUM(0);
1833 19258 : Datum arg2 = PG_GETARG_DATUM(1);
1834 : Size len1,
1835 : len2;
1836 :
1837 : /* See comment in texteq() */
1838 19258 : len1 = toast_raw_datum_size(arg1);
1839 19258 : len2 = toast_raw_datum_size(arg2);
1840 19258 : if (len1 != len2)
1841 1780 : result = true;
1842 : else
1843 : {
1844 17478 : text *targ1 = DatumGetTextPP(arg1);
1845 17478 : text *targ2 = DatumGetTextPP(arg2);
1846 :
1847 17478 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1848 : len1 - VARHDRSZ) != 0);
1849 :
1850 17478 : PG_FREE_IF_COPY(targ1, 0);
1851 17478 : PG_FREE_IF_COPY(targ2, 1);
1852 : }
1853 : }
1854 : else
1855 : {
1856 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
1857 0 : text *arg2 = PG_GETARG_TEXT_PP(1);
1858 :
1859 0 : result = (text_cmp(arg1, arg2, collid) != 0);
1860 :
1861 0 : PG_FREE_IF_COPY(arg1, 0);
1862 0 : PG_FREE_IF_COPY(arg2, 1);
1863 : }
1864 :
1865 19258 : PG_RETURN_BOOL(result);
1866 : }
1867 :
1868 : Datum
1869 123554 : text_lt(PG_FUNCTION_ARGS)
1870 : {
1871 123554 : text *arg1 = PG_GETARG_TEXT_PP(0);
1872 123554 : text *arg2 = PG_GETARG_TEXT_PP(1);
1873 : bool result;
1874 :
1875 123554 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1876 :
1877 123548 : PG_FREE_IF_COPY(arg1, 0);
1878 123548 : PG_FREE_IF_COPY(arg2, 1);
1879 :
1880 123548 : PG_RETURN_BOOL(result);
1881 : }
1882 :
1883 : Datum
1884 325034 : text_le(PG_FUNCTION_ARGS)
1885 : {
1886 325034 : text *arg1 = PG_GETARG_TEXT_PP(0);
1887 325034 : text *arg2 = PG_GETARG_TEXT_PP(1);
1888 : bool result;
1889 :
1890 325034 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1891 :
1892 325034 : PG_FREE_IF_COPY(arg1, 0);
1893 325034 : PG_FREE_IF_COPY(arg2, 1);
1894 :
1895 325034 : PG_RETURN_BOOL(result);
1896 : }
1897 :
1898 : Datum
1899 112336 : text_gt(PG_FUNCTION_ARGS)
1900 : {
1901 112336 : text *arg1 = PG_GETARG_TEXT_PP(0);
1902 112336 : text *arg2 = PG_GETARG_TEXT_PP(1);
1903 : bool result;
1904 :
1905 112336 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1906 :
1907 112336 : PG_FREE_IF_COPY(arg1, 0);
1908 112336 : PG_FREE_IF_COPY(arg2, 1);
1909 :
1910 112336 : PG_RETURN_BOOL(result);
1911 : }
1912 :
1913 : Datum
1914 184534 : text_ge(PG_FUNCTION_ARGS)
1915 : {
1916 184534 : text *arg1 = PG_GETARG_TEXT_PP(0);
1917 184534 : text *arg2 = PG_GETARG_TEXT_PP(1);
1918 : bool result;
1919 :
1920 184534 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1921 :
1922 184534 : PG_FREE_IF_COPY(arg1, 0);
1923 184534 : PG_FREE_IF_COPY(arg2, 1);
1924 :
1925 184534 : PG_RETURN_BOOL(result);
1926 : }
1927 :
1928 : Datum
1929 37914 : text_starts_with(PG_FUNCTION_ARGS)
1930 : {
1931 37914 : Datum arg1 = PG_GETARG_DATUM(0);
1932 37914 : Datum arg2 = PG_GETARG_DATUM(1);
1933 37914 : Oid collid = PG_GET_COLLATION();
1934 37914 : pg_locale_t mylocale = 0;
1935 : bool result;
1936 : Size len1,
1937 : len2;
1938 :
1939 37914 : check_collation_set(collid);
1940 :
1941 37914 : if (!lc_collate_is_c(collid))
1942 37914 : mylocale = pg_newlocale_from_collation(collid);
1943 :
1944 37914 : if (mylocale && !mylocale->deterministic)
1945 0 : ereport(ERROR,
1946 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1947 : errmsg("nondeterministic collations are not supported for substring searches")));
1948 :
1949 37914 : len1 = toast_raw_datum_size(arg1);
1950 37914 : len2 = toast_raw_datum_size(arg2);
1951 37914 : if (len2 > len1)
1952 0 : result = false;
1953 : else
1954 : {
1955 37914 : text *targ1 = text_substring(arg1, 1, len2, false);
1956 37914 : text *targ2 = DatumGetTextPP(arg2);
1957 :
1958 37914 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1959 37914 : VARSIZE_ANY_EXHDR(targ2)) == 0);
1960 :
1961 37914 : PG_FREE_IF_COPY(targ1, 0);
1962 37914 : PG_FREE_IF_COPY(targ2, 1);
1963 : }
1964 :
1965 37914 : PG_RETURN_BOOL(result);
1966 : }
1967 :
1968 : Datum
1969 5872882 : bttextcmp(PG_FUNCTION_ARGS)
1970 : {
1971 5872882 : text *arg1 = PG_GETARG_TEXT_PP(0);
1972 5872882 : text *arg2 = PG_GETARG_TEXT_PP(1);
1973 : int32 result;
1974 :
1975 5872882 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1976 :
1977 5872882 : PG_FREE_IF_COPY(arg1, 0);
1978 5872882 : PG_FREE_IF_COPY(arg2, 1);
1979 :
1980 5872882 : PG_RETURN_INT32(result);
1981 : }
1982 :
1983 : Datum
1984 79892 : bttextsortsupport(PG_FUNCTION_ARGS)
1985 : {
1986 79892 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1987 79892 : Oid collid = ssup->ssup_collation;
1988 : MemoryContext oldcontext;
1989 :
1990 79892 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1991 :
1992 : /* Use generic string SortSupport */
1993 79892 : varstr_sortsupport(ssup, TEXTOID, collid);
1994 :
1995 79886 : MemoryContextSwitchTo(oldcontext);
1996 :
1997 79886 : PG_RETURN_VOID();
1998 : }
1999 :
2000 : /*
2001 : * Generic sortsupport interface for character type's operator classes.
2002 : * Includes locale support, and support for BpChar semantics (i.e. removing
2003 : * trailing spaces before comparison).
2004 : *
2005 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2006 : * same representation. Callers that always use the C collation (e.g.
2007 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
2008 : * this will not work with any other collation, though.
2009 : */
2010 : void
2011 150670 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2012 : {
2013 150670 : bool abbreviate = ssup->abbreviate;
2014 150670 : bool collate_c = false;
2015 : VarStringSortSupport *sss;
2016 150670 : pg_locale_t locale = 0;
2017 :
2018 150670 : check_collation_set(collid);
2019 :
2020 : /*
2021 : * If possible, set ssup->comparator to a function which can be used to
2022 : * directly compare two datums. If we can do this, we'll avoid the
2023 : * overhead of a trip through the fmgr layer for every comparison, which
2024 : * can be substantial.
2025 : *
2026 : * Most typically, we'll set the comparator to varlenafastcmp_locale,
2027 : * which uses strcoll() to perform comparisons. We use that for the
2028 : * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2029 : * LC_COLLATE = C, we can make things quite a bit faster with
2030 : * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2031 : * memcmp() rather than strcoll().
2032 : */
2033 150664 : if (lc_collate_is_c(collid))
2034 : {
2035 101914 : if (typid == BPCHAROID)
2036 28 : ssup->comparator = bpcharfastcmp_c;
2037 101886 : else if (typid == NAMEOID)
2038 : {
2039 69934 : ssup->comparator = namefastcmp_c;
2040 : /* Not supporting abbreviation with type NAME, for now */
2041 69934 : abbreviate = false;
2042 : }
2043 : else
2044 31952 : ssup->comparator = varstrfastcmp_c;
2045 :
2046 101914 : collate_c = true;
2047 : }
2048 : else
2049 : {
2050 : /*
2051 : * We need a collation-sensitive comparison. To make things faster,
2052 : * we'll figure out the collation based on the locale id and cache the
2053 : * result.
2054 : */
2055 48750 : locale = pg_newlocale_from_collation(collid);
2056 :
2057 : /*
2058 : * There is a further exception on Windows. When the database
2059 : * encoding is UTF-8 and we are not using the C collation, complex
2060 : * hacks are required. We don't currently have a comparator that
2061 : * handles that case, so we fall back on the slow method of having the
2062 : * sort code invoke bttextcmp() (in the case of text) via the fmgr
2063 : * trampoline. ICU locales work just the same on Windows, however.
2064 : */
2065 : #ifdef WIN32
2066 : if (GetDatabaseEncoding() == PG_UTF8 &&
2067 : !(locale && locale->provider == COLLPROVIDER_ICU))
2068 : return;
2069 : #endif
2070 :
2071 : /*
2072 : * We use varlenafastcmp_locale except for type NAME.
2073 : */
2074 48750 : if (typid == NAMEOID)
2075 : {
2076 0 : ssup->comparator = namefastcmp_locale;
2077 : /* Not supporting abbreviation with type NAME, for now */
2078 0 : abbreviate = false;
2079 : }
2080 : else
2081 48750 : ssup->comparator = varlenafastcmp_locale;
2082 : }
2083 :
2084 : /*
2085 : * Unfortunately, it seems that abbreviation for non-C collations is
2086 : * broken on many common platforms; testing of multiple versions of glibc
2087 : * reveals that, for many locales, strcoll() and strxfrm() do not return
2088 : * consistent results, which is fatal to this optimization. While no
2089 : * other libc other than Cygwin has so far been shown to have a problem,
2090 : * we take the conservative course of action for right now and disable
2091 : * this categorically. (Users who are certain this isn't a problem on
2092 : * their system can define TRUST_STRXFRM.)
2093 : *
2094 : * Even apart from the risk of broken locales, it's possible that there
2095 : * are platforms where the use of abbreviated keys should be disabled at
2096 : * compile time. Having only 4 byte datums could make worst-case
2097 : * performance drastically more likely, for example. Moreover, macOS's
2098 : * strxfrm() implementation is known to not effectively concentrate a
2099 : * significant amount of entropy from the original string in earlier
2100 : * transformed blobs. It's possible that other supported platforms are
2101 : * similarly encumbered. So, if we ever get past disabling this
2102 : * categorically, we may still want or need to disable it for particular
2103 : * platforms.
2104 : */
2105 : #ifndef TRUST_STRXFRM
2106 150664 : if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2107 48750 : abbreviate = false;
2108 : #endif
2109 :
2110 : /*
2111 : * If we're using abbreviated keys, or if we're using a locale-aware
2112 : * comparison, we need to initialize a VarStringSortSupport object. Both
2113 : * cases will make use of the temporary buffers we initialize here for
2114 : * scratch space (and to detect requirement for BpChar semantics from
2115 : * caller), and the abbreviation case requires additional state.
2116 : */
2117 150664 : if (abbreviate || !collate_c)
2118 : {
2119 52030 : sss = palloc(sizeof(VarStringSortSupport));
2120 52030 : sss->buf1 = palloc(TEXTBUFLEN);
2121 52030 : sss->buflen1 = TEXTBUFLEN;
2122 52030 : sss->buf2 = palloc(TEXTBUFLEN);
2123 52030 : sss->buflen2 = TEXTBUFLEN;
2124 : /* Start with invalid values */
2125 52030 : sss->last_len1 = -1;
2126 52030 : sss->last_len2 = -1;
2127 : /* Initialize */
2128 52030 : sss->last_returned = 0;
2129 52030 : sss->locale = locale;
2130 :
2131 : /*
2132 : * To avoid somehow confusing a strxfrm() blob and an original string,
2133 : * constantly keep track of the variety of data that buf1 and buf2
2134 : * currently contain.
2135 : *
2136 : * Comparisons may be interleaved with conversion calls. Frequently,
2137 : * conversions and comparisons are batched into two distinct phases,
2138 : * but the correctness of caching cannot hinge upon this. For
2139 : * comparison caching, buffer state is only trusted if cache_blob is
2140 : * found set to false, whereas strxfrm() caching only trusts the state
2141 : * when cache_blob is found set to true.
2142 : *
2143 : * Arbitrarily initialize cache_blob to true.
2144 : */
2145 52030 : sss->cache_blob = true;
2146 52030 : sss->collate_c = collate_c;
2147 52030 : sss->typid = typid;
2148 52030 : ssup->ssup_extra = sss;
2149 :
2150 : /*
2151 : * If possible, plan to use the abbreviated keys optimization. The
2152 : * core code may switch back to authoritative comparator should
2153 : * abbreviation be aborted.
2154 : */
2155 52030 : if (abbreviate)
2156 : {
2157 3280 : sss->prop_card = 0.20;
2158 3280 : initHyperLogLog(&sss->abbr_card, 10);
2159 3280 : initHyperLogLog(&sss->full_card, 10);
2160 3280 : ssup->abbrev_full_comparator = ssup->comparator;
2161 3280 : ssup->comparator = ssup_datum_unsigned_cmp;
2162 3280 : ssup->abbrev_converter = varstr_abbrev_convert;
2163 3280 : ssup->abbrev_abort = varstr_abbrev_abort;
2164 : }
2165 : }
2166 150664 : }
2167 :
2168 : /*
2169 : * sortsupport comparison func (for C locale case)
2170 : */
2171 : static int
2172 81080548 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2173 : {
2174 81080548 : VarString *arg1 = DatumGetVarStringPP(x);
2175 81080548 : VarString *arg2 = DatumGetVarStringPP(y);
2176 : char *a1p,
2177 : *a2p;
2178 : int len1,
2179 : len2,
2180 : result;
2181 :
2182 81080548 : a1p = VARDATA_ANY(arg1);
2183 81080548 : a2p = VARDATA_ANY(arg2);
2184 :
2185 81080548 : len1 = VARSIZE_ANY_EXHDR(arg1);
2186 81080548 : len2 = VARSIZE_ANY_EXHDR(arg2);
2187 :
2188 81080548 : result = memcmp(a1p, a2p, Min(len1, len2));
2189 81080548 : if ((result == 0) && (len1 != len2))
2190 1850424 : result = (len1 < len2) ? -1 : 1;
2191 :
2192 : /* We can't afford to leak memory here. */
2193 81080548 : if (PointerGetDatum(arg1) != x)
2194 0 : pfree(arg1);
2195 81080548 : if (PointerGetDatum(arg2) != y)
2196 0 : pfree(arg2);
2197 :
2198 81080548 : return result;
2199 : }
2200 :
2201 : /*
2202 : * sortsupport comparison func (for BpChar C locale case)
2203 : *
2204 : * BpChar outsources its sortsupport to this module. Specialization for the
2205 : * varstr_sortsupport BpChar case, modeled on
2206 : * internal_bpchar_pattern_compare().
2207 : */
2208 : static int
2209 60014 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2210 : {
2211 60014 : BpChar *arg1 = DatumGetBpCharPP(x);
2212 60014 : BpChar *arg2 = DatumGetBpCharPP(y);
2213 : char *a1p,
2214 : *a2p;
2215 : int len1,
2216 : len2,
2217 : result;
2218 :
2219 60014 : a1p = VARDATA_ANY(arg1);
2220 60014 : a2p = VARDATA_ANY(arg2);
2221 :
2222 60014 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2223 60014 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2224 :
2225 60014 : result = memcmp(a1p, a2p, Min(len1, len2));
2226 60014 : if ((result == 0) && (len1 != len2))
2227 0 : result = (len1 < len2) ? -1 : 1;
2228 :
2229 : /* We can't afford to leak memory here. */
2230 60014 : if (PointerGetDatum(arg1) != x)
2231 0 : pfree(arg1);
2232 60014 : if (PointerGetDatum(arg2) != y)
2233 0 : pfree(arg2);
2234 :
2235 60014 : return result;
2236 : }
2237 :
2238 : /*
2239 : * sortsupport comparison func (for NAME C locale case)
2240 : */
2241 : static int
2242 103647056 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2243 : {
2244 103647056 : Name arg1 = DatumGetName(x);
2245 103647056 : Name arg2 = DatumGetName(y);
2246 :
2247 103647056 : return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2248 : }
2249 :
2250 : /*
2251 : * sortsupport comparison func (for locale case with all varlena types)
2252 : */
2253 : static int
2254 40193606 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2255 : {
2256 40193606 : VarString *arg1 = DatumGetVarStringPP(x);
2257 40193606 : VarString *arg2 = DatumGetVarStringPP(y);
2258 : char *a1p,
2259 : *a2p;
2260 : int len1,
2261 : len2,
2262 : result;
2263 :
2264 40193606 : a1p = VARDATA_ANY(arg1);
2265 40193606 : a2p = VARDATA_ANY(arg2);
2266 :
2267 40193606 : len1 = VARSIZE_ANY_EXHDR(arg1);
2268 40193606 : len2 = VARSIZE_ANY_EXHDR(arg2);
2269 :
2270 40193606 : result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2271 :
2272 : /* We can't afford to leak memory here. */
2273 40193606 : if (PointerGetDatum(arg1) != x)
2274 6 : pfree(arg1);
2275 40193606 : if (PointerGetDatum(arg2) != y)
2276 6 : pfree(arg2);
2277 :
2278 40193606 : return result;
2279 : }
2280 :
2281 : /*
2282 : * sortsupport comparison func (for locale case with NAME type)
2283 : */
2284 : static int
2285 0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2286 : {
2287 0 : Name arg1 = DatumGetName(x);
2288 0 : Name arg2 = DatumGetName(y);
2289 :
2290 0 : return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2291 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2292 : ssup);
2293 : }
2294 :
2295 : /*
2296 : * sortsupport comparison func for locale cases
2297 : */
2298 : static int
2299 40193606 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2300 : {
2301 40193606 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2302 : int result;
2303 : bool arg1_match;
2304 :
2305 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2306 40193606 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2307 : {
2308 : /*
2309 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2310 : * last_len2. Existing contents of buffers might still be used by
2311 : * next call.
2312 : *
2313 : * It's fine to allow the comparison of BpChar padding bytes here,
2314 : * even though that implies that the memcmp() will usually be
2315 : * performed for BpChar callers (though multibyte characters could
2316 : * still prevent that from occurring). The memcmp() is still very
2317 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2318 : * (not limited to padding), so we need make no distinction between
2319 : * padding space characters and "real" space characters.
2320 : */
2321 15092018 : return 0;
2322 : }
2323 :
2324 25101588 : if (sss->typid == BPCHAROID)
2325 : {
2326 : /* Get true number of bytes, ignoring trailing spaces */
2327 40196 : len1 = bpchartruelen(a1p, len1);
2328 40196 : len2 = bpchartruelen(a2p, len2);
2329 : }
2330 :
2331 25101588 : if (len1 >= sss->buflen1)
2332 : {
2333 6 : pfree(sss->buf1);
2334 6 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2335 6 : sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2336 : }
2337 25101588 : if (len2 >= sss->buflen2)
2338 : {
2339 6 : pfree(sss->buf2);
2340 6 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2341 6 : sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2342 : }
2343 :
2344 : /*
2345 : * We're likely to be asked to compare the same strings repeatedly, and
2346 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2347 : * comparisons, even though in general there is no reason to think that
2348 : * that will work out (every string datum may be unique). Caching does
2349 : * not slow things down measurably when it doesn't work out, and can speed
2350 : * things up by rather a lot when it does. In part, this is because the
2351 : * memcmp() compares data from cachelines that are needed in L1 cache even
2352 : * when the last comparison's result cannot be reused.
2353 : */
2354 25101588 : arg1_match = true;
2355 25101588 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2356 : {
2357 22409654 : arg1_match = false;
2358 22409654 : memcpy(sss->buf1, a1p, len1);
2359 22409654 : sss->buf1[len1] = '\0';
2360 22409654 : sss->last_len1 = len1;
2361 : }
2362 :
2363 : /*
2364 : * If we're comparing the same two strings as last time, we can return the
2365 : * same answer without calling strcoll() again. This is more likely than
2366 : * it seems (at least with moderate to low cardinality sets), because
2367 : * quicksort compares the same pivot against many values.
2368 : */
2369 25101588 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2370 : {
2371 4091966 : memcpy(sss->buf2, a2p, len2);
2372 4091966 : sss->buf2[len2] = '\0';
2373 4091966 : sss->last_len2 = len2;
2374 : }
2375 21009622 : else if (arg1_match && !sss->cache_blob)
2376 : {
2377 : /* Use result cached following last actual strcoll() call */
2378 2304198 : return sss->last_returned;
2379 : }
2380 :
2381 22797390 : if (sss->locale)
2382 : {
2383 0 : if (sss->locale->provider == COLLPROVIDER_ICU)
2384 : {
2385 : #ifdef USE_ICU
2386 : #ifdef HAVE_UCOL_STRCOLLUTF8
2387 : if (GetDatabaseEncoding() == PG_UTF8)
2388 : {
2389 : UErrorCode status;
2390 :
2391 : status = U_ZERO_ERROR;
2392 : result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2393 : a1p, len1,
2394 : a2p, len2,
2395 : &status);
2396 : if (U_FAILURE(status))
2397 : ereport(ERROR,
2398 : (errmsg("collation failed: %s", u_errorName(status))));
2399 : }
2400 : else
2401 : #endif
2402 : {
2403 : int32_t ulen1,
2404 : ulen2;
2405 : UChar *uchar1,
2406 : *uchar2;
2407 :
2408 : ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2409 : ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2410 :
2411 : result = ucol_strcoll(sss->locale->info.icu.ucol,
2412 : uchar1, ulen1,
2413 : uchar2, ulen2);
2414 :
2415 : pfree(uchar1);
2416 : pfree(uchar2);
2417 : }
2418 : #else /* not USE_ICU */
2419 : /* shouldn't happen */
2420 0 : elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2421 : #endif /* not USE_ICU */
2422 : }
2423 : else
2424 : {
2425 : #ifdef HAVE_LOCALE_T
2426 0 : result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2427 : #else
2428 : /* shouldn't happen */
2429 : elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2430 : #endif
2431 : }
2432 : }
2433 : else
2434 22797390 : result = strcoll(sss->buf1, sss->buf2);
2435 :
2436 : /* Break tie if necessary. */
2437 22797390 : if (result == 0 &&
2438 0 : (!sss->locale || sss->locale->deterministic))
2439 0 : result = strcmp(sss->buf1, sss->buf2);
2440 :
2441 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2442 22797390 : sss->cache_blob = false;
2443 22797390 : sss->last_returned = result;
2444 22797390 : return result;
2445 : }
2446 :
2447 : /*
2448 : * Conversion routine for sortsupport. Converts original to abbreviated key
2449 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2450 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2451 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2452 : * locale is used, or in case of bytea, just memcpy() from original instead.
2453 : */
2454 : static Datum
2455 149038 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2456 : {
2457 149038 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2458 149038 : VarString *authoritative = DatumGetVarStringPP(original);
2459 149038 : char *authoritative_data = VARDATA_ANY(authoritative);
2460 :
2461 : /* working state */
2462 : Datum res;
2463 : char *pres;
2464 : int len;
2465 : uint32 hash;
2466 :
2467 149038 : pres = (char *) &res;
2468 : /* memset(), so any non-overwritten bytes are NUL */
2469 149038 : memset(pres, 0, sizeof(Datum));
2470 149038 : len = VARSIZE_ANY_EXHDR(authoritative);
2471 :
2472 : /* Get number of bytes, ignoring trailing spaces */
2473 149038 : if (sss->typid == BPCHAROID)
2474 0 : len = bpchartruelen(authoritative_data, len);
2475 :
2476 : /*
2477 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2478 : * abbreviate keys. The full comparator for the C locale is always
2479 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2480 : * always force the C collation -- bytea isn't a collatable type, but this
2481 : * approach is convenient) to use strxfrm(). This is because bytea
2482 : * strings may contain NUL bytes. Besides, this should be faster, too.
2483 : *
2484 : * More generally, it's okay that bytea callers can have NUL bytes in
2485 : * strings because abbreviated cmp need not make a distinction between
2486 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2487 : * authoritative representation. Hopefully a comparison at or past one
2488 : * abbreviated key's terminating NUL byte will resolve the comparison
2489 : * without consulting the authoritative representation; specifically, some
2490 : * later non-NUL byte in the longer string can resolve the comparison
2491 : * against a subsequent terminating NUL in the shorter string. There will
2492 : * usually be what is effectively a "length-wise" resolution there and
2493 : * then.
2494 : *
2495 : * If that doesn't work out -- if all bytes in the longer string
2496 : * positioned at or past the offset of the smaller string's (first)
2497 : * terminating NUL are actually representative of NUL bytes in the
2498 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2499 : * towards the end of the longer string iff it happens to still be small)
2500 : * -- then an authoritative tie-breaker will happen, and do the right
2501 : * thing: explicitly consider string length.
2502 : */
2503 149038 : if (sss->collate_c)
2504 149038 : memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2505 : else
2506 : {
2507 : Size bsize;
2508 : #ifdef USE_ICU
2509 : int32_t ulen = -1;
2510 : UChar *uchar = NULL;
2511 : #endif
2512 :
2513 : /*
2514 : * We're not using the C collation, so fall back on strxfrm or ICU
2515 : * analogs.
2516 : */
2517 :
2518 : /* By convention, we use buffer 1 to store and NUL-terminate */
2519 0 : if (len >= sss->buflen1)
2520 : {
2521 0 : pfree(sss->buf1);
2522 0 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523 0 : sss->buf1 = palloc(sss->buflen1);
2524 : }
2525 :
2526 : /* Might be able to reuse strxfrm() blob from last call */
2527 0 : if (sss->last_len1 == len && sss->cache_blob &&
2528 0 : memcmp(sss->buf1, authoritative_data, len) == 0)
2529 : {
2530 0 : memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531 : /* No change affecting cardinality, so no hashing required */
2532 0 : goto done;
2533 : }
2534 :
2535 0 : memcpy(sss->buf1, authoritative_data, len);
2536 :
2537 : /*
2538 : * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539 : * necessary for ICU, but doesn't hurt.
2540 : */
2541 0 : sss->buf1[len] = '\0';
2542 0 : sss->last_len1 = len;
2543 :
2544 : #ifdef USE_ICU
2545 : /* When using ICU and not UTF8, convert string to UChar. */
2546 : if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2547 : GetDatabaseEncoding() != PG_UTF8)
2548 : ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 : #endif
2550 :
2551 : /*
2552 : * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553 : * and try again. Both of these functions have the result buffer
2554 : * content undefined if the result did not fit, so we need to retry
2555 : * until everything fits, even though we only need the first few bytes
2556 : * in the end. When using ucol_nextSortKeyPart(), however, we only
2557 : * ask for as many bytes as we actually need.
2558 : */
2559 : for (;;)
2560 : {
2561 : #ifdef USE_ICU
2562 : if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2563 : {
2564 : /*
2565 : * When using UTF8, use the iteration interface so we only
2566 : * need to produce as many bytes as we actually need.
2567 : */
2568 : if (GetDatabaseEncoding() == PG_UTF8)
2569 : {
2570 : UCharIterator iter;
2571 : uint32_t state[2];
2572 : UErrorCode status;
2573 :
2574 : uiter_setUTF8(&iter, sss->buf1, len);
2575 : state[0] = state[1] = 0; /* won't need that again */
2576 : status = U_ZERO_ERROR;
2577 : bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578 : &iter,
2579 : state,
2580 : (uint8_t *) sss->buf2,
2581 : Min(sizeof(Datum), sss->buflen2),
2582 : &status);
2583 : if (U_FAILURE(status))
2584 : ereport(ERROR,
2585 : (errmsg("sort key generation failed: %s",
2586 : u_errorName(status))));
2587 : }
2588 : else
2589 : bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590 : uchar, ulen,
2591 : (uint8_t *) sss->buf2, sss->buflen2);
2592 : }
2593 : else
2594 : #endif
2595 : #ifdef HAVE_LOCALE_T
2596 0 : if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597 0 : bsize = strxfrm_l(sss->buf2, sss->buf1,
2598 0 : sss->buflen2, sss->locale->info.lt);
2599 : else
2600 : #endif
2601 0 : bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2602 :
2603 0 : sss->last_len2 = bsize;
2604 0 : if (bsize < sss->buflen2)
2605 0 : break;
2606 :
2607 : /*
2608 : * Grow buffer and retry.
2609 : */
2610 0 : pfree(sss->buf2);
2611 0 : sss->buflen2 = Max(bsize + 1,
2612 : Min(sss->buflen2 * 2, MaxAllocSize));
2613 0 : sss->buf2 = palloc(sss->buflen2);
2614 : }
2615 :
2616 : /*
2617 : * Every Datum byte is always compared. This is safe because the
2618 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2619 : * misinterpreting any NUL bytes not intended to be interpreted as
2620 : * logically representing termination.
2621 : *
2622 : * (Actually, even if there were NUL bytes in the blob it would be
2623 : * okay. See remarks on bytea case above.)
2624 : */
2625 0 : memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2626 :
2627 : #ifdef USE_ICU
2628 : if (uchar)
2629 : pfree(uchar);
2630 : #endif
2631 : }
2632 :
2633 : /*
2634 : * Maintain approximate cardinality of both abbreviated keys and original,
2635 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2636 : * the worst case, where we do many string transformations for no saving
2637 : * in full strcoll()-based comparisons. These statistics are used by
2638 : * varstr_abbrev_abort().
2639 : *
2640 : * First, Hash key proper, or a significant fraction of it. Mix in length
2641 : * in order to compensate for cases where differences are past
2642 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2643 : */
2644 149038 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2645 : Min(len, PG_CACHE_LINE_SIZE)));
2646 :
2647 149038 : if (len > PG_CACHE_LINE_SIZE)
2648 10 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2649 :
2650 149038 : addHyperLogLog(&sss->full_card, hash);
2651 :
2652 : /* Hash abbreviated key */
2653 : #if SIZEOF_DATUM == 8
2654 : {
2655 : uint32 lohalf,
2656 : hihalf;
2657 :
2658 149038 : lohalf = (uint32) res;
2659 149038 : hihalf = (uint32) (res >> 32);
2660 149038 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2661 : }
2662 : #else /* SIZEOF_DATUM != 8 */
2663 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2664 : #endif
2665 :
2666 149038 : addHyperLogLog(&sss->abbr_card, hash);
2667 :
2668 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2669 149038 : sss->cache_blob = true;
2670 149038 : done:
2671 :
2672 : /*
2673 : * Byteswap on little-endian machines.
2674 : *
2675 : * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2676 : * 3-way comparator) works correctly on all platforms. If we didn't do
2677 : * this, the comparator would have to call memcmp() with a pair of
2678 : * pointers to the first byte of each abbreviated key, which is slower.
2679 : */
2680 149038 : res = DatumBigEndianToNative(res);
2681 :
2682 : /* Don't leak memory here */
2683 149038 : if (PointerGetDatum(authoritative) != original)
2684 0 : pfree(authoritative);
2685 :
2686 149038 : return res;
2687 : }
2688 :
2689 : /*
2690 : * Callback for estimating effectiveness of abbreviated key optimization, using
2691 : * heuristic rules. Returns value indicating if the abbreviation optimization
2692 : * should be aborted, based on its projected effectiveness.
2693 : */
2694 : static bool
2695 304 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2696 : {
2697 304 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2698 : double abbrev_distinct,
2699 : key_distinct;
2700 :
2701 : Assert(ssup->abbreviate);
2702 :
2703 : /* Have a little patience */
2704 304 : if (memtupcount < 100)
2705 186 : return false;
2706 :
2707 118 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2708 118 : key_distinct = estimateHyperLogLog(&sss->full_card);
2709 :
2710 : /*
2711 : * Clamp cardinality estimates to at least one distinct value. While
2712 : * NULLs are generally disregarded, if only NULL values were seen so far,
2713 : * that might misrepresent costs if we failed to clamp.
2714 : */
2715 118 : if (abbrev_distinct <= 1.0)
2716 0 : abbrev_distinct = 1.0;
2717 :
2718 118 : if (key_distinct <= 1.0)
2719 0 : key_distinct = 1.0;
2720 :
2721 : /*
2722 : * In the worst case all abbreviated keys are identical, while at the same
2723 : * time there are differences within full key strings not captured in
2724 : * abbreviations.
2725 : */
2726 : #ifdef TRACE_SORT
2727 118 : if (trace_sort)
2728 : {
2729 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2730 :
2731 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2732 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2733 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2734 : sss->prop_card);
2735 : }
2736 : #endif
2737 :
2738 : /*
2739 : * If the number of distinct abbreviated keys approximately matches the
2740 : * number of distinct authoritative original keys, that's reason enough to
2741 : * proceed. We can win even with a very low cardinality set if most
2742 : * tie-breakers only memcmp(). This is by far the most important
2743 : * consideration.
2744 : *
2745 : * While comparisons that are resolved at the abbreviated key level are
2746 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2747 : * those two outcomes are so much cheaper than a full strcoll() once
2748 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2749 : * cardinality against the overall size of the set in order to more
2750 : * accurately model costs. Assume that an abbreviated comparison, and an
2751 : * abbreviated comparison with a cheap memcmp()-based authoritative
2752 : * resolution are equivalent.
2753 : */
2754 118 : if (abbrev_distinct > key_distinct * sss->prop_card)
2755 : {
2756 : /*
2757 : * When we have exceeded 10,000 tuples, decay required cardinality
2758 : * aggressively for next call.
2759 : *
2760 : * This is useful because the number of comparisons required on
2761 : * average increases at a linearithmic rate, and at roughly 10,000
2762 : * tuples that factor will start to dominate over the linear costs of
2763 : * string transformation (this is a conservative estimate). The decay
2764 : * rate is chosen to be a little less aggressive than halving -- which
2765 : * (since we're called at points at which memtupcount has doubled)
2766 : * would never see the cost model actually abort past the first call
2767 : * following a decay. This decay rate is mostly a precaution against
2768 : * a sudden, violent swing in how well abbreviated cardinality tracks
2769 : * full key cardinality. The decay also serves to prevent a marginal
2770 : * case from being aborted too late, when too much has already been
2771 : * invested in string transformation.
2772 : *
2773 : * It's possible for sets of several million distinct strings with
2774 : * mere tens of thousands of distinct abbreviated keys to still
2775 : * benefit very significantly. This will generally occur provided
2776 : * each abbreviated key is a proxy for a roughly uniform number of the
2777 : * set's full keys. If it isn't so, we hope to catch that early and
2778 : * abort. If it isn't caught early, by the time the problem is
2779 : * apparent it's probably not worth aborting.
2780 : */
2781 118 : if (memtupcount > 10000)
2782 0 : sss->prop_card *= 0.65;
2783 :
2784 118 : return false;
2785 : }
2786 :
2787 : /*
2788 : * Abort abbreviation strategy.
2789 : *
2790 : * The worst case, where all abbreviated keys are identical while all
2791 : * original strings differ will typically only see a regression of about
2792 : * 10% in execution time for small to medium sized lists of strings.
2793 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2794 : * often expect very large improvements, particularly with sets of strings
2795 : * of moderately high to high abbreviated cardinality. There is little to
2796 : * lose but much to gain, which our strategy reflects.
2797 : */
2798 : #ifdef TRACE_SORT
2799 0 : if (trace_sort)
2800 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2801 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2802 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2803 : #endif
2804 :
2805 0 : return true;
2806 : }
2807 :
2808 : /*
2809 : * Generic equalimage support function for character type's operator classes.
2810 : * Disables the use of deduplication with nondeterministic collations.
2811 : */
2812 : Datum
2813 24128 : btvarstrequalimage(PG_FUNCTION_ARGS)
2814 : {
2815 : /* Oid opcintype = PG_GETARG_OID(0); */
2816 24128 : Oid collid = PG_GET_COLLATION();
2817 :
2818 24128 : check_collation_set(collid);
2819 :
2820 24128 : if (lc_collate_is_c(collid) ||
2821 0 : collid == DEFAULT_COLLATION_OID ||
2822 0 : get_collation_isdeterministic(collid))
2823 24128 : PG_RETURN_BOOL(true);
2824 : else
2825 0 : PG_RETURN_BOOL(false);
2826 : }
2827 :
2828 : Datum
2829 184626 : text_larger(PG_FUNCTION_ARGS)
2830 : {
2831 184626 : text *arg1 = PG_GETARG_TEXT_PP(0);
2832 184626 : text *arg2 = PG_GETARG_TEXT_PP(1);
2833 : text *result;
2834 :
2835 184626 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2836 :
2837 184626 : PG_RETURN_TEXT_P(result);
2838 : }
2839 :
2840 : Datum
2841 41142 : text_smaller(PG_FUNCTION_ARGS)
2842 : {
2843 41142 : text *arg1 = PG_GETARG_TEXT_PP(0);
2844 41142 : text *arg2 = PG_GETARG_TEXT_PP(1);
2845 : text *result;
2846 :
2847 41142 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2848 :
2849 41142 : PG_RETURN_TEXT_P(result);
2850 : }
2851 :
2852 :
2853 : /*
2854 : * Cross-type comparison functions for types text and name.
2855 : */
2856 :
2857 : Datum
2858 188950 : nameeqtext(PG_FUNCTION_ARGS)
2859 : {
2860 188950 : Name arg1 = PG_GETARG_NAME(0);
2861 188950 : text *arg2 = PG_GETARG_TEXT_PP(1);
2862 188950 : size_t len1 = strlen(NameStr(*arg1));
2863 188950 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2864 188950 : Oid collid = PG_GET_COLLATION();
2865 : bool result;
2866 :
2867 188950 : check_collation_set(collid);
2868 :
2869 188950 : if (collid == C_COLLATION_OID)
2870 293376 : result = (len1 == len2 &&
2871 129564 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2872 : else
2873 25138 : result = (varstr_cmp(NameStr(*arg1), len1,
2874 25138 : VARDATA_ANY(arg2), len2,
2875 : collid) == 0);
2876 :
2877 188950 : PG_FREE_IF_COPY(arg2, 1);
2878 :
2879 188950 : PG_RETURN_BOOL(result);
2880 : }
2881 :
2882 : Datum
2883 384 : texteqname(PG_FUNCTION_ARGS)
2884 : {
2885 384 : text *arg1 = PG_GETARG_TEXT_PP(0);
2886 384 : Name arg2 = PG_GETARG_NAME(1);
2887 384 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2888 384 : size_t len2 = strlen(NameStr(*arg2));
2889 384 : Oid collid = PG_GET_COLLATION();
2890 : bool result;
2891 :
2892 384 : check_collation_set(collid);
2893 :
2894 384 : if (collid == C_COLLATION_OID)
2895 564 : result = (len1 == len2 &&
2896 180 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2897 : else
2898 0 : result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2899 0 : NameStr(*arg2), len2,
2900 : collid) == 0);
2901 :
2902 384 : PG_FREE_IF_COPY(arg1, 0);
2903 :
2904 384 : PG_RETURN_BOOL(result);
2905 : }
2906 :
2907 : Datum
2908 18 : namenetext(PG_FUNCTION_ARGS)
2909 : {
2910 18 : Name arg1 = PG_GETARG_NAME(0);
2911 18 : text *arg2 = PG_GETARG_TEXT_PP(1);
2912 18 : size_t len1 = strlen(NameStr(*arg1));
2913 18 : size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2914 18 : Oid collid = PG_GET_COLLATION();
2915 : bool result;
2916 :
2917 18 : check_collation_set(collid);
2918 :
2919 18 : if (collid == C_COLLATION_OID)
2920 18 : result = !(len1 == len2 &&
2921 0 : memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2922 : else
2923 0 : result = !(varstr_cmp(NameStr(*arg1), len1,
2924 0 : VARDATA_ANY(arg2), len2,
2925 : collid) == 0);
2926 :
2927 18 : PG_FREE_IF_COPY(arg2, 1);
2928 :
2929 18 : PG_RETURN_BOOL(result);
2930 : }
2931 :
2932 : Datum
2933 0 : textnename(PG_FUNCTION_ARGS)
2934 : {
2935 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2936 0 : Name arg2 = PG_GETARG_NAME(1);
2937 0 : size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2938 0 : size_t len2 = strlen(NameStr(*arg2));
2939 0 : Oid collid = PG_GET_COLLATION();
2940 : bool result;
2941 :
2942 0 : check_collation_set(collid);
2943 :
2944 0 : if (collid == C_COLLATION_OID)
2945 0 : result = !(len1 == len2 &&
2946 0 : memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2947 : else
2948 0 : result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2949 0 : NameStr(*arg2), len2,
2950 : collid) == 0);
2951 :
2952 0 : PG_FREE_IF_COPY(arg1, 0);
2953 :
2954 0 : PG_RETURN_BOOL(result);
2955 : }
2956 :
2957 : Datum
2958 139650 : btnametextcmp(PG_FUNCTION_ARGS)
2959 : {
2960 139650 : Name arg1 = PG_GETARG_NAME(0);
2961 139650 : text *arg2 = PG_GETARG_TEXT_PP(1);
2962 : int32 result;
2963 :
2964 279300 : result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2965 279300 : VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2966 : PG_GET_COLLATION());
2967 :
2968 139650 : PG_FREE_IF_COPY(arg2, 1);
2969 :
2970 139650 : PG_RETURN_INT32(result);
2971 : }
2972 :
2973 : Datum
2974 0 : bttextnamecmp(PG_FUNCTION_ARGS)
2975 : {
2976 0 : text *arg1 = PG_GETARG_TEXT_PP(0);
2977 0 : Name arg2 = PG_GETARG_NAME(1);
2978 : int32 result;
2979 :
2980 0 : result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2981 0 : NameStr(*arg2), strlen(NameStr(*arg2)),
2982 : PG_GET_COLLATION());
2983 :
2984 0 : PG_FREE_IF_COPY(arg1, 0);
2985 :
2986 0 : PG_RETURN_INT32(result);
2987 : }
2988 :
2989 : #define CmpCall(cmpfunc) \
2990 : DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2991 : PG_GET_COLLATION(), \
2992 : PG_GETARG_DATUM(0), \
2993 : PG_GETARG_DATUM(1)))
2994 :
2995 : Datum
2996 44776 : namelttext(PG_FUNCTION_ARGS)
2997 : {
2998 44776 : PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2999 : }
3000 :
3001 : Datum
3002 0 : nameletext(PG_FUNCTION_ARGS)
3003 : {
3004 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3005 : }
3006 :
3007 : Datum
3008 0 : namegttext(PG_FUNCTION_ARGS)
3009 : {
3010 0 : PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3011 : }
3012 :
3013 : Datum
3014 43292 : namegetext(PG_FUNCTION_ARGS)
3015 : {
3016 43292 : PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3017 : }
3018 :
3019 : Datum
3020 0 : textltname(PG_FUNCTION_ARGS)
3021 : {
3022 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3023 : }
3024 :
3025 : Datum
3026 0 : textlename(PG_FUNCTION_ARGS)
3027 : {
3028 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3029 : }
3030 :
3031 : Datum
3032 0 : textgtname(PG_FUNCTION_ARGS)
3033 : {
3034 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3035 : }
3036 :
3037 : Datum
3038 0 : textgename(PG_FUNCTION_ARGS)
3039 : {
3040 0 : PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3041 : }
3042 :
3043 : #undef CmpCall
3044 :
3045 :
3046 : /*
3047 : * The following operators support character-by-character comparison
3048 : * of text datums, to allow building indexes suitable for LIKE clauses.
3049 : * Note that the regular texteq/textne comparison operators, and regular
3050 : * support functions 1 and 2 with "C" collation are assumed to be
3051 : * compatible with these!
3052 : */
3053 :
3054 : static int
3055 152080 : internal_text_pattern_compare(text *arg1, text *arg2)
3056 : {
3057 : int result;
3058 : int len1,
3059 : len2;
3060 :
3061 152080 : len1 = VARSIZE_ANY_EXHDR(arg1);
3062 152080 : len2 = VARSIZE_ANY_EXHDR(arg2);
3063 :
3064 152080 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3065 152080 : if (result != 0)
3066 152026 : return result;
3067 54 : else if (len1 < len2)
3068 0 : return -1;
3069 54 : else if (len1 > len2)
3070 18 : return 1;
3071 : else
3072 36 : return 0;
3073 : }
3074 :
3075 :
3076 : Datum
3077 39538 : text_pattern_lt(PG_FUNCTION_ARGS)
3078 : {
3079 39538 : text *arg1 = PG_GETARG_TEXT_PP(0);
3080 39538 : text *arg2 = PG_GETARG_TEXT_PP(1);
3081 : int result;
3082 :
3083 39538 : result = internal_text_pattern_compare(arg1, arg2);
3084 :
3085 39538 : PG_FREE_IF_COPY(arg1, 0);
3086 39538 : PG_FREE_IF_COPY(arg2, 1);
3087 :
3088 39538 : PG_RETURN_BOOL(result < 0);
3089 : }
3090 :
3091 :
3092 : Datum
3093 37510 : text_pattern_le(PG_FUNCTION_ARGS)
3094 : {
3095 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
3096 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
3097 : int result;
3098 :
3099 37510 : result = internal_text_pattern_compare(arg1, arg2);
3100 :
3101 37510 : PG_FREE_IF_COPY(arg1, 0);
3102 37510 : PG_FREE_IF_COPY(arg2, 1);
3103 :
3104 37510 : PG_RETURN_BOOL(result <= 0);
3105 : }
3106 :
3107 :
3108 : Datum
3109 37510 : text_pattern_ge(PG_FUNCTION_ARGS)
3110 : {
3111 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
3112 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
3113 : int result;
3114 :
3115 37510 : result = internal_text_pattern_compare(arg1, arg2);
3116 :
3117 37510 : PG_FREE_IF_COPY(arg1, 0);
3118 37510 : PG_FREE_IF_COPY(arg2, 1);
3119 :
3120 37510 : PG_RETURN_BOOL(result >= 0);
3121 : }
3122 :
3123 :
3124 : Datum
3125 37510 : text_pattern_gt(PG_FUNCTION_ARGS)
3126 : {
3127 37510 : text *arg1 = PG_GETARG_TEXT_PP(0);
3128 37510 : text *arg2 = PG_GETARG_TEXT_PP(1);
3129 : int result;
3130 :
3131 37510 : result = internal_text_pattern_compare(arg1, arg2);
3132 :
3133 37510 : PG_FREE_IF_COPY(arg1, 0);
3134 37510 : PG_FREE_IF_COPY(arg2, 1);
3135 :
3136 37510 : PG_RETURN_BOOL(result > 0);
3137 : }
3138 :
3139 :
3140 : Datum
3141 12 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
3142 : {
3143 12 : text *arg1 = PG_GETARG_TEXT_PP(0);
3144 12 : text *arg2 = PG_GETARG_TEXT_PP(1);
3145 : int result;
3146 :
3147 12 : result = internal_text_pattern_compare(arg1, arg2);
3148 :
3149 12 : PG_FREE_IF_COPY(arg1, 0);
3150 12 : PG_FREE_IF_COPY(arg2, 1);
3151 :
3152 12 : PG_RETURN_INT32(result);
3153 : }
3154 :
3155 :
3156 : Datum
3157 116 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3158 : {
3159 116 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3160 : MemoryContext oldcontext;
3161 :
3162 116 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3163 :
3164 : /* Use generic string SortSupport, forcing "C" collation */
3165 116 : varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3166 :
3167 116 : MemoryContextSwitchTo(oldcontext);
3168 :
3169 116 : PG_RETURN_VOID();
3170 : }
3171 :
3172 :
3173 : /*-------------------------------------------------------------
3174 : * byteaoctetlen
3175 : *
3176 : * get the number of bytes contained in an instance of type 'bytea'
3177 : *-------------------------------------------------------------
3178 : */
3179 : Datum
3180 302 : byteaoctetlen(PG_FUNCTION_ARGS)
3181 : {
3182 302 : Datum str = PG_GETARG_DATUM(0);
3183 :
3184 : /* We need not detoast the input at all */
3185 302 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3186 : }
3187 :
3188 : /*
3189 : * byteacat -
3190 : * takes two bytea* and returns a bytea* that is the concatenation of
3191 : * the two.
3192 : *
3193 : * Cloned from textcat and modified as required.
3194 : */
3195 : Datum
3196 1520 : byteacat(PG_FUNCTION_ARGS)
3197 : {
3198 1520 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3199 1520 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3200 :
3201 1520 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3202 : }
3203 :
3204 : /*
3205 : * bytea_catenate
3206 : * Guts of byteacat(), broken out so it can be used by other functions
3207 : *
3208 : * Arguments can be in short-header form, but not compressed or out-of-line
3209 : */
3210 : static bytea *
3211 1556 : bytea_catenate(bytea *t1, bytea *t2)
3212 : {
3213 : bytea *result;
3214 : int len1,
3215 : len2,
3216 : len;
3217 : char *ptr;
3218 :
3219 1556 : len1 = VARSIZE_ANY_EXHDR(t1);
3220 1556 : len2 = VARSIZE_ANY_EXHDR(t2);
3221 :
3222 : /* paranoia ... probably should throw error instead? */
3223 1556 : if (len1 < 0)
3224 0 : len1 = 0;
3225 1556 : if (len2 < 0)
3226 0 : len2 = 0;
3227 :
3228 1556 : len = len1 + len2 + VARHDRSZ;
3229 1556 : result = (bytea *) palloc(len);
3230 :
3231 : /* Set size of result string... */
3232 1556 : SET_VARSIZE(result, len);
3233 :
3234 : /* Fill data field of result string... */
3235 1556 : ptr = VARDATA(result);
3236 1556 : if (len1 > 0)
3237 1556 : memcpy(ptr, VARDATA_ANY(t1), len1);
3238 1556 : if (len2 > 0)
3239 1538 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3240 :
3241 1556 : return result;
3242 : }
3243 :
3244 : #define PG_STR_GET_BYTEA(str_) \
3245 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3246 :
3247 : /*
3248 : * bytea_substr()
3249 : * Return a substring starting at the specified position.
3250 : * Cloned from text_substr and modified as required.
3251 : *
3252 : * Input:
3253 : * - string
3254 : * - starting position (is one-based)
3255 : * - string length (optional)
3256 : *
3257 : * If the starting position is zero or less, then return from the start of the string
3258 : * adjusting the length to be consistent with the "negative start" per SQL.
3259 : * If the length is less than zero, an ERROR is thrown. If no third argument
3260 : * (length) is provided, the length to the end of the string is assumed.
3261 : */
3262 : Datum
3263 82 : bytea_substr(PG_FUNCTION_ARGS)
3264 : {
3265 82 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3266 : PG_GETARG_INT32(1),
3267 : PG_GETARG_INT32(2),
3268 : false));
3269 : }
3270 :
3271 : /*
3272 : * bytea_substr_no_len -
3273 : * Wrapper to avoid opr_sanity failure due to
3274 : * one function accepting a different number of args.
3275 : */
3276 : Datum
3277 3900 : bytea_substr_no_len(PG_FUNCTION_ARGS)
3278 : {
3279 3900 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3280 : PG_GETARG_INT32(1),
3281 : -1,
3282 : true));
3283 : }
3284 :
3285 : static bytea *
3286 4018 : bytea_substring(Datum str,
3287 : int S,
3288 : int L,
3289 : bool length_not_specified)
3290 : {
3291 : int32 S1; /* adjusted start position */
3292 : int32 L1; /* adjusted substring length */
3293 : int32 E; /* end position */
3294 :
3295 : /*
3296 : * The logic here should generally match text_substring().
3297 : */
3298 4018 : S1 = Max(S, 1);
3299 :
3300 4018 : if (length_not_specified)
3301 : {
3302 : /*
3303 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3304 : * end of the string if we pass it a negative value for length.
3305 : */
3306 3918 : L1 = -1;
3307 : }
3308 100 : else if (L < 0)
3309 : {
3310 : /* SQL99 says to throw an error for E < S, i.e., negative length */
3311 12 : ereport(ERROR,
3312 : (errcode(ERRCODE_SUBSTRING_ERROR),
3313 : errmsg("negative substring length not allowed")));
3314 : L1 = -1; /* silence stupider compilers */
3315 : }
3316 88 : else if (pg_add_s32_overflow(S, L, &E))
3317 : {
3318 : /*
3319 : * L could be large enough for S + L to overflow, in which case the
3320 : * substring must run to end of string.
3321 : */
3322 6 : L1 = -1;
3323 : }
3324 : else
3325 : {
3326 : /*
3327 : * A zero or negative value for the end position can happen if the
3328 : * start was negative or one. SQL99 says to return a zero-length
3329 : * string.
3330 : */
3331 82 : if (E < 1)
3332 0 : return PG_STR_GET_BYTEA("");
3333 :
3334 82 : L1 = E - S1;
3335 : }
3336 :
3337 : /*
3338 : * If the start position is past the end of the string, SQL99 says to
3339 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
3340 : * us. We need only convert S1 to zero-based starting position.
3341 : */
3342 4006 : return DatumGetByteaPSlice(str, S1 - 1, L1);
3343 : }
3344 :
3345 : /*
3346 : * byteaoverlay
3347 : * Replace specified substring of first string with second
3348 : *
3349 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3350 : * This code is a direct implementation of what the standard says.
3351 : */
3352 : Datum
3353 6 : byteaoverlay(PG_FUNCTION_ARGS)
3354 : {
3355 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3356 6 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3357 6 : int sp = PG_GETARG_INT32(2); /* substring start position */
3358 6 : int sl = PG_GETARG_INT32(3); /* substring length */
3359 :
3360 6 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3361 : }
3362 :
3363 : Datum
3364 12 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
3365 : {
3366 12 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3367 12 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3368 12 : int sp = PG_GETARG_INT32(2); /* substring start position */
3369 : int sl;
3370 :
3371 12 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3372 12 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3373 : }
3374 :
3375 : static bytea *
3376 18 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3377 : {
3378 : bytea *result;
3379 : bytea *s1;
3380 : bytea *s2;
3381 : int sp_pl_sl;
3382 :
3383 : /*
3384 : * Check for possible integer-overflow cases. For negative sp, throw a
3385 : * "substring length" error because that's what should be expected
3386 : * according to the spec's definition of OVERLAY().
3387 : */
3388 18 : if (sp <= 0)
3389 0 : ereport(ERROR,
3390 : (errcode(ERRCODE_SUBSTRING_ERROR),
3391 : errmsg("negative substring length not allowed")));
3392 18 : if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3393 0 : ereport(ERROR,
3394 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3395 : errmsg("integer out of range")));
3396 :
3397 18 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3398 18 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3399 18 : result = bytea_catenate(s1, t2);
3400 18 : result = bytea_catenate(result, s2);
3401 :
3402 18 : return result;
3403 : }
3404 :
3405 : /*
3406 : * bit_count
3407 : */
3408 : Datum
3409 6 : bytea_bit_count(PG_FUNCTION_ARGS)
3410 : {
3411 6 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3412 :
3413 6 : PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3414 : }
3415 :
3416 : /*
3417 : * byteapos -
3418 : * Return the position of the specified substring.
3419 : * Implements the SQL POSITION() function.
3420 : * Cloned from textpos and modified as required.
3421 : */
3422 : Datum
3423 0 : byteapos(PG_FUNCTION_ARGS)
3424 : {
3425 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
3426 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
3427 : int pos;
3428 : int px,
3429 : p;
3430 : int len1,
3431 : len2;
3432 : char *p1,
3433 : *p2;
3434 :
3435 0 : len1 = VARSIZE_ANY_EXHDR(t1);
3436 0 : len2 = VARSIZE_ANY_EXHDR(t2);
3437 :
3438 0 : if (len2 <= 0)
3439 0 : PG_RETURN_INT32(1); /* result for empty pattern */
3440 :
3441 0 : p1 = VARDATA_ANY(t1);
3442 0 : p2 = VARDATA_ANY(t2);
3443 :
3444 0 : pos = 0;
3445 0 : px = (len1 - len2);
3446 0 : for (p = 0; p <= px; p++)
3447 : {
3448 0 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3449 : {
3450 0 : pos = p + 1;
3451 0 : break;
3452 : };
3453 0 : p1++;
3454 : };
3455 :
3456 0 : PG_RETURN_INT32(pos);
3457 : }
3458 :
3459 : /*-------------------------------------------------------------
3460 : * byteaGetByte
3461 : *
3462 : * this routine treats "bytea" as an array of bytes.
3463 : * It returns the Nth byte (a number between 0 and 255).
3464 : *-------------------------------------------------------------
3465 : */
3466 : Datum
3467 34 : byteaGetByte(PG_FUNCTION_ARGS)
3468 : {
3469 34 : bytea *v = PG_GETARG_BYTEA_PP(0);
3470 34 : int32 n = PG_GETARG_INT32(1);
3471 : int len;
3472 : int byte;
3473 :
3474 34 : len = VARSIZE_ANY_EXHDR(v);
3475 :
3476 34 : if (n < 0 || n >= len)
3477 6 : ereport(ERROR,
3478 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3479 : errmsg("index %d out of valid range, 0..%d",
3480 : n, len - 1)));
3481 :
3482 28 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3483 :
3484 28 : PG_RETURN_INT32(byte);
3485 : }
3486 :
3487 : /*-------------------------------------------------------------
3488 : * byteaGetBit
3489 : *
3490 : * This routine treats a "bytea" type like an array of bits.
3491 : * It returns the value of the Nth bit (0 or 1).
3492 : *
3493 : *-------------------------------------------------------------
3494 : */
3495 : Datum
3496 12 : byteaGetBit(PG_FUNCTION_ARGS)
3497 : {
3498 12 : bytea *v = PG_GETARG_BYTEA_PP(0);
3499 12 : int64 n = PG_GETARG_INT64(1);
3500 : int byteNo,
3501 : bitNo;
3502 : int len;
3503 : int byte;
3504 :
3505 12 : len = VARSIZE_ANY_EXHDR(v);
3506 :
3507 12 : if (n < 0 || n >= (int64) len * 8)
3508 6 : ereport(ERROR,
3509 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510 : errmsg("index %lld out of valid range, 0..%lld",
3511 : (long long) n, (long long) len * 8 - 1)));
3512 :
3513 : /* n/8 is now known < len, so safe to cast to int */
3514 6 : byteNo = (int) (n / 8);
3515 6 : bitNo = (int) (n % 8);
3516 :
3517 6 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3518 :
3519 6 : if (byte & (1 << bitNo))
3520 6 : PG_RETURN_INT32(1);
3521 : else
3522 0 : PG_RETURN_INT32(0);
3523 : }
3524 :
3525 : /*-------------------------------------------------------------
3526 : * byteaSetByte
3527 : *
3528 : * Given an instance of type 'bytea' creates a new one with
3529 : * the Nth byte set to the given value.
3530 : *
3531 : *-------------------------------------------------------------
3532 : */
3533 : Datum
3534 12 : byteaSetByte(PG_FUNCTION_ARGS)
3535 : {
3536 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3537 12 : int32 n = PG_GETARG_INT32(1);
3538 12 : int32 newByte = PG_GETARG_INT32(2);
3539 : int len;
3540 :
3541 12 : len = VARSIZE(res) - VARHDRSZ;
3542 :
3543 12 : if (n < 0 || n >= len)
3544 6 : ereport(ERROR,
3545 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3546 : errmsg("index %d out of valid range, 0..%d",
3547 : n, len - 1)));
3548 :
3549 : /*
3550 : * Now set the byte.
3551 : */
3552 6 : ((unsigned char *) VARDATA(res))[n] = newByte;
3553 :
3554 6 : PG_RETURN_BYTEA_P(res);
3555 : }
3556 :
3557 : /*-------------------------------------------------------------
3558 : * byteaSetBit
3559 : *
3560 : * Given an instance of type 'bytea' creates a new one with
3561 : * the Nth bit set to the given value.
3562 : *
3563 : *-------------------------------------------------------------
3564 : */
3565 : Datum
3566 12 : byteaSetBit(PG_FUNCTION_ARGS)
3567 : {
3568 12 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3569 12 : int64 n = PG_GETARG_INT64(1);
3570 12 : int32 newBit = PG_GETARG_INT32(2);
3571 : int len;
3572 : int oldByte,
3573 : newByte;
3574 : int byteNo,
3575 : bitNo;
3576 :
3577 12 : len = VARSIZE(res) - VARHDRSZ;
3578 :
3579 12 : if (n < 0 || n >= (int64) len * 8)
3580 6 : ereport(ERROR,
3581 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3582 : errmsg("index %lld out of valid range, 0..%lld",
3583 : (long long) n, (long long) len * 8 - 1)));
3584 :
3585 : /* n/8 is now known < len, so safe to cast to int */
3586 6 : byteNo = (int) (n / 8);
3587 6 : bitNo = (int) (n % 8);
3588 :
3589 : /*
3590 : * sanity check!
3591 : */
3592 6 : if (newBit != 0 && newBit != 1)
3593 0 : ereport(ERROR,
3594 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3595 : errmsg("new bit must be 0 or 1")));
3596 :
3597 : /*
3598 : * Update the byte.
3599 : */
3600 6 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3601 :
3602 6 : if (newBit == 0)
3603 6 : newByte = oldByte & (~(1 << bitNo));
3604 : else
3605 0 : newByte = oldByte | (1 << bitNo);
3606 :
3607 6 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3608 :
3609 6 : PG_RETURN_BYTEA_P(res);
3610 : }
3611 :
3612 :
3613 : /* text_name()
3614 : * Converts a text type to a Name type.
3615 : */
3616 : Datum
3617 33972 : text_name(PG_FUNCTION_ARGS)
3618 : {
3619 33972 : text *s = PG_GETARG_TEXT_PP(0);
3620 : Name result;
3621 : int len;
3622 :
3623 33972 : len = VARSIZE_ANY_EXHDR(s);
3624 :
3625 : /* Truncate oversize input */
3626 33972 : if (len >= NAMEDATALEN)
3627 6 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3628 :
3629 : /* We use palloc0 here to ensure result is zero-padded */
3630 33972 : result = (Name) palloc0(NAMEDATALEN);
3631 33972 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3632 :
3633 33972 : PG_RETURN_NAME(result);
3634 : }
3635 :
3636 : /* name_text()
3637 : * Converts a Name type to a text type.
3638 : */
3639 : Datum
3640 977918 : name_text(PG_FUNCTION_ARGS)
3641 : {
3642 977918 : Name s = PG_GETARG_NAME(0);
3643 :
3644 977918 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3645 : }
3646 :
3647 :
3648 : /*
3649 : * textToQualifiedNameList - convert a text object to list of names
3650 : *
3651 : * This implements the input parsing needed by nextval() and other
3652 : * functions that take a text parameter representing a qualified name.
3653 : * We split the name at dots, downcase if not double-quoted, and
3654 : * truncate names if they're too long.
3655 : */
3656 : List *
3657 1334 : textToQualifiedNameList(text *textval)
3658 : {
3659 : char *rawname;
3660 1334 : List *result = NIL;
3661 : List *namelist;
3662 : ListCell *l;
3663 :
3664 : /* Convert to C string (handles possible detoasting). */
3665 : /* Note we rely on being able to modify rawname below. */
3666 1334 : rawname = text_to_cstring(textval);
3667 :
3668 1334 : if (!SplitIdentifierString(rawname, '.', &namelist))
3669 0 : ereport(ERROR,
3670 : (errcode(ERRCODE_INVALID_NAME),
3671 : errmsg("invalid name syntax")));
3672 :
3673 1334 : if (namelist == NIL)
3674 0 : ereport(ERROR,
3675 : (errcode(ERRCODE_INVALID_NAME),
3676 : errmsg("invalid name syntax")));
3677 :
3678 2778 : foreach(l, namelist)
3679 : {
3680 1444 : char *curname = (char *) lfirst(l);
3681 :
3682 1444 : result = lappend(result, makeString(pstrdup(curname)));
3683 : }
3684 :
3685 1334 : pfree(rawname);
3686 1334 : list_free(namelist);
3687 :
3688 1334 : return result;
3689 : }
3690 :
3691 : /*
3692 : * SplitIdentifierString --- parse a string containing identifiers
3693 : *
3694 : * This is the guts of textToQualifiedNameList, and is exported for use in
3695 : * other situations such as parsing GUC variables. In the GUC case, it's
3696 : * important to avoid memory leaks, so the API is designed to minimize the
3697 : * amount of stuff that needs to be allocated and freed.
3698 : *
3699 : * Inputs:
3700 : * rawstring: the input string; must be overwritable! On return, it's
3701 : * been modified to contain the separated identifiers.
3702 : * separator: the separator punctuation expected between identifiers
3703 : * (typically '.' or ','). Whitespace may also appear around
3704 : * identifiers.
3705 : * Outputs:
3706 : * namelist: filled with a palloc'd list of pointers to identifiers within
3707 : * rawstring. Caller should list_free() this even on error return.
3708 : *
3709 : * Returns true if okay, false if there is a syntax error in the string.
3710 : *
3711 : * Note that an empty string is considered okay here, though not in
3712 : * textToQualifiedNameList.
3713 : */
3714 : bool
3715 121256 : SplitIdentifierString(char *rawstring, char separator,
3716 : List **namelist)
3717 : {
3718 121256 : char *nextp = rawstring;
3719 121256 : bool done = false;
3720 :
3721 121256 : *namelist = NIL;
3722 :
3723 121256 : while (scanner_isspace(*nextp))
3724 0 : nextp++; /* skip leading whitespace */
3725 :
3726 121256 : if (*nextp == '\0')
3727 14042 : return true; /* allow empty string */
3728 :
3729 : /* At the top of the loop, we are at start of a new identifier. */
3730 : do
3731 : {
3732 : char *curname;
3733 : char *endp;
3734 :
3735 173396 : if (*nextp == '"')
3736 : {
3737 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3738 25680 : curname = nextp + 1;
3739 : for (;;)
3740 : {
3741 25684 : endp = strchr(nextp + 1, '"');
3742 25682 : if (endp == NULL)
3743 0 : return false; /* mismatched quotes */
3744 25682 : if (endp[1] != '"')
3745 25680 : break; /* found end of quoted name */
3746 : /* Collapse adjacent quotes into one quote, and look again */
3747 2 : memmove(endp, endp + 1, strlen(endp));
3748 2 : nextp = endp;
3749 : }
3750 : /* endp now points at the terminating quote */
3751 25680 : nextp = endp + 1;
3752 : }
3753 : else
3754 : {
3755 : /* Unquoted name --- extends to separator or whitespace */
3756 : char *downname;
3757 : int len;
3758 :
3759 147716 : curname = nextp;
3760 1277074 : while (*nextp && *nextp != separator &&
3761 1129360 : !scanner_isspace(*nextp))
3762 1129358 : nextp++;
3763 147716 : endp = nextp;
3764 147716 : if (curname == nextp)
3765 0 : return false; /* empty unquoted name not allowed */
3766 :
3767 : /*
3768 : * Downcase the identifier, using same code as main lexer does.
3769 : *
3770 : * XXX because we want to overwrite the input in-place, we cannot
3771 : * support a downcasing transformation that increases the string
3772 : * length. This is not a problem given the current implementation
3773 : * of downcase_truncate_identifier, but we'll probably have to do
3774 : * something about this someday.
3775 : */
3776 147716 : len = endp - curname;
3777 147716 : downname = downcase_truncate_identifier(curname, len, false);
3778 : Assert(strlen(downname) <= len);
3779 147716 : strncpy(curname, downname, len); /* strncpy is required here */
3780 147716 : pfree(downname);
3781 : }
3782 :
3783 173398 : while (scanner_isspace(*nextp))
3784 2 : nextp++; /* skip trailing whitespace */
3785 :
3786 173396 : if (*nextp == separator)
3787 : {
3788 66182 : nextp++;
3789 104482 : while (scanner_isspace(*nextp))
3790 38300 : nextp++; /* skip leading whitespace for next */
3791 : /* we expect another name, so done remains false */
3792 : }
3793 107214 : else if (*nextp == '\0')
3794 107212 : done = true;
3795 : else
3796 2 : return false; /* invalid syntax */
3797 :
3798 : /* Now safe to overwrite separator with a null */
3799 173394 : *endp = '\0';
3800 :
3801 : /* Truncate name if it's overlength */
3802 173394 : truncate_identifier(curname, strlen(curname), false);
3803 :
3804 : /*
3805 : * Finished isolating current name --- add it to list
3806 : */
3807 173394 : *namelist = lappend(*namelist, curname);
3808 :
3809 : /* Loop back if we didn't reach end of string */
3810 173394 : } while (!done);
3811 :
3812 107212 : return true;
3813 : }
3814 :
3815 :
3816 : /*
3817 : * SplitDirectoriesString --- parse a string containing file/directory names
3818 : *
3819 : * This works fine on file names too; the function name is historical.
3820 : *
3821 : * This is similar to SplitIdentifierString, except that the parsing
3822 : * rules are meant to handle pathnames instead of identifiers: there is
3823 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3824 : * and we apply canonicalize_path() to each extracted string. Because of the
3825 : * last, the returned strings are separately palloc'd rather than being
3826 : * pointers into rawstring --- but we still scribble on rawstring.
3827 : *
3828 : * Inputs:
3829 : * rawstring: the input string; must be modifiable!
3830 : * separator: the separator punctuation expected between directories
3831 : * (typically ',' or ';'). Whitespace may also appear around
3832 : * directories.
3833 : * Outputs:
3834 : * namelist: filled with a palloc'd list of directory names.
3835 : * Caller should list_free_deep() this even on error return.
3836 : *
3837 : * Returns true if okay, false if there is a syntax error in the string.
3838 : *
3839 : * Note that an empty string is considered okay here.
3840 : */
3841 : bool
3842 1058 : SplitDirectoriesString(char *rawstring, char separator,
3843 : List **namelist)
3844 : {
3845 1058 : char *nextp = rawstring;
3846 1058 : bool done = false;
3847 :
3848 1058 : *namelist = NIL;
3849 :
3850 1058 : while (scanner_isspace(*nextp))
3851 0 : nextp++; /* skip leading whitespace */
3852 :
3853 1058 : if (*nextp == '\0')
3854 2 : return true; /* allow empty string */
3855 :
3856 : /* At the top of the loop, we are at start of a new directory. */
3857 : do
3858 : {
3859 : char *curname;
3860 : char *endp;
3861 :
3862 1056 : if (*nextp == '"')
3863 : {
3864 : /* Quoted name --- collapse quote-quote pairs */
3865 0 : curname = nextp + 1;
3866 : for (;;)
3867 : {
3868 0 : endp = strchr(nextp + 1, '"');
3869 0 : if (endp == NULL)
3870 0 : return false; /* mismatched quotes */
3871 0 : if (endp[1] != '"')
3872 0 : break; /* found end of quoted name */
3873 : /* Collapse adjacent quotes into one quote, and look again */
3874 0 : memmove(endp, endp + 1, strlen(endp));
3875 0 : nextp = endp;
3876 : }
3877 : /* endp now points at the terminating quote */
3878 0 : nextp = endp + 1;
3879 : }
3880 : else
3881 : {
3882 : /* Unquoted name --- extends to separator or end of string */
3883 1056 : curname = endp = nextp;
3884 18194 : while (*nextp && *nextp != separator)
3885 : {
3886 : /* trailing whitespace should not be included in name */
3887 17138 : if (!scanner_isspace(*nextp))
3888 17138 : endp = nextp + 1;
3889 17138 : nextp++;
3890 : }
3891 1056 : if (curname == endp)
3892 0 : return false; /* empty unquoted name not allowed */
3893 : }
3894 :
3895 1056 : while (scanner_isspace(*nextp))
3896 0 : nextp++; /* skip trailing whitespace */
3897 :
3898 1056 : if (*nextp == separator)
3899 : {
3900 0 : nextp++;
3901 0 : while (scanner_isspace(*nextp))
3902 0 : nextp++; /* skip leading whitespace for next */
3903 : /* we expect another name, so done remains false */
3904 : }
3905 1056 : else if (*nextp == '\0')
3906 1056 : done = true;
3907 : else
3908 0 : return false; /* invalid syntax */
3909 :
3910 : /* Now safe to overwrite separator with a null */
3911 1056 : *endp = '\0';
3912 :
3913 : /* Truncate path if it's overlength */
3914 1056 : if (strlen(curname) >= MAXPGPATH)
3915 0 : curname[MAXPGPATH - 1] = '\0';
3916 :
3917 : /*
3918 : * Finished isolating current name --- add it to list
3919 : */
3920 1056 : curname = pstrdup(curname);
3921 1056 : canonicalize_path(curname);
3922 1056 : *namelist = lappend(*namelist, curname);
3923 :
3924 : /* Loop back if we didn't reach end of string */
3925 1056 : } while (!done);
3926 :
3927 1056 : return true;
3928 : }
3929 :
3930 :
3931 : /*
3932 : * SplitGUCList --- parse a string containing identifiers or file names
3933 : *
3934 : * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3935 : * presuming whether the elements will be taken as identifiers or file names.
3936 : * We assume the input has already been through flatten_set_variable_args(),
3937 : * so that we need never downcase (if appropriate, that was done already).
3938 : * Nor do we ever truncate, since we don't know the correct max length.
3939 : * We disallow embedded whitespace for simplicity (it shouldn't matter,
3940 : * because any embedded whitespace should have led to double-quoting).
3941 : * Otherwise the API is identical to SplitIdentifierString.
3942 : *
3943 : * XXX it's annoying to have so many copies of this string-splitting logic.
3944 : * However, it's not clear that having one function with a bunch of option
3945 : * flags would be much better.
3946 : *
3947 : * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3948 : * Be sure to update that if you have to change this.
3949 : *
3950 : * Inputs:
3951 : * rawstring: the input string; must be overwritable! On return, it's
3952 : * been modified to contain the separated identifiers.
3953 : * separator: the separator punctuation expected between identifiers
3954 : * (typically '.' or ','). Whitespace may also appear around
3955 : * identifiers.
3956 : * Outputs:
3957 : * namelist: filled with a palloc'd list of pointers to identifiers within
3958 : * rawstring. Caller should list_free() this even on error return.
3959 : *
3960 : * Returns true if okay, false if there is a syntax error in the string.
3961 : */
3962 : bool
3963 1056 : SplitGUCList(char *rawstring, char separator,
3964 : List **namelist)
3965 : {
3966 1056 : char *nextp = rawstring;
3967 1056 : bool done = false;
3968 :
3969 1056 : *namelist = NIL;
3970 :
3971 1056 : while (scanner_isspace(*nextp))
3972 0 : nextp++; /* skip leading whitespace */
3973 :
3974 1056 : if (*nextp == '\0')
3975 1000 : return true; /* allow empty string */
3976 :
3977 : /* At the top of the loop, we are at start of a new identifier. */
3978 : do
3979 : {
3980 : char *curname;
3981 : char *endp;
3982 :
3983 74 : if (*nextp == '"')
3984 : {
3985 : /* Quoted name --- collapse quote-quote pairs */
3986 24 : curname = nextp + 1;
3987 : for (;;)
3988 : {
3989 36 : endp = strchr(nextp + 1, '"');
3990 30 : if (endp == NULL)
3991 0 : return false; /* mismatched quotes */
3992 30 : if (endp[1] != '"')
3993 24 : break; /* found end of quoted name */
3994 : /* Collapse adjacent quotes into one quote, and look again */
3995 6 : memmove(endp, endp + 1, strlen(endp));
3996 6 : nextp = endp;
3997 : }
3998 : /* endp now points at the terminating quote */
3999 24 : nextp = endp + 1;
4000 : }
4001 : else
4002 : {
4003 : /* Unquoted name --- extends to separator or whitespace */
4004 50 : curname = nextp;
4005 506 : while (*nextp && *nextp != separator &&
4006 456 : !scanner_isspace(*nextp))
4007 456 : nextp++;
4008 50 : endp = nextp;
4009 50 : if (curname == nextp)
4010 0 : return false; /* empty unquoted name not allowed */
4011 : }
4012 :
4013 74 : while (scanner_isspace(*nextp))
4014 0 : nextp++; /* skip trailing whitespace */
4015 :
4016 74 : if (*nextp == separator)
4017 : {
4018 18 : nextp++;
4019 36 : while (scanner_isspace(*nextp))
4020 18 : nextp++; /* skip leading whitespace for next */
4021 : /* we expect another name, so done remains false */
4022 : }
4023 56 : else if (*nextp == '\0')
4024 56 : done = true;
4025 : else
4026 0 : return false; /* invalid syntax */
4027 :
4028 : /* Now safe to overwrite separator with a null */
4029 74 : *endp = '\0';
4030 :
4031 : /*
4032 : * Finished isolating current name --- add it to list
4033 : */
4034 74 : *namelist = lappend(*namelist, curname);
4035 :
4036 : /* Loop back if we didn't reach end of string */
4037 74 : } while (!done);
4038 :
4039 56 : return true;
4040 : }
4041 :
4042 :
4043 : /*****************************************************************************
4044 : * Comparison Functions used for bytea
4045 : *
4046 : * Note: btree indexes need these routines not to leak memory; therefore,
4047 : * be careful to free working copies of toasted datums. Most places don't
4048 : * need to be so careful.
4049 : *****************************************************************************/
4050 :
4051 : Datum
4052 10370 : byteaeq(PG_FUNCTION_ARGS)
4053 : {
4054 10370 : Datum arg1 = PG_GETARG_DATUM(0);
4055 10370 : Datum arg2 = PG_GETARG_DATUM(1);
4056 : bool result;
4057 : Size len1,
4058 : len2;
4059 :
4060 : /*
4061 : * We can use a fast path for unequal lengths, which might save us from
4062 : * having to detoast one or both values.
4063 : */
4064 10370 : len1 = toast_raw_datum_size(arg1);
4065 10370 : len2 = toast_raw_datum_size(arg2);
4066 10370 : if (len1 != len2)
4067 4304 : result = false;
4068 : else
4069 : {
4070 6066 : bytea *barg1 = DatumGetByteaPP(arg1);
4071 6066 : bytea *barg2 = DatumGetByteaPP(arg2);
4072 :
4073 6066 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4074 : len1 - VARHDRSZ) == 0);
4075 :
4076 6066 : PG_FREE_IF_COPY(barg1, 0);
4077 6066 : PG_FREE_IF_COPY(barg2, 1);
4078 : }
4079 :
4080 10370 : PG_RETURN_BOOL(result);
4081 : }
4082 :
4083 : Datum
4084 768 : byteane(PG_FUNCTION_ARGS)
4085 : {
4086 768 : Datum arg1 = PG_GETARG_DATUM(0);
4087 768 : Datum arg2 = PG_GETARG_DATUM(1);
4088 : bool result;
4089 : Size len1,
4090 : len2;
4091 :
4092 : /*
4093 : * We can use a fast path for unequal lengths, which might save us from
4094 : * having to detoast one or both values.
4095 : */
4096 768 : len1 = toast_raw_datum_size(arg1);
4097 768 : len2 = toast_raw_datum_size(arg2);
4098 768 : if (len1 != len2)
4099 0 : result = true;
4100 : else
4101 : {
4102 768 : bytea *barg1 = DatumGetByteaPP(arg1);
4103 768 : bytea *barg2 = DatumGetByteaPP(arg2);
4104 :
4105 768 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4106 : len1 - VARHDRSZ) != 0);
4107 :
4108 768 : PG_FREE_IF_COPY(barg1, 0);
4109 768 : PG_FREE_IF_COPY(barg2, 1);
4110 : }
4111 :
4112 768 : PG_RETURN_BOOL(result);
4113 : }
4114 :
4115 : Datum
4116 7052 : bytealt(PG_FUNCTION_ARGS)
4117 : {
4118 7052 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4119 7052 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4120 : int len1,
4121 : len2;
4122 : int cmp;
4123 :
4124 7052 : len1 = VARSIZE_ANY_EXHDR(arg1);
4125 7052 : len2 = VARSIZE_ANY_EXHDR(arg2);
4126 :
4127 7052 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4128 :
4129 7052 : PG_FREE_IF_COPY(arg1, 0);
4130 7052 : PG_FREE_IF_COPY(arg2, 1);
4131 :
4132 7052 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4133 : }
4134 :
4135 : Datum
4136 6356 : byteale(PG_FUNCTION_ARGS)
4137 : {
4138 6356 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4139 6356 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4140 : int len1,
4141 : len2;
4142 : int cmp;
4143 :
4144 6356 : len1 = VARSIZE_ANY_EXHDR(arg1);
4145 6356 : len2 = VARSIZE_ANY_EXHDR(arg2);
4146 :
4147 6356 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4148 :
4149 6356 : PG_FREE_IF_COPY(arg1, 0);
4150 6356 : PG_FREE_IF_COPY(arg2, 1);
4151 :
4152 6356 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4153 : }
4154 :
4155 : Datum
4156 4966 : byteagt(PG_FUNCTION_ARGS)
4157 : {
4158 4966 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4159 4966 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4160 : int len1,
4161 : len2;
4162 : int cmp;
4163 :
4164 4966 : len1 = VARSIZE_ANY_EXHDR(arg1);
4165 4966 : len2 = VARSIZE_ANY_EXHDR(arg2);
4166 :
4167 4966 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4168 :
4169 4966 : PG_FREE_IF_COPY(arg1, 0);
4170 4966 : PG_FREE_IF_COPY(arg2, 1);
4171 :
4172 4966 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4173 : }
4174 :
4175 : Datum
4176 5008 : byteage(PG_FUNCTION_ARGS)
4177 : {
4178 5008 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4179 5008 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4180 : int len1,
4181 : len2;
4182 : int cmp;
4183 :
4184 5008 : len1 = VARSIZE_ANY_EXHDR(arg1);
4185 5008 : len2 = VARSIZE_ANY_EXHDR(arg2);
4186 :
4187 5008 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4188 :
4189 5008 : PG_FREE_IF_COPY(arg1, 0);
4190 5008 : PG_FREE_IF_COPY(arg2, 1);
4191 :
4192 5008 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4193 : }
4194 :
4195 : Datum
4196 87498 : byteacmp(PG_FUNCTION_ARGS)
4197 : {
4198 87498 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4199 87498 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4200 : int len1,
4201 : len2;
4202 : int cmp;
4203 :
4204 87498 : len1 = VARSIZE_ANY_EXHDR(arg1);
4205 87498 : len2 = VARSIZE_ANY_EXHDR(arg2);
4206 :
4207 87498 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4208 87498 : if ((cmp == 0) && (len1 != len2))
4209 14604 : cmp = (len1 < len2) ? -1 : 1;
4210 :
4211 87498 : PG_FREE_IF_COPY(arg1, 0);
4212 87498 : PG_FREE_IF_COPY(arg2, 1);
4213 :
4214 87498 : PG_RETURN_INT32(cmp);
4215 : }
4216 :
4217 : Datum
4218 30 : bytea_sortsupport(PG_FUNCTION_ARGS)
4219 : {
4220 30 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4221 : MemoryContext oldcontext;
4222 :
4223 30 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4224 :
4225 : /* Use generic string SortSupport, forcing "C" collation */
4226 30 : varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4227 :
4228 30 : MemoryContextSwitchTo(oldcontext);
4229 :
4230 30 : PG_RETURN_VOID();
4231 : }
4232 :
4233 : /*
4234 : * appendStringInfoText
4235 : *
4236 : * Append a text to str.
4237 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4238 : */
4239 : static void
4240 1570906 : appendStringInfoText(StringInfo str, const text *t)
4241 : {
4242 1570906 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4243 1570906 : }
4244 :
4245 : /*
4246 : * replace_text
4247 : * replace all occurrences of 'old_sub_str' in 'orig_str'
4248 : * with 'new_sub_str' to form 'new_str'
4249 : *
4250 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4251 : * otherwise returns 'new_str'
4252 : */
4253 : Datum
4254 2182 : replace_text(PG_FUNCTION_ARGS)
4255 : {
4256 2182 : text *src_text = PG_GETARG_TEXT_PP(0);
4257 2182 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
4258 2182 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
4259 : int src_text_len;
4260 : int from_sub_text_len;
4261 : TextPositionState state;
4262 : text *ret_text;
4263 : int chunk_len;
4264 : char *curr_ptr;
4265 : char *start_ptr;
4266 : StringInfoData str;
4267 : bool found;
4268 :
4269 2182 : src_text_len = VARSIZE_ANY_EXHDR(src_text);
4270 2182 : from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4271 :
4272 : /* Return unmodified source string if empty source or pattern */
4273 2182 : if (src_text_len < 1 || from_sub_text_len < 1)
4274 : {
4275 0 : PG_RETURN_TEXT_P(src_text);
4276 : }
4277 :
4278 2182 : text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4279 :
4280 2182 : found = text_position_next(&state);
4281 :
4282 : /* When the from_sub_text is not found, there is nothing to do. */
4283 2182 : if (!found)
4284 : {
4285 644 : text_position_cleanup(&state);
4286 644 : PG_RETURN_TEXT_P(src_text);
4287 : }
4288 1538 : curr_ptr = text_position_get_match_ptr(&state);
4289 1538 : start_ptr = VARDATA_ANY(src_text);
4290 :
4291 1538 : initStringInfo(&str);
4292 :
4293 : do
4294 : {
4295 5694 : CHECK_FOR_INTERRUPTS();
4296 :
4297 : /* copy the data skipped over by last text_position_next() */
4298 5694 : chunk_len = curr_ptr - start_ptr;
4299 5694 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4300 :
4301 5694 : appendStringInfoText(&str, to_sub_text);
4302 :
4303 5694 : start_ptr = curr_ptr + from_sub_text_len;
4304 :
4305 5694 : found = text_position_next(&state);
4306 5694 : if (found)
4307 4156 : curr_ptr = text_position_get_match_ptr(&state);
4308 : }
4309 5694 : while (found);
4310 :
4311 : /* copy trailing data */
4312 1538 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4313 1538 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
4314 :
4315 1538 : text_position_cleanup(&state);
4316 :
4317 1538 : ret_text = cstring_to_text_with_len(str.data, str.len);
4318 1538 : pfree(str.data);
4319 :
4320 1538 : PG_RETURN_TEXT_P(ret_text);
4321 : }
4322 :
4323 : /*
4324 : * check_replace_text_has_escape
4325 : *
4326 : * Returns 0 if text contains no backslashes that need processing.
4327 : * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4328 : * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4329 : */
4330 : static int
4331 9118 : check_replace_text_has_escape(const text *replace_text)
4332 : {
4333 9118 : int result = 0;
4334 9118 : const char *p = VARDATA_ANY(replace_text);
4335 9118 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4336 :
4337 9162 : while (p < p_end)
4338 : {
4339 : /* Find next escape char, if any. */
4340 8100 : p = memchr(p, '\\', p_end - p);
4341 8100 : if (p == NULL)
4342 7884 : break;
4343 216 : p++;
4344 : /* Note: a backslash at the end doesn't require extra processing. */
4345 216 : if (p < p_end)
4346 : {
4347 216 : if (*p >= '1' && *p <= '9')
4348 172 : return 2; /* Found a submatch specifier, so done */
4349 44 : result = 1; /* Found some other sequence, keep looking */
4350 44 : p++;
4351 : }
4352 : }
4353 8946 : return result;
4354 : }
4355 :
4356 : /*
4357 : * appendStringInfoRegexpSubstr
4358 : *
4359 : * Append replace_text to str, substituting regexp back references for
4360 : * \n escapes. start_ptr is the start of the match in the source string,
4361 : * at logical character position data_pos.
4362 : */
4363 : static void
4364 128 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4365 : regmatch_t *pmatch,
4366 : char *start_ptr, int data_pos)
4367 : {
4368 128 : const char *p = VARDATA_ANY(replace_text);
4369 128 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4370 :
4371 340 : while (p < p_end)
4372 : {
4373 296 : const char *chunk_start = p;
4374 : int so;
4375 : int eo;
4376 :
4377 : /* Find next escape char, if any. */
4378 296 : p = memchr(p, '\\', p_end - p);
4379 296 : if (p == NULL)
4380 78 : p = p_end;
4381 :
4382 : /* Copy the text we just scanned over, if any. */
4383 296 : if (p > chunk_start)
4384 204 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4385 :
4386 : /* Done if at end of string, else advance over escape char. */
4387 296 : if (p >= p_end)
4388 78 : break;
4389 218 : p++;
4390 :
4391 218 : if (p >= p_end)
4392 : {
4393 : /* Escape at very end of input. Treat same as unexpected char */
4394 6 : appendStringInfoChar(str, '\\');
4395 6 : break;
4396 : }
4397 :
4398 212 : if (*p >= '1' && *p <= '9')
4399 152 : {
4400 : /* Use the back reference of regexp. */
4401 152 : int idx = *p - '0';
4402 :
4403 152 : so = pmatch[idx].rm_so;
4404 152 : eo = pmatch[idx].rm_eo;
4405 152 : p++;
4406 : }
4407 60 : else if (*p == '&')
4408 : {
4409 : /* Use the entire matched string. */
4410 18 : so = pmatch[0].rm_so;
4411 18 : eo = pmatch[0].rm_eo;
4412 18 : p++;
4413 : }
4414 42 : else if (*p == '\\')
4415 : {
4416 : /* \\ means transfer one \ to output. */
4417 36 : appendStringInfoChar(str, '\\');
4418 36 : p++;
4419 36 : continue;
4420 : }
4421 : else
4422 : {
4423 : /*
4424 : * If escape char is not followed by any expected char, just treat
4425 : * it as ordinary data to copy. (XXX would it be better to throw
4426 : * an error?)
4427 : */
4428 6 : appendStringInfoChar(str, '\\');
4429 6 : continue;
4430 : }
4431 :
4432 170 : if (so >= 0 && eo >= 0)
4433 : {
4434 : /*
4435 : * Copy the text that is back reference of regexp. Note so and eo
4436 : * are counted in characters not bytes.
4437 : */
4438 : char *chunk_start;
4439 : int chunk_len;
4440 :
4441 : Assert(so >= data_pos);
4442 170 : chunk_start = start_ptr;
4443 170 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4444 170 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4445 170 : appendBinaryStringInfo(str, chunk_start, chunk_len);
4446 : }
4447 : }
4448 128 : }
4449 :
4450 : /*
4451 : * replace_text_regexp
4452 : *
4453 : * replace substring(s) in src_text that match pattern with replace_text.
4454 : * The replace_text can contain backslash markers to substitute
4455 : * (parts of) the matched text.
4456 : *
4457 : * cflags: regexp compile flags.
4458 : * collation: collation to use.
4459 : * search_start: the character (not byte) offset in src_text at which to
4460 : * begin searching.
4461 : * n: if 0, replace all matches; if > 0, replace only the N'th match.
4462 : */
4463 : text *
4464 9118 : replace_text_regexp(text *src_text, text *pattern_text,
4465 : text *replace_text,
4466 : int cflags, Oid collation,
4467 : int search_start, int n)
4468 : {
4469 : text *ret_text;
4470 : regex_t *re;
4471 9118 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4472 9118 : int nmatches = 0;
4473 : StringInfoData buf;
4474 : regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4475 9118 : int nmatch = lengthof(pmatch);
4476 : pg_wchar *data;
4477 : size_t data_len;
4478 : int data_pos;
4479 : char *start_ptr;
4480 : int escape_status;
4481 :
4482 9118 : initStringInfo(&buf);
4483 :
4484 : /* Convert data string to wide characters. */
4485 9118 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4486 9118 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4487 :
4488 : /* Check whether replace_text has escapes, especially regexp submatches. */
4489 9118 : escape_status = check_replace_text_has_escape(replace_text);
4490 :
4491 : /* If no regexp submatches, we can use REG_NOSUB. */
4492 9118 : if (escape_status < 2)
4493 : {
4494 8946 : cflags |= REG_NOSUB;
4495 : /* Also tell pg_regexec we only want the whole-match location. */
4496 8946 : nmatch = 1;
4497 : }
4498 :
4499 : /* Prepare the regexp. */
4500 9118 : re = RE_compile_and_cache(pattern_text, cflags, collation);
4501 :
4502 : /* start_ptr points to the data_pos'th character of src_text */
4503 9118 : start_ptr = (char *) VARDATA_ANY(src_text);
4504 9118 : data_pos = 0;
4505 :
4506 12684 : while (search_start <= data_len)
4507 : {
4508 : int regexec_result;
4509 :
4510 12678 : CHECK_FOR_INTERRUPTS();
4511 :
4512 12678 : regexec_result = pg_regexec(re,
4513 : data,
4514 : data_len,
4515 : search_start,
4516 : NULL, /* no details */
4517 : nmatch,
4518 : pmatch,
4519 : 0);
4520 :
4521 12678 : if (regexec_result == REG_NOMATCH)
4522 7920 : break;
4523 :
4524 4758 : if (regexec_result != REG_OKAY)
4525 : {
4526 : char errMsg[100];
4527 :
4528 0 : CHECK_FOR_INTERRUPTS();
4529 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4530 0 : ereport(ERROR,
4531 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4532 : errmsg("regular expression failed: %s", errMsg)));
4533 : }
4534 :
4535 : /*
4536 : * Count matches, and decide whether to replace this match.
4537 : */
4538 4758 : nmatches++;
4539 4758 : if (n > 0 && nmatches != n)
4540 : {
4541 : /*
4542 : * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4543 : * we treat the matched text as if it weren't matched, and copy it
4544 : * to the output later.)
4545 : */
4546 60 : search_start = pmatch[0].rm_eo;
4547 60 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4548 0 : search_start++;
4549 60 : continue;
4550 : }
4551 :
4552 : /*
4553 : * Copy the text to the left of the match position. Note we are given
4554 : * character not byte indexes.
4555 : */
4556 4698 : if (pmatch[0].rm_so - data_pos > 0)
4557 : {
4558 : int chunk_len;
4559 :
4560 4582 : chunk_len = charlen_to_bytelen(start_ptr,
4561 4582 : pmatch[0].rm_so - data_pos);
4562 4582 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4563 :
4564 : /*
4565 : * Advance start_ptr over that text, to avoid multiple rescans of
4566 : * it if the replace_text contains multiple back-references.
4567 : */
4568 4582 : start_ptr += chunk_len;
4569 4582 : data_pos = pmatch[0].rm_so;
4570 : }
4571 :
4572 : /*
4573 : * Copy the replace_text, processing escapes if any are present.
4574 : */
4575 4698 : if (escape_status > 0)
4576 128 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4577 : start_ptr, data_pos);
4578 : else
4579 4570 : appendStringInfoText(&buf, replace_text);
4580 :
4581 : /* Advance start_ptr and data_pos over the matched text. */
4582 9396 : start_ptr += charlen_to_bytelen(start_ptr,
4583 4698 : pmatch[0].rm_eo - data_pos);
4584 4698 : data_pos = pmatch[0].rm_eo;
4585 :
4586 : /*
4587 : * If we only want to replace one occurrence, we're done.
4588 : */
4589 4698 : if (n > 0)
4590 1192 : break;
4591 :
4592 : /*
4593 : * Advance search position. Normally we start the next search at the
4594 : * end of the previous match; but if the match was of zero length, we
4595 : * have to advance by one character, or we'd just find the same match
4596 : * again.
4597 : */
4598 3506 : search_start = data_pos;
4599 3506 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4600 12 : search_start++;
4601 : }
4602 :
4603 : /*
4604 : * Copy the text to the right of the last match.
4605 : */
4606 9118 : if (data_pos < data_len)
4607 : {
4608 : int chunk_len;
4609 :
4610 8722 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4611 8722 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4612 : }
4613 :
4614 9118 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4615 9118 : pfree(buf.data);
4616 9118 : pfree(data);
4617 :
4618 9118 : return ret_text;
4619 : }
4620 :
4621 : /*
4622 : * split_part
4623 : * parse input string based on provided field separator
4624 : * return N'th item (1 based, negative counts from end)
4625 : */
4626 : Datum
4627 102 : split_part(PG_FUNCTION_ARGS)
4628 : {
4629 102 : text *inputstring = PG_GETARG_TEXT_PP(0);
4630 102 : text *fldsep = PG_GETARG_TEXT_PP(1);
4631 102 : int fldnum = PG_GETARG_INT32(2);
4632 : int inputstring_len;
4633 : int fldsep_len;
4634 : TextPositionState state;
4635 : char *start_ptr;
4636 : char *end_ptr;
4637 : text *result_text;
4638 : bool found;
4639 :
4640 : /* field number is 1 based */
4641 102 : if (fldnum == 0)
4642 6 : ereport(ERROR,
4643 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4644 : errmsg("field position must not be zero")));
4645 :
4646 96 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4647 96 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4648 :
4649 : /* return empty string for empty input string */
4650 96 : if (inputstring_len < 1)
4651 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4652 :
4653 : /* handle empty field separator */
4654 84 : if (fldsep_len < 1)
4655 : {
4656 : /* if first or last field, return input string, else empty string */
4657 24 : if (fldnum == 1 || fldnum == -1)
4658 12 : PG_RETURN_TEXT_P(inputstring);
4659 : else
4660 12 : PG_RETURN_TEXT_P(cstring_to_text(""));
4661 : }
4662 :
4663 : /* find the first field separator */
4664 60 : text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4665 :
4666 60 : found = text_position_next(&state);
4667 :
4668 : /* special case if fldsep not found at all */
4669 60 : if (!found)
4670 : {
4671 12 : text_position_cleanup(&state);
4672 : /* if first or last field, return input string, else empty string */
4673 12 : if (fldnum == 1 || fldnum == -1)
4674 6 : PG_RETURN_TEXT_P(inputstring);
4675 : else
4676 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4677 : }
4678 :
4679 : /*
4680 : * take care of a negative field number (i.e. count from the right) by
4681 : * converting to a positive field number; we need total number of fields
4682 : */
4683 48 : if (fldnum < 0)
4684 : {
4685 : /* we found a fldsep, so there are at least two fields */
4686 24 : int numfields = 2;
4687 :
4688 36 : while (text_position_next(&state))
4689 12 : numfields++;
4690 :
4691 : /* special case of last field does not require an extra pass */
4692 24 : if (fldnum == -1)
4693 : {
4694 6 : start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4695 6 : end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4696 6 : text_position_cleanup(&state);
4697 6 : PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4698 : end_ptr - start_ptr));
4699 : }
4700 :
4701 : /* else, convert fldnum to positive notation */
4702 18 : fldnum += numfields + 1;
4703 :
4704 : /* if nonexistent field, return empty string */
4705 18 : if (fldnum <= 0)
4706 : {
4707 6 : text_position_cleanup(&state);
4708 6 : PG_RETURN_TEXT_P(cstring_to_text(""));
4709 : }
4710 :
4711 : /* reset to pointing at first match, but now with positive fldnum */
4712 12 : text_position_reset(&state);
4713 12 : found = text_position_next(&state);
4714 : Assert(found);
4715 : }
4716 :
4717 : /* identify bounds of first field */
4718 36 : start_ptr = VARDATA_ANY(inputstring);
4719 36 : end_ptr = text_position_get_match_ptr(&state);
4720 :
4721 66 : while (found && --fldnum > 0)
4722 : {
4723 : /* identify bounds of next field */
4724 30 : start_ptr = end_ptr + fldsep_len;
4725 30 : found = text_position_next(&state);
4726 30 : if (found)
4727 18 : end_ptr = text_position_get_match_ptr(&state);
4728 : }
4729 :
4730 36 : text_position_cleanup(&state);
4731 :
4732 36 : if (fldnum > 0)
4733 : {
4734 : /* N'th field separator not found */
4735 : /* if last field requested, return it, else empty string */
4736 12 : if (fldnum == 1)
4737 : {
4738 6 : int last_len = start_ptr - VARDATA_ANY(inputstring);
4739 :
4740 6 : result_text = cstring_to_text_with_len(start_ptr,
4741 : inputstring_len - last_len);
4742 : }
4743 : else
4744 6 : result_text = cstring_to_text("");
4745 : }
4746 : else
4747 : {
4748 : /* non-last field requested */
4749 24 : result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4750 : }
4751 :
4752 36 : PG_RETURN_TEXT_P(result_text);
4753 : }
4754 :
4755 : /*
4756 : * Convenience function to return true when two text params are equal.
4757 : */
4758 : static bool
4759 168 : text_isequal(text *txt1, text *txt2, Oid collid)
4760 : {
4761 168 : return DatumGetBool(DirectFunctionCall2Coll(texteq,
4762 : collid,
4763 : PointerGetDatum(txt1),
4764 : PointerGetDatum(txt2)));
4765 : }
4766 :
4767 : /*
4768 : * text_to_array
4769 : * parse input string and return text array of elements,
4770 : * based on provided field separator
4771 : */
4772 : Datum
4773 108 : text_to_array(PG_FUNCTION_ARGS)
4774 : {
4775 : SplitTextOutputData tstate;
4776 :
4777 : /* For array output, tstate should start as all zeroes */
4778 108 : memset(&tstate, 0, sizeof(tstate));
4779 :
4780 108 : if (!split_text(fcinfo, &tstate))
4781 6 : PG_RETURN_NULL();
4782 :
4783 102 : if (tstate.astate == NULL)
4784 6 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4785 :
4786 96 : PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
4787 : CurrentMemoryContext));
4788 : }
4789 :
4790 : /*
4791 : * text_to_array_null
4792 : * parse input string and return text array of elements,
4793 : * based on provided field separator and null string
4794 : *
4795 : * This is a separate entry point only to prevent the regression tests from
4796 : * complaining about different argument sets for the same internal function.
4797 : */
4798 : Datum
4799 24 : text_to_array_null(PG_FUNCTION_ARGS)
4800 : {
4801 24 : return text_to_array(fcinfo);
4802 : }
4803 :
4804 : /*
4805 : * text_to_table
4806 : * parse input string and return table of elements,
4807 : * based on provided field separator
4808 : */
4809 : Datum
4810 84 : text_to_table(PG_FUNCTION_ARGS)
4811 : {
4812 84 : ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4813 : SplitTextOutputData tstate;
4814 :
4815 84 : tstate.astate = NULL;
4816 84 : SetSingleFuncCall(fcinfo, SRF_SINGLE_USE_EXPECTED);
4817 84 : tstate.tupstore = rsi->setResult;
4818 84 : tstate.tupdesc = rsi->setDesc;
4819 :
4820 84 : (void) split_text(fcinfo, &tstate);
4821 :
4822 84 : return (Datum) 0;
4823 : }
4824 :
4825 : /*
4826 : * text_to_table_null
4827 : * parse input string and return table of elements,
4828 : * based on provided field separator and null string
4829 : *
4830 : * This is a separate entry point only to prevent the regression tests from
4831 : * complaining about different argument sets for the same internal function.
4832 : */
4833 : Datum
4834 24 : text_to_table_null(PG_FUNCTION_ARGS)
4835 : {
4836 24 : return text_to_table(fcinfo);
4837 : }
4838 :
4839 : /*
4840 : * Common code for text_to_array, text_to_array_null, text_to_table
4841 : * and text_to_table_null functions.
4842 : *
4843 : * These are not strict so we have to test for null inputs explicitly.
4844 : * Returns false if result is to be null, else returns true.
4845 : *
4846 : * Note that if the result is valid but empty (zero elements), we return
4847 : * without changing *tstate --- caller must handle that case, too.
4848 : */
4849 : static bool
4850 192 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4851 : {
4852 : text *inputstring;
4853 : text *fldsep;
4854 : text *null_string;
4855 192 : Oid collation = PG_GET_COLLATION();
4856 : int inputstring_len;
4857 : int fldsep_len;
4858 : char *start_ptr;
4859 : text *result_text;
4860 :
4861 : /* when input string is NULL, then result is NULL too */
4862 192 : if (PG_ARGISNULL(0))
4863 12 : return false;
4864 :
4865 180 : inputstring = PG_GETARG_TEXT_PP(0);
4866 :
4867 : /* fldsep can be NULL */
4868 180 : if (!PG_ARGISNULL(1))
4869 168 : fldsep = PG_GETARG_TEXT_PP(1);
4870 : else
4871 12 : fldsep = NULL;
4872 :
4873 : /* null_string can be NULL or omitted */
4874 180 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4875 48 : null_string = PG_GETARG_TEXT_PP(2);
4876 : else
4877 132 : null_string = NULL;
4878 :
4879 180 : if (fldsep != NULL)
4880 : {
4881 : /*
4882 : * Normal case with non-null fldsep. Use the text_position machinery
4883 : * to search for occurrences of fldsep.
4884 : */
4885 : TextPositionState state;
4886 :
4887 168 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4888 168 : fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4889 :
4890 : /* return empty set for empty input string */
4891 168 : if (inputstring_len < 1)
4892 60 : return true;
4893 :
4894 : /* empty field separator: return input string as a one-element set */
4895 156 : if (fldsep_len < 1)
4896 : {
4897 48 : split_text_accum_result(tstate, inputstring,
4898 : null_string, collation);
4899 48 : return true;
4900 : }
4901 :
4902 108 : text_position_setup(inputstring, fldsep, collation, &state);
4903 :
4904 108 : start_ptr = VARDATA_ANY(inputstring);
4905 :
4906 : for (;;)
4907 444 : {
4908 : bool found;
4909 : char *end_ptr;
4910 : int chunk_len;
4911 :
4912 552 : CHECK_FOR_INTERRUPTS();
4913 :
4914 552 : found = text_position_next(&state);
4915 552 : if (!found)
4916 : {
4917 : /* fetch last field */
4918 108 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4919 108 : end_ptr = NULL; /* not used, but some compilers complain */
4920 : }
4921 : else
4922 : {
4923 : /* fetch non-last field */
4924 444 : end_ptr = text_position_get_match_ptr(&state);
4925 444 : chunk_len = end_ptr - start_ptr;
4926 : }
4927 :
4928 : /* build a temp text datum to pass to split_text_accum_result */
4929 552 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4930 :
4931 : /* stash away this field */
4932 552 : split_text_accum_result(tstate, result_text,
4933 : null_string, collation);
4934 :
4935 552 : pfree(result_text);
4936 :
4937 552 : if (!found)
4938 108 : break;
4939 :
4940 444 : start_ptr = end_ptr + fldsep_len;
4941 : }
4942 :
4943 108 : text_position_cleanup(&state);
4944 : }
4945 : else
4946 : {
4947 : /*
4948 : * When fldsep is NULL, each character in the input string becomes a
4949 : * separate element in the result set. The separator is effectively
4950 : * the space between characters.
4951 : */
4952 12 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4953 :
4954 12 : start_ptr = VARDATA_ANY(inputstring);
4955 :
4956 72 : while (inputstring_len > 0)
4957 : {
4958 60 : int chunk_len = pg_mblen(start_ptr);
4959 :
4960 60 : CHECK_FOR_INTERRUPTS();
4961 :
4962 : /* build a temp text datum to pass to split_text_accum_result */
4963 60 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4964 :
4965 : /* stash away this field */
4966 60 : split_text_accum_result(tstate, result_text,
4967 : null_string, collation);
4968 :
4969 60 : pfree(result_text);
4970 :
4971 60 : start_ptr += chunk_len;
4972 60 : inputstring_len -= chunk_len;
4973 : }
4974 : }
4975 :
4976 120 : return true;
4977 : }
4978 :
4979 : /*
4980 : * Add text item to result set (table or array).
4981 : *
4982 : * This is also responsible for checking to see if the item matches
4983 : * the null_string, in which case we should emit NULL instead.
4984 : */
4985 : static void
4986 660 : split_text_accum_result(SplitTextOutputData *tstate,
4987 : text *field_value,
4988 : text *null_string,
4989 : Oid collation)
4990 : {
4991 660 : bool is_null = false;
4992 :
4993 660 : if (null_string && text_isequal(field_value, null_string, collation))
4994 48 : is_null = true;
4995 :
4996 660 : if (tstate->tupstore)
4997 : {
4998 : Datum values[1];
4999 : bool nulls[1];
5000 :
5001 228 : values[0] = PointerGetDatum(field_value);
5002 228 : nulls[0] = is_null;
5003 :
5004 228 : tuplestore_putvalues(tstate->tupstore,
5005 : tstate->tupdesc,
5006 : values,
5007 : nulls);
5008 : }
5009 : else
5010 : {
5011 432 : tstate->astate = accumArrayResult(tstate->astate,
5012 : PointerGetDatum(field_value),
5013 : is_null,
5014 : TEXTOID,
5015 : CurrentMemoryContext);
5016 : }
5017 660 : }
5018 :
5019 : /*
5020 : * array_to_text
5021 : * concatenate Cstring representation of input array elements
5022 : * using provided field separator
5023 : */
5024 : Datum
5025 58306 : array_to_text(PG_FUNCTION_ARGS)
5026 : {
5027 58306 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
5028 58306 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5029 :
5030 58306 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5031 : }
5032 :
5033 : /*
5034 : * array_to_text_null
5035 : * concatenate Cstring representation of input array elements
5036 : * using provided field separator and null string
5037 : *
5038 : * This version is not strict so we have to test for null inputs explicitly.
5039 : */
5040 : Datum
5041 12 : array_to_text_null(PG_FUNCTION_ARGS)
5042 : {
5043 : ArrayType *v;
5044 : char *fldsep;
5045 : char *null_string;
5046 :
5047 : /* returns NULL when first or second parameter is NULL */
5048 12 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5049 0 : PG_RETURN_NULL();
5050 :
5051 12 : v = PG_GETARG_ARRAYTYPE_P(0);
5052 12 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5053 :
5054 : /* NULL null string is passed through as a null pointer */
5055 12 : if (!PG_ARGISNULL(2))
5056 6 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5057 : else
5058 6 : null_string = NULL;
5059 :
5060 12 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5061 : }
5062 :
5063 : /*
5064 : * common code for array_to_text and array_to_text_null functions
5065 : */
5066 : static text *
5067 58336 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5068 : const char *fldsep, const char *null_string)
5069 : {
5070 : text *result;
5071 : int nitems,
5072 : *dims,
5073 : ndims;
5074 : Oid element_type;
5075 : int typlen;
5076 : bool typbyval;
5077 : char typalign;
5078 : StringInfoData buf;
5079 58336 : bool printed = false;
5080 : char *p;
5081 : bits8 *bitmap;
5082 : int bitmask;
5083 : int i;
5084 : ArrayMetaState *my_extra;
5085 :
5086 58336 : ndims = ARR_NDIM(v);
5087 58336 : dims = ARR_DIMS(v);
5088 58336 : nitems = ArrayGetNItems(ndims, dims);
5089 :
5090 : /* if there are no elements, return an empty string */
5091 58336 : if (nitems == 0)
5092 34850 : return cstring_to_text_with_len("", 0);
5093 :
5094 23486 : element_type = ARR_ELEMTYPE(v);
5095 23486 : initStringInfo(&buf);
5096 :
5097 : /*
5098 : * We arrange to look up info about element type, including its output
5099 : * conversion proc, only once per series of calls, assuming the element
5100 : * type doesn't change underneath us.
5101 : */
5102 23486 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5103 23486 : if (my_extra == NULL)
5104 : {
5105 1314 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5106 : sizeof(ArrayMetaState));
5107 1314 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5108 1314 : my_extra->element_type = ~element_type;
5109 : }
5110 :
5111 23486 : if (my_extra->element_type != element_type)
5112 : {
5113 : /*
5114 : * Get info about element type, including its output conversion proc
5115 : */
5116 1314 : get_type_io_data(element_type, IOFunc_output,
5117 : &my_extra->typlen, &my_extra->typbyval,
5118 : &my_extra->typalign, &my_extra->typdelim,
5119 : &my_extra->typioparam, &my_extra->typiofunc);
5120 1314 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5121 1314 : fcinfo->flinfo->fn_mcxt);
5122 1314 : my_extra->element_type = element_type;
5123 : }
5124 23486 : typlen = my_extra->typlen;
5125 23486 : typbyval = my_extra->typbyval;
5126 23486 : typalign = my_extra->typalign;
5127 :
5128 23486 : p = ARR_DATA_PTR(v);
5129 23486 : bitmap = ARR_NULLBITMAP(v);
5130 23486 : bitmask = 1;
5131 :
5132 80206 : for (i = 0; i < nitems; i++)
5133 : {
5134 : Datum itemvalue;
5135 : char *value;
5136 :
5137 : /* Get source element, checking for NULL */
5138 56720 : if (bitmap && (*bitmap & bitmask) == 0)
5139 : {
5140 : /* if null_string is NULL, we just ignore null elements */
5141 18 : if (null_string != NULL)
5142 : {
5143 6 : if (printed)
5144 6 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
5145 : else
5146 0 : appendStringInfoString(&buf, null_string);
5147 6 : printed = true;
5148 : }
5149 : }
5150 : else
5151 : {
5152 56702 : itemvalue = fetch_att(p, typbyval, typlen);
5153 :
5154 56702 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
5155 :
5156 56702 : if (printed)
5157 33216 : appendStringInfo(&buf, "%s%s", fldsep, value);
5158 : else
5159 23486 : appendStringInfoString(&buf, value);
5160 56702 : printed = true;
5161 :
5162 56702 : p = att_addlength_pointer(p, typlen, p);
5163 56702 : p = (char *) att_align_nominal(p, typalign);
5164 : }
5165 :
5166 : /* advance bitmap pointer if any */
5167 56720 : if (bitmap)
5168 : {
5169 108 : bitmask <<= 1;
5170 108 : if (bitmask == 0x100)
5171 : {
5172 0 : bitmap++;
5173 0 : bitmask = 1;
5174 : }
5175 : }
5176 : }
5177 :
5178 23486 : result = cstring_to_text_with_len(buf.data, buf.len);
5179 23486 : pfree(buf.data);
5180 :
5181 23486 : return result;
5182 : }
5183 :
5184 : #define HEXBASE 16
5185 : /*
5186 : * Convert an int32 to a string containing a base 16 (hex) representation of
5187 : * the number.
5188 : */
5189 : Datum
5190 38708 : to_hex32(PG_FUNCTION_ARGS)
5191 : {
5192 38708 : uint32 value = (uint32) PG_GETARG_INT32(0);
5193 : char *ptr;
5194 38708 : const char *digits = "0123456789abcdef";
5195 : char buf[32]; /* bigger than needed, but reasonable */
5196 :
5197 38708 : ptr = buf + sizeof(buf) - 1;
5198 38708 : *ptr = '\0';
5199 :
5200 : do
5201 : {
5202 74726 : *--ptr = digits[value % HEXBASE];
5203 74726 : value /= HEXBASE;
5204 74726 : } while (ptr > buf && value);
5205 :
5206 38708 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
5207 : }
5208 :
5209 : /*
5210 : * Convert an int64 to a string containing a base 16 (hex) representation of
5211 : * the number.
5212 : */
5213 : Datum
5214 6 : to_hex64(PG_FUNCTION_ARGS)
5215 : {
5216 6 : uint64 value = (uint64) PG_GETARG_INT64(0);
5217 : char *ptr;
5218 6 : const char *digits = "0123456789abcdef";
5219 : char buf[32]; /* bigger than needed, but reasonable */
5220 :
5221 6 : ptr = buf + sizeof(buf) - 1;
5222 6 : *ptr = '\0';
5223 :
5224 : do
5225 : {
5226 48 : *--ptr = digits[value % HEXBASE];
5227 48 : value /= HEXBASE;
5228 48 : } while (ptr > buf && value);
5229 :
5230 6 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
5231 : }
5232 :
5233 : /*
5234 : * Return the size of a datum, possibly compressed
5235 : *
5236 : * Works on any data type
5237 : */
5238 : Datum
5239 122 : pg_column_size(PG_FUNCTION_ARGS)
5240 : {
5241 122 : Datum value = PG_GETARG_DATUM(0);
5242 : int32 result;
5243 : int typlen;
5244 :
5245 : /* On first call, get the input type's typlen, and save at *fn_extra */
5246 122 : if (fcinfo->flinfo->fn_extra == NULL)
5247 : {
5248 : /* Lookup the datatype of the supplied argument */
5249 122 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5250 :
5251 122 : typlen = get_typlen(argtypeid);
5252 122 : if (typlen == 0) /* should not happen */
5253 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5254 :
5255 122 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5256 : sizeof(int));
5257 122 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5258 : }
5259 : else
5260 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5261 :
5262 122 : if (typlen == -1)
5263 : {
5264 : /* varlena type, possibly toasted */
5265 122 : result = toast_datum_size(value);
5266 : }
5267 0 : else if (typlen == -2)
5268 : {
5269 : /* cstring */
5270 0 : result = strlen(DatumGetCString(value)) + 1;
5271 : }
5272 : else
5273 : {
5274 : /* ordinary fixed-width type */
5275 0 : result = typlen;
5276 : }
5277 :
5278 122 : PG_RETURN_INT32(result);
5279 : }
5280 :
5281 : /*
5282 : * Return the compression method stored in the compressed attribute. Return
5283 : * NULL for non varlena type or uncompressed data.
5284 : */
5285 : Datum
5286 162 : pg_column_compression(PG_FUNCTION_ARGS)
5287 : {
5288 : int typlen;
5289 : char *result;
5290 : ToastCompressionId cmid;
5291 :
5292 : /* On first call, get the input type's typlen, and save at *fn_extra */
5293 162 : if (fcinfo->flinfo->fn_extra == NULL)
5294 : {
5295 : /* Lookup the datatype of the supplied argument */
5296 108 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5297 :
5298 108 : typlen = get_typlen(argtypeid);
5299 108 : if (typlen == 0) /* should not happen */
5300 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
5301 :
5302 108 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5303 : sizeof(int));
5304 108 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
5305 : }
5306 : else
5307 54 : typlen = *((int *) fcinfo->flinfo->fn_extra);
5308 :
5309 162 : if (typlen != -1)
5310 0 : PG_RETURN_NULL();
5311 :
5312 : /* get the compression method id stored in the compressed varlena */
5313 162 : cmid = toast_get_compression_id((struct varlena *)
5314 162 : DatumGetPointer(PG_GETARG_DATUM(0)));
5315 162 : if (cmid == TOAST_INVALID_COMPRESSION_ID)
5316 6 : PG_RETURN_NULL();
5317 :
5318 : /* convert compression method id to compression method name */
5319 156 : switch (cmid)
5320 : {
5321 66 : case TOAST_PGLZ_COMPRESSION_ID:
5322 66 : result = "pglz";
5323 66 : break;
5324 90 : case TOAST_LZ4_COMPRESSION_ID:
5325 90 : result = "lz4";
5326 90 : break;
5327 0 : default:
5328 0 : elog(ERROR, "invalid compression method id %d", cmid);
5329 : }
5330 :
5331 156 : PG_RETURN_TEXT_P(cstring_to_text(result));
5332 : }
5333 :
5334 : /*
5335 : * string_agg - Concatenates values and returns string.
5336 : *
5337 : * Syntax: string_agg(value text, delimiter text) RETURNS text
5338 : *
5339 : * Note: Any NULL values are ignored. The first-call delimiter isn't
5340 : * actually used at all, and on subsequent calls the delimiter precedes
5341 : * the associated value.
5342 : */
5343 :
5344 : /* subroutine to initialize state */
5345 : static StringInfo
5346 1412 : makeStringAggState(FunctionCallInfo fcinfo)
5347 : {
5348 : StringInfo state;
5349 : MemoryContext aggcontext;
5350 : MemoryContext oldcontext;
5351 :
5352 1412 : if (!AggCheckCallContext(fcinfo, &aggcontext))
5353 : {
5354 : /* cannot be called directly because of internal-type argument */
5355 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
5356 : }
5357 :
5358 : /*
5359 : * Create state in aggregate context. It'll stay there across subsequent
5360 : * calls.
5361 : */
5362 1412 : oldcontext = MemoryContextSwitchTo(aggcontext);
5363 1412 : state = makeStringInfo();
5364 1412 : MemoryContextSwitchTo(oldcontext);
5365 :
5366 1412 : return state;
5367 : }
5368 :
5369 : Datum
5370 781062 : string_agg_transfn(PG_FUNCTION_ARGS)
5371 : {
5372 : StringInfo state;
5373 :
5374 781062 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5375 :
5376 : /* Append the value unless null. */
5377 781062 : if (!PG_ARGISNULL(1))
5378 : {
5379 : /* On the first time through, we ignore the delimiter. */
5380 781014 : if (state == NULL)
5381 1386 : state = makeStringAggState(fcinfo);
5382 779628 : else if (!PG_ARGISNULL(2))
5383 779628 : appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5384 :
5385 781014 : appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5386 : }
5387 :
5388 : /*
5389 : * The transition type for string_agg() is declared to be "internal",
5390 : * which is a pass-by-value type the same size as a pointer.
5391 : */
5392 781062 : PG_RETURN_POINTER(state);
5393 : }
5394 :
5395 : Datum
5396 1458 : string_agg_finalfn(PG_FUNCTION_ARGS)
5397 : {
5398 : StringInfo state;
5399 :
5400 : /* cannot be called directly because of internal-type argument */
5401 : Assert(AggCheckCallContext(fcinfo, NULL));
5402 :
5403 1458 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5404 :
5405 1458 : if (state != NULL)
5406 1386 : PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5407 : else
5408 72 : PG_RETURN_NULL();
5409 : }
5410 :
5411 : /*
5412 : * Prepare cache with fmgr info for the output functions of the datatypes of
5413 : * the arguments of a concat-like function, beginning with argument "argidx".
5414 : * (Arguments before that will have corresponding slots in the resulting
5415 : * FmgrInfo array, but we don't fill those slots.)
5416 : */
5417 : static FmgrInfo *
5418 40 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5419 : {
5420 : FmgrInfo *foutcache;
5421 : int i;
5422 :
5423 : /* We keep the info in fn_mcxt so it survives across calls */
5424 40 : foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5425 40 : PG_NARGS() * sizeof(FmgrInfo));
5426 :
5427 196 : for (i = argidx; i < PG_NARGS(); i++)
5428 : {
5429 : Oid valtype;
5430 : Oid typOutput;
5431 : bool typIsVarlena;
5432 :
5433 156 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5434 156 : if (!OidIsValid(valtype))
5435 0 : elog(ERROR, "could not determine data type of concat() input");
5436 :
5437 156 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5438 156 : fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5439 : }
5440 :
5441 40 : fcinfo->flinfo->fn_extra = foutcache;
5442 :
5443 40 : return foutcache;
5444 : }
5445 :
5446 : /*
5447 : * Implementation of both concat() and concat_ws().
5448 : *
5449 : * sepstr is the separator string to place between values.
5450 : * argidx identifies the first argument to concatenate (counting from zero);
5451 : * note that this must be constant across any one series of calls.
5452 : *
5453 : * Returns NULL if result should be NULL, else text value.
5454 : */
5455 : static text *
5456 72 : concat_internal(const char *sepstr, int argidx,
5457 : FunctionCallInfo fcinfo)
5458 : {
5459 : text *result;
5460 : StringInfoData str;
5461 : FmgrInfo *foutcache;
5462 72 : bool first_arg = true;
5463 : int i;
5464 :
5465 : /*
5466 : * concat(VARIADIC some-array) is essentially equivalent to
5467 : * array_to_text(), ie concat the array elements with the given separator.
5468 : * So we just pass the case off to that code.
5469 : */
5470 72 : if (get_fn_expr_variadic(fcinfo->flinfo))
5471 : {
5472 : ArrayType *arr;
5473 :
5474 : /* Should have just the one argument */
5475 : Assert(argidx == PG_NARGS() - 1);
5476 :
5477 : /* concat(VARIADIC NULL) is defined as NULL */
5478 30 : if (PG_ARGISNULL(argidx))
5479 12 : return NULL;
5480 :
5481 : /*
5482 : * Non-null argument had better be an array. We assume that any call
5483 : * context that could let get_fn_expr_variadic return true will have
5484 : * checked that a VARIADIC-labeled parameter actually is an array. So
5485 : * it should be okay to just Assert that it's an array rather than
5486 : * doing a full-fledged error check.
5487 : */
5488 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5489 :
5490 : /* OK, safe to fetch the array value */
5491 18 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
5492 :
5493 : /*
5494 : * And serialize the array. We tell array_to_text to ignore null
5495 : * elements, which matches the behavior of the loop below.
5496 : */
5497 18 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5498 : }
5499 :
5500 : /* Normal case without explicit VARIADIC marker */
5501 42 : initStringInfo(&str);
5502 :
5503 : /* Get output function info, building it if first time through */
5504 42 : foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5505 42 : if (foutcache == NULL)
5506 40 : foutcache = build_concat_foutcache(fcinfo, argidx);
5507 :
5508 204 : for (i = argidx; i < PG_NARGS(); i++)
5509 : {
5510 162 : if (!PG_ARGISNULL(i))
5511 : {
5512 150 : Datum value = PG_GETARG_DATUM(i);
5513 :
5514 : /* add separator if appropriate */
5515 150 : if (first_arg)
5516 42 : first_arg = false;
5517 : else
5518 108 : appendStringInfoString(&str, sepstr);
5519 :
5520 : /* call the appropriate type output function, append the result */
5521 150 : appendStringInfoString(&str,
5522 150 : OutputFunctionCall(&foutcache[i], value));
5523 : }
5524 : }
5525 :
5526 42 : result = cstring_to_text_with_len(str.data, str.len);
5527 42 : pfree(str.data);
5528 :
5529 42 : return result;
5530 : }
5531 :
5532 : /*
5533 : * Concatenate all arguments. NULL arguments are ignored.
5534 : */
5535 : Datum
5536 36 : text_concat(PG_FUNCTION_ARGS)
5537 : {
5538 : text *result;
5539 :
5540 36 : result = concat_internal("", 0, fcinfo);
5541 36 : if (result == NULL)
5542 6 : PG_RETURN_NULL();
5543 30 : PG_RETURN_TEXT_P(result);
5544 : }
5545 :
5546 : /*
5547 : * Concatenate all but first argument value with separators. The first
5548 : * parameter is used as the separator. NULL arguments are ignored.
5549 : */
5550 : Datum
5551 42 : text_concat_ws(PG_FUNCTION_ARGS)
5552 : {
5553 : char *sep;
5554 : text *result;
5555 :
5556 : /* return NULL when separator is NULL */
5557 42 : if (PG_ARGISNULL(0))
5558 6 : PG_RETURN_NULL();
5559 36 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5560 :
5561 36 : result = concat_internal(sep, 1, fcinfo);
5562 36 : if (result == NULL)
5563 6 : PG_RETURN_NULL();
5564 30 : PG_RETURN_TEXT_P(result);
5565 : }
5566 :
5567 : /*
5568 : * Return first n characters in the string. When n is negative,
5569 : * return all but last |n| characters.
5570 : */
5571 : Datum
5572 1884 : text_left(PG_FUNCTION_ARGS)
5573 : {
5574 1884 : int n = PG_GETARG_INT32(1);
5575 :
5576 1884 : if (n < 0)
5577 : {
5578 30 : text *str = PG_GETARG_TEXT_PP(0);
5579 30 : const char *p = VARDATA_ANY(str);
5580 30 : int len = VARSIZE_ANY_EXHDR(str);
5581 : int rlen;
5582 :
5583 30 : n = pg_mbstrlen_with_len(p, len) + n;
5584 30 : rlen = pg_mbcharcliplen(p, len, n);
5585 30 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5586 : }
5587 : else
5588 1854 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5589 : }
5590 :
5591 : /*
5592 : * Return last n characters in the string. When n is negative,
5593 : * return all but first |n| characters.
5594 : */
5595 : Datum
5596 66 : text_right(PG_FUNCTION_ARGS)
5597 : {
5598 66 : text *str = PG_GETARG_TEXT_PP(0);
5599 66 : const char *p = VARDATA_ANY(str);
5600 66 : int len = VARSIZE_ANY_EXHDR(str);
5601 66 : int n = PG_GETARG_INT32(1);
5602 : int off;
5603 :
5604 66 : if (n < 0)
5605 30 : n = -n;
5606 : else
5607 36 : n = pg_mbstrlen_with_len(p, len) - n;
5608 66 : off = pg_mbcharcliplen(p, len, n);
5609 :
5610 66 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5611 : }
5612 :
5613 : /*
5614 : * Return reversed string
5615 : */
5616 : Datum
5617 6 : text_reverse(PG_FUNCTION_ARGS)
5618 : {
5619 6 : text *str = PG_GETARG_TEXT_PP(0);
5620 6 : const char *p = VARDATA_ANY(str);
5621 6 : int len = VARSIZE_ANY_EXHDR(str);
5622 6 : const char *endp = p + len;
5623 : text *result;
5624 : char *dst;
5625 :
5626 6 : result = palloc(len + VARHDRSZ);
5627 6 : dst = (char *) VARDATA(result) + len;
5628 6 : SET_VARSIZE(result, len + VARHDRSZ);
5629 :
5630 6 : if (pg_database_encoding_max_length() > 1)
5631 : {
5632 : /* multibyte version */
5633 36 : while (p < endp)
5634 : {
5635 : int sz;
5636 :
5637 30 : sz = pg_mblen(p);
5638 30 : dst -= sz;
5639 30 : memcpy(dst, p, sz);
5640 30 : p += sz;
5641 : }
5642 : }
5643 : else
5644 : {
5645 : /* single byte version */
5646 0 : while (p < endp)
5647 0 : *(--dst) = *p++;
5648 : }
5649 :
5650 6 : PG_RETURN_TEXT_P(result);
5651 : }
5652 :
5653 :
5654 : /*
5655 : * Support macros for text_format()
5656 : */
5657 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5658 :
5659 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5660 : do { \
5661 : if (++(ptr) >= (end_ptr)) \
5662 : ereport(ERROR, \
5663 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5664 : errmsg("unterminated format() type specifier"), \
5665 : errhint("For a single \"%%\" use \"%%%%\"."))); \
5666 : } while (0)
5667 :
5668 : /*
5669 : * Returns a formatted string
5670 : */
5671 : Datum
5672 85012 : text_format(PG_FUNCTION_ARGS)
5673 : {
5674 : text *fmt;
5675 : StringInfoData str;
5676 : const char *cp;
5677 : const char *start_ptr;
5678 : const char *end_ptr;
5679 : text *result;
5680 : int arg;
5681 : bool funcvariadic;
5682 : int nargs;
5683 85012 : Datum *elements = NULL;
5684 85012 : bool *nulls = NULL;
5685 85012 : Oid element_type = InvalidOid;
5686 85012 : Oid prev_type = InvalidOid;
5687 85012 : Oid prev_width_type = InvalidOid;
5688 : FmgrInfo typoutputfinfo;
5689 : FmgrInfo typoutputinfo_width;
5690 :
5691 : /* When format string is null, immediately return null */
5692 85012 : if (PG_ARGISNULL(0))
5693 6 : PG_RETURN_NULL();
5694 :
5695 : /* If argument is marked VARIADIC, expand array into elements */
5696 85006 : if (get_fn_expr_variadic(fcinfo->flinfo))
5697 : {
5698 : ArrayType *arr;
5699 : int16 elmlen;
5700 : bool elmbyval;
5701 : char elmalign;
5702 : int nitems;
5703 :
5704 : /* Should have just the one argument */
5705 : Assert(PG_NARGS() == 2);
5706 :
5707 : /* If argument is NULL, we treat it as zero-length array */
5708 48 : if (PG_ARGISNULL(1))
5709 6 : nitems = 0;
5710 : else
5711 : {
5712 : /*
5713 : * Non-null argument had better be an array. We assume that any
5714 : * call context that could let get_fn_expr_variadic return true
5715 : * will have checked that a VARIADIC-labeled parameter actually is
5716 : * an array. So it should be okay to just Assert that it's an
5717 : * array rather than doing a full-fledged error check.
5718 : */
5719 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5720 :
5721 : /* OK, safe to fetch the array value */
5722 42 : arr = PG_GETARG_ARRAYTYPE_P(1);
5723 :
5724 : /* Get info about array element type */
5725 42 : element_type = ARR_ELEMTYPE(arr);
5726 42 : get_typlenbyvalalign(element_type,
5727 : &elmlen, &elmbyval, &elmalign);
5728 :
5729 : /* Extract all array elements */
5730 42 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5731 : &elements, &nulls, &nitems);
5732 : }
5733 :
5734 48 : nargs = nitems + 1;
5735 48 : funcvariadic = true;
5736 : }
5737 : else
5738 : {
5739 : /* Non-variadic case, we'll process the arguments individually */
5740 84958 : nargs = PG_NARGS();
5741 84958 : funcvariadic = false;
5742 : }
5743 :
5744 : /* Setup for main loop. */
5745 85006 : fmt = PG_GETARG_TEXT_PP(0);
5746 85006 : start_ptr = VARDATA_ANY(fmt);
5747 85006 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5748 85006 : initStringInfo(&str);
5749 85006 : arg = 1; /* next argument position to print */
5750 :
5751 : /* Scan format string, looking for conversion specifiers. */
5752 846406 : for (cp = start_ptr; cp < end_ptr; cp++)
5753 : {
5754 : int argpos;
5755 : int widthpos;
5756 : int flags;
5757 : int width;
5758 : Datum value;
5759 : bool isNull;
5760 : Oid typid;
5761 :
5762 : /*
5763 : * If it's not the start of a conversion specifier, just copy it to
5764 : * the output buffer.
5765 : */
5766 761460 : if (*cp != '%')
5767 : {
5768 642456 : appendStringInfoCharMacro(&str, *cp);
5769 642474 : continue;
5770 : }
5771 :
5772 119004 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5773 :
5774 : /* Easy case: %% outputs a single % */
5775 119004 : if (*cp == '%')
5776 : {
5777 18 : appendStringInfoCharMacro(&str, *cp);
5778 18 : continue;
5779 : }
5780 :
5781 : /* Parse the optional portions of the format specifier */
5782 118986 : cp = text_format_parse_format(cp, end_ptr,
5783 : &argpos, &widthpos,
5784 : &flags, &width);
5785 :
5786 : /*
5787 : * Next we should see the main conversion specifier. Whether or not
5788 : * an argument position was present, it's known that at least one
5789 : * character remains in the string at this point. Experience suggests
5790 : * that it's worth checking that that character is one of the expected
5791 : * ones before we try to fetch arguments, so as to produce the least
5792 : * confusing response to a mis-formatted specifier.
5793 : */
5794 118962 : if (strchr("sIL", *cp) == NULL)
5795 6 : ereport(ERROR,
5796 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5797 : errmsg("unrecognized format() type specifier \"%.*s\"",
5798 : pg_mblen(cp), cp),
5799 : errhint("For a single \"%%\" use \"%%%%\".")));
5800 :
5801 : /* If indirect width was specified, get its value */
5802 118956 : if (widthpos >= 0)
5803 : {
5804 : /* Collect the specified or next argument position */
5805 42 : if (widthpos > 0)
5806 36 : arg = widthpos;
5807 42 : if (arg >= nargs)
5808 0 : ereport(ERROR,
5809 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5810 : errmsg("too few arguments for format()")));
5811 :
5812 : /* Get the value and type of the selected argument */
5813 42 : if (!funcvariadic)
5814 : {
5815 42 : value = PG_GETARG_DATUM(arg);
5816 42 : isNull = PG_ARGISNULL(arg);
5817 42 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5818 : }
5819 : else
5820 : {
5821 0 : value = elements[arg - 1];
5822 0 : isNull = nulls[arg - 1];
5823 0 : typid = element_type;
5824 : }
5825 42 : if (!OidIsValid(typid))
5826 0 : elog(ERROR, "could not determine data type of format() input");
5827 :
5828 42 : arg++;
5829 :
5830 : /* We can treat NULL width the same as zero */
5831 42 : if (isNull)
5832 6 : width = 0;
5833 36 : else if (typid == INT4OID)
5834 36 : width = DatumGetInt32(value);
5835 0 : else if (typid == INT2OID)
5836 0 : width = DatumGetInt16(value);
5837 : else
5838 : {
5839 : /* For less-usual datatypes, convert to text then to int */
5840 : char *str;
5841 :
5842 0 : if (typid != prev_width_type)
5843 : {
5844 : Oid typoutputfunc;
5845 : bool typIsVarlena;
5846 :
5847 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5848 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
5849 0 : prev_width_type = typid;
5850 : }
5851 :
5852 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
5853 :
5854 : /* pg_strtoint32 will complain about bad data or overflow */
5855 0 : width = pg_strtoint32(str);
5856 :
5857 0 : pfree(str);
5858 : }
5859 : }
5860 :
5861 : /* Collect the specified or next argument position */
5862 118956 : if (argpos > 0)
5863 132 : arg = argpos;
5864 118956 : if (arg >= nargs)
5865 24 : ereport(ERROR,
5866 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5867 : errmsg("too few arguments for format()")));
5868 :
5869 : /* Get the value and type of the selected argument */
5870 118932 : if (!funcvariadic)
5871 : {
5872 117660 : value = PG_GETARG_DATUM(arg);
5873 117660 : isNull = PG_ARGISNULL(arg);
5874 117660 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5875 : }
5876 : else
5877 : {
5878 1272 : value = elements[arg - 1];
5879 1272 : isNull = nulls[arg - 1];
5880 1272 : typid = element_type;
5881 : }
5882 118932 : if (!OidIsValid(typid))
5883 0 : elog(ERROR, "could not determine data type of format() input");
5884 :
5885 118932 : arg++;
5886 :
5887 : /*
5888 : * Get the appropriate typOutput function, reusing previous one if
5889 : * same type as previous argument. That's particularly useful in the
5890 : * variadic-array case, but often saves work even for ordinary calls.
5891 : */
5892 118932 : if (typid != prev_type)
5893 : {
5894 : Oid typoutputfunc;
5895 : bool typIsVarlena;
5896 :
5897 87886 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5898 87886 : fmgr_info(typoutputfunc, &typoutputfinfo);
5899 87886 : prev_type = typid;
5900 : }
5901 :
5902 : /*
5903 : * And now we can format the value.
5904 : */
5905 118932 : switch (*cp)
5906 : {
5907 118932 : case 's':
5908 : case 'I':
5909 : case 'L':
5910 118932 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
5911 : value, isNull,
5912 : flags, width);
5913 118926 : break;
5914 0 : default:
5915 : /* should not get here, because of previous check */
5916 0 : ereport(ERROR,
5917 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5918 : errmsg("unrecognized format() type specifier \"%.*s\"",
5919 : pg_mblen(cp), cp),
5920 : errhint("For a single \"%%\" use \"%%%%\".")));
5921 : break;
5922 : }
5923 : }
5924 :
5925 : /* Don't need deconstruct_array results anymore. */
5926 84946 : if (elements != NULL)
5927 42 : pfree(elements);
5928 84946 : if (nulls != NULL)
5929 42 : pfree(nulls);
5930 :
5931 : /* Generate results. */
5932 84946 : result = cstring_to_text_with_len(str.data, str.len);
5933 84946 : pfree(str.data);
5934 :
5935 84946 : PG_RETURN_TEXT_P(result);
5936 : }
5937 :
5938 : /*
5939 : * Parse contiguous digits as a decimal number.
5940 : *
5941 : * Returns true if some digits could be parsed.
5942 : * The value is returned into *value, and *ptr is advanced to the next
5943 : * character to be parsed.
5944 : *
5945 : * Note parsing invariant: at least one character is known available before
5946 : * string end (end_ptr) at entry, and this is still true at exit.
5947 : */
5948 : static bool
5949 177936 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5950 : {
5951 177936 : bool found = false;
5952 177936 : const char *cp = *ptr;
5953 177936 : int val = 0;
5954 :
5955 298248 : while (*cp >= '0' && *cp <= '9')
5956 : {
5957 120318 : int8 digit = (*cp - '0');
5958 :
5959 120318 : if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5960 120318 : unlikely(pg_add_s32_overflow(val, digit, &val)))
5961 0 : ereport(ERROR,
5962 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5963 : errmsg("number is out of range")));
5964 120318 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5965 120312 : found = true;
5966 : }
5967 :
5968 177930 : *ptr = cp;
5969 177930 : *value = val;
5970 :
5971 177930 : return found;
5972 : }
5973 :
5974 : /*
5975 : * Parse a format specifier (generally following the SUS printf spec).
5976 : *
5977 : * We have already advanced over the initial '%', and we are looking for
5978 : * [argpos][flags][width]type (but the type character is not consumed here).
5979 : *
5980 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5981 : * Output parameters:
5982 : * argpos: argument position for value to be printed. -1 means unspecified.
5983 : * widthpos: argument position for width. Zero means the argument position
5984 : * was unspecified (ie, take the next arg) and -1 means no width
5985 : * argument (width was omitted or specified as a constant).
5986 : * flags: bitmask of flags.
5987 : * width: directly-specified width value. Zero means the width was omitted
5988 : * (note it's not necessary to distinguish this case from an explicit
5989 : * zero width value).
5990 : *
5991 : * The function result is the next character position to be parsed, ie, the
5992 : * location where the type character is/should be.
5993 : *
5994 : * Note parsing invariant: at least one character is known available before
5995 : * string end (end_ptr) at entry, and this is still true at exit.
5996 : */
5997 : static const char *
5998 118986 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
5999 : int *argpos, int *widthpos,
6000 : int *flags, int *width)
6001 : {
6002 118986 : const char *cp = start_ptr;
6003 : int n;
6004 :
6005 : /* set defaults for output parameters */
6006 118986 : *argpos = -1;
6007 118986 : *widthpos = -1;
6008 118986 : *flags = 0;
6009 118986 : *width = 0;
6010 :
6011 : /* try to identify first number */
6012 118986 : if (text_format_parse_digits(&cp, end_ptr, &n))
6013 : {
6014 60174 : if (*cp != '$')
6015 : {
6016 : /* Must be just a width and a type, so we're done */
6017 60024 : *width = n;
6018 60024 : return cp;
6019 : }
6020 : /* The number was argument position */
6021 150 : *argpos = n;
6022 : /* Explicit 0 for argument index is immediately refused */
6023 150 : if (n == 0)
6024 6 : ereport(ERROR,
6025 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6026 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6027 144 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6028 : }
6029 :
6030 : /* Handle flags (only minus is supported now) */
6031 58980 : while (*cp == '-')
6032 : {
6033 30 : *flags |= TEXT_FORMAT_FLAG_MINUS;
6034 30 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6035 : }
6036 :
6037 58950 : if (*cp == '*')
6038 : {
6039 : /* Handle indirect width */
6040 48 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6041 48 : if (text_format_parse_digits(&cp, end_ptr, &n))
6042 : {
6043 : /* number in this position must be closed by $ */
6044 42 : if (*cp != '$')
6045 0 : ereport(ERROR,
6046 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6047 : errmsg("width argument position must be ended by \"$\"")));
6048 : /* The number was width argument position */
6049 42 : *widthpos = n;
6050 : /* Explicit 0 for argument index is immediately refused */
6051 42 : if (n == 0)
6052 6 : ereport(ERROR,
6053 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6054 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
6055 36 : ADVANCE_PARSE_POINTER(cp, end_ptr);
6056 : }
6057 : else
6058 6 : *widthpos = 0; /* width's argument position is unspecified */
6059 : }
6060 : else
6061 : {
6062 : /* Check for direct width specification */
6063 58902 : if (text_format_parse_digits(&cp, end_ptr, &n))
6064 30 : *width = n;
6065 : }
6066 :
6067 : /* cp should now be pointing at type character */
6068 58938 : return cp;
6069 : }
6070 :
6071 : /*
6072 : * Format a %s, %I, or %L conversion
6073 : */
6074 : static void
6075 118932 : text_format_string_conversion(StringInfo buf, char conversion,
6076 : FmgrInfo *typOutputInfo,
6077 : Datum value, bool isNull,
6078 : int flags, int width)
6079 : {
6080 : char *str;
6081 :
6082 : /* Handle NULL arguments before trying to stringify the value. */
6083 118932 : if (isNull)
6084 : {
6085 306 : if (conversion == 's')
6086 234 : text_format_append_string(buf, "", flags, width);
6087 72 : else if (conversion == 'L')
6088 66 : text_format_append_string(buf, "NULL", flags, width);
6089 6 : else if (conversion == 'I')
6090 6 : ereport(ERROR,
6091 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6092 : errmsg("null values cannot be formatted as an SQL identifier")));
6093 300 : return;
6094 : }
6095 :
6096 : /* Stringify. */
6097 118626 : str = OutputFunctionCall(typOutputInfo, value);
6098 :
6099 : /* Escape. */
6100 118626 : if (conversion == 'I')
6101 : {
6102 : /* quote_identifier may or may not allocate a new string. */
6103 3106 : text_format_append_string(buf, quote_identifier(str), flags, width);
6104 : }
6105 115520 : else if (conversion == 'L')
6106 : {
6107 2578 : char *qstr = quote_literal_cstr(str);
6108 :
6109 2578 : text_format_append_string(buf, qstr, flags, width);
6110 : /* quote_literal_cstr() always allocates a new string */
6111 2578 : pfree(qstr);
6112 : }
6113 : else
6114 112942 : text_format_append_string(buf, str, flags, width);
6115 :
6116 : /* Cleanup. */
6117 118626 : pfree(str);
6118 : }
6119 :
6120 : /*
6121 : * Append str to buf, padding as directed by flags/width
6122 : */
6123 : static void
6124 118926 : text_format_append_string(StringInfo buf, const char *str,
6125 : int flags, int width)
6126 : {
6127 118926 : bool align_to_left = false;
6128 : int len;
6129 :
6130 : /* fast path for typical easy case */
6131 118926 : if (width == 0)
6132 : {
6133 58842 : appendStringInfoString(buf, str);
6134 58842 : return;
6135 : }
6136 :
6137 60084 : if (width < 0)
6138 : {
6139 : /* Negative width: implicit '-' flag, then take absolute value */
6140 6 : align_to_left = true;
6141 : /* -INT_MIN is undefined */
6142 6 : if (width <= INT_MIN)
6143 0 : ereport(ERROR,
6144 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6145 : errmsg("number is out of range")));
6146 6 : width = -width;
6147 : }
6148 60078 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
6149 24 : align_to_left = true;
6150 :
6151 60084 : len = pg_mbstrlen(str);
6152 60084 : if (align_to_left)
6153 : {
6154 : /* left justify */
6155 30 : appendStringInfoString(buf, str);
6156 30 : if (len < width)
6157 30 : appendStringInfoSpaces(buf, width - len);
6158 : }
6159 : else
6160 : {
6161 : /* right justify */
6162 60054 : if (len < width)
6163 60054 : appendStringInfoSpaces(buf, width - len);
6164 60054 : appendStringInfoString(buf, str);
6165 : }
6166 : }
6167 :
6168 : /*
6169 : * text_format_nv - nonvariadic wrapper for text_format function.
6170 : *
6171 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6172 : * which checks that all built-in functions that share the implementing C
6173 : * function take the same number of arguments.
6174 : */
6175 : Datum
6176 30 : text_format_nv(PG_FUNCTION_ARGS)
6177 : {
6178 30 : return text_format(fcinfo);
6179 : }
6180 :
6181 : /*
6182 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
6183 : * for this use case.
6184 : */
6185 : static inline bool
6186 0 : rest_of_char_same(const char *s1, const char *s2, int len)
6187 : {
6188 0 : while (len > 0)
6189 : {
6190 0 : len--;
6191 0 : if (s1[len] != s2[len])
6192 0 : return false;
6193 : }
6194 0 : return true;
6195 : }
6196 :
6197 : /* Expand each Levenshtein distance variant */
6198 : #include "levenshtein.c"
6199 : #define LEVENSHTEIN_LESS_EQUAL
6200 : #include "levenshtein.c"
6201 :
6202 :
6203 : /*
6204 : * Unicode support
6205 : */
6206 :
6207 : static UnicodeNormalizationForm
6208 186 : unicode_norm_form_from_string(const char *formstr)
6209 : {
6210 186 : UnicodeNormalizationForm form = -1;
6211 :
6212 : /*
6213 : * Might as well check this while we're here.
6214 : */
6215 186 : if (GetDatabaseEncoding() != PG_UTF8)
6216 0 : ereport(ERROR,
6217 : (errcode(ERRCODE_SYNTAX_ERROR),
6218 : errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6219 :
6220 186 : if (pg_strcasecmp(formstr, "NFC") == 0)
6221 66 : form = UNICODE_NFC;
6222 120 : else if (pg_strcasecmp(formstr, "NFD") == 0)
6223 36 : form = UNICODE_NFD;
6224 84 : else if (pg_strcasecmp(formstr, "NFKC") == 0)
6225 36 : form = UNICODE_NFKC;
6226 48 : else if (pg_strcasecmp(formstr, "NFKD") == 0)
6227 36 : form = UNICODE_NFKD;
6228 : else
6229 12 : ereport(ERROR,
6230 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6231 : errmsg("invalid normalization form: %s", formstr)));
6232 :
6233 174 : return form;
6234 : }
6235 :
6236 : Datum
6237 48 : unicode_normalize_func(PG_FUNCTION_ARGS)
6238 : {
6239 48 : text *input = PG_GETARG_TEXT_PP(0);
6240 48 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6241 : UnicodeNormalizationForm form;
6242 : int size;
6243 : pg_wchar *input_chars;
6244 : pg_wchar *output_chars;
6245 : unsigned char *p;
6246 : text *result;
6247 : int i;
6248 :
6249 48 : form = unicode_norm_form_from_string(formstr);
6250 :
6251 : /* convert to pg_wchar */
6252 42 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6253 42 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6254 42 : p = (unsigned char *) VARDATA_ANY(input);
6255 168 : for (i = 0; i < size; i++)
6256 : {
6257 126 : input_chars[i] = utf8_to_unicode(p);
6258 126 : p += pg_utf_mblen(p);
6259 : }
6260 42 : input_chars[i] = (pg_wchar) '\0';
6261 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6262 :
6263 : /* action */
6264 42 : output_chars = unicode_normalize(form, input_chars);
6265 :
6266 : /* convert back to UTF-8 string */
6267 42 : size = 0;
6268 162 : for (pg_wchar *wp = output_chars; *wp; wp++)
6269 : {
6270 : unsigned char buf[4];
6271 :
6272 120 : unicode_to_utf8(*wp, buf);
6273 120 : size += pg_utf_mblen(buf);
6274 : }
6275 :
6276 42 : result = palloc(size + VARHDRSZ);
6277 42 : SET_VARSIZE(result, size + VARHDRSZ);
6278 :
6279 42 : p = (unsigned char *) VARDATA_ANY(result);
6280 162 : for (pg_wchar *wp = output_chars; *wp; wp++)
6281 : {
6282 120 : unicode_to_utf8(*wp, p);
6283 120 : p += pg_utf_mblen(p);
6284 : }
6285 : Assert((char *) p == (char *) result + size + VARHDRSZ);
6286 :
6287 42 : PG_RETURN_TEXT_P(result);
6288 : }
6289 :
6290 : /*
6291 : * Check whether the string is in the specified Unicode normalization form.
6292 : *
6293 : * This is done by converting the string to the specified normal form and then
6294 : * comparing that to the original string. To speed that up, we also apply the
6295 : * "quick check" algorithm specified in UAX #15, which can give a yes or no
6296 : * answer for many strings by just scanning the string once.
6297 : *
6298 : * This function should generally be optimized for the case where the string
6299 : * is in fact normalized. In that case, we'll end up looking at the entire
6300 : * string, so it's probably not worth doing any incremental conversion etc.
6301 : */
6302 : Datum
6303 138 : unicode_is_normalized(PG_FUNCTION_ARGS)
6304 : {
6305 138 : text *input = PG_GETARG_TEXT_PP(0);
6306 138 : char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6307 : UnicodeNormalizationForm form;
6308 : int size;
6309 : pg_wchar *input_chars;
6310 : pg_wchar *output_chars;
6311 : unsigned char *p;
6312 : int i;
6313 : UnicodeNormalizationQC quickcheck;
6314 : int output_size;
6315 : bool result;
6316 :
6317 138 : form = unicode_norm_form_from_string(formstr);
6318 :
6319 : /* convert to pg_wchar */
6320 132 : size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6321 132 : input_chars = palloc((size + 1) * sizeof(pg_wchar));
6322 132 : p = (unsigned char *) VARDATA_ANY(input);
6323 504 : for (i = 0; i < size; i++)
6324 : {
6325 372 : input_chars[i] = utf8_to_unicode(p);
6326 372 : p += pg_utf_mblen(p);
6327 : }
6328 132 : input_chars[i] = (pg_wchar) '\0';
6329 : Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6330 :
6331 : /* quick check (see UAX #15) */
6332 132 : quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6333 132 : if (quickcheck == UNICODE_NORM_QC_YES)
6334 42 : PG_RETURN_BOOL(true);
6335 90 : else if (quickcheck == UNICODE_NORM_QC_NO)
6336 12 : PG_RETURN_BOOL(false);
6337 :
6338 : /* normalize and compare with original */
6339 78 : output_chars = unicode_normalize(form, input_chars);
6340 :
6341 78 : output_size = 0;
6342 324 : for (pg_wchar *wp = output_chars; *wp; wp++)
6343 246 : output_size++;
6344 :
6345 114 : result = (size == output_size) &&
6346 36 : (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6347 :
6348 78 : PG_RETURN_BOOL(result);
6349 : }
6350 :
6351 : /*
6352 : * Check if first n chars are hexadecimal digits
6353 : */
6354 : static bool
6355 156 : isxdigits_n(const char *instr, size_t n)
6356 : {
6357 660 : for (size_t i = 0; i < n; i++)
6358 570 : if (!isxdigit((unsigned char) instr[i]))
6359 66 : return false;
6360 :
6361 90 : return true;
6362 : }
6363 :
6364 : static unsigned int
6365 504 : hexval(unsigned char c)
6366 : {
6367 504 : if (c >= '0' && c <= '9')
6368 384 : return c - '0';
6369 120 : if (c >= 'a' && c <= 'f')
6370 60 : return c - 'a' + 0xA;
6371 60 : if (c >= 'A' && c <= 'F')
6372 60 : return c - 'A' + 0xA;
6373 0 : elog(ERROR, "invalid hexadecimal digit");
6374 : return 0; /* not reached */
6375 : }
6376 :
6377 : /*
6378 : * Translate string with hexadecimal digits to number
6379 : */
6380 : static unsigned int
6381 90 : hexval_n(const char *instr, size_t n)
6382 : {
6383 90 : unsigned int result = 0;
6384 :
6385 594 : for (size_t i = 0; i < n; i++)
6386 504 : result += hexval(instr[i]) << (4 * (n - i - 1));
6387 :
6388 90 : return result;
6389 : }
6390 :
6391 : /*
6392 : * Replaces Unicode escape sequences by Unicode characters
6393 : */
6394 : Datum
6395 66 : unistr(PG_FUNCTION_ARGS)
6396 : {
6397 66 : text *input_text = PG_GETARG_TEXT_PP(0);
6398 : char *instr;
6399 : int len;
6400 : StringInfoData str;
6401 : text *result;
6402 66 : pg_wchar pair_first = 0;
6403 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6404 :
6405 66 : instr = VARDATA_ANY(input_text);
6406 66 : len = VARSIZE_ANY_EXHDR(input_text);
6407 :
6408 66 : initStringInfo(&str);
6409 :
6410 510 : while (len > 0)
6411 : {
6412 486 : if (instr[0] == '\\')
6413 : {
6414 102 : if (len >= 2 &&
6415 102 : instr[1] == '\\')
6416 : {
6417 6 : if (pair_first)
6418 0 : goto invalid_pair;
6419 6 : appendStringInfoChar(&str, '\\');
6420 6 : instr += 2;
6421 6 : len -= 2;
6422 : }
6423 96 : else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6424 66 : (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6425 30 : {
6426 : pg_wchar unicode;
6427 42 : int offset = instr[1] == 'u' ? 2 : 1;
6428 :
6429 42 : unicode = hexval_n(instr + offset, 4);
6430 :
6431 42 : if (!is_valid_unicode_codepoint(unicode))
6432 0 : ereport(ERROR,
6433 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6434 : errmsg("invalid Unicode code point: %04X", unicode));
6435 :
6436 42 : if (pair_first)
6437 : {
6438 12 : if (is_utf16_surrogate_second(unicode))
6439 : {
6440 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6441 0 : pair_first = 0;
6442 : }
6443 : else
6444 12 : goto invalid_pair;
6445 : }
6446 30 : else if (is_utf16_surrogate_second(unicode))
6447 0 : goto invalid_pair;
6448 :
6449 30 : if (is_utf16_surrogate_first(unicode))
6450 18 : pair_first = unicode;
6451 : else
6452 : {
6453 12 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6454 12 : appendStringInfoString(&str, cbuf);
6455 : }
6456 :
6457 30 : instr += 4 + offset;
6458 30 : len -= 4 + offset;
6459 : }
6460 54 : else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6461 12 : {
6462 : pg_wchar unicode;
6463 :
6464 24 : unicode = hexval_n(instr + 2, 6);
6465 :
6466 24 : if (!is_valid_unicode_codepoint(unicode))
6467 6 : ereport(ERROR,
6468 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6469 : errmsg("invalid Unicode code point: %04X", unicode));
6470 :
6471 18 : if (pair_first)
6472 : {
6473 6 : if (is_utf16_surrogate_second(unicode))
6474 : {
6475 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6476 0 : pair_first = 0;
6477 : }
6478 : else
6479 6 : goto invalid_pair;
6480 : }
6481 12 : else if (is_utf16_surrogate_second(unicode))
6482 0 : goto invalid_pair;
6483 :
6484 12 : if (is_utf16_surrogate_first(unicode))
6485 6 : pair_first = unicode;
6486 : else
6487 : {
6488 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6489 6 : appendStringInfoString(&str, cbuf);
6490 : }
6491 :
6492 12 : instr += 8;
6493 12 : len -= 8;
6494 : }
6495 30 : else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6496 12 : {
6497 : pg_wchar unicode;
6498 :
6499 24 : unicode = hexval_n(instr + 2, 8);
6500 :
6501 24 : if (!is_valid_unicode_codepoint(unicode))
6502 6 : ereport(ERROR,
6503 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6504 : errmsg("invalid Unicode code point: %04X", unicode));
6505 :
6506 18 : if (pair_first)
6507 : {
6508 6 : if (is_utf16_surrogate_second(unicode))
6509 : {
6510 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6511 0 : pair_first = 0;
6512 : }
6513 : else
6514 6 : goto invalid_pair;
6515 : }
6516 12 : else if (is_utf16_surrogate_second(unicode))
6517 0 : goto invalid_pair;
6518 :
6519 12 : if (is_utf16_surrogate_first(unicode))
6520 6 : pair_first = unicode;
6521 : else
6522 : {
6523 6 : pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6524 6 : appendStringInfoString(&str, cbuf);
6525 : }
6526 :
6527 12 : instr += 10;
6528 12 : len -= 10;
6529 : }
6530 : else
6531 6 : ereport(ERROR,
6532 : (errcode(ERRCODE_SYNTAX_ERROR),
6533 : errmsg("invalid Unicode escape"),
6534 : errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6535 : }
6536 : else
6537 : {
6538 384 : if (pair_first)
6539 0 : goto invalid_pair;
6540 :
6541 384 : appendStringInfoChar(&str, *instr++);
6542 384 : len--;
6543 : }
6544 : }
6545 :
6546 : /* unfinished surrogate pair? */
6547 24 : if (pair_first)
6548 6 : goto invalid_pair;
6549 :
6550 18 : result = cstring_to_text_with_len(str.data, str.len);
6551 18 : pfree(str.data);
6552 :
6553 18 : PG_RETURN_TEXT_P(result);
6554 :
6555 30 : invalid_pair:
6556 30 : ereport(ERROR,
6557 : (errcode(ERRCODE_SYNTAX_ERROR),
6558 : errmsg("invalid Unicode surrogate pair")));
6559 : PG_RETURN_NULL(); /* keep compiler quiet */
6560 : }
|