Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <limits.h>
16 :
17 : #include "mb/pg_wchar.h"
18 : #include "utils/ascii.h"
19 :
20 :
21 : /*
22 : * In today's multibyte encodings other than UTF8, this two-byte sequence
23 : * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 : *
25 : * For historical reasons, several verifychar implementations opt to reject
26 : * this pair specifically. Byte pair range constraints, in encoding
27 : * originator documentation, always excluded this pair. No core conversion
28 : * could translate it. However, longstanding verifychar implementations
29 : * accepted any non-NUL byte. big5_to_euc_tw even translates pairs not
30 : * valid per encoding originator documentation. To avoid tightening core
31 : * or non-core conversions in a security patch, we sought this one pair.
32 : *
33 : * PQescapeString() historically used spaces for BYTE1; many other values
34 : * could suffice for BYTE1.
35 : */
36 : #define NONUTF8_INVALID_BYTE0 (0x8d)
37 : #define NONUTF8_INVALID_BYTE1 (' ')
38 :
39 :
40 : /*
41 : * Operations on multi-byte encodings are driven by a table of helper
42 : * functions.
43 : *
44 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 : * and wchar2mb() conversion functions.
47 : *
48 : * These functions generally assume that their input is validly formed.
49 : * The "verifier" functions, further down in the file, have to be more
50 : * paranoid.
51 : *
52 : * We expect that mblen() does not need to examine more than the first byte
53 : * of the character to discover the correct length. GB18030 is an exception
54 : * to that rule, though, as it also looks at second byte. But even that
55 : * behaves in a predictable way, if you only pass the first byte: it will
56 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 : * good enough for all current uses.
58 : *
59 : * Note: for the display output of psql to work properly, the return values
60 : * of the dsplen functions must conform to the Unicode standard. In particular
61 : * the NUL character is zero width and control characters are generally
62 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 : * subset to the ASCII routines to ensure consistency.
64 : */
65 :
66 : /* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67 : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68 :
69 : /*
70 : * SQL/ASCII
71 : */
72 : static int
73 433 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74 : {
75 433 : int cnt = 0;
76 :
77 33279 : while (len > 0 && *from)
78 : {
79 32846 : *to++ = *from++;
80 32846 : len--;
81 32846 : cnt++;
82 : }
83 433 : *to = 0;
84 433 : return cnt;
85 : }
86 :
87 : static int
88 19580 : pg_ascii_mblen(const unsigned char *s)
89 : {
90 19580 : return 1;
91 : }
92 :
93 : static int
94 18075 : pg_ascii_dsplen(const unsigned char *s)
95 : {
96 18075 : if (*s == '\0')
97 0 : return 0;
98 18075 : if (*s < 0x20 || *s == 0x7f)
99 2 : return -1;
100 :
101 18073 : return 1;
102 : }
103 :
104 : /*
105 : * EUC
106 : */
107 : static int
108 32 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109 : {
110 32 : int cnt = 0;
111 :
112 48 : while (len > 0 && *from)
113 : {
114 32 : if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 : {
116 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
117 4 : from++;
118 4 : *to = (SS2 << 8) | *from++;
119 4 : len -= 2;
120 : }
121 24 : else if (*from == SS3) /* JIS X 0212 KANJI */
122 : {
123 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
124 4 : from++;
125 4 : *to = (SS3 << 16) | (*from++ << 8);
126 4 : *to |= *from++;
127 4 : len -= 3;
128 : }
129 12 : else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 : {
131 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
132 4 : *to = *from++ << 8;
133 4 : *to |= *from++;
134 4 : len -= 2;
135 : }
136 : else /* must be ASCII */
137 : {
138 4 : *to = *from++;
139 4 : len--;
140 : }
141 16 : to++;
142 16 : cnt++;
143 : }
144 32 : *to = 0;
145 32 : return cnt;
146 : }
147 :
148 : static inline int
149 156 : pg_euc_mblen(const unsigned char *s)
150 : {
151 : int len;
152 :
153 156 : if (*s == SS2)
154 0 : len = 2;
155 156 : else if (*s == SS3)
156 0 : len = 3;
157 156 : else if (IS_HIGHBIT_SET(*s))
158 108 : len = 2;
159 : else
160 48 : len = 1;
161 156 : return len;
162 : }
163 :
164 : static inline int
165 0 : pg_euc_dsplen(const unsigned char *s)
166 : {
167 : int len;
168 :
169 0 : if (*s == SS2)
170 0 : len = 2;
171 0 : else if (*s == SS3)
172 0 : len = 2;
173 0 : else if (IS_HIGHBIT_SET(*s))
174 0 : len = 2;
175 : else
176 0 : len = pg_ascii_dsplen(s);
177 0 : return len;
178 : }
179 :
180 : /*
181 : * EUC_JP
182 : */
183 : static int
184 32 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185 : {
186 32 : return pg_euc2wchar_with_len(from, to, len);
187 : }
188 :
189 : static int
190 136 : pg_eucjp_mblen(const unsigned char *s)
191 : {
192 136 : return pg_euc_mblen(s);
193 : }
194 :
195 : static int
196 0 : pg_eucjp_dsplen(const unsigned char *s)
197 : {
198 : int len;
199 :
200 0 : if (*s == SS2)
201 0 : len = 1;
202 0 : else if (*s == SS3)
203 0 : len = 2;
204 0 : else if (IS_HIGHBIT_SET(*s))
205 0 : len = 2;
206 : else
207 0 : len = pg_ascii_dsplen(s);
208 0 : return len;
209 : }
210 :
211 : /*
212 : * EUC_KR
213 : */
214 : static int
215 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216 : {
217 0 : return pg_euc2wchar_with_len(from, to, len);
218 : }
219 :
220 : static int
221 4 : pg_euckr_mblen(const unsigned char *s)
222 : {
223 4 : return pg_euc_mblen(s);
224 : }
225 :
226 : static int
227 0 : pg_euckr_dsplen(const unsigned char *s)
228 : {
229 0 : return pg_euc_dsplen(s);
230 : }
231 :
232 : /*
233 : * EUC_CN
234 : *
235 : */
236 : static int
237 36 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238 : {
239 36 : int cnt = 0;
240 :
241 52 : while (len > 0 && *from)
242 : {
243 36 : if (*from == SS2) /* code set 2 (unused?) */
244 : {
245 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
246 4 : from++;
247 4 : *to = (SS2 << 16) | (*from++ << 8);
248 4 : *to |= *from++;
249 4 : len -= 3;
250 : }
251 24 : else if (*from == SS3) /* code set 3 (unused ?) */
252 : {
253 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
254 4 : from++;
255 4 : *to = (SS3 << 16) | (*from++ << 8);
256 4 : *to |= *from++;
257 4 : len -= 3;
258 : }
259 12 : else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 : {
261 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
262 4 : *to = *from++ << 8;
263 4 : *to |= *from++;
264 4 : len -= 2;
265 : }
266 : else
267 : {
268 4 : *to = *from++;
269 4 : len--;
270 : }
271 16 : to++;
272 16 : cnt++;
273 : }
274 36 : *to = 0;
275 36 : return cnt;
276 : }
277 :
278 : /*
279 : * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 : * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 : * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 : * datums (e.g. from shared catalogs) reach this.
283 : */
284 : static int
285 4 : pg_euccn_mblen(const unsigned char *s)
286 : {
287 : int len;
288 :
289 4 : if (*s == SS2)
290 0 : len = 3;
291 4 : else if (*s == SS3)
292 0 : len = 3;
293 4 : else if (IS_HIGHBIT_SET(*s))
294 4 : len = 2;
295 : else
296 0 : len = 1;
297 4 : return len;
298 : }
299 :
300 : static int
301 0 : pg_euccn_dsplen(const unsigned char *s)
302 : {
303 : int len;
304 :
305 0 : if (IS_HIGHBIT_SET(*s))
306 0 : len = 2;
307 : else
308 0 : len = pg_ascii_dsplen(s);
309 0 : return len;
310 : }
311 :
312 : /*
313 : * EUC_TW
314 : *
315 : */
316 : static int
317 40 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318 : {
319 40 : int cnt = 0;
320 :
321 56 : while (len > 0 && *from)
322 : {
323 40 : if (*from == SS2) /* code set 2 */
324 : {
325 16 : MB2CHAR_NEED_AT_LEAST(len, 4);
326 4 : from++;
327 4 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 4 : *to |= *from++ << 8;
329 4 : *to |= *from++;
330 4 : len -= 4;
331 : }
332 24 : else if (*from == SS3) /* code set 3 (unused?) */
333 : {
334 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
335 4 : from++;
336 4 : *to = (SS3 << 16) | (*from++ << 8);
337 4 : *to |= *from++;
338 4 : len -= 3;
339 : }
340 12 : else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 : {
342 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
343 4 : *to = *from++ << 8;
344 4 : *to |= *from++;
345 4 : len -= 2;
346 : }
347 : else
348 : {
349 4 : *to = *from++;
350 4 : len--;
351 : }
352 16 : to++;
353 16 : cnt++;
354 : }
355 40 : *to = 0;
356 40 : return cnt;
357 : }
358 :
359 : static int
360 4 : pg_euctw_mblen(const unsigned char *s)
361 : {
362 : int len;
363 :
364 4 : if (*s == SS2)
365 0 : len = 4;
366 4 : else if (*s == SS3)
367 0 : len = 3;
368 4 : else if (IS_HIGHBIT_SET(*s))
369 4 : len = 2;
370 : else
371 0 : len = 1;
372 4 : return len;
373 : }
374 :
375 : static int
376 0 : pg_euctw_dsplen(const unsigned char *s)
377 : {
378 : int len;
379 :
380 0 : if (*s == SS2)
381 0 : len = 2;
382 0 : else if (*s == SS3)
383 0 : len = 2;
384 0 : else if (IS_HIGHBIT_SET(*s))
385 0 : len = 2;
386 : else
387 0 : len = pg_ascii_dsplen(s);
388 0 : return len;
389 : }
390 :
391 : /*
392 : * Convert pg_wchar to EUC_* encoding.
393 : * caller must allocate enough space for "to", including a trailing zero!
394 : * len: length of from.
395 : * "from" not necessarily null terminated.
396 : */
397 : static int
398 48 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399 : {
400 48 : int cnt = 0;
401 :
402 96 : while (len > 0 && *from)
403 : {
404 : unsigned char c;
405 :
406 48 : if ((c = (*from >> 24)))
407 : {
408 4 : *to++ = c;
409 4 : *to++ = (*from >> 16) & 0xff;
410 4 : *to++ = (*from >> 8) & 0xff;
411 4 : *to++ = *from & 0xff;
412 4 : cnt += 4;
413 : }
414 44 : else if ((c = (*from >> 16)))
415 : {
416 16 : *to++ = c;
417 16 : *to++ = (*from >> 8) & 0xff;
418 16 : *to++ = *from & 0xff;
419 16 : cnt += 3;
420 : }
421 28 : else if ((c = (*from >> 8)))
422 : {
423 16 : *to++ = c;
424 16 : *to++ = *from & 0xff;
425 16 : cnt += 2;
426 : }
427 : else
428 : {
429 12 : *to++ = *from;
430 12 : cnt++;
431 : }
432 48 : from++;
433 48 : len--;
434 : }
435 48 : *to = 0;
436 48 : return cnt;
437 : }
438 :
439 :
440 : /*
441 : * JOHAB
442 : */
443 : static int
444 16 : pg_johab_mblen(const unsigned char *s)
445 : {
446 16 : return pg_euc_mblen(s);
447 : }
448 :
449 : static int
450 0 : pg_johab_dsplen(const unsigned char *s)
451 : {
452 0 : return pg_euc_dsplen(s);
453 : }
454 :
455 : /*
456 : * convert UTF8 string to pg_wchar (UCS-4)
457 : * caller must allocate enough space for "to", including a trailing zero!
458 : * len: length of from.
459 : * "from" not necessarily null terminated.
460 : */
461 : static int
462 6879946 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463 : {
464 6879946 : int cnt = 0;
465 : uint32 c1,
466 : c2,
467 : c3,
468 : c4;
469 :
470 106927058 : while (len > 0 && *from)
471 : {
472 100047140 : if ((*from & 0x80) == 0)
473 : {
474 100046461 : *to = *from++;
475 100046461 : len--;
476 : }
477 679 : else if ((*from & 0xe0) == 0xc0)
478 : {
479 345 : MB2CHAR_NEED_AT_LEAST(len, 2);
480 337 : c1 = *from++ & 0x1f;
481 337 : c2 = *from++ & 0x3f;
482 337 : *to = (c1 << 6) | c2;
483 337 : len -= 2;
484 : }
485 334 : else if ((*from & 0xf0) == 0xe0)
486 : {
487 174 : MB2CHAR_NEED_AT_LEAST(len, 3);
488 166 : c1 = *from++ & 0x0f;
489 166 : c2 = *from++ & 0x3f;
490 166 : c3 = *from++ & 0x3f;
491 166 : *to = (c1 << 12) | (c2 << 6) | c3;
492 166 : len -= 3;
493 : }
494 160 : else if ((*from & 0xf8) == 0xf0)
495 : {
496 16 : MB2CHAR_NEED_AT_LEAST(len, 4);
497 4 : c1 = *from++ & 0x07;
498 4 : c2 = *from++ & 0x3f;
499 4 : c3 = *from++ & 0x3f;
500 4 : c4 = *from++ & 0x3f;
501 4 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 4 : len -= 4;
503 : }
504 : else
505 : {
506 : /* treat a bogus char as length 1; not ours to raise error */
507 144 : *to = *from++;
508 144 : len--;
509 : }
510 100047112 : to++;
511 100047112 : cnt++;
512 : }
513 6879946 : *to = 0;
514 6879946 : return cnt;
515 : }
516 :
517 :
518 : /*
519 : * Trivial conversion from pg_wchar to UTF-8.
520 : * caller should allocate enough space for "to"
521 : * len: length of from.
522 : * "from" not necessarily null terminated.
523 : */
524 : static int
525 579537 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526 : {
527 579537 : int cnt = 0;
528 :
529 8542733 : while (len > 0 && *from)
530 : {
531 : int char_len;
532 :
533 7963196 : unicode_to_utf8(*from, to);
534 7963196 : char_len = pg_utf_mblen(to);
535 7963196 : cnt += char_len;
536 7963196 : to += char_len;
537 7963196 : from++;
538 7963196 : len--;
539 : }
540 579537 : *to = 0;
541 579537 : return cnt;
542 : }
543 :
544 : /*
545 : * Return the byte length of a UTF8 character pointed to by s
546 : *
547 : * Note: in the current implementation we do not support UTF8 sequences
548 : * of more than 4 bytes; hence do NOT return a value larger than 4.
549 : * We return "1" for any leading byte that is either flat-out illegal or
550 : * indicates a length larger than we support.
551 : *
552 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 : * other places would need to be fixed to change this.
554 : */
555 : int
556 97258227 : pg_utf_mblen(const unsigned char *s)
557 : {
558 : int len;
559 :
560 97258227 : if ((*s & 0x80) == 0)
561 97153028 : len = 1;
562 105199 : else if ((*s & 0xe0) == 0xc0)
563 8520 : len = 2;
564 96679 : else if ((*s & 0xf0) == 0xe0)
565 70297 : len = 3;
566 26382 : else if ((*s & 0xf8) == 0xf0)
567 26267 : len = 4;
568 : #ifdef NOT_USED
569 : else if ((*s & 0xfc) == 0xf8)
570 : len = 5;
571 : else if ((*s & 0xfe) == 0xfc)
572 : len = 6;
573 : #endif
574 : else
575 115 : len = 1;
576 97258227 : return len;
577 : }
578 :
579 : /*
580 : * This is an implementation of wcwidth() and wcswidth() as defined in
581 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 : * <http://www.unix.org/online.html>
583 : *
584 : * Markus Kuhn -- 2001-09-08 -- public domain
585 : *
586 : * customised for PostgreSQL
587 : *
588 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 : */
590 :
591 : struct mbinterval
592 : {
593 : unsigned int first;
594 : unsigned int last;
595 : };
596 :
597 : /* auxiliary function for binary search in interval table */
598 : static int
599 60018000 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600 : {
601 60018000 : int min = 0;
602 : int mid;
603 :
604 60018000 : if (ucs < table[0].first || ucs > table[max].last)
605 60012354 : return 0;
606 49149 : while (max >= min)
607 : {
608 43983 : mid = (min + max) / 2;
609 43983 : if (ucs > table[mid].last)
610 9747 : min = mid + 1;
611 34236 : else if (ucs < table[mid].first)
612 33756 : max = mid - 1;
613 : else
614 480 : return 1;
615 : }
616 :
617 5166 : return 0;
618 : }
619 :
620 :
621 : /*
622 : * The following functions define the column width of an ISO 10646
623 : * character as follows:
624 : *
625 : * - The null character (U+0000) has a column width of 0.
626 : *
627 : * - Other C0/C1 control characters and DEL will lead to a return
628 : * value of -1.
629 : *
630 : * - Non-spacing and enclosing combining characters (general
631 : * category code Mn, Me or Cf in the Unicode database) have a
632 : * column width of 0.
633 : *
634 : * - Spacing characters in the East Asian Wide (W) or East Asian
635 : * FullWidth (F) category as defined in Unicode Technical
636 : * Report #11 have a column width of 2.
637 : *
638 : * - All remaining characters (including all printable
639 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
640 : * etc.) have a column width of 1.
641 : *
642 : * This implementation assumes that wchar_t characters are encoded
643 : * in ISO 10646.
644 : */
645 :
646 : static int
647 30038587 : ucs_wcwidth(pg_wchar ucs)
648 : {
649 : #include "common/unicode_nonspacing_table.h"
650 : #include "common/unicode_east_asian_fw_table.h"
651 :
652 : /* test for 8-bit control characters */
653 30038587 : if (ucs == 0)
654 0 : return 0;
655 :
656 30038587 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
657 29425 : return -1;
658 :
659 : /*
660 : * binary search in table of non-spacing characters
661 : *
662 : * XXX: In the official Unicode sources, it is possible for a character to
663 : * be described as both non-spacing and wide at the same time. As of
664 : * Unicode 13.0, treating the non-spacing property as the determining
665 : * factor for display width leads to the correct behavior, so do that
666 : * search first.
667 : */
668 30009162 : if (mbbisearch(ucs, nonspacing,
669 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
670 324 : return 0;
671 :
672 : /* binary search in table of wide characters */
673 30008838 : if (mbbisearch(ucs, east_asian_fw,
674 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
675 156 : return 2;
676 :
677 30008682 : return 1;
678 : }
679 :
680 : static int
681 30038587 : pg_utf_dsplen(const unsigned char *s)
682 : {
683 30038587 : return ucs_wcwidth(utf8_to_unicode(s));
684 : }
685 :
686 : /*
687 : * ISO8859-1
688 : */
689 : static int
690 468 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
691 : {
692 468 : int cnt = 0;
693 :
694 13377 : while (len > 0 && *from)
695 : {
696 12909 : *to++ = *from++;
697 12909 : len--;
698 12909 : cnt++;
699 : }
700 468 : *to = 0;
701 468 : return cnt;
702 : }
703 :
704 : /*
705 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
706 : * high bits.
707 : * caller should allocate enough space for "to"
708 : * len: length of from.
709 : * "from" not necessarily null terminated.
710 : */
711 : static int
712 79 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
713 : {
714 79 : int cnt = 0;
715 :
716 678 : while (len > 0 && *from)
717 : {
718 599 : *to++ = *from++;
719 599 : len--;
720 599 : cnt++;
721 : }
722 79 : *to = 0;
723 79 : return cnt;
724 : }
725 :
726 : static int
727 3614 : pg_latin1_mblen(const unsigned char *s)
728 : {
729 3614 : return 1;
730 : }
731 :
732 : static int
733 400 : pg_latin1_dsplen(const unsigned char *s)
734 : {
735 400 : return pg_ascii_dsplen(s);
736 : }
737 :
738 : /*
739 : * SJIS
740 : */
741 : static int
742 1015 : pg_sjis_mblen(const unsigned char *s)
743 : {
744 : int len;
745 :
746 1015 : if (*s >= 0xa1 && *s <= 0xdf)
747 0 : len = 1; /* 1 byte kana? */
748 1015 : else if (IS_HIGHBIT_SET(*s))
749 809 : len = 2; /* kanji? */
750 : else
751 206 : len = 1; /* should be ASCII */
752 1015 : return len;
753 : }
754 :
755 : static int
756 0 : pg_sjis_dsplen(const unsigned char *s)
757 : {
758 : int len;
759 :
760 0 : if (*s >= 0xa1 && *s <= 0xdf)
761 0 : len = 1; /* 1 byte kana? */
762 0 : else if (IS_HIGHBIT_SET(*s))
763 0 : len = 2; /* kanji? */
764 : else
765 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
766 0 : return len;
767 : }
768 :
769 : /*
770 : * Big5
771 : */
772 : static int
773 232 : pg_big5_mblen(const unsigned char *s)
774 : {
775 : int len;
776 :
777 232 : if (IS_HIGHBIT_SET(*s))
778 208 : len = 2; /* kanji? */
779 : else
780 24 : len = 1; /* should be ASCII */
781 232 : return len;
782 : }
783 :
784 : static int
785 0 : pg_big5_dsplen(const unsigned char *s)
786 : {
787 : int len;
788 :
789 0 : if (IS_HIGHBIT_SET(*s))
790 0 : len = 2; /* kanji? */
791 : else
792 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
793 0 : return len;
794 : }
795 :
796 : /*
797 : * GBK
798 : */
799 : static int
800 282 : pg_gbk_mblen(const unsigned char *s)
801 : {
802 : int len;
803 :
804 282 : if (IS_HIGHBIT_SET(*s))
805 212 : len = 2; /* kanji? */
806 : else
807 70 : len = 1; /* should be ASCII */
808 282 : return len;
809 : }
810 :
811 : static int
812 0 : pg_gbk_dsplen(const unsigned char *s)
813 : {
814 : int len;
815 :
816 0 : if (IS_HIGHBIT_SET(*s))
817 0 : len = 2; /* kanji? */
818 : else
819 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
820 0 : return len;
821 : }
822 :
823 : /*
824 : * UHC
825 : */
826 : static int
827 16 : pg_uhc_mblen(const unsigned char *s)
828 : {
829 : int len;
830 :
831 16 : if (IS_HIGHBIT_SET(*s))
832 16 : len = 2; /* 2byte? */
833 : else
834 0 : len = 1; /* should be ASCII */
835 16 : return len;
836 : }
837 :
838 : static int
839 0 : pg_uhc_dsplen(const unsigned char *s)
840 : {
841 : int len;
842 :
843 0 : if (IS_HIGHBIT_SET(*s))
844 0 : len = 2; /* 2byte? */
845 : else
846 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
847 0 : return len;
848 : }
849 :
850 : /*
851 : * GB18030
852 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
853 : */
854 :
855 : /*
856 : * Unlike all other mblen() functions, this also looks at the second byte of
857 : * the input. However, if you only pass the first byte of a multi-byte
858 : * string, and \0 as the second byte, this still works in a predictable way:
859 : * a 4-byte character will be reported as two 2-byte characters. That's
860 : * enough for all current uses, as a client-only encoding. It works that
861 : * way, because in any valid 4-byte GB18030-encoded character, the third and
862 : * fourth byte look like a 2-byte encoded character, when looked at
863 : * separately.
864 : */
865 : static int
866 623 : pg_gb18030_mblen(const unsigned char *s)
867 : {
868 : int len;
869 :
870 623 : if (!IS_HIGHBIT_SET(*s))
871 348 : len = 1; /* ASCII */
872 275 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
873 114 : len = 4;
874 : else
875 161 : len = 2;
876 623 : return len;
877 : }
878 :
879 : static int
880 0 : pg_gb18030_dsplen(const unsigned char *s)
881 : {
882 : int len;
883 :
884 0 : if (IS_HIGHBIT_SET(*s))
885 0 : len = 2;
886 : else
887 0 : len = pg_ascii_dsplen(s); /* ASCII */
888 0 : return len;
889 : }
890 :
891 : /*
892 : *-------------------------------------------------------------------
893 : * multibyte sequence validators
894 : *
895 : * The verifychar functions accept "s", a pointer to the first byte of a
896 : * string, and "len", the remaining length of the string. If there is a
897 : * validly encoded character beginning at *s, return its length in bytes;
898 : * else return -1.
899 : *
900 : * The verifystr functions also accept "s", a pointer to a string and "len",
901 : * the length of the string. They verify the whole string, and return the
902 : * number of input bytes (<= len) that are valid. In other words, if the
903 : * whole string is valid, verifystr returns "len", otherwise it returns the
904 : * byte offset of the first invalid character. The verifystr functions must
905 : * test for and reject zeroes in the input.
906 : *
907 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
908 : * they must test for and reject zeroes in any additional bytes of a
909 : * multibyte character. Note that this definition allows the function for a
910 : * single-byte encoding to be just "return 1".
911 : *-------------------------------------------------------------------
912 : */
913 : static int
914 161 : pg_ascii_verifychar(const unsigned char *s, int len)
915 : {
916 161 : return 1;
917 : }
918 :
919 : static int
920 211652 : pg_ascii_verifystr(const unsigned char *s, int len)
921 : {
922 211652 : const unsigned char *nullpos = memchr(s, 0, len);
923 :
924 211652 : if (nullpos == NULL)
925 211652 : return len;
926 : else
927 0 : return nullpos - s;
928 : }
929 :
930 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
931 :
932 : static int
933 336 : pg_eucjp_verifychar(const unsigned char *s, int len)
934 : {
935 : int l;
936 : unsigned char c1,
937 : c2;
938 :
939 336 : c1 = *s++;
940 :
941 336 : switch (c1)
942 : {
943 0 : case SS2: /* JIS X 0201 */
944 0 : l = 2;
945 0 : if (l > len)
946 0 : return -1;
947 0 : c2 = *s++;
948 0 : if (c2 < 0xa1 || c2 > 0xdf)
949 0 : return -1;
950 0 : break;
951 :
952 0 : case SS3: /* JIS X 0212 */
953 0 : l = 3;
954 0 : if (l > len)
955 0 : return -1;
956 0 : c2 = *s++;
957 0 : if (!IS_EUC_RANGE_VALID(c2))
958 0 : return -1;
959 0 : c2 = *s++;
960 0 : if (!IS_EUC_RANGE_VALID(c2))
961 0 : return -1;
962 0 : break;
963 :
964 336 : default:
965 336 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
966 : {
967 336 : l = 2;
968 336 : if (l > len)
969 56 : return -1;
970 280 : if (!IS_EUC_RANGE_VALID(c1))
971 16 : return -1;
972 264 : c2 = *s++;
973 264 : if (!IS_EUC_RANGE_VALID(c2))
974 120 : return -1;
975 : }
976 : else
977 : /* must be ASCII */
978 : {
979 0 : l = 1;
980 : }
981 144 : break;
982 : }
983 :
984 144 : return l;
985 : }
986 :
987 : static int
988 196 : pg_eucjp_verifystr(const unsigned char *s, int len)
989 : {
990 196 : const unsigned char *start = s;
991 :
992 604 : while (len > 0)
993 : {
994 : int l;
995 :
996 : /* fast path for ASCII-subset characters */
997 552 : if (!IS_HIGHBIT_SET(*s))
998 : {
999 384 : if (*s == '\0')
1000 48 : break;
1001 336 : l = 1;
1002 : }
1003 : else
1004 : {
1005 168 : l = pg_eucjp_verifychar(s, len);
1006 168 : if (l == -1)
1007 96 : break;
1008 : }
1009 408 : s += l;
1010 408 : len -= l;
1011 : }
1012 :
1013 196 : return s - start;
1014 : }
1015 :
1016 : static int
1017 96 : pg_euckr_verifychar(const unsigned char *s, int len)
1018 : {
1019 : int l;
1020 : unsigned char c1,
1021 : c2;
1022 :
1023 96 : c1 = *s++;
1024 :
1025 96 : if (IS_HIGHBIT_SET(c1))
1026 : {
1027 96 : l = 2;
1028 96 : if (l > len)
1029 8 : return -1;
1030 88 : if (!IS_EUC_RANGE_VALID(c1))
1031 16 : return -1;
1032 72 : c2 = *s++;
1033 72 : if (!IS_EUC_RANGE_VALID(c2))
1034 0 : return -1;
1035 : }
1036 : else
1037 : /* must be ASCII */
1038 : {
1039 0 : l = 1;
1040 : }
1041 :
1042 72 : return l;
1043 : }
1044 :
1045 : static int
1046 40 : pg_euckr_verifystr(const unsigned char *s, int len)
1047 : {
1048 40 : const unsigned char *start = s;
1049 :
1050 124 : while (len > 0)
1051 : {
1052 : int l;
1053 :
1054 : /* fast path for ASCII-subset characters */
1055 108 : if (!IS_HIGHBIT_SET(*s))
1056 : {
1057 48 : if (*s == '\0')
1058 0 : break;
1059 48 : l = 1;
1060 : }
1061 : else
1062 : {
1063 60 : l = pg_euckr_verifychar(s, len);
1064 60 : if (l == -1)
1065 24 : break;
1066 : }
1067 84 : s += l;
1068 84 : len -= l;
1069 : }
1070 :
1071 40 : return s - start;
1072 : }
1073 :
1074 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1075 : #define pg_euccn_verifychar pg_euckr_verifychar
1076 : #define pg_euccn_verifystr pg_euckr_verifystr
1077 :
1078 : static int
1079 12 : pg_euctw_verifychar(const unsigned char *s, int len)
1080 : {
1081 : int l;
1082 : unsigned char c1,
1083 : c2;
1084 :
1085 12 : c1 = *s++;
1086 :
1087 12 : switch (c1)
1088 : {
1089 0 : case SS2: /* CNS 11643 Plane 1-7 */
1090 0 : l = 4;
1091 0 : if (l > len)
1092 0 : return -1;
1093 0 : c2 = *s++;
1094 0 : if (c2 < 0xa1 || c2 > 0xa7)
1095 0 : return -1;
1096 0 : c2 = *s++;
1097 0 : if (!IS_EUC_RANGE_VALID(c2))
1098 0 : return -1;
1099 0 : c2 = *s++;
1100 0 : if (!IS_EUC_RANGE_VALID(c2))
1101 0 : return -1;
1102 0 : break;
1103 :
1104 0 : case SS3: /* unused */
1105 0 : return -1;
1106 :
1107 12 : default:
1108 12 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1109 : {
1110 12 : l = 2;
1111 12 : if (l > len)
1112 4 : return -1;
1113 : /* no further range check on c1? */
1114 8 : c2 = *s++;
1115 8 : if (!IS_EUC_RANGE_VALID(c2))
1116 8 : return -1;
1117 : }
1118 : else
1119 : /* must be ASCII */
1120 : {
1121 0 : l = 1;
1122 : }
1123 0 : break;
1124 : }
1125 0 : return l;
1126 : }
1127 :
1128 : static int
1129 20 : pg_euctw_verifystr(const unsigned char *s, int len)
1130 : {
1131 20 : const unsigned char *start = s;
1132 :
1133 44 : while (len > 0)
1134 : {
1135 : int l;
1136 :
1137 : /* fast path for ASCII-subset characters */
1138 36 : if (!IS_HIGHBIT_SET(*s))
1139 : {
1140 24 : if (*s == '\0')
1141 0 : break;
1142 24 : l = 1;
1143 : }
1144 : else
1145 : {
1146 12 : l = pg_euctw_verifychar(s, len);
1147 12 : if (l == -1)
1148 12 : break;
1149 : }
1150 24 : s += l;
1151 24 : len -= l;
1152 : }
1153 :
1154 20 : return s - start;
1155 : }
1156 :
1157 : static int
1158 12 : pg_johab_verifychar(const unsigned char *s, int len)
1159 : {
1160 : int l,
1161 : mbl;
1162 : unsigned char c;
1163 :
1164 12 : l = mbl = pg_johab_mblen(s);
1165 :
1166 12 : if (len < l)
1167 4 : return -1;
1168 :
1169 8 : if (!IS_HIGHBIT_SET(*s))
1170 0 : return mbl;
1171 :
1172 8 : while (--l > 0)
1173 : {
1174 8 : c = *++s;
1175 8 : if (!IS_EUC_RANGE_VALID(c))
1176 8 : return -1;
1177 : }
1178 0 : return mbl;
1179 : }
1180 :
1181 : static int
1182 16 : pg_johab_verifystr(const unsigned char *s, int len)
1183 : {
1184 16 : const unsigned char *start = s;
1185 :
1186 28 : while (len > 0)
1187 : {
1188 : int l;
1189 :
1190 : /* fast path for ASCII-subset characters */
1191 24 : if (!IS_HIGHBIT_SET(*s))
1192 : {
1193 12 : if (*s == '\0')
1194 0 : break;
1195 12 : l = 1;
1196 : }
1197 : else
1198 : {
1199 12 : l = pg_johab_verifychar(s, len);
1200 12 : if (l == -1)
1201 12 : break;
1202 : }
1203 12 : s += l;
1204 12 : len -= l;
1205 : }
1206 :
1207 16 : return s - start;
1208 : }
1209 :
1210 : static int
1211 3223 : pg_latin1_verifychar(const unsigned char *s, int len)
1212 : {
1213 3223 : return 1;
1214 : }
1215 :
1216 : static int
1217 5212 : pg_latin1_verifystr(const unsigned char *s, int len)
1218 : {
1219 5212 : const unsigned char *nullpos = memchr(s, 0, len);
1220 :
1221 5212 : if (nullpos == NULL)
1222 5140 : return len;
1223 : else
1224 72 : return nullpos - s;
1225 : }
1226 :
1227 : static int
1228 624 : pg_sjis_verifychar(const unsigned char *s, int len)
1229 : {
1230 : int l,
1231 : mbl;
1232 : unsigned char c1,
1233 : c2;
1234 :
1235 624 : l = mbl = pg_sjis_mblen(s);
1236 :
1237 624 : if (len < l)
1238 86 : return -1;
1239 :
1240 538 : if (l == 1) /* pg_sjis_mblen already verified it */
1241 0 : return mbl;
1242 :
1243 538 : c1 = *s++;
1244 538 : c2 = *s;
1245 538 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1246 214 : return -1;
1247 324 : return mbl;
1248 : }
1249 :
1250 : static int
1251 322 : pg_sjis_verifystr(const unsigned char *s, int len)
1252 : {
1253 322 : const unsigned char *start = s;
1254 :
1255 1233 : while (len > 0)
1256 : {
1257 : int l;
1258 :
1259 : /* fast path for ASCII-subset characters */
1260 1107 : if (!IS_HIGHBIT_SET(*s))
1261 : {
1262 815 : if (*s == '\0')
1263 48 : break;
1264 767 : l = 1;
1265 : }
1266 : else
1267 : {
1268 292 : l = pg_sjis_verifychar(s, len);
1269 292 : if (l == -1)
1270 148 : break;
1271 : }
1272 911 : s += l;
1273 911 : len -= l;
1274 : }
1275 :
1276 322 : return s - start;
1277 : }
1278 :
1279 : static int
1280 168 : pg_big5_verifychar(const unsigned char *s, int len)
1281 : {
1282 : int l,
1283 : mbl;
1284 :
1285 168 : l = mbl = pg_big5_mblen(s);
1286 :
1287 168 : if (len < l)
1288 4 : return -1;
1289 :
1290 164 : if (l == 2 &&
1291 164 : s[0] == NONUTF8_INVALID_BYTE0 &&
1292 8 : s[1] == NONUTF8_INVALID_BYTE1)
1293 8 : return -1;
1294 :
1295 264 : while (--l > 0)
1296 : {
1297 156 : if (*++s == '\0')
1298 48 : return -1;
1299 : }
1300 :
1301 108 : return mbl;
1302 : }
1303 :
1304 : static int
1305 104 : pg_big5_verifystr(const unsigned char *s, int len)
1306 : {
1307 104 : const unsigned char *start = s;
1308 :
1309 428 : while (len > 0)
1310 : {
1311 : int l;
1312 :
1313 : /* fast path for ASCII-subset characters */
1314 384 : if (!IS_HIGHBIT_SET(*s))
1315 : {
1316 300 : if (*s == '\0')
1317 24 : break;
1318 276 : l = 1;
1319 : }
1320 : else
1321 : {
1322 84 : l = pg_big5_verifychar(s, len);
1323 84 : if (l == -1)
1324 36 : break;
1325 : }
1326 324 : s += l;
1327 324 : len -= l;
1328 : }
1329 :
1330 104 : return s - start;
1331 : }
1332 :
1333 : static int
1334 140 : pg_gbk_verifychar(const unsigned char *s, int len)
1335 : {
1336 : int l,
1337 : mbl;
1338 :
1339 140 : l = mbl = pg_gbk_mblen(s);
1340 :
1341 140 : if (len < l)
1342 28 : return -1;
1343 :
1344 112 : if (l == 2 &&
1345 112 : s[0] == NONUTF8_INVALID_BYTE0 &&
1346 16 : s[1] == NONUTF8_INVALID_BYTE1)
1347 16 : return -1;
1348 :
1349 192 : while (--l > 0)
1350 : {
1351 96 : if (*++s == '\0')
1352 0 : return -1;
1353 : }
1354 :
1355 96 : return mbl;
1356 : }
1357 :
1358 : static int
1359 132 : pg_gbk_verifystr(const unsigned char *s, int len)
1360 : {
1361 132 : const unsigned char *start = s;
1362 :
1363 336 : while (len > 0)
1364 : {
1365 : int l;
1366 :
1367 : /* fast path for ASCII-subset characters */
1368 248 : if (!IS_HIGHBIT_SET(*s))
1369 : {
1370 124 : if (*s == '\0')
1371 0 : break;
1372 124 : l = 1;
1373 : }
1374 : else
1375 : {
1376 124 : l = pg_gbk_verifychar(s, len);
1377 124 : if (l == -1)
1378 44 : break;
1379 : }
1380 204 : s += l;
1381 204 : len -= l;
1382 : }
1383 :
1384 132 : return s - start;
1385 : }
1386 :
1387 : static int
1388 12 : pg_uhc_verifychar(const unsigned char *s, int len)
1389 : {
1390 : int l,
1391 : mbl;
1392 :
1393 12 : l = mbl = pg_uhc_mblen(s);
1394 :
1395 12 : if (len < l)
1396 4 : return -1;
1397 :
1398 8 : if (l == 2 &&
1399 8 : s[0] == NONUTF8_INVALID_BYTE0 &&
1400 8 : s[1] == NONUTF8_INVALID_BYTE1)
1401 8 : return -1;
1402 :
1403 0 : while (--l > 0)
1404 : {
1405 0 : if (*++s == '\0')
1406 0 : return -1;
1407 : }
1408 :
1409 0 : return mbl;
1410 : }
1411 :
1412 : static int
1413 16 : pg_uhc_verifystr(const unsigned char *s, int len)
1414 : {
1415 16 : const unsigned char *start = s;
1416 :
1417 28 : while (len > 0)
1418 : {
1419 : int l;
1420 :
1421 : /* fast path for ASCII-subset characters */
1422 24 : if (!IS_HIGHBIT_SET(*s))
1423 : {
1424 12 : if (*s == '\0')
1425 0 : break;
1426 12 : l = 1;
1427 : }
1428 : else
1429 : {
1430 12 : l = pg_uhc_verifychar(s, len);
1431 12 : if (l == -1)
1432 12 : break;
1433 : }
1434 12 : s += l;
1435 12 : len -= l;
1436 : }
1437 :
1438 16 : return s - start;
1439 : }
1440 :
1441 : static int
1442 698 : pg_gb18030_verifychar(const unsigned char *s, int len)
1443 : {
1444 : int l;
1445 :
1446 698 : if (!IS_HIGHBIT_SET(*s))
1447 0 : l = 1; /* ASCII */
1448 698 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1449 : {
1450 : /* Should be 4-byte, validate remaining bytes */
1451 210 : if (*s >= 0x81 && *s <= 0xfe &&
1452 204 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1453 204 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1454 108 : l = 4;
1455 : else
1456 102 : l = -1;
1457 : }
1458 488 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1459 : {
1460 : /* Should be 2-byte, validate */
1461 358 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1462 238 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1463 176 : l = 2;
1464 : else
1465 182 : l = -1;
1466 : }
1467 : else
1468 130 : l = -1;
1469 698 : return l;
1470 : }
1471 :
1472 : static int
1473 500 : pg_gb18030_verifystr(const unsigned char *s, int len)
1474 : {
1475 500 : const unsigned char *start = s;
1476 :
1477 1679 : while (len > 0)
1478 : {
1479 : int l;
1480 :
1481 : /* fast path for ASCII-subset characters */
1482 1515 : if (!IS_HIGHBIT_SET(*s))
1483 : {
1484 1037 : if (*s == '\0')
1485 30 : break;
1486 1007 : l = 1;
1487 : }
1488 : else
1489 : {
1490 478 : l = pg_gb18030_verifychar(s, len);
1491 478 : if (l == -1)
1492 306 : break;
1493 : }
1494 1179 : s += l;
1495 1179 : len -= l;
1496 : }
1497 :
1498 500 : return s - start;
1499 : }
1500 :
1501 : static int
1502 9451 : pg_utf8_verifychar(const unsigned char *s, int len)
1503 : {
1504 : int l;
1505 :
1506 9451 : if ((*s & 0x80) == 0)
1507 : {
1508 0 : if (*s == '\0')
1509 0 : return -1;
1510 0 : return 1;
1511 : }
1512 9451 : else if ((*s & 0xe0) == 0xc0)
1513 3331 : l = 2;
1514 6120 : else if ((*s & 0xf0) == 0xe0)
1515 3412 : l = 3;
1516 2708 : else if ((*s & 0xf8) == 0xf0)
1517 2532 : l = 4;
1518 : else
1519 176 : l = 1;
1520 :
1521 9451 : if (l > len)
1522 320 : return -1;
1523 :
1524 9131 : if (!pg_utf8_islegal(s, l))
1525 1486 : return -1;
1526 :
1527 7645 : return l;
1528 : }
1529 :
1530 : /*
1531 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1532 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1533 : * input byte and current state are used to compute an index into an array of
1534 : * state transitions. Since the address of the next transition is dependent
1535 : * on this computation, there is latency in executing the load instruction,
1536 : * and the CPU is not kept busy.
1537 : *
1538 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1539 : *
1540 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1541 : *
1542 : * In a shift-based DFA, the input byte is an index into array of integers
1543 : * whose bit pattern encodes the state transitions. To compute the next
1544 : * state, we simply right-shift the integer by the current state and apply a
1545 : * mask. In this scheme, the address of the transition only depends on the
1546 : * input byte, so there is better pipelining.
1547 : *
1548 : * The naming convention for states and transitions was adopted from a UTF-8
1549 : * to UTF-16/32 transcoder, whose table is reproduced below:
1550 : *
1551 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1552 : *
1553 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1554 : * ==========================================================================
1555 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1556 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1557 : * |
1558 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1559 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1560 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1561 : * |
1562 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1563 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1564 : * |
1565 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1566 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1567 : *
1568 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1569 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1570 : * it's possible to find state numbers such that the transitions fit within
1571 : * 32-bit integers, as Dougall Johnson demonstrated:
1572 : *
1573 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1574 : *
1575 : * This packed representation is the reason for the seemingly odd choice of
1576 : * state values below.
1577 : */
1578 :
1579 : /* Error */
1580 : #define ERR 0
1581 : /* Begin */
1582 : #define BGN 11
1583 : /* Continuation states, expect 1/2/3 continuation bytes */
1584 : #define CS1 16
1585 : #define CS2 1
1586 : #define CS3 5
1587 : /* Partial states, where the first continuation byte has a restricted range */
1588 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1589 : #define P3B 20 /* Lead was ED, check for surrogate */
1590 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1591 : #define P4B 30 /* Lead was F4, check for too-large */
1592 : /* Begin and End are the same state */
1593 : #define END BGN
1594 :
1595 : /* the encoded state transitions for the lookup table */
1596 :
1597 : /* ASCII */
1598 : #define ASC (END << BGN)
1599 : /* 2-byte lead */
1600 : #define L2A (CS1 << BGN)
1601 : /* 3-byte lead */
1602 : #define L3A (P3A << BGN)
1603 : #define L3B (CS2 << BGN)
1604 : #define L3C (P3B << BGN)
1605 : /* 4-byte lead */
1606 : #define L4A (P4A << BGN)
1607 : #define L4B (CS3 << BGN)
1608 : #define L4C (P4B << BGN)
1609 : /* continuation byte */
1610 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1611 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1612 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1613 : /* invalid byte */
1614 : #define ILL ERR
1615 :
1616 : static const uint32 Utf8Transition[256] =
1617 : {
1618 : /* ASCII */
1619 :
1620 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1621 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1622 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1623 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1624 :
1625 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1626 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1627 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1628 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1629 :
1630 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1631 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1632 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1633 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1634 :
1635 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1636 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1637 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1638 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1639 :
1640 : /* continuation bytes */
1641 :
1642 : /* 80..8F */
1643 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1644 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1645 :
1646 : /* 90..9F */
1647 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1648 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1649 :
1650 : /* A0..BF */
1651 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1652 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1653 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1654 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1655 :
1656 : /* leading bytes */
1657 :
1658 : /* C0..DF */
1659 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1660 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1661 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1662 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1663 :
1664 : /* E0..EF */
1665 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1666 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1667 :
1668 : /* F0..FF */
1669 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1670 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1671 : };
1672 :
1673 : static void
1674 1147 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1675 : {
1676 : /* Note: We deliberately don't check the state's value here. */
1677 37851 : while (len > 0)
1678 : {
1679 : /*
1680 : * It's important that the mask value is 31: In most instruction sets,
1681 : * a shift by a 32-bit operand is understood to be a shift by its mod
1682 : * 32, so the compiler should elide the mask operation.
1683 : */
1684 36704 : *state = Utf8Transition[*s++] >> (*state & 31);
1685 36704 : len--;
1686 : }
1687 :
1688 1147 : *state &= 31;
1689 1147 : }
1690 :
1691 : static int
1692 717701 : pg_utf8_verifystr(const unsigned char *s, int len)
1693 : {
1694 717701 : const unsigned char *start = s;
1695 717701 : const int orig_len = len;
1696 717701 : uint32 state = BGN;
1697 :
1698 : /*
1699 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1700 : * the compiler can unroll a longer loop, it's not worth it because we
1701 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1702 : */
1703 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1704 :
1705 717701 : if (len >= STRIDE_LENGTH)
1706 : {
1707 2662674 : while (len >= STRIDE_LENGTH)
1708 : {
1709 : /*
1710 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1711 : * but we must first check for a non-END state, which means the
1712 : * previous chunk ended in the middle of a multibyte sequence.
1713 : */
1714 2299137 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1715 1147 : utf8_advance(s, &state, STRIDE_LENGTH);
1716 :
1717 2299137 : s += STRIDE_LENGTH;
1718 2299137 : len -= STRIDE_LENGTH;
1719 : }
1720 :
1721 : /* The error state persists, so we only need to check for it here. */
1722 363537 : if (state == ERR)
1723 : {
1724 : /*
1725 : * Start over from the beginning with the slow path so we can
1726 : * count the valid bytes.
1727 : */
1728 336 : len = orig_len;
1729 336 : s = start;
1730 : }
1731 363201 : else if (state != END)
1732 : {
1733 : /*
1734 : * The fast path exited in the middle of a multibyte sequence.
1735 : * Walk backwards to find the leading byte so that the slow path
1736 : * can resume checking from there. We must always backtrack at
1737 : * least one byte, since the current byte could be e.g. an ASCII
1738 : * byte after a 2-byte lead, which is invalid.
1739 : */
1740 : do
1741 : {
1742 : Assert(s > start);
1743 73 : s--;
1744 73 : len++;
1745 : Assert(IS_HIGHBIT_SET(*s));
1746 73 : } while (pg_utf_mblen(s) <= 1);
1747 : }
1748 : }
1749 :
1750 : /* check remaining bytes */
1751 10703420 : while (len > 0)
1752 : {
1753 : int l;
1754 :
1755 : /* fast path for ASCII-subset characters */
1756 9987625 : if (!IS_HIGHBIT_SET(*s))
1757 : {
1758 9978210 : if (*s == '\0')
1759 132 : break;
1760 9978078 : l = 1;
1761 : }
1762 : else
1763 : {
1764 9415 : l = pg_utf8_verifychar(s, len);
1765 9415 : if (l == -1)
1766 1774 : break;
1767 : }
1768 9985719 : s += l;
1769 9985719 : len -= l;
1770 : }
1771 :
1772 717701 : return s - start;
1773 : }
1774 :
1775 : /*
1776 : * Check for validity of a single UTF-8 encoded character
1777 : *
1778 : * This directly implements the rules in RFC3629. The bizarre-looking
1779 : * restrictions on the second byte are meant to ensure that there isn't
1780 : * more than one encoding of a given Unicode character point; that is,
1781 : * you may not use a longer-than-necessary byte sequence with high order
1782 : * zero bits to represent a character that would fit in fewer bytes.
1783 : * To do otherwise is to create security hazards (eg, create an apparent
1784 : * non-ASCII character that decodes to plain ASCII).
1785 : *
1786 : * length is assumed to have been obtained by pg_utf_mblen(), and the
1787 : * caller must have checked that that many bytes are present in the buffer.
1788 : */
1789 : bool
1790 16204 : pg_utf8_islegal(const unsigned char *source, int length)
1791 : {
1792 : unsigned char a;
1793 :
1794 16204 : switch (length)
1795 : {
1796 0 : default:
1797 : /* reject lengths 5 and 6 for now */
1798 0 : return false;
1799 2396 : case 4:
1800 2396 : a = source[3];
1801 2396 : if (a < 0x80 || a > 0xBF)
1802 198 : return false;
1803 : pg_fallthrough;
1804 : case 3:
1805 6619 : a = source[2];
1806 6619 : if (a < 0x80 || a > 0xBF)
1807 440 : return false;
1808 : pg_fallthrough;
1809 : case 2:
1810 9830 : a = source[1];
1811 9830 : switch (*source)
1812 : {
1813 208 : case 0xE0:
1814 208 : if (a < 0xA0 || a > 0xBF)
1815 176 : return false;
1816 32 : break;
1817 208 : case 0xED:
1818 208 : if (a < 0x80 || a > 0x9F)
1819 176 : return false;
1820 32 : break;
1821 2078 : case 0xF0:
1822 2078 : if (a < 0x90 || a > 0xBF)
1823 176 : return false;
1824 1902 : break;
1825 120 : case 0xF4:
1826 120 : if (a < 0x80 || a > 0x8F)
1827 88 : return false;
1828 32 : break;
1829 7216 : default:
1830 7216 : if (a < 0x80 || a > 0xBF)
1831 168 : return false;
1832 7048 : break;
1833 : }
1834 : pg_fallthrough;
1835 : case 1:
1836 14782 : a = *source;
1837 14782 : if (a >= 0x80 && a < 0xC2)
1838 264 : return false;
1839 14518 : if (a > 0xF4)
1840 88 : return false;
1841 14430 : break;
1842 : }
1843 14430 : return true;
1844 : }
1845 :
1846 :
1847 : /*
1848 : * Fills the provided buffer with two bytes such that:
1849 : * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
1850 : */
1851 : void
1852 212 : pg_encoding_set_invalid(int encoding, char *dst)
1853 : {
1854 : Assert(pg_encoding_max_length(encoding) > 1);
1855 :
1856 212 : dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
1857 212 : dst[1] = NONUTF8_INVALID_BYTE1;
1858 212 : }
1859 :
1860 : /*
1861 : *-------------------------------------------------------------------
1862 : * encoding info table
1863 : *-------------------------------------------------------------------
1864 : */
1865 : const pg_wchar_tbl pg_wchar_table[] = {
1866 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
1867 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
1868 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
1869 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
1870 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
1871 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
1872 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
1873 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1874 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1875 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1876 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1877 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1878 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1879 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1880 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1881 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1882 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1883 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1884 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1885 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1886 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1887 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1888 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1889 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1890 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1891 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1892 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1893 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1894 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1895 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1896 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1897 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1898 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1899 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1900 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
1901 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
1902 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
1903 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
1904 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
1905 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
1906 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
1907 : };
1908 :
1909 : /*
1910 : * Returns the byte length of a multibyte character.
1911 : *
1912 : * Choose "mblen" functions based on the input string characteristics.
1913 : * pg_encoding_mblen() can be used when ANY of these conditions are met:
1914 : *
1915 : * - The input string is zero-terminated
1916 : *
1917 : * - The input string is known to be valid in the encoding (e.g., string
1918 : * converted from database encoding)
1919 : *
1920 : * - The encoding is not GB18030 (e.g., when only database encodings are
1921 : * passed to 'encoding' parameter)
1922 : *
1923 : * encoding==GB18030 requires examining up to two bytes to determine character
1924 : * length. Therefore, callers satisfying none of those conditions must use
1925 : * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
1926 : * guaranteed to be within allocation bounds.
1927 : *
1928 : * When dealing with text that is not certainly valid in the specified
1929 : * encoding, the result may exceed the actual remaining string length.
1930 : * Callers that are not prepared to deal with that should use Min(remaining,
1931 : * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
1932 : * pg_encoding_mblen_bounded() are interchangeable.
1933 : */
1934 : int
1935 30165036 : pg_encoding_mblen(int encoding, const char *mbstr)
1936 : {
1937 30165036 : return (PG_VALID_ENCODING(encoding) ?
1938 60330072 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1939 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1940 : }
1941 :
1942 : /*
1943 : * Returns the byte length of a multibyte character (possibly not
1944 : * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
1945 : */
1946 : int
1947 3136 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
1948 : size_t remaining)
1949 : {
1950 : /*
1951 : * Define zero remaining as too few, even for single-byte encodings.
1952 : * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
1953 : * zero; others read one.
1954 : */
1955 3136 : if (remaining < 1 ||
1956 202 : (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
1957 42 : return INT_MAX;
1958 3094 : return pg_encoding_mblen(encoding, mbstr);
1959 : }
1960 :
1961 : /*
1962 : * Returns the byte length of a multibyte character; but not more than the
1963 : * distance to the terminating zero byte. For input that might lack a
1964 : * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
1965 : */
1966 : int
1967 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
1968 : {
1969 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
1970 : }
1971 :
1972 : /*
1973 : * Returns the display length of a multibyte character.
1974 : */
1975 : int
1976 30049471 : pg_encoding_dsplen(int encoding, const char *mbstr)
1977 : {
1978 30049471 : return (PG_VALID_ENCODING(encoding) ?
1979 60098942 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1980 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1981 : }
1982 :
1983 : /*
1984 : * Verify the first multibyte character of the given string.
1985 : * Return its byte length if good, -1 if bad. (See comments above for
1986 : * full details of the mbverifychar API.)
1987 : */
1988 : int
1989 4228 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
1990 : {
1991 4228 : return (PG_VALID_ENCODING(encoding) ?
1992 8456 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
1993 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
1994 : }
1995 :
1996 : /*
1997 : * Verify that a string is valid for the given encoding.
1998 : * Returns the number of input bytes (<= len) that form a valid string.
1999 : * (See comments above for full details of the mbverifystr API.)
2000 : */
2001 : int
2002 230721 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2003 : {
2004 230721 : return (PG_VALID_ENCODING(encoding) ?
2005 461442 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2006 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2007 : }
2008 :
2009 : /*
2010 : * fetch maximum length of a given encoding
2011 : */
2012 : int
2013 684682 : pg_encoding_max_length(int encoding)
2014 : {
2015 : Assert(PG_VALID_ENCODING(encoding));
2016 :
2017 : /*
2018 : * Check for the encoding despite the assert, due to some mingw versions
2019 : * otherwise issuing bogus warnings.
2020 : */
2021 684682 : return PG_VALID_ENCODING(encoding) ?
2022 1369364 : pg_wchar_table[encoding].maxmblen :
2023 : pg_wchar_table[PG_SQL_ASCII].maxmblen;
2024 : }
|