Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <limits.h>
16 :
17 : #include "mb/pg_wchar.h"
18 : #include "utils/ascii.h"
19 :
20 :
21 : /*
22 : * In today's multibyte encodings other than UTF8, this two-byte sequence
23 : * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 : *
25 : * For historical reasons, several verifychar implementations opt to reject
26 : * this pair specifically. Byte pair range constraints, in encoding
27 : * originator documentation, always excluded this pair. No core conversion
28 : * could translate it. However, longstanding verifychar implementations
29 : * accepted any non-NUL byte. big5_to_euc_tw even translates pairs not
30 : * valid per encoding originator documentation. To avoid tightening core
31 : * or non-core conversions in a security patch, we sought this one pair.
32 : *
33 : * PQescapeString() historically used spaces for BYTE1; many other values
34 : * could suffice for BYTE1.
35 : */
36 : #define NONUTF8_INVALID_BYTE0 (0x8d)
37 : #define NONUTF8_INVALID_BYTE1 (' ')
38 :
39 :
40 : /*
41 : * Operations on multi-byte encodings are driven by a table of helper
42 : * functions.
43 : *
44 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 : * and wchar2mb() conversion functions.
47 : *
48 : * These functions generally assume that their input is validly formed.
49 : * The "verifier" functions, further down in the file, have to be more
50 : * paranoid.
51 : *
52 : * We expect that mblen() does not need to examine more than the first byte
53 : * of the character to discover the correct length. GB18030 is an exception
54 : * to that rule, though, as it also looks at second byte. But even that
55 : * behaves in a predictable way, if you only pass the first byte: it will
56 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 : * good enough for all current uses.
58 : *
59 : * Note: for the display output of psql to work properly, the return values
60 : * of the dsplen functions must conform to the Unicode standard. In particular
61 : * the NUL character is zero width and control characters are generally
62 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 : * subset to the ASCII routines to ensure consistency.
64 : */
65 :
66 : /* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67 : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68 :
69 : /*
70 : * SQL/ASCII
71 : */
72 : static int
73 433 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74 : {
75 433 : int cnt = 0;
76 :
77 33279 : while (len > 0 && *from)
78 : {
79 32846 : *to++ = *from++;
80 32846 : len--;
81 32846 : cnt++;
82 : }
83 433 : *to = 0;
84 433 : return cnt;
85 : }
86 :
87 : static int
88 19580 : pg_ascii_mblen(const unsigned char *s)
89 : {
90 19580 : return 1;
91 : }
92 :
93 : static int
94 18075 : pg_ascii_dsplen(const unsigned char *s)
95 : {
96 18075 : if (*s == '\0')
97 0 : return 0;
98 18075 : if (*s < 0x20 || *s == 0x7f)
99 2 : return -1;
100 :
101 18073 : return 1;
102 : }
103 :
104 : /*
105 : * EUC
106 : */
107 : static int
108 32 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109 : {
110 32 : int cnt = 0;
111 :
112 48 : while (len > 0 && *from)
113 : {
114 32 : if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 : {
116 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
117 4 : from++;
118 4 : *to = (SS2 << 8) | *from++;
119 4 : len -= 2;
120 : }
121 24 : else if (*from == SS3) /* JIS X 0212 KANJI */
122 : {
123 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
124 4 : from++;
125 4 : *to = (SS3 << 16) | (*from++ << 8);
126 4 : *to |= *from++;
127 4 : len -= 3;
128 : }
129 12 : else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 : {
131 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
132 4 : *to = *from++ << 8;
133 4 : *to |= *from++;
134 4 : len -= 2;
135 : }
136 : else /* must be ASCII */
137 : {
138 4 : *to = *from++;
139 4 : len--;
140 : }
141 16 : to++;
142 16 : cnt++;
143 : }
144 32 : *to = 0;
145 32 : return cnt;
146 : }
147 :
148 : static inline int
149 156 : pg_euc_mblen(const unsigned char *s)
150 : {
151 : int len;
152 :
153 156 : if (*s == SS2)
154 0 : len = 2;
155 156 : else if (*s == SS3)
156 0 : len = 3;
157 156 : else if (IS_HIGHBIT_SET(*s))
158 108 : len = 2;
159 : else
160 48 : len = 1;
161 156 : return len;
162 : }
163 :
164 : static inline int
165 0 : pg_euc_dsplen(const unsigned char *s)
166 : {
167 : int len;
168 :
169 0 : if (*s == SS2)
170 0 : len = 2;
171 0 : else if (*s == SS3)
172 0 : len = 2;
173 0 : else if (IS_HIGHBIT_SET(*s))
174 0 : len = 2;
175 : else
176 0 : len = pg_ascii_dsplen(s);
177 0 : return len;
178 : }
179 :
180 : /*
181 : * EUC_JP
182 : */
183 : static int
184 32 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185 : {
186 32 : return pg_euc2wchar_with_len(from, to, len);
187 : }
188 :
189 : static int
190 136 : pg_eucjp_mblen(const unsigned char *s)
191 : {
192 136 : return pg_euc_mblen(s);
193 : }
194 :
195 : static int
196 0 : pg_eucjp_dsplen(const unsigned char *s)
197 : {
198 : int len;
199 :
200 0 : if (*s == SS2)
201 0 : len = 1;
202 0 : else if (*s == SS3)
203 0 : len = 2;
204 0 : else if (IS_HIGHBIT_SET(*s))
205 0 : len = 2;
206 : else
207 0 : len = pg_ascii_dsplen(s);
208 0 : return len;
209 : }
210 :
211 : /*
212 : * EUC_KR
213 : */
214 : static int
215 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216 : {
217 0 : return pg_euc2wchar_with_len(from, to, len);
218 : }
219 :
220 : static int
221 4 : pg_euckr_mblen(const unsigned char *s)
222 : {
223 4 : return pg_euc_mblen(s);
224 : }
225 :
226 : static int
227 0 : pg_euckr_dsplen(const unsigned char *s)
228 : {
229 0 : return pg_euc_dsplen(s);
230 : }
231 :
232 : /*
233 : * EUC_CN
234 : *
235 : */
236 : static int
237 36 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238 : {
239 36 : int cnt = 0;
240 :
241 52 : while (len > 0 && *from)
242 : {
243 36 : if (*from == SS2) /* code set 2 (unused?) */
244 : {
245 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
246 4 : from++;
247 4 : *to = (SS2 << 16) | (*from++ << 8);
248 4 : *to |= *from++;
249 4 : len -= 3;
250 : }
251 24 : else if (*from == SS3) /* code set 3 (unused ?) */
252 : {
253 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
254 4 : from++;
255 4 : *to = (SS3 << 16) | (*from++ << 8);
256 4 : *to |= *from++;
257 4 : len -= 3;
258 : }
259 12 : else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 : {
261 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
262 4 : *to = *from++ << 8;
263 4 : *to |= *from++;
264 4 : len -= 2;
265 : }
266 : else
267 : {
268 4 : *to = *from++;
269 4 : len--;
270 : }
271 16 : to++;
272 16 : cnt++;
273 : }
274 36 : *to = 0;
275 36 : return cnt;
276 : }
277 :
278 : /*
279 : * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 : * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 : * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 : * datums (e.g. from shared catalogs) reach this.
283 : */
284 : static int
285 4 : pg_euccn_mblen(const unsigned char *s)
286 : {
287 : int len;
288 :
289 4 : if (*s == SS2)
290 0 : len = 3;
291 4 : else if (*s == SS3)
292 0 : len = 3;
293 4 : else if (IS_HIGHBIT_SET(*s))
294 4 : len = 2;
295 : else
296 0 : len = 1;
297 4 : return len;
298 : }
299 :
300 : static int
301 0 : pg_euccn_dsplen(const unsigned char *s)
302 : {
303 : int len;
304 :
305 0 : if (IS_HIGHBIT_SET(*s))
306 0 : len = 2;
307 : else
308 0 : len = pg_ascii_dsplen(s);
309 0 : return len;
310 : }
311 :
312 : /*
313 : * EUC_TW
314 : *
315 : */
316 : static int
317 40 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318 : {
319 40 : int cnt = 0;
320 :
321 56 : while (len > 0 && *from)
322 : {
323 40 : if (*from == SS2) /* code set 2 */
324 : {
325 16 : MB2CHAR_NEED_AT_LEAST(len, 4);
326 4 : from++;
327 4 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 4 : *to |= *from++ << 8;
329 4 : *to |= *from++;
330 4 : len -= 4;
331 : }
332 24 : else if (*from == SS3) /* code set 3 (unused?) */
333 : {
334 12 : MB2CHAR_NEED_AT_LEAST(len, 3);
335 4 : from++;
336 4 : *to = (SS3 << 16) | (*from++ << 8);
337 4 : *to |= *from++;
338 4 : len -= 3;
339 : }
340 12 : else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 : {
342 8 : MB2CHAR_NEED_AT_LEAST(len, 2);
343 4 : *to = *from++ << 8;
344 4 : *to |= *from++;
345 4 : len -= 2;
346 : }
347 : else
348 : {
349 4 : *to = *from++;
350 4 : len--;
351 : }
352 16 : to++;
353 16 : cnt++;
354 : }
355 40 : *to = 0;
356 40 : return cnt;
357 : }
358 :
359 : static int
360 4 : pg_euctw_mblen(const unsigned char *s)
361 : {
362 : int len;
363 :
364 4 : if (*s == SS2)
365 0 : len = 4;
366 4 : else if (*s == SS3)
367 0 : len = 3;
368 4 : else if (IS_HIGHBIT_SET(*s))
369 4 : len = 2;
370 : else
371 0 : len = 1;
372 4 : return len;
373 : }
374 :
375 : static int
376 0 : pg_euctw_dsplen(const unsigned char *s)
377 : {
378 : int len;
379 :
380 0 : if (*s == SS2)
381 0 : len = 2;
382 0 : else if (*s == SS3)
383 0 : len = 2;
384 0 : else if (IS_HIGHBIT_SET(*s))
385 0 : len = 2;
386 : else
387 0 : len = pg_ascii_dsplen(s);
388 0 : return len;
389 : }
390 :
391 : /*
392 : * Convert pg_wchar to EUC_* encoding.
393 : * caller must allocate enough space for "to", including a trailing zero!
394 : * len: length of from.
395 : * "from" not necessarily null terminated.
396 : */
397 : static int
398 48 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399 : {
400 48 : int cnt = 0;
401 :
402 96 : while (len > 0 && *from)
403 : {
404 : unsigned char c;
405 :
406 48 : if ((c = (*from >> 24)))
407 : {
408 4 : *to++ = c;
409 4 : *to++ = (*from >> 16) & 0xff;
410 4 : *to++ = (*from >> 8) & 0xff;
411 4 : *to++ = *from & 0xff;
412 4 : cnt += 4;
413 : }
414 44 : else if ((c = (*from >> 16)))
415 : {
416 16 : *to++ = c;
417 16 : *to++ = (*from >> 8) & 0xff;
418 16 : *to++ = *from & 0xff;
419 16 : cnt += 3;
420 : }
421 28 : else if ((c = (*from >> 8)))
422 : {
423 16 : *to++ = c;
424 16 : *to++ = *from & 0xff;
425 16 : cnt += 2;
426 : }
427 : else
428 : {
429 12 : *to++ = *from;
430 12 : cnt++;
431 : }
432 48 : from++;
433 48 : len--;
434 : }
435 48 : *to = 0;
436 48 : return cnt;
437 : }
438 :
439 :
440 : /*
441 : * JOHAB
442 : */
443 : static int
444 16 : pg_johab_mblen(const unsigned char *s)
445 : {
446 16 : return pg_euc_mblen(s);
447 : }
448 :
449 : static int
450 0 : pg_johab_dsplen(const unsigned char *s)
451 : {
452 0 : return pg_euc_dsplen(s);
453 : }
454 :
455 : /*
456 : * convert UTF8 string to pg_wchar (UCS-4)
457 : * caller must allocate enough space for "to", including a trailing zero!
458 : * len: length of from.
459 : * "from" not necessarily null terminated.
460 : */
461 : static int
462 6716543 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463 : {
464 6716543 : int cnt = 0;
465 : uint32 c1,
466 : c2,
467 : c3,
468 : c4;
469 :
470 106648656 : while (len > 0 && *from)
471 : {
472 99932141 : if ((*from & 0x80) == 0)
473 : {
474 99931462 : *to = *from++;
475 99931462 : len--;
476 : }
477 679 : else if ((*from & 0xe0) == 0xc0)
478 : {
479 345 : MB2CHAR_NEED_AT_LEAST(len, 2);
480 337 : c1 = *from++ & 0x1f;
481 337 : c2 = *from++ & 0x3f;
482 337 : *to = (c1 << 6) | c2;
483 337 : len -= 2;
484 : }
485 334 : else if ((*from & 0xf0) == 0xe0)
486 : {
487 174 : MB2CHAR_NEED_AT_LEAST(len, 3);
488 166 : c1 = *from++ & 0x0f;
489 166 : c2 = *from++ & 0x3f;
490 166 : c3 = *from++ & 0x3f;
491 166 : *to = (c1 << 12) | (c2 << 6) | c3;
492 166 : len -= 3;
493 : }
494 160 : else if ((*from & 0xf8) == 0xf0)
495 : {
496 16 : MB2CHAR_NEED_AT_LEAST(len, 4);
497 4 : c1 = *from++ & 0x07;
498 4 : c2 = *from++ & 0x3f;
499 4 : c3 = *from++ & 0x3f;
500 4 : c4 = *from++ & 0x3f;
501 4 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 4 : len -= 4;
503 : }
504 : else
505 : {
506 : /* treat a bogus char as length 1; not ours to raise error */
507 144 : *to = *from++;
508 144 : len--;
509 : }
510 99932113 : to++;
511 99932113 : cnt++;
512 : }
513 6716543 : *to = 0;
514 6716543 : return cnt;
515 : }
516 :
517 :
518 : /*
519 : * Trivial conversion from pg_wchar to UTF-8.
520 : * caller should allocate enough space for "to"
521 : * len: length of from.
522 : * "from" not necessarily null terminated.
523 : */
524 : static int
525 579537 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526 : {
527 579537 : int cnt = 0;
528 :
529 8542731 : while (len > 0 && *from)
530 : {
531 : int char_len;
532 :
533 7963194 : unicode_to_utf8(*from, to);
534 7963194 : char_len = pg_utf_mblen(to);
535 7963194 : cnt += char_len;
536 7963194 : to += char_len;
537 7963194 : from++;
538 7963194 : len--;
539 : }
540 579537 : *to = 0;
541 579537 : return cnt;
542 : }
543 :
544 : /*
545 : * Return the byte length of a UTF8 character pointed to by s
546 : *
547 : * Note: in the current implementation we do not support UTF8 sequences
548 : * of more than 4 bytes; hence do NOT return a value larger than 4.
549 : * We return "1" for any leading byte that is either flat-out illegal or
550 : * indicates a length larger than we support.
551 : *
552 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 : * other places would need to be fixed to change this.
554 : */
555 : int
556 96394452 : pg_utf_mblen(const unsigned char *s)
557 : {
558 : int len;
559 :
560 96394452 : if ((*s & 0x80) == 0)
561 96289875 : len = 1;
562 104577 : else if ((*s & 0xe0) == 0xc0)
563 8501 : len = 2;
564 96076 : else if ((*s & 0xf0) == 0xe0)
565 69694 : len = 3;
566 26382 : else if ((*s & 0xf8) == 0xf0)
567 26267 : len = 4;
568 : #ifdef NOT_USED
569 : else if ((*s & 0xfc) == 0xf8)
570 : len = 5;
571 : else if ((*s & 0xfe) == 0xfc)
572 : len = 6;
573 : #endif
574 : else
575 115 : len = 1;
576 96394452 : return len;
577 : }
578 :
579 : /*
580 : * This is an implementation of wcwidth() and wcswidth() as defined in
581 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 : * <http://www.unix.org/online.html>
583 : *
584 : * Markus Kuhn -- 2001-09-08 -- public domain
585 : *
586 : * customised for PostgreSQL
587 : *
588 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 : */
590 :
591 : struct mbinterval
592 : {
593 : unsigned int first;
594 : unsigned int last;
595 : };
596 :
597 : /* auxiliary function for binary search in interval table */
598 : static int
599 59764790 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600 : {
601 59764790 : int min = 0;
602 : int mid;
603 :
604 59764790 : if (ucs < table[0].first || ucs > table[max].last)
605 59759144 : return 0;
606 49149 : while (max >= min)
607 : {
608 43983 : mid = (min + max) / 2;
609 43983 : if (ucs > table[mid].last)
610 9747 : min = mid + 1;
611 34236 : else if (ucs < table[mid].first)
612 33756 : max = mid - 1;
613 : else
614 480 : return 1;
615 : }
616 :
617 5166 : return 0;
618 : }
619 :
620 :
621 : /* The following functions define the column width of an ISO 10646
622 : * character as follows:
623 : *
624 : * - The null character (U+0000) has a column width of 0.
625 : *
626 : * - Other C0/C1 control characters and DEL will lead to a return
627 : * value of -1.
628 : *
629 : * - Non-spacing and enclosing combining characters (general
630 : * category code Mn, Me or Cf in the Unicode database) have a
631 : * column width of 0.
632 : *
633 : * - Spacing characters in the East Asian Wide (W) or East Asian
634 : * FullWidth (F) category as defined in Unicode Technical
635 : * Report #11 have a column width of 2.
636 : *
637 : * - All remaining characters (including all printable
638 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
639 : * etc.) have a column width of 1.
640 : *
641 : * This implementation assumes that wchar_t characters are encoded
642 : * in ISO 10646.
643 : */
644 :
645 : static int
646 29911966 : ucs_wcwidth(pg_wchar ucs)
647 : {
648 : #include "common/unicode_nonspacing_table.h"
649 : #include "common/unicode_east_asian_fw_table.h"
650 :
651 : /* test for 8-bit control characters */
652 29911966 : if (ucs == 0)
653 0 : return 0;
654 :
655 29911966 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
656 29409 : return -1;
657 :
658 : /*
659 : * binary search in table of non-spacing characters
660 : *
661 : * XXX: In the official Unicode sources, it is possible for a character to
662 : * be described as both non-spacing and wide at the same time. As of
663 : * Unicode 13.0, treating the non-spacing property as the determining
664 : * factor for display width leads to the correct behavior, so do that
665 : * search first.
666 : */
667 29882557 : if (mbbisearch(ucs, nonspacing,
668 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
669 324 : return 0;
670 :
671 : /* binary search in table of wide characters */
672 29882233 : if (mbbisearch(ucs, east_asian_fw,
673 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
674 156 : return 2;
675 :
676 29882077 : return 1;
677 : }
678 :
679 : static int
680 29911966 : pg_utf_dsplen(const unsigned char *s)
681 : {
682 29911966 : return ucs_wcwidth(utf8_to_unicode(s));
683 : }
684 :
685 : /*
686 : * ISO8859-1
687 : */
688 : static int
689 468 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
690 : {
691 468 : int cnt = 0;
692 :
693 13377 : while (len > 0 && *from)
694 : {
695 12909 : *to++ = *from++;
696 12909 : len--;
697 12909 : cnt++;
698 : }
699 468 : *to = 0;
700 468 : return cnt;
701 : }
702 :
703 : /*
704 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
705 : * high bits.
706 : * caller should allocate enough space for "to"
707 : * len: length of from.
708 : * "from" not necessarily null terminated.
709 : */
710 : static int
711 79 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
712 : {
713 79 : int cnt = 0;
714 :
715 678 : while (len > 0 && *from)
716 : {
717 599 : *to++ = *from++;
718 599 : len--;
719 599 : cnt++;
720 : }
721 79 : *to = 0;
722 79 : return cnt;
723 : }
724 :
725 : static int
726 3614 : pg_latin1_mblen(const unsigned char *s)
727 : {
728 3614 : return 1;
729 : }
730 :
731 : static int
732 400 : pg_latin1_dsplen(const unsigned char *s)
733 : {
734 400 : return pg_ascii_dsplen(s);
735 : }
736 :
737 : /*
738 : * SJIS
739 : */
740 : static int
741 1015 : pg_sjis_mblen(const unsigned char *s)
742 : {
743 : int len;
744 :
745 1015 : if (*s >= 0xa1 && *s <= 0xdf)
746 0 : len = 1; /* 1 byte kana? */
747 1015 : else if (IS_HIGHBIT_SET(*s))
748 809 : len = 2; /* kanji? */
749 : else
750 206 : len = 1; /* should be ASCII */
751 1015 : return len;
752 : }
753 :
754 : static int
755 0 : pg_sjis_dsplen(const unsigned char *s)
756 : {
757 : int len;
758 :
759 0 : if (*s >= 0xa1 && *s <= 0xdf)
760 0 : len = 1; /* 1 byte kana? */
761 0 : else if (IS_HIGHBIT_SET(*s))
762 0 : len = 2; /* kanji? */
763 : else
764 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
765 0 : return len;
766 : }
767 :
768 : /*
769 : * Big5
770 : */
771 : static int
772 232 : pg_big5_mblen(const unsigned char *s)
773 : {
774 : int len;
775 :
776 232 : if (IS_HIGHBIT_SET(*s))
777 208 : len = 2; /* kanji? */
778 : else
779 24 : len = 1; /* should be ASCII */
780 232 : return len;
781 : }
782 :
783 : static int
784 0 : pg_big5_dsplen(const unsigned char *s)
785 : {
786 : int len;
787 :
788 0 : if (IS_HIGHBIT_SET(*s))
789 0 : len = 2; /* kanji? */
790 : else
791 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
792 0 : return len;
793 : }
794 :
795 : /*
796 : * GBK
797 : */
798 : static int
799 282 : pg_gbk_mblen(const unsigned char *s)
800 : {
801 : int len;
802 :
803 282 : if (IS_HIGHBIT_SET(*s))
804 212 : len = 2; /* kanji? */
805 : else
806 70 : len = 1; /* should be ASCII */
807 282 : return len;
808 : }
809 :
810 : static int
811 0 : pg_gbk_dsplen(const unsigned char *s)
812 : {
813 : int len;
814 :
815 0 : if (IS_HIGHBIT_SET(*s))
816 0 : len = 2; /* kanji? */
817 : else
818 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
819 0 : return len;
820 : }
821 :
822 : /*
823 : * UHC
824 : */
825 : static int
826 16 : pg_uhc_mblen(const unsigned char *s)
827 : {
828 : int len;
829 :
830 16 : if (IS_HIGHBIT_SET(*s))
831 16 : len = 2; /* 2byte? */
832 : else
833 0 : len = 1; /* should be ASCII */
834 16 : return len;
835 : }
836 :
837 : static int
838 0 : pg_uhc_dsplen(const unsigned char *s)
839 : {
840 : int len;
841 :
842 0 : if (IS_HIGHBIT_SET(*s))
843 0 : len = 2; /* 2byte? */
844 : else
845 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
846 0 : return len;
847 : }
848 :
849 : /*
850 : * GB18030
851 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
852 : */
853 :
854 : /*
855 : * Unlike all other mblen() functions, this also looks at the second byte of
856 : * the input. However, if you only pass the first byte of a multi-byte
857 : * string, and \0 as the second byte, this still works in a predictable way:
858 : * a 4-byte character will be reported as two 2-byte characters. That's
859 : * enough for all current uses, as a client-only encoding. It works that
860 : * way, because in any valid 4-byte GB18030-encoded character, the third and
861 : * fourth byte look like a 2-byte encoded character, when looked at
862 : * separately.
863 : */
864 : static int
865 623 : pg_gb18030_mblen(const unsigned char *s)
866 : {
867 : int len;
868 :
869 623 : if (!IS_HIGHBIT_SET(*s))
870 348 : len = 1; /* ASCII */
871 275 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
872 114 : len = 4;
873 : else
874 161 : len = 2;
875 623 : return len;
876 : }
877 :
878 : static int
879 0 : pg_gb18030_dsplen(const unsigned char *s)
880 : {
881 : int len;
882 :
883 0 : if (IS_HIGHBIT_SET(*s))
884 0 : len = 2;
885 : else
886 0 : len = pg_ascii_dsplen(s); /* ASCII */
887 0 : return len;
888 : }
889 :
890 : /*
891 : *-------------------------------------------------------------------
892 : * multibyte sequence validators
893 : *
894 : * The verifychar functions accept "s", a pointer to the first byte of a
895 : * string, and "len", the remaining length of the string. If there is a
896 : * validly encoded character beginning at *s, return its length in bytes;
897 : * else return -1.
898 : *
899 : * The verifystr functions also accept "s", a pointer to a string and "len",
900 : * the length of the string. They verify the whole string, and return the
901 : * number of input bytes (<= len) that are valid. In other words, if the
902 : * whole string is valid, verifystr returns "len", otherwise it returns the
903 : * byte offset of the first invalid character. The verifystr functions must
904 : * test for and reject zeroes in the input.
905 : *
906 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
907 : * they must test for and reject zeroes in any additional bytes of a
908 : * multibyte character. Note that this definition allows the function for a
909 : * single-byte encoding to be just "return 1".
910 : *-------------------------------------------------------------------
911 : */
912 : static int
913 161 : pg_ascii_verifychar(const unsigned char *s, int len)
914 : {
915 161 : return 1;
916 : }
917 :
918 : static int
919 211568 : pg_ascii_verifystr(const unsigned char *s, int len)
920 : {
921 211568 : const unsigned char *nullpos = memchr(s, 0, len);
922 :
923 211568 : if (nullpos == NULL)
924 211568 : return len;
925 : else
926 0 : return nullpos - s;
927 : }
928 :
929 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
930 :
931 : static int
932 336 : pg_eucjp_verifychar(const unsigned char *s, int len)
933 : {
934 : int l;
935 : unsigned char c1,
936 : c2;
937 :
938 336 : c1 = *s++;
939 :
940 336 : switch (c1)
941 : {
942 0 : case SS2: /* JIS X 0201 */
943 0 : l = 2;
944 0 : if (l > len)
945 0 : return -1;
946 0 : c2 = *s++;
947 0 : if (c2 < 0xa1 || c2 > 0xdf)
948 0 : return -1;
949 0 : break;
950 :
951 0 : case SS3: /* JIS X 0212 */
952 0 : l = 3;
953 0 : if (l > len)
954 0 : return -1;
955 0 : c2 = *s++;
956 0 : if (!IS_EUC_RANGE_VALID(c2))
957 0 : return -1;
958 0 : c2 = *s++;
959 0 : if (!IS_EUC_RANGE_VALID(c2))
960 0 : return -1;
961 0 : break;
962 :
963 336 : default:
964 336 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
965 : {
966 336 : l = 2;
967 336 : if (l > len)
968 56 : return -1;
969 280 : if (!IS_EUC_RANGE_VALID(c1))
970 16 : return -1;
971 264 : c2 = *s++;
972 264 : if (!IS_EUC_RANGE_VALID(c2))
973 120 : return -1;
974 : }
975 : else
976 : /* must be ASCII */
977 : {
978 0 : l = 1;
979 : }
980 144 : break;
981 : }
982 :
983 144 : return l;
984 : }
985 :
986 : static int
987 196 : pg_eucjp_verifystr(const unsigned char *s, int len)
988 : {
989 196 : const unsigned char *start = s;
990 :
991 604 : while (len > 0)
992 : {
993 : int l;
994 :
995 : /* fast path for ASCII-subset characters */
996 552 : if (!IS_HIGHBIT_SET(*s))
997 : {
998 384 : if (*s == '\0')
999 48 : break;
1000 336 : l = 1;
1001 : }
1002 : else
1003 : {
1004 168 : l = pg_eucjp_verifychar(s, len);
1005 168 : if (l == -1)
1006 96 : break;
1007 : }
1008 408 : s += l;
1009 408 : len -= l;
1010 : }
1011 :
1012 196 : return s - start;
1013 : }
1014 :
1015 : static int
1016 96 : pg_euckr_verifychar(const unsigned char *s, int len)
1017 : {
1018 : int l;
1019 : unsigned char c1,
1020 : c2;
1021 :
1022 96 : c1 = *s++;
1023 :
1024 96 : if (IS_HIGHBIT_SET(c1))
1025 : {
1026 96 : l = 2;
1027 96 : if (l > len)
1028 8 : return -1;
1029 88 : if (!IS_EUC_RANGE_VALID(c1))
1030 16 : return -1;
1031 72 : c2 = *s++;
1032 72 : if (!IS_EUC_RANGE_VALID(c2))
1033 0 : return -1;
1034 : }
1035 : else
1036 : /* must be ASCII */
1037 : {
1038 0 : l = 1;
1039 : }
1040 :
1041 72 : return l;
1042 : }
1043 :
1044 : static int
1045 40 : pg_euckr_verifystr(const unsigned char *s, int len)
1046 : {
1047 40 : const unsigned char *start = s;
1048 :
1049 124 : while (len > 0)
1050 : {
1051 : int l;
1052 :
1053 : /* fast path for ASCII-subset characters */
1054 108 : if (!IS_HIGHBIT_SET(*s))
1055 : {
1056 48 : if (*s == '\0')
1057 0 : break;
1058 48 : l = 1;
1059 : }
1060 : else
1061 : {
1062 60 : l = pg_euckr_verifychar(s, len);
1063 60 : if (l == -1)
1064 24 : break;
1065 : }
1066 84 : s += l;
1067 84 : len -= l;
1068 : }
1069 :
1070 40 : return s - start;
1071 : }
1072 :
1073 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1074 : #define pg_euccn_verifychar pg_euckr_verifychar
1075 : #define pg_euccn_verifystr pg_euckr_verifystr
1076 :
1077 : static int
1078 12 : pg_euctw_verifychar(const unsigned char *s, int len)
1079 : {
1080 : int l;
1081 : unsigned char c1,
1082 : c2;
1083 :
1084 12 : c1 = *s++;
1085 :
1086 12 : switch (c1)
1087 : {
1088 0 : case SS2: /* CNS 11643 Plane 1-7 */
1089 0 : l = 4;
1090 0 : if (l > len)
1091 0 : return -1;
1092 0 : c2 = *s++;
1093 0 : if (c2 < 0xa1 || c2 > 0xa7)
1094 0 : return -1;
1095 0 : c2 = *s++;
1096 0 : if (!IS_EUC_RANGE_VALID(c2))
1097 0 : return -1;
1098 0 : c2 = *s++;
1099 0 : if (!IS_EUC_RANGE_VALID(c2))
1100 0 : return -1;
1101 0 : break;
1102 :
1103 0 : case SS3: /* unused */
1104 0 : return -1;
1105 :
1106 12 : default:
1107 12 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1108 : {
1109 12 : l = 2;
1110 12 : if (l > len)
1111 4 : return -1;
1112 : /* no further range check on c1? */
1113 8 : c2 = *s++;
1114 8 : if (!IS_EUC_RANGE_VALID(c2))
1115 8 : return -1;
1116 : }
1117 : else
1118 : /* must be ASCII */
1119 : {
1120 0 : l = 1;
1121 : }
1122 0 : break;
1123 : }
1124 0 : return l;
1125 : }
1126 :
1127 : static int
1128 20 : pg_euctw_verifystr(const unsigned char *s, int len)
1129 : {
1130 20 : const unsigned char *start = s;
1131 :
1132 44 : while (len > 0)
1133 : {
1134 : int l;
1135 :
1136 : /* fast path for ASCII-subset characters */
1137 36 : if (!IS_HIGHBIT_SET(*s))
1138 : {
1139 24 : if (*s == '\0')
1140 0 : break;
1141 24 : l = 1;
1142 : }
1143 : else
1144 : {
1145 12 : l = pg_euctw_verifychar(s, len);
1146 12 : if (l == -1)
1147 12 : break;
1148 : }
1149 24 : s += l;
1150 24 : len -= l;
1151 : }
1152 :
1153 20 : return s - start;
1154 : }
1155 :
1156 : static int
1157 12 : pg_johab_verifychar(const unsigned char *s, int len)
1158 : {
1159 : int l,
1160 : mbl;
1161 : unsigned char c;
1162 :
1163 12 : l = mbl = pg_johab_mblen(s);
1164 :
1165 12 : if (len < l)
1166 4 : return -1;
1167 :
1168 8 : if (!IS_HIGHBIT_SET(*s))
1169 0 : return mbl;
1170 :
1171 8 : while (--l > 0)
1172 : {
1173 8 : c = *++s;
1174 8 : if (!IS_EUC_RANGE_VALID(c))
1175 8 : return -1;
1176 : }
1177 0 : return mbl;
1178 : }
1179 :
1180 : static int
1181 16 : pg_johab_verifystr(const unsigned char *s, int len)
1182 : {
1183 16 : const unsigned char *start = s;
1184 :
1185 28 : while (len > 0)
1186 : {
1187 : int l;
1188 :
1189 : /* fast path for ASCII-subset characters */
1190 24 : if (!IS_HIGHBIT_SET(*s))
1191 : {
1192 12 : if (*s == '\0')
1193 0 : break;
1194 12 : l = 1;
1195 : }
1196 : else
1197 : {
1198 12 : l = pg_johab_verifychar(s, len);
1199 12 : if (l == -1)
1200 12 : break;
1201 : }
1202 12 : s += l;
1203 12 : len -= l;
1204 : }
1205 :
1206 16 : return s - start;
1207 : }
1208 :
1209 : static int
1210 3223 : pg_latin1_verifychar(const unsigned char *s, int len)
1211 : {
1212 3223 : return 1;
1213 : }
1214 :
1215 : static int
1216 5212 : pg_latin1_verifystr(const unsigned char *s, int len)
1217 : {
1218 5212 : const unsigned char *nullpos = memchr(s, 0, len);
1219 :
1220 5212 : if (nullpos == NULL)
1221 5140 : return len;
1222 : else
1223 72 : return nullpos - s;
1224 : }
1225 :
1226 : static int
1227 624 : pg_sjis_verifychar(const unsigned char *s, int len)
1228 : {
1229 : int l,
1230 : mbl;
1231 : unsigned char c1,
1232 : c2;
1233 :
1234 624 : l = mbl = pg_sjis_mblen(s);
1235 :
1236 624 : if (len < l)
1237 86 : return -1;
1238 :
1239 538 : if (l == 1) /* pg_sjis_mblen already verified it */
1240 0 : return mbl;
1241 :
1242 538 : c1 = *s++;
1243 538 : c2 = *s;
1244 538 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1245 214 : return -1;
1246 324 : return mbl;
1247 : }
1248 :
1249 : static int
1250 322 : pg_sjis_verifystr(const unsigned char *s, int len)
1251 : {
1252 322 : const unsigned char *start = s;
1253 :
1254 1233 : while (len > 0)
1255 : {
1256 : int l;
1257 :
1258 : /* fast path for ASCII-subset characters */
1259 1107 : if (!IS_HIGHBIT_SET(*s))
1260 : {
1261 815 : if (*s == '\0')
1262 48 : break;
1263 767 : l = 1;
1264 : }
1265 : else
1266 : {
1267 292 : l = pg_sjis_verifychar(s, len);
1268 292 : if (l == -1)
1269 148 : break;
1270 : }
1271 911 : s += l;
1272 911 : len -= l;
1273 : }
1274 :
1275 322 : return s - start;
1276 : }
1277 :
1278 : static int
1279 168 : pg_big5_verifychar(const unsigned char *s, int len)
1280 : {
1281 : int l,
1282 : mbl;
1283 :
1284 168 : l = mbl = pg_big5_mblen(s);
1285 :
1286 168 : if (len < l)
1287 4 : return -1;
1288 :
1289 164 : if (l == 2 &&
1290 164 : s[0] == NONUTF8_INVALID_BYTE0 &&
1291 8 : s[1] == NONUTF8_INVALID_BYTE1)
1292 8 : return -1;
1293 :
1294 264 : while (--l > 0)
1295 : {
1296 156 : if (*++s == '\0')
1297 48 : return -1;
1298 : }
1299 :
1300 108 : return mbl;
1301 : }
1302 :
1303 : static int
1304 104 : pg_big5_verifystr(const unsigned char *s, int len)
1305 : {
1306 104 : const unsigned char *start = s;
1307 :
1308 428 : while (len > 0)
1309 : {
1310 : int l;
1311 :
1312 : /* fast path for ASCII-subset characters */
1313 384 : if (!IS_HIGHBIT_SET(*s))
1314 : {
1315 300 : if (*s == '\0')
1316 24 : break;
1317 276 : l = 1;
1318 : }
1319 : else
1320 : {
1321 84 : l = pg_big5_verifychar(s, len);
1322 84 : if (l == -1)
1323 36 : break;
1324 : }
1325 324 : s += l;
1326 324 : len -= l;
1327 : }
1328 :
1329 104 : return s - start;
1330 : }
1331 :
1332 : static int
1333 140 : pg_gbk_verifychar(const unsigned char *s, int len)
1334 : {
1335 : int l,
1336 : mbl;
1337 :
1338 140 : l = mbl = pg_gbk_mblen(s);
1339 :
1340 140 : if (len < l)
1341 28 : return -1;
1342 :
1343 112 : if (l == 2 &&
1344 112 : s[0] == NONUTF8_INVALID_BYTE0 &&
1345 16 : s[1] == NONUTF8_INVALID_BYTE1)
1346 16 : return -1;
1347 :
1348 192 : while (--l > 0)
1349 : {
1350 96 : if (*++s == '\0')
1351 0 : return -1;
1352 : }
1353 :
1354 96 : return mbl;
1355 : }
1356 :
1357 : static int
1358 132 : pg_gbk_verifystr(const unsigned char *s, int len)
1359 : {
1360 132 : const unsigned char *start = s;
1361 :
1362 336 : while (len > 0)
1363 : {
1364 : int l;
1365 :
1366 : /* fast path for ASCII-subset characters */
1367 248 : if (!IS_HIGHBIT_SET(*s))
1368 : {
1369 124 : if (*s == '\0')
1370 0 : break;
1371 124 : l = 1;
1372 : }
1373 : else
1374 : {
1375 124 : l = pg_gbk_verifychar(s, len);
1376 124 : if (l == -1)
1377 44 : break;
1378 : }
1379 204 : s += l;
1380 204 : len -= l;
1381 : }
1382 :
1383 132 : return s - start;
1384 : }
1385 :
1386 : static int
1387 12 : pg_uhc_verifychar(const unsigned char *s, int len)
1388 : {
1389 : int l,
1390 : mbl;
1391 :
1392 12 : l = mbl = pg_uhc_mblen(s);
1393 :
1394 12 : if (len < l)
1395 4 : return -1;
1396 :
1397 8 : if (l == 2 &&
1398 8 : s[0] == NONUTF8_INVALID_BYTE0 &&
1399 8 : s[1] == NONUTF8_INVALID_BYTE1)
1400 8 : return -1;
1401 :
1402 0 : while (--l > 0)
1403 : {
1404 0 : if (*++s == '\0')
1405 0 : return -1;
1406 : }
1407 :
1408 0 : return mbl;
1409 : }
1410 :
1411 : static int
1412 16 : pg_uhc_verifystr(const unsigned char *s, int len)
1413 : {
1414 16 : const unsigned char *start = s;
1415 :
1416 28 : while (len > 0)
1417 : {
1418 : int l;
1419 :
1420 : /* fast path for ASCII-subset characters */
1421 24 : if (!IS_HIGHBIT_SET(*s))
1422 : {
1423 12 : if (*s == '\0')
1424 0 : break;
1425 12 : l = 1;
1426 : }
1427 : else
1428 : {
1429 12 : l = pg_uhc_verifychar(s, len);
1430 12 : if (l == -1)
1431 12 : break;
1432 : }
1433 12 : s += l;
1434 12 : len -= l;
1435 : }
1436 :
1437 16 : return s - start;
1438 : }
1439 :
1440 : static int
1441 698 : pg_gb18030_verifychar(const unsigned char *s, int len)
1442 : {
1443 : int l;
1444 :
1445 698 : if (!IS_HIGHBIT_SET(*s))
1446 0 : l = 1; /* ASCII */
1447 698 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1448 : {
1449 : /* Should be 4-byte, validate remaining bytes */
1450 210 : if (*s >= 0x81 && *s <= 0xfe &&
1451 204 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1452 204 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1453 108 : l = 4;
1454 : else
1455 102 : l = -1;
1456 : }
1457 488 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1458 : {
1459 : /* Should be 2-byte, validate */
1460 358 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1461 238 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1462 176 : l = 2;
1463 : else
1464 182 : l = -1;
1465 : }
1466 : else
1467 130 : l = -1;
1468 698 : return l;
1469 : }
1470 :
1471 : static int
1472 500 : pg_gb18030_verifystr(const unsigned char *s, int len)
1473 : {
1474 500 : const unsigned char *start = s;
1475 :
1476 1679 : while (len > 0)
1477 : {
1478 : int l;
1479 :
1480 : /* fast path for ASCII-subset characters */
1481 1515 : if (!IS_HIGHBIT_SET(*s))
1482 : {
1483 1037 : if (*s == '\0')
1484 30 : break;
1485 1007 : l = 1;
1486 : }
1487 : else
1488 : {
1489 478 : l = pg_gb18030_verifychar(s, len);
1490 478 : if (l == -1)
1491 306 : break;
1492 : }
1493 1179 : s += l;
1494 1179 : len -= l;
1495 : }
1496 :
1497 500 : return s - start;
1498 : }
1499 :
1500 : static int
1501 9451 : pg_utf8_verifychar(const unsigned char *s, int len)
1502 : {
1503 : int l;
1504 :
1505 9451 : if ((*s & 0x80) == 0)
1506 : {
1507 0 : if (*s == '\0')
1508 0 : return -1;
1509 0 : return 1;
1510 : }
1511 9451 : else if ((*s & 0xe0) == 0xc0)
1512 3331 : l = 2;
1513 6120 : else if ((*s & 0xf0) == 0xe0)
1514 3412 : l = 3;
1515 2708 : else if ((*s & 0xf8) == 0xf0)
1516 2532 : l = 4;
1517 : else
1518 176 : l = 1;
1519 :
1520 9451 : if (l > len)
1521 320 : return -1;
1522 :
1523 9131 : if (!pg_utf8_islegal(s, l))
1524 1486 : return -1;
1525 :
1526 7645 : return l;
1527 : }
1528 :
1529 : /*
1530 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1531 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1532 : * input byte and current state are used to compute an index into an array of
1533 : * state transitions. Since the address of the next transition is dependent
1534 : * on this computation, there is latency in executing the load instruction,
1535 : * and the CPU is not kept busy.
1536 : *
1537 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1538 : *
1539 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1540 : *
1541 : * In a shift-based DFA, the input byte is an index into array of integers
1542 : * whose bit pattern encodes the state transitions. To compute the next
1543 : * state, we simply right-shift the integer by the current state and apply a
1544 : * mask. In this scheme, the address of the transition only depends on the
1545 : * input byte, so there is better pipelining.
1546 : *
1547 : * The naming convention for states and transitions was adopted from a UTF-8
1548 : * to UTF-16/32 transcoder, whose table is reproduced below:
1549 : *
1550 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1551 : *
1552 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1553 : * ==========================================================================
1554 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1555 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1556 : * |
1557 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1558 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1559 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1560 : * |
1561 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1562 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1563 : * |
1564 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1565 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1566 : *
1567 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1568 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1569 : * it's possible to find state numbers such that the transitions fit within
1570 : * 32-bit integers, as Dougall Johnson demonstrated:
1571 : *
1572 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1573 : *
1574 : * This packed representation is the reason for the seemingly odd choice of
1575 : * state values below.
1576 : */
1577 :
1578 : /* Error */
1579 : #define ERR 0
1580 : /* Begin */
1581 : #define BGN 11
1582 : /* Continuation states, expect 1/2/3 continuation bytes */
1583 : #define CS1 16
1584 : #define CS2 1
1585 : #define CS3 5
1586 : /* Partial states, where the first continuation byte has a restricted range */
1587 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1588 : #define P3B 20 /* Lead was ED, check for surrogate */
1589 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1590 : #define P4B 30 /* Lead was F4, check for too-large */
1591 : /* Begin and End are the same state */
1592 : #define END BGN
1593 :
1594 : /* the encoded state transitions for the lookup table */
1595 :
1596 : /* ASCII */
1597 : #define ASC (END << BGN)
1598 : /* 2-byte lead */
1599 : #define L2A (CS1 << BGN)
1600 : /* 3-byte lead */
1601 : #define L3A (P3A << BGN)
1602 : #define L3B (CS2 << BGN)
1603 : #define L3C (P3B << BGN)
1604 : /* 4-byte lead */
1605 : #define L4A (P4A << BGN)
1606 : #define L4B (CS3 << BGN)
1607 : #define L4C (P4B << BGN)
1608 : /* continuation byte */
1609 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1610 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1611 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1612 : /* invalid byte */
1613 : #define ILL ERR
1614 :
1615 : static const uint32 Utf8Transition[256] =
1616 : {
1617 : /* ASCII */
1618 :
1619 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1620 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1621 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1622 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1623 :
1624 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1625 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1626 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1627 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1628 :
1629 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1630 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1631 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1632 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1633 :
1634 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1635 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1636 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1637 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1638 :
1639 : /* continuation bytes */
1640 :
1641 : /* 80..8F */
1642 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1643 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1644 :
1645 : /* 90..9F */
1646 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1647 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1648 :
1649 : /* A0..BF */
1650 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1651 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1652 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1653 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1654 :
1655 : /* leading bytes */
1656 :
1657 : /* C0..DF */
1658 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1659 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1660 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1661 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1662 :
1663 : /* E0..EF */
1664 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1665 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1666 :
1667 : /* F0..FF */
1668 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1669 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1670 : };
1671 :
1672 : static void
1673 1147 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1674 : {
1675 : /* Note: We deliberately don't check the state's value here. */
1676 37851 : while (len > 0)
1677 : {
1678 : /*
1679 : * It's important that the mask value is 31: In most instruction sets,
1680 : * a shift by a 32-bit operand is understood to be a shift by its mod
1681 : * 32, so the compiler should elide the mask operation.
1682 : */
1683 36704 : *state = Utf8Transition[*s++] >> (*state & 31);
1684 36704 : len--;
1685 : }
1686 :
1687 1147 : *state &= 31;
1688 1147 : }
1689 :
1690 : static int
1691 712368 : pg_utf8_verifystr(const unsigned char *s, int len)
1692 : {
1693 712368 : const unsigned char *start = s;
1694 712368 : const int orig_len = len;
1695 712368 : uint32 state = BGN;
1696 :
1697 : /*
1698 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1699 : * the compiler can unroll a longer loop, it's not worth it because we
1700 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1701 : */
1702 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1703 :
1704 712368 : if (len >= STRIDE_LENGTH)
1705 : {
1706 2654248 : while (len >= STRIDE_LENGTH)
1707 : {
1708 : /*
1709 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1710 : * but we must first check for a non-END state, which means the
1711 : * previous chunk ended in the middle of a multibyte sequence.
1712 : */
1713 2292195 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1714 1147 : utf8_advance(s, &state, STRIDE_LENGTH);
1715 :
1716 2292195 : s += STRIDE_LENGTH;
1717 2292195 : len -= STRIDE_LENGTH;
1718 : }
1719 :
1720 : /* The error state persists, so we only need to check for it here. */
1721 362053 : if (state == ERR)
1722 : {
1723 : /*
1724 : * Start over from the beginning with the slow path so we can
1725 : * count the valid bytes.
1726 : */
1727 336 : len = orig_len;
1728 336 : s = start;
1729 : }
1730 361717 : else if (state != END)
1731 : {
1732 : /*
1733 : * The fast path exited in the middle of a multibyte sequence.
1734 : * Walk backwards to find the leading byte so that the slow path
1735 : * can resume checking from there. We must always backtrack at
1736 : * least one byte, since the current byte could be e.g. an ASCII
1737 : * byte after a 2-byte lead, which is invalid.
1738 : */
1739 : do
1740 : {
1741 : Assert(s > start);
1742 73 : s--;
1743 73 : len++;
1744 : Assert(IS_HIGHBIT_SET(*s));
1745 73 : } while (pg_utf_mblen(s) <= 1);
1746 : }
1747 : }
1748 :
1749 : /* check remaining bytes */
1750 10630283 : while (len > 0)
1751 : {
1752 : int l;
1753 :
1754 : /* fast path for ASCII-subset characters */
1755 9919821 : if (!IS_HIGHBIT_SET(*s))
1756 : {
1757 9910406 : if (*s == '\0')
1758 132 : break;
1759 9910274 : l = 1;
1760 : }
1761 : else
1762 : {
1763 9415 : l = pg_utf8_verifychar(s, len);
1764 9415 : if (l == -1)
1765 1774 : break;
1766 : }
1767 9917915 : s += l;
1768 9917915 : len -= l;
1769 : }
1770 :
1771 712368 : return s - start;
1772 : }
1773 :
1774 : /*
1775 : * Check for validity of a single UTF-8 encoded character
1776 : *
1777 : * This directly implements the rules in RFC3629. The bizarre-looking
1778 : * restrictions on the second byte are meant to ensure that there isn't
1779 : * more than one encoding of a given Unicode character point; that is,
1780 : * you may not use a longer-than-necessary byte sequence with high order
1781 : * zero bits to represent a character that would fit in fewer bytes.
1782 : * To do otherwise is to create security hazards (eg, create an apparent
1783 : * non-ASCII character that decodes to plain ASCII).
1784 : *
1785 : * length is assumed to have been obtained by pg_utf_mblen(), and the
1786 : * caller must have checked that that many bytes are present in the buffer.
1787 : */
1788 : bool
1789 16172 : pg_utf8_islegal(const unsigned char *source, int length)
1790 : {
1791 : unsigned char a;
1792 :
1793 16172 : switch (length)
1794 : {
1795 0 : default:
1796 : /* reject lengths 5 and 6 for now */
1797 0 : return false;
1798 2396 : case 4:
1799 2396 : a = source[3];
1800 2396 : if (a < 0x80 || a > 0xBF)
1801 198 : return false;
1802 : pg_fallthrough;
1803 : case 3:
1804 6619 : a = source[2];
1805 6619 : if (a < 0x80 || a > 0xBF)
1806 440 : return false;
1807 : pg_fallthrough;
1808 : case 2:
1809 9822 : a = source[1];
1810 9822 : switch (*source)
1811 : {
1812 208 : case 0xE0:
1813 208 : if (a < 0xA0 || a > 0xBF)
1814 176 : return false;
1815 32 : break;
1816 208 : case 0xED:
1817 208 : if (a < 0x80 || a > 0x9F)
1818 176 : return false;
1819 32 : break;
1820 2078 : case 0xF0:
1821 2078 : if (a < 0x90 || a > 0xBF)
1822 176 : return false;
1823 1902 : break;
1824 120 : case 0xF4:
1825 120 : if (a < 0x80 || a > 0x8F)
1826 88 : return false;
1827 32 : break;
1828 7208 : default:
1829 7208 : if (a < 0x80 || a > 0xBF)
1830 168 : return false;
1831 7040 : break;
1832 : }
1833 : pg_fallthrough;
1834 : case 1:
1835 14750 : a = *source;
1836 14750 : if (a >= 0x80 && a < 0xC2)
1837 264 : return false;
1838 14486 : if (a > 0xF4)
1839 88 : return false;
1840 14398 : break;
1841 : }
1842 14398 : return true;
1843 : }
1844 :
1845 :
1846 : /*
1847 : * Fills the provided buffer with two bytes such that:
1848 : * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
1849 : */
1850 : void
1851 212 : pg_encoding_set_invalid(int encoding, char *dst)
1852 : {
1853 : Assert(pg_encoding_max_length(encoding) > 1);
1854 :
1855 212 : dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
1856 212 : dst[1] = NONUTF8_INVALID_BYTE1;
1857 212 : }
1858 :
1859 : /*
1860 : *-------------------------------------------------------------------
1861 : * encoding info table
1862 : *-------------------------------------------------------------------
1863 : */
1864 : const pg_wchar_tbl pg_wchar_table[] = {
1865 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
1866 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
1867 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
1868 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
1869 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
1870 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
1871 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
1872 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1873 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1874 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1875 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1876 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1877 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1878 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1879 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1880 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1881 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1882 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1883 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1884 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1885 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1886 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1887 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1888 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1889 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1890 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1891 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1892 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1893 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1894 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1895 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1896 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1897 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1898 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
1899 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
1900 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
1901 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
1902 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
1903 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
1904 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
1905 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
1906 : };
1907 :
1908 : /*
1909 : * Returns the byte length of a multibyte character.
1910 : *
1911 : * Choose "mblen" functions based on the input string characteristics.
1912 : * pg_encoding_mblen() can be used when ANY of these conditions are met:
1913 : *
1914 : * - The input string is zero-terminated
1915 : *
1916 : * - The input string is known to be valid in the encoding (e.g., string
1917 : * converted from database encoding)
1918 : *
1919 : * - The encoding is not GB18030 (e.g., when only database encodings are
1920 : * passed to 'encoding' parameter)
1921 : *
1922 : * encoding==GB18030 requires examining up to two bytes to determine character
1923 : * length. Therefore, callers satisfying none of those conditions must use
1924 : * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
1925 : * guaranteed to be within allocation bounds.
1926 : *
1927 : * When dealing with text that is not certainly valid in the specified
1928 : * encoding, the result may exceed the actual remaining string length.
1929 : * Callers that are not prepared to deal with that should use Min(remaining,
1930 : * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
1931 : * pg_encoding_mblen_bounded() are interchangeable.
1932 : */
1933 : int
1934 30038425 : pg_encoding_mblen(int encoding, const char *mbstr)
1935 : {
1936 30038425 : return (PG_VALID_ENCODING(encoding) ?
1937 60076850 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1938 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1939 : }
1940 :
1941 : /*
1942 : * Returns the byte length of a multibyte character (possibly not
1943 : * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
1944 : */
1945 : int
1946 3136 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
1947 : size_t remaining)
1948 : {
1949 : /*
1950 : * Define zero remaining as too few, even for single-byte encodings.
1951 : * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
1952 : * zero; others read one.
1953 : */
1954 3136 : if (remaining < 1 ||
1955 202 : (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
1956 42 : return INT_MAX;
1957 3094 : return pg_encoding_mblen(encoding, mbstr);
1958 : }
1959 :
1960 : /*
1961 : * Returns the byte length of a multibyte character; but not more than the
1962 : * distance to the terminating zero byte. For input that might lack a
1963 : * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
1964 : */
1965 : int
1966 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
1967 : {
1968 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
1969 : }
1970 :
1971 : /*
1972 : * Returns the display length of a multibyte character.
1973 : */
1974 : int
1975 29922862 : pg_encoding_dsplen(int encoding, const char *mbstr)
1976 : {
1977 29922862 : return (PG_VALID_ENCODING(encoding) ?
1978 59845724 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1979 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1980 : }
1981 :
1982 : /*
1983 : * Verify the first multibyte character of the given string.
1984 : * Return its byte length if good, -1 if bad. (See comments above for
1985 : * full details of the mbverifychar API.)
1986 : */
1987 : int
1988 4228 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
1989 : {
1990 4228 : return (PG_VALID_ENCODING(encoding) ?
1991 8456 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
1992 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
1993 : }
1994 :
1995 : /*
1996 : * Verify that a string is valid for the given encoding.
1997 : * Returns the number of input bytes (<= len) that form a valid string.
1998 : * (See comments above for full details of the mbverifystr API.)
1999 : */
2000 : int
2001 233718 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2002 : {
2003 233718 : return (PG_VALID_ENCODING(encoding) ?
2004 467436 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2005 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2006 : }
2007 :
2008 : /*
2009 : * fetch maximum length of a given encoding
2010 : */
2011 : int
2012 683458 : pg_encoding_max_length(int encoding)
2013 : {
2014 : Assert(PG_VALID_ENCODING(encoding));
2015 :
2016 : /*
2017 : * Check for the encoding despite the assert, due to some mingw versions
2018 : * otherwise issuing bogus warnings.
2019 : */
2020 683458 : return PG_VALID_ENCODING(encoding) ?
2021 1366916 : pg_wchar_table[encoding].maxmblen :
2022 : pg_wchar_table[PG_SQL_ASCII].maxmblen;
2023 : }
|