Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include "mb/pg_wchar.h"
16 : #include "utils/ascii.h"
17 :
18 :
19 : /*
20 : * Operations on multi-byte encodings are driven by a table of helper
21 : * functions.
22 : *
23 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
24 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
25 : * and wchar2mb() conversion functions.
26 : *
27 : * These functions generally assume that their input is validly formed.
28 : * The "verifier" functions, further down in the file, have to be more
29 : * paranoid.
30 : *
31 : * We expect that mblen() does not need to examine more than the first byte
32 : * of the character to discover the correct length. GB18030 is an exception
33 : * to that rule, though, as it also looks at second byte. But even that
34 : * behaves in a predictable way, if you only pass the first byte: it will
35 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
36 : * good enough for all current uses.
37 : *
38 : * Note: for the display output of psql to work properly, the return values
39 : * of the dsplen functions must conform to the Unicode standard. In particular
40 : * the NUL character is zero width and control characters are generally
41 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
42 : * subset to the ASCII routines to ensure consistency.
43 : */
44 :
45 : /*
46 : * SQL/ASCII
47 : */
48 : static int
49 762 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
50 : {
51 762 : int cnt = 0;
52 :
53 63428 : while (len > 0 && *from)
54 : {
55 62666 : *to++ = *from++;
56 62666 : len--;
57 62666 : cnt++;
58 : }
59 762 : *to = 0;
60 762 : return cnt;
61 : }
62 :
63 : static int
64 46524 : pg_ascii_mblen(const unsigned char *s)
65 : {
66 46524 : return 1;
67 : }
68 :
69 : static int
70 45224 : pg_ascii_dsplen(const unsigned char *s)
71 : {
72 45224 : if (*s == '\0')
73 0 : return 0;
74 45224 : if (*s < 0x20 || *s == 0x7f)
75 6 : return -1;
76 :
77 45218 : return 1;
78 : }
79 :
80 : /*
81 : * EUC
82 : */
83 : static int
84 0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
85 : {
86 0 : int cnt = 0;
87 :
88 0 : while (len > 0 && *from)
89 : {
90 0 : if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
91 : * KANA") */
92 : {
93 0 : from++;
94 0 : *to = (SS2 << 8) | *from++;
95 0 : len -= 2;
96 : }
97 0 : else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
98 : {
99 0 : from++;
100 0 : *to = (SS3 << 16) | (*from++ << 8);
101 0 : *to |= *from++;
102 0 : len -= 3;
103 : }
104 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
105 : {
106 0 : *to = *from++ << 8;
107 0 : *to |= *from++;
108 0 : len -= 2;
109 : }
110 : else /* must be ASCII */
111 : {
112 0 : *to = *from++;
113 0 : len--;
114 : }
115 0 : to++;
116 0 : cnt++;
117 : }
118 0 : *to = 0;
119 0 : return cnt;
120 : }
121 :
122 : static inline int
123 192 : pg_euc_mblen(const unsigned char *s)
124 : {
125 : int len;
126 :
127 192 : if (*s == SS2)
128 0 : len = 2;
129 192 : else if (*s == SS3)
130 0 : len = 3;
131 192 : else if (IS_HIGHBIT_SET(*s))
132 120 : len = 2;
133 : else
134 72 : len = 1;
135 192 : return len;
136 : }
137 :
138 : static inline int
139 0 : pg_euc_dsplen(const unsigned char *s)
140 : {
141 : int len;
142 :
143 0 : if (*s == SS2)
144 0 : len = 2;
145 0 : else if (*s == SS3)
146 0 : len = 2;
147 0 : else if (IS_HIGHBIT_SET(*s))
148 0 : len = 2;
149 : else
150 0 : len = pg_ascii_dsplen(s);
151 0 : return len;
152 : }
153 :
154 : /*
155 : * EUC_JP
156 : */
157 : static int
158 0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
159 : {
160 0 : return pg_euc2wchar_with_len(from, to, len);
161 : }
162 :
163 : static int
164 192 : pg_eucjp_mblen(const unsigned char *s)
165 : {
166 192 : return pg_euc_mblen(s);
167 : }
168 :
169 : static int
170 0 : pg_eucjp_dsplen(const unsigned char *s)
171 : {
172 : int len;
173 :
174 0 : if (*s == SS2)
175 0 : len = 1;
176 0 : else if (*s == SS3)
177 0 : len = 2;
178 0 : else if (IS_HIGHBIT_SET(*s))
179 0 : len = 2;
180 : else
181 0 : len = pg_ascii_dsplen(s);
182 0 : return len;
183 : }
184 :
185 : /*
186 : * EUC_KR
187 : */
188 : static int
189 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
190 : {
191 0 : return pg_euc2wchar_with_len(from, to, len);
192 : }
193 :
194 : static int
195 0 : pg_euckr_mblen(const unsigned char *s)
196 : {
197 0 : return pg_euc_mblen(s);
198 : }
199 :
200 : static int
201 0 : pg_euckr_dsplen(const unsigned char *s)
202 : {
203 0 : return pg_euc_dsplen(s);
204 : }
205 :
206 : /*
207 : * EUC_CN
208 : *
209 : */
210 : static int
211 0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
212 : {
213 0 : int cnt = 0;
214 :
215 0 : while (len > 0 && *from)
216 : {
217 0 : if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
218 : {
219 0 : from++;
220 0 : *to = (SS2 << 16) | (*from++ << 8);
221 0 : *to |= *from++;
222 0 : len -= 3;
223 : }
224 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
225 : {
226 0 : from++;
227 0 : *to = (SS3 << 16) | (*from++ << 8);
228 0 : *to |= *from++;
229 0 : len -= 3;
230 : }
231 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
232 : {
233 0 : *to = *from++ << 8;
234 0 : *to |= *from++;
235 0 : len -= 2;
236 : }
237 : else
238 : {
239 0 : *to = *from++;
240 0 : len--;
241 : }
242 0 : to++;
243 0 : cnt++;
244 : }
245 0 : *to = 0;
246 0 : return cnt;
247 : }
248 :
249 : static int
250 0 : pg_euccn_mblen(const unsigned char *s)
251 : {
252 : int len;
253 :
254 0 : if (IS_HIGHBIT_SET(*s))
255 0 : len = 2;
256 : else
257 0 : len = 1;
258 0 : return len;
259 : }
260 :
261 : static int
262 0 : pg_euccn_dsplen(const unsigned char *s)
263 : {
264 : int len;
265 :
266 0 : if (IS_HIGHBIT_SET(*s))
267 0 : len = 2;
268 : else
269 0 : len = pg_ascii_dsplen(s);
270 0 : return len;
271 : }
272 :
273 : /*
274 : * EUC_TW
275 : *
276 : */
277 : static int
278 0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
279 : {
280 0 : int cnt = 0;
281 :
282 0 : while (len > 0 && *from)
283 : {
284 0 : if (*from == SS2 && len >= 4) /* code set 2 */
285 : {
286 0 : from++;
287 0 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
288 0 : *to |= *from++ << 8;
289 0 : *to |= *from++;
290 0 : len -= 4;
291 : }
292 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
293 : {
294 0 : from++;
295 0 : *to = (SS3 << 16) | (*from++ << 8);
296 0 : *to |= *from++;
297 0 : len -= 3;
298 : }
299 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
300 : {
301 0 : *to = *from++ << 8;
302 0 : *to |= *from++;
303 0 : len -= 2;
304 : }
305 : else
306 : {
307 0 : *to = *from++;
308 0 : len--;
309 : }
310 0 : to++;
311 0 : cnt++;
312 : }
313 0 : *to = 0;
314 0 : return cnt;
315 : }
316 :
317 : static int
318 0 : pg_euctw_mblen(const unsigned char *s)
319 : {
320 : int len;
321 :
322 0 : if (*s == SS2)
323 0 : len = 4;
324 0 : else if (*s == SS3)
325 0 : len = 3;
326 0 : else if (IS_HIGHBIT_SET(*s))
327 0 : len = 2;
328 : else
329 0 : len = 1;
330 0 : return len;
331 : }
332 :
333 : static int
334 0 : pg_euctw_dsplen(const unsigned char *s)
335 : {
336 : int len;
337 :
338 0 : if (*s == SS2)
339 0 : len = 2;
340 0 : else if (*s == SS3)
341 0 : len = 2;
342 0 : else if (IS_HIGHBIT_SET(*s))
343 0 : len = 2;
344 : else
345 0 : len = pg_ascii_dsplen(s);
346 0 : return len;
347 : }
348 :
349 : /*
350 : * Convert pg_wchar to EUC_* encoding.
351 : * caller must allocate enough space for "to", including a trailing zero!
352 : * len: length of from.
353 : * "from" not necessarily null terminated.
354 : */
355 : static int
356 0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
357 : {
358 0 : int cnt = 0;
359 :
360 0 : while (len > 0 && *from)
361 : {
362 : unsigned char c;
363 :
364 0 : if ((c = (*from >> 24)))
365 : {
366 0 : *to++ = c;
367 0 : *to++ = (*from >> 16) & 0xff;
368 0 : *to++ = (*from >> 8) & 0xff;
369 0 : *to++ = *from & 0xff;
370 0 : cnt += 4;
371 : }
372 0 : else if ((c = (*from >> 16)))
373 : {
374 0 : *to++ = c;
375 0 : *to++ = (*from >> 8) & 0xff;
376 0 : *to++ = *from & 0xff;
377 0 : cnt += 3;
378 : }
379 0 : else if ((c = (*from >> 8)))
380 : {
381 0 : *to++ = c;
382 0 : *to++ = *from & 0xff;
383 0 : cnt += 2;
384 : }
385 : else
386 : {
387 0 : *to++ = *from;
388 0 : cnt++;
389 : }
390 0 : from++;
391 0 : len--;
392 : }
393 0 : *to = 0;
394 0 : return cnt;
395 : }
396 :
397 :
398 : /*
399 : * JOHAB
400 : */
401 : static int
402 0 : pg_johab_mblen(const unsigned char *s)
403 : {
404 0 : return pg_euc_mblen(s);
405 : }
406 :
407 : static int
408 0 : pg_johab_dsplen(const unsigned char *s)
409 : {
410 0 : return pg_euc_dsplen(s);
411 : }
412 :
413 : /*
414 : * convert UTF8 string to pg_wchar (UCS-4)
415 : * caller must allocate enough space for "to", including a trailing zero!
416 : * len: length of from.
417 : * "from" not necessarily null terminated.
418 : */
419 : static int
420 6541598 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
421 : {
422 6541598 : int cnt = 0;
423 : uint32 c1,
424 : c2,
425 : c3,
426 : c4;
427 :
428 140908568 : while (len > 0 && *from)
429 : {
430 134366970 : if ((*from & 0x80) == 0)
431 : {
432 134366514 : *to = *from++;
433 134366514 : len--;
434 : }
435 456 : else if ((*from & 0xe0) == 0xc0)
436 : {
437 364 : if (len < 2)
438 0 : break; /* drop trailing incomplete char */
439 364 : c1 = *from++ & 0x1f;
440 364 : c2 = *from++ & 0x3f;
441 364 : *to = (c1 << 6) | c2;
442 364 : len -= 2;
443 : }
444 92 : else if ((*from & 0xf0) == 0xe0)
445 : {
446 92 : if (len < 3)
447 0 : break; /* drop trailing incomplete char */
448 92 : c1 = *from++ & 0x0f;
449 92 : c2 = *from++ & 0x3f;
450 92 : c3 = *from++ & 0x3f;
451 92 : *to = (c1 << 12) | (c2 << 6) | c3;
452 92 : len -= 3;
453 : }
454 0 : else if ((*from & 0xf8) == 0xf0)
455 : {
456 0 : if (len < 4)
457 0 : break; /* drop trailing incomplete char */
458 0 : c1 = *from++ & 0x07;
459 0 : c2 = *from++ & 0x3f;
460 0 : c3 = *from++ & 0x3f;
461 0 : c4 = *from++ & 0x3f;
462 0 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
463 0 : len -= 4;
464 : }
465 : else
466 : {
467 : /* treat a bogus char as length 1; not ours to raise error */
468 0 : *to = *from++;
469 0 : len--;
470 : }
471 134366970 : to++;
472 134366970 : cnt++;
473 : }
474 6541598 : *to = 0;
475 6541598 : return cnt;
476 : }
477 :
478 :
479 : /*
480 : * Trivial conversion from pg_wchar to UTF-8.
481 : * caller should allocate enough space for "to"
482 : * len: length of from.
483 : * "from" not necessarily null terminated.
484 : */
485 : static int
486 1113420 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
487 : {
488 1113420 : int cnt = 0;
489 :
490 16772520 : while (len > 0 && *from)
491 : {
492 : int char_len;
493 :
494 15659100 : unicode_to_utf8(*from, to);
495 15659100 : char_len = pg_utf_mblen(to);
496 15659100 : cnt += char_len;
497 15659100 : to += char_len;
498 15659100 : from++;
499 15659100 : len--;
500 : }
501 1113420 : *to = 0;
502 1113420 : return cnt;
503 : }
504 :
505 : /*
506 : * Return the byte length of a UTF8 character pointed to by s
507 : *
508 : * Note: in the current implementation we do not support UTF8 sequences
509 : * of more than 4 bytes; hence do NOT return a value larger than 4.
510 : * We return "1" for any leading byte that is either flat-out illegal or
511 : * indicates a length larger than we support.
512 : *
513 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
514 : * other places would need to be fixed to change this.
515 : */
516 : int
517 319850312 : pg_utf_mblen(const unsigned char *s)
518 : {
519 : int len;
520 :
521 319850312 : if ((*s & 0x80) == 0)
522 319823974 : len = 1;
523 26338 : else if ((*s & 0xe0) == 0xc0)
524 12604 : len = 2;
525 13734 : else if ((*s & 0xf0) == 0xe0)
526 9368 : len = 3;
527 4366 : else if ((*s & 0xf8) == 0xf0)
528 4192 : len = 4;
529 : #ifdef NOT_USED
530 : else if ((*s & 0xfc) == 0xf8)
531 : len = 5;
532 : else if ((*s & 0xfe) == 0xfc)
533 : len = 6;
534 : #endif
535 : else
536 174 : len = 1;
537 319850312 : return len;
538 : }
539 :
540 : /*
541 : * This is an implementation of wcwidth() and wcswidth() as defined in
542 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
543 : * <http://www.unix.org/online.html>
544 : *
545 : * Markus Kuhn -- 2001-09-08 -- public domain
546 : *
547 : * customised for PostgreSQL
548 : *
549 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
550 : */
551 :
552 : struct mbinterval
553 : {
554 : unsigned int first;
555 : unsigned int last;
556 : };
557 :
558 : /* auxiliary function for binary search in interval table */
559 : static int
560 104792792 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
561 : {
562 104792792 : int min = 0;
563 : int mid;
564 :
565 104792792 : if (ucs < table[0].first || ucs > table[max].last)
566 104784144 : return 0;
567 76608 : while (max >= min)
568 : {
569 68416 : mid = (min + max) / 2;
570 68416 : if (ucs > table[mid].last)
571 16664 : min = mid + 1;
572 51752 : else if (ucs < table[mid].first)
573 51296 : max = mid - 1;
574 : else
575 456 : return 1;
576 : }
577 :
578 8192 : return 0;
579 : }
580 :
581 :
582 : /* The following functions define the column width of an ISO 10646
583 : * character as follows:
584 : *
585 : * - The null character (U+0000) has a column width of 0.
586 : *
587 : * - Other C0/C1 control characters and DEL will lead to a return
588 : * value of -1.
589 : *
590 : * - Non-spacing and enclosing combining characters (general
591 : * category code Mn, Me or Cf in the Unicode database) have a
592 : * column width of 0.
593 : *
594 : * - Spacing characters in the East Asian Wide (W) or East Asian
595 : * FullWidth (F) category as defined in Unicode Technical
596 : * Report #11 have a column width of 2.
597 : *
598 : * - All remaining characters (including all printable
599 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
600 : * etc.) have a column width of 1.
601 : *
602 : * This implementation assumes that wchar_t characters are encoded
603 : * in ISO 10646.
604 : */
605 :
606 : static int
607 52458070 : ucs_wcwidth(pg_wchar ucs)
608 : {
609 : #include "common/unicode_nonspacing_table.h"
610 : #include "common/unicode_east_asian_fw_table.h"
611 :
612 : /* test for 8-bit control characters */
613 52458070 : if (ucs == 0)
614 0 : return 0;
615 :
616 52458070 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
617 61494 : return -1;
618 :
619 : /*
620 : * binary search in table of non-spacing characters
621 : *
622 : * XXX: In the official Unicode sources, it is possible for a character to
623 : * be described as both non-spacing and wide at the same time. As of
624 : * Unicode 13.0, treating the non-spacing property as the determining
625 : * factor for display width leads to the correct behavior, so do that
626 : * search first.
627 : */
628 52396576 : if (mbbisearch(ucs, nonspacing,
629 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
630 360 : return 0;
631 :
632 : /* binary search in table of wide characters */
633 52396216 : if (mbbisearch(ucs, east_asian_fw,
634 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
635 96 : return 2;
636 :
637 52396120 : return 1;
638 : }
639 :
640 : static int
641 52458070 : pg_utf_dsplen(const unsigned char *s)
642 : {
643 52458070 : return ucs_wcwidth(utf8_to_unicode(s));
644 : }
645 :
646 : /*
647 : * convert mule internal code to pg_wchar
648 : * caller should allocate enough space for "to"
649 : * len: length of from.
650 : * "from" not necessarily null terminated.
651 : */
652 : static int
653 0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
654 : {
655 0 : int cnt = 0;
656 :
657 0 : while (len > 0 && *from)
658 : {
659 0 : if (IS_LC1(*from) && len >= 2)
660 : {
661 0 : *to = *from++ << 16;
662 0 : *to |= *from++;
663 0 : len -= 2;
664 : }
665 0 : else if (IS_LCPRV1(*from) && len >= 3)
666 : {
667 0 : from++;
668 0 : *to = *from++ << 16;
669 0 : *to |= *from++;
670 0 : len -= 3;
671 : }
672 0 : else if (IS_LC2(*from) && len >= 3)
673 : {
674 0 : *to = *from++ << 16;
675 0 : *to |= *from++ << 8;
676 0 : *to |= *from++;
677 0 : len -= 3;
678 : }
679 0 : else if (IS_LCPRV2(*from) && len >= 4)
680 : {
681 0 : from++;
682 0 : *to = *from++ << 16;
683 0 : *to |= *from++ << 8;
684 0 : *to |= *from++;
685 0 : len -= 4;
686 : }
687 : else
688 : { /* assume ASCII */
689 0 : *to = (unsigned char) *from++;
690 0 : len--;
691 : }
692 0 : to++;
693 0 : cnt++;
694 : }
695 0 : *to = 0;
696 0 : return cnt;
697 : }
698 :
699 : /*
700 : * convert pg_wchar to mule internal code
701 : * caller should allocate enough space for "to"
702 : * len: length of from.
703 : * "from" not necessarily null terminated.
704 : */
705 : static int
706 0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
707 : {
708 0 : int cnt = 0;
709 :
710 0 : while (len > 0 && *from)
711 : {
712 : unsigned char lb;
713 :
714 0 : lb = (*from >> 16) & 0xff;
715 0 : if (IS_LC1(lb))
716 : {
717 0 : *to++ = lb;
718 0 : *to++ = *from & 0xff;
719 0 : cnt += 2;
720 : }
721 0 : else if (IS_LC2(lb))
722 : {
723 0 : *to++ = lb;
724 0 : *to++ = (*from >> 8) & 0xff;
725 0 : *to++ = *from & 0xff;
726 0 : cnt += 3;
727 : }
728 0 : else if (IS_LCPRV1_A_RANGE(lb))
729 : {
730 0 : *to++ = LCPRV1_A;
731 0 : *to++ = lb;
732 0 : *to++ = *from & 0xff;
733 0 : cnt += 3;
734 : }
735 0 : else if (IS_LCPRV1_B_RANGE(lb))
736 : {
737 0 : *to++ = LCPRV1_B;
738 0 : *to++ = lb;
739 0 : *to++ = *from & 0xff;
740 0 : cnt += 3;
741 : }
742 0 : else if (IS_LCPRV2_A_RANGE(lb))
743 : {
744 0 : *to++ = LCPRV2_A;
745 0 : *to++ = lb;
746 0 : *to++ = (*from >> 8) & 0xff;
747 0 : *to++ = *from & 0xff;
748 0 : cnt += 4;
749 : }
750 0 : else if (IS_LCPRV2_B_RANGE(lb))
751 : {
752 0 : *to++ = LCPRV2_B;
753 0 : *to++ = lb;
754 0 : *to++ = (*from >> 8) & 0xff;
755 0 : *to++ = *from & 0xff;
756 0 : cnt += 4;
757 : }
758 : else
759 : {
760 0 : *to++ = *from & 0xff;
761 0 : cnt += 1;
762 : }
763 0 : from++;
764 0 : len--;
765 : }
766 0 : *to = 0;
767 0 : return cnt;
768 : }
769 :
770 : /* exported for direct use by conv.c */
771 : int
772 2952 : pg_mule_mblen(const unsigned char *s)
773 : {
774 : int len;
775 :
776 2952 : if (IS_LC1(*s))
777 1188 : len = 2;
778 1764 : else if (IS_LCPRV1(*s))
779 0 : len = 3;
780 1764 : else if (IS_LC2(*s))
781 1710 : len = 3;
782 54 : else if (IS_LCPRV2(*s))
783 0 : len = 4;
784 : else
785 54 : len = 1; /* assume ASCII */
786 2952 : return len;
787 : }
788 :
789 : static int
790 0 : pg_mule_dsplen(const unsigned char *s)
791 : {
792 : int len;
793 :
794 : /*
795 : * Note: it's not really appropriate to assume that all multibyte charsets
796 : * are double-wide on screen. But this seems an okay approximation for
797 : * the MULE charsets we currently support.
798 : */
799 :
800 0 : if (IS_LC1(*s))
801 0 : len = 1;
802 0 : else if (IS_LCPRV1(*s))
803 0 : len = 1;
804 0 : else if (IS_LC2(*s))
805 0 : len = 2;
806 0 : else if (IS_LCPRV2(*s))
807 0 : len = 2;
808 : else
809 0 : len = 1; /* assume ASCII */
810 :
811 0 : return len;
812 : }
813 :
814 : /*
815 : * ISO8859-1
816 : */
817 : static int
818 1070 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
819 : {
820 1070 : int cnt = 0;
821 :
822 30004 : while (len > 0 && *from)
823 : {
824 28934 : *to++ = *from++;
825 28934 : len--;
826 28934 : cnt++;
827 : }
828 1070 : *to = 0;
829 1070 : return cnt;
830 : }
831 :
832 : /*
833 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
834 : * high bits.
835 : * caller should allocate enough space for "to"
836 : * len: length of from.
837 : * "from" not necessarily null terminated.
838 : */
839 : static int
840 150 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
841 : {
842 150 : int cnt = 0;
843 :
844 1356 : while (len > 0 && *from)
845 : {
846 1206 : *to++ = *from++;
847 1206 : len--;
848 1206 : cnt++;
849 : }
850 150 : *to = 0;
851 150 : return cnt;
852 : }
853 :
854 : static int
855 4316 : pg_latin1_mblen(const unsigned char *s)
856 : {
857 4316 : return 1;
858 : }
859 :
860 : static int
861 800 : pg_latin1_dsplen(const unsigned char *s)
862 : {
863 800 : return pg_ascii_dsplen(s);
864 : }
865 :
866 : /*
867 : * SJIS
868 : */
869 : static int
870 972 : pg_sjis_mblen(const unsigned char *s)
871 : {
872 : int len;
873 :
874 972 : if (*s >= 0xa1 && *s <= 0xdf)
875 0 : len = 1; /* 1 byte kana? */
876 972 : else if (IS_HIGHBIT_SET(*s))
877 864 : len = 2; /* kanji? */
878 : else
879 108 : len = 1; /* should be ASCII */
880 972 : return len;
881 : }
882 :
883 : static int
884 0 : pg_sjis_dsplen(const unsigned char *s)
885 : {
886 : int len;
887 :
888 0 : if (*s >= 0xa1 && *s <= 0xdf)
889 0 : len = 1; /* 1 byte kana? */
890 0 : else if (IS_HIGHBIT_SET(*s))
891 0 : len = 2; /* kanji? */
892 : else
893 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
894 0 : return len;
895 : }
896 :
897 : /*
898 : * Big5
899 : */
900 : static int
901 468 : pg_big5_mblen(const unsigned char *s)
902 : {
903 : int len;
904 :
905 468 : if (IS_HIGHBIT_SET(*s))
906 414 : len = 2; /* kanji? */
907 : else
908 54 : len = 1; /* should be ASCII */
909 468 : return len;
910 : }
911 :
912 : static int
913 0 : pg_big5_dsplen(const unsigned char *s)
914 : {
915 : int len;
916 :
917 0 : if (IS_HIGHBIT_SET(*s))
918 0 : len = 2; /* kanji? */
919 : else
920 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
921 0 : return len;
922 : }
923 :
924 : /*
925 : * GBK
926 : */
927 : static int
928 0 : pg_gbk_mblen(const unsigned char *s)
929 : {
930 : int len;
931 :
932 0 : if (IS_HIGHBIT_SET(*s))
933 0 : len = 2; /* kanji? */
934 : else
935 0 : len = 1; /* should be ASCII */
936 0 : return len;
937 : }
938 :
939 : static int
940 0 : pg_gbk_dsplen(const unsigned char *s)
941 : {
942 : int len;
943 :
944 0 : if (IS_HIGHBIT_SET(*s))
945 0 : len = 2; /* kanji? */
946 : else
947 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
948 0 : return len;
949 : }
950 :
951 : /*
952 : * UHC
953 : */
954 : static int
955 0 : pg_uhc_mblen(const unsigned char *s)
956 : {
957 : int len;
958 :
959 0 : if (IS_HIGHBIT_SET(*s))
960 0 : len = 2; /* 2byte? */
961 : else
962 0 : len = 1; /* should be ASCII */
963 0 : return len;
964 : }
965 :
966 : static int
967 0 : pg_uhc_dsplen(const unsigned char *s)
968 : {
969 : int len;
970 :
971 0 : if (IS_HIGHBIT_SET(*s))
972 0 : len = 2; /* 2byte? */
973 : else
974 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
975 0 : return len;
976 : }
977 :
978 : /*
979 : * GB18030
980 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
981 : */
982 :
983 : /*
984 : * Unlike all other mblen() functions, this also looks at the second byte of
985 : * the input. However, if you only pass the first byte of a multi-byte
986 : * string, and \0 as the second byte, this still works in a predictable way:
987 : * a 4-byte character will be reported as two 2-byte characters. That's
988 : * enough for all current uses, as a client-only encoding. It works that
989 : * way, because in any valid 4-byte GB18030-encoded character, the third and
990 : * fourth byte look like a 2-byte encoded character, when looked at
991 : * separately.
992 : */
993 : static int
994 162 : pg_gb18030_mblen(const unsigned char *s)
995 : {
996 : int len;
997 :
998 162 : if (!IS_HIGHBIT_SET(*s))
999 36 : len = 1; /* ASCII */
1000 126 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1001 126 : len = 4;
1002 : else
1003 0 : len = 2;
1004 162 : return len;
1005 : }
1006 :
1007 : static int
1008 0 : pg_gb18030_dsplen(const unsigned char *s)
1009 : {
1010 : int len;
1011 :
1012 0 : if (IS_HIGHBIT_SET(*s))
1013 0 : len = 2;
1014 : else
1015 0 : len = pg_ascii_dsplen(s); /* ASCII */
1016 0 : return len;
1017 : }
1018 :
1019 : /*
1020 : *-------------------------------------------------------------------
1021 : * multibyte sequence validators
1022 : *
1023 : * The verifychar functions accept "s", a pointer to the first byte of a
1024 : * string, and "len", the remaining length of the string. If there is a
1025 : * validly encoded character beginning at *s, return its length in bytes;
1026 : * else return -1.
1027 : *
1028 : * The verifystr functions also accept "s", a pointer to a string and "len",
1029 : * the length of the string. They verify the whole string, and return the
1030 : * number of input bytes (<= len) that are valid. In other words, if the
1031 : * whole string is valid, verifystr returns "len", otherwise it returns the
1032 : * byte offset of the first invalid character. The verifystr functions must
1033 : * test for and reject zeroes in the input.
1034 : *
1035 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
1036 : * they must test for and reject zeroes in any additional bytes of a
1037 : * multibyte character. Note that this definition allows the function for a
1038 : * single-byte encoding to be just "return 1".
1039 : *-------------------------------------------------------------------
1040 : */
1041 : static int
1042 58 : pg_ascii_verifychar(const unsigned char *s, int len)
1043 : {
1044 58 : return 1;
1045 : }
1046 :
1047 : static int
1048 423130 : pg_ascii_verifystr(const unsigned char *s, int len)
1049 : {
1050 423130 : const unsigned char *nullpos = memchr(s, 0, len);
1051 :
1052 423130 : if (nullpos == NULL)
1053 423130 : return len;
1054 : else
1055 0 : return nullpos - s;
1056 : }
1057 :
1058 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1059 :
1060 : static int
1061 468 : pg_eucjp_verifychar(const unsigned char *s, int len)
1062 : {
1063 : int l;
1064 : unsigned char c1,
1065 : c2;
1066 :
1067 468 : c1 = *s++;
1068 :
1069 468 : switch (c1)
1070 : {
1071 0 : case SS2: /* JIS X 0201 */
1072 0 : l = 2;
1073 0 : if (l > len)
1074 0 : return -1;
1075 0 : c2 = *s++;
1076 0 : if (c2 < 0xa1 || c2 > 0xdf)
1077 0 : return -1;
1078 0 : break;
1079 :
1080 0 : case SS3: /* JIS X 0212 */
1081 0 : l = 3;
1082 0 : if (l > len)
1083 0 : return -1;
1084 0 : c2 = *s++;
1085 0 : if (!IS_EUC_RANGE_VALID(c2))
1086 0 : return -1;
1087 0 : c2 = *s++;
1088 0 : if (!IS_EUC_RANGE_VALID(c2))
1089 0 : return -1;
1090 0 : break;
1091 :
1092 468 : default:
1093 468 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1094 : {
1095 468 : l = 2;
1096 468 : if (l > len)
1097 72 : return -1;
1098 396 : if (!IS_EUC_RANGE_VALID(c1))
1099 0 : return -1;
1100 396 : c2 = *s++;
1101 396 : if (!IS_EUC_RANGE_VALID(c2))
1102 180 : return -1;
1103 : }
1104 : else
1105 : /* must be ASCII */
1106 : {
1107 0 : l = 1;
1108 : }
1109 216 : break;
1110 : }
1111 :
1112 216 : return l;
1113 : }
1114 :
1115 : static int
1116 264 : pg_eucjp_verifystr(const unsigned char *s, int len)
1117 : {
1118 264 : const unsigned char *start = s;
1119 :
1120 894 : while (len > 0)
1121 : {
1122 : int l;
1123 :
1124 : /* fast path for ASCII-subset characters */
1125 810 : if (!IS_HIGHBIT_SET(*s))
1126 : {
1127 594 : if (*s == '\0')
1128 72 : break;
1129 522 : l = 1;
1130 : }
1131 : else
1132 : {
1133 216 : l = pg_eucjp_verifychar(s, len);
1134 216 : if (l == -1)
1135 108 : break;
1136 : }
1137 630 : s += l;
1138 630 : len -= l;
1139 : }
1140 :
1141 264 : return s - start;
1142 : }
1143 :
1144 : static int
1145 0 : pg_euckr_verifychar(const unsigned char *s, int len)
1146 : {
1147 : int l;
1148 : unsigned char c1,
1149 : c2;
1150 :
1151 0 : c1 = *s++;
1152 :
1153 0 : if (IS_HIGHBIT_SET(c1))
1154 : {
1155 0 : l = 2;
1156 0 : if (l > len)
1157 0 : return -1;
1158 0 : if (!IS_EUC_RANGE_VALID(c1))
1159 0 : return -1;
1160 0 : c2 = *s++;
1161 0 : if (!IS_EUC_RANGE_VALID(c2))
1162 0 : return -1;
1163 : }
1164 : else
1165 : /* must be ASCII */
1166 : {
1167 0 : l = 1;
1168 : }
1169 :
1170 0 : return l;
1171 : }
1172 :
1173 : static int
1174 24 : pg_euckr_verifystr(const unsigned char *s, int len)
1175 : {
1176 24 : const unsigned char *start = s;
1177 :
1178 96 : while (len > 0)
1179 : {
1180 : int l;
1181 :
1182 : /* fast path for ASCII-subset characters */
1183 72 : if (!IS_HIGHBIT_SET(*s))
1184 : {
1185 72 : if (*s == '\0')
1186 0 : break;
1187 72 : l = 1;
1188 : }
1189 : else
1190 : {
1191 0 : l = pg_euckr_verifychar(s, len);
1192 0 : if (l == -1)
1193 0 : break;
1194 : }
1195 72 : s += l;
1196 72 : len -= l;
1197 : }
1198 :
1199 24 : return s - start;
1200 : }
1201 :
1202 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1203 : #define pg_euccn_verifychar pg_euckr_verifychar
1204 : #define pg_euccn_verifystr pg_euckr_verifystr
1205 :
1206 : static int
1207 0 : pg_euctw_verifychar(const unsigned char *s, int len)
1208 : {
1209 : int l;
1210 : unsigned char c1,
1211 : c2;
1212 :
1213 0 : c1 = *s++;
1214 :
1215 0 : switch (c1)
1216 : {
1217 0 : case SS2: /* CNS 11643 Plane 1-7 */
1218 0 : l = 4;
1219 0 : if (l > len)
1220 0 : return -1;
1221 0 : c2 = *s++;
1222 0 : if (c2 < 0xa1 || c2 > 0xa7)
1223 0 : return -1;
1224 0 : c2 = *s++;
1225 0 : if (!IS_EUC_RANGE_VALID(c2))
1226 0 : return -1;
1227 0 : c2 = *s++;
1228 0 : if (!IS_EUC_RANGE_VALID(c2))
1229 0 : return -1;
1230 0 : break;
1231 :
1232 0 : case SS3: /* unused */
1233 0 : return -1;
1234 :
1235 0 : default:
1236 0 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1237 : {
1238 0 : l = 2;
1239 0 : if (l > len)
1240 0 : return -1;
1241 : /* no further range check on c1? */
1242 0 : c2 = *s++;
1243 0 : if (!IS_EUC_RANGE_VALID(c2))
1244 0 : return -1;
1245 : }
1246 : else
1247 : /* must be ASCII */
1248 : {
1249 0 : l = 1;
1250 : }
1251 0 : break;
1252 : }
1253 0 : return l;
1254 : }
1255 :
1256 : static int
1257 18 : pg_euctw_verifystr(const unsigned char *s, int len)
1258 : {
1259 18 : const unsigned char *start = s;
1260 :
1261 72 : while (len > 0)
1262 : {
1263 : int l;
1264 :
1265 : /* fast path for ASCII-subset characters */
1266 54 : if (!IS_HIGHBIT_SET(*s))
1267 : {
1268 54 : if (*s == '\0')
1269 0 : break;
1270 54 : l = 1;
1271 : }
1272 : else
1273 : {
1274 0 : l = pg_euctw_verifychar(s, len);
1275 0 : if (l == -1)
1276 0 : break;
1277 : }
1278 54 : s += l;
1279 54 : len -= l;
1280 : }
1281 :
1282 18 : return s - start;
1283 : }
1284 :
1285 : static int
1286 0 : pg_johab_verifychar(const unsigned char *s, int len)
1287 : {
1288 : int l,
1289 : mbl;
1290 : unsigned char c;
1291 :
1292 0 : l = mbl = pg_johab_mblen(s);
1293 :
1294 0 : if (len < l)
1295 0 : return -1;
1296 :
1297 0 : if (!IS_HIGHBIT_SET(*s))
1298 0 : return mbl;
1299 :
1300 0 : while (--l > 0)
1301 : {
1302 0 : c = *++s;
1303 0 : if (!IS_EUC_RANGE_VALID(c))
1304 0 : return -1;
1305 : }
1306 0 : return mbl;
1307 : }
1308 :
1309 : static int
1310 6 : pg_johab_verifystr(const unsigned char *s, int len)
1311 : {
1312 6 : const unsigned char *start = s;
1313 :
1314 24 : while (len > 0)
1315 : {
1316 : int l;
1317 :
1318 : /* fast path for ASCII-subset characters */
1319 18 : if (!IS_HIGHBIT_SET(*s))
1320 : {
1321 18 : if (*s == '\0')
1322 0 : break;
1323 18 : l = 1;
1324 : }
1325 : else
1326 : {
1327 0 : l = pg_johab_verifychar(s, len);
1328 0 : if (l == -1)
1329 0 : break;
1330 : }
1331 18 : s += l;
1332 18 : len -= l;
1333 : }
1334 :
1335 6 : return s - start;
1336 : }
1337 :
1338 : static int
1339 1296 : pg_mule_verifychar(const unsigned char *s, int len)
1340 : {
1341 : int l,
1342 : mbl;
1343 : unsigned char c;
1344 :
1345 1296 : l = mbl = pg_mule_mblen(s);
1346 :
1347 1296 : if (len < l)
1348 324 : return -1;
1349 :
1350 1998 : while (--l > 0)
1351 : {
1352 1314 : c = *++s;
1353 1314 : if (!IS_HIGHBIT_SET(c))
1354 288 : return -1;
1355 : }
1356 684 : return mbl;
1357 : }
1358 :
1359 : static int
1360 378 : pg_mule_verifystr(const unsigned char *s, int len)
1361 : {
1362 378 : const unsigned char *start = s;
1363 :
1364 1062 : while (len > 0)
1365 : {
1366 : int l;
1367 :
1368 : /* fast path for ASCII-subset characters */
1369 900 : if (!IS_HIGHBIT_SET(*s))
1370 : {
1371 522 : if (*s == '\0')
1372 36 : break;
1373 486 : l = 1;
1374 : }
1375 : else
1376 : {
1377 378 : l = pg_mule_verifychar(s, len);
1378 378 : if (l == -1)
1379 180 : break;
1380 : }
1381 684 : s += l;
1382 684 : len -= l;
1383 : }
1384 :
1385 378 : return s - start;
1386 : }
1387 :
1388 : static int
1389 244 : pg_latin1_verifychar(const unsigned char *s, int len)
1390 : {
1391 244 : return 1;
1392 : }
1393 :
1394 : static int
1395 11052 : pg_latin1_verifystr(const unsigned char *s, int len)
1396 : {
1397 11052 : const unsigned char *nullpos = memchr(s, 0, len);
1398 :
1399 11052 : if (nullpos == NULL)
1400 10944 : return len;
1401 : else
1402 108 : return nullpos - s;
1403 : }
1404 :
1405 : static int
1406 702 : pg_sjis_verifychar(const unsigned char *s, int len)
1407 : {
1408 : int l,
1409 : mbl;
1410 : unsigned char c1,
1411 : c2;
1412 :
1413 702 : l = mbl = pg_sjis_mblen(s);
1414 :
1415 702 : if (len < l)
1416 108 : return -1;
1417 :
1418 594 : if (l == 1) /* pg_sjis_mblen already verified it */
1419 0 : return mbl;
1420 :
1421 594 : c1 = *s++;
1422 594 : c2 = *s;
1423 594 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1424 216 : return -1;
1425 378 : return mbl;
1426 : }
1427 :
1428 : static int
1429 282 : pg_sjis_verifystr(const unsigned char *s, int len)
1430 : {
1431 282 : const unsigned char *start = s;
1432 :
1433 1254 : while (len > 0)
1434 : {
1435 : int l;
1436 :
1437 : /* fast path for ASCII-subset characters */
1438 1152 : if (!IS_HIGHBIT_SET(*s))
1439 : {
1440 918 : if (*s == '\0')
1441 72 : break;
1442 846 : l = 1;
1443 : }
1444 : else
1445 : {
1446 234 : l = pg_sjis_verifychar(s, len);
1447 234 : if (l == -1)
1448 108 : break;
1449 : }
1450 972 : s += l;
1451 972 : len -= l;
1452 : }
1453 :
1454 282 : return s - start;
1455 : }
1456 :
1457 : static int
1458 342 : pg_big5_verifychar(const unsigned char *s, int len)
1459 : {
1460 : int l,
1461 : mbl;
1462 :
1463 342 : l = mbl = pg_big5_mblen(s);
1464 :
1465 342 : if (len < l)
1466 0 : return -1;
1467 :
1468 576 : while (--l > 0)
1469 : {
1470 342 : if (*++s == '\0')
1471 108 : return -1;
1472 : }
1473 :
1474 234 : return mbl;
1475 : }
1476 :
1477 : static int
1478 144 : pg_big5_verifystr(const unsigned char *s, int len)
1479 : {
1480 144 : const unsigned char *start = s;
1481 :
1482 648 : while (len > 0)
1483 : {
1484 : int l;
1485 :
1486 : /* fast path for ASCII-subset characters */
1487 576 : if (!IS_HIGHBIT_SET(*s))
1488 : {
1489 468 : if (*s == '\0')
1490 36 : break;
1491 432 : l = 1;
1492 : }
1493 : else
1494 : {
1495 108 : l = pg_big5_verifychar(s, len);
1496 108 : if (l == -1)
1497 36 : break;
1498 : }
1499 504 : s += l;
1500 504 : len -= l;
1501 : }
1502 :
1503 144 : return s - start;
1504 : }
1505 :
1506 : static int
1507 0 : pg_gbk_verifychar(const unsigned char *s, int len)
1508 : {
1509 : int l,
1510 : mbl;
1511 :
1512 0 : l = mbl = pg_gbk_mblen(s);
1513 :
1514 0 : if (len < l)
1515 0 : return -1;
1516 :
1517 0 : while (--l > 0)
1518 : {
1519 0 : if (*++s == '\0')
1520 0 : return -1;
1521 : }
1522 :
1523 0 : return mbl;
1524 : }
1525 :
1526 : static int
1527 6 : pg_gbk_verifystr(const unsigned char *s, int len)
1528 : {
1529 6 : const unsigned char *start = s;
1530 :
1531 24 : while (len > 0)
1532 : {
1533 : int l;
1534 :
1535 : /* fast path for ASCII-subset characters */
1536 18 : if (!IS_HIGHBIT_SET(*s))
1537 : {
1538 18 : if (*s == '\0')
1539 0 : break;
1540 18 : l = 1;
1541 : }
1542 : else
1543 : {
1544 0 : l = pg_gbk_verifychar(s, len);
1545 0 : if (l == -1)
1546 0 : break;
1547 : }
1548 18 : s += l;
1549 18 : len -= l;
1550 : }
1551 :
1552 6 : return s - start;
1553 : }
1554 :
1555 : static int
1556 0 : pg_uhc_verifychar(const unsigned char *s, int len)
1557 : {
1558 : int l,
1559 : mbl;
1560 :
1561 0 : l = mbl = pg_uhc_mblen(s);
1562 :
1563 0 : if (len < l)
1564 0 : return -1;
1565 :
1566 0 : while (--l > 0)
1567 : {
1568 0 : if (*++s == '\0')
1569 0 : return -1;
1570 : }
1571 :
1572 0 : return mbl;
1573 : }
1574 :
1575 : static int
1576 6 : pg_uhc_verifystr(const unsigned char *s, int len)
1577 : {
1578 6 : const unsigned char *start = s;
1579 :
1580 24 : while (len > 0)
1581 : {
1582 : int l;
1583 :
1584 : /* fast path for ASCII-subset characters */
1585 18 : if (!IS_HIGHBIT_SET(*s))
1586 : {
1587 18 : if (*s == '\0')
1588 0 : break;
1589 18 : l = 1;
1590 : }
1591 : else
1592 : {
1593 0 : l = pg_uhc_verifychar(s, len);
1594 0 : if (l == -1)
1595 0 : break;
1596 : }
1597 18 : s += l;
1598 18 : len -= l;
1599 : }
1600 :
1601 6 : return s - start;
1602 : }
1603 :
1604 : static int
1605 414 : pg_gb18030_verifychar(const unsigned char *s, int len)
1606 : {
1607 : int l;
1608 :
1609 414 : if (!IS_HIGHBIT_SET(*s))
1610 0 : l = 1; /* ASCII */
1611 414 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1612 : {
1613 : /* Should be 4-byte, validate remaining bytes */
1614 306 : if (*s >= 0x81 && *s <= 0xfe &&
1615 306 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1616 306 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1617 162 : l = 4;
1618 : else
1619 144 : l = -1;
1620 : }
1621 108 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1622 : {
1623 : /* Should be 2-byte, validate */
1624 108 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1625 108 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1626 36 : l = 2;
1627 : else
1628 72 : l = -1;
1629 : }
1630 : else
1631 0 : l = -1;
1632 414 : return l;
1633 : }
1634 :
1635 : static int
1636 222 : pg_gb18030_verifystr(const unsigned char *s, int len)
1637 : {
1638 222 : const unsigned char *start = s;
1639 :
1640 978 : while (len > 0)
1641 : {
1642 : int l;
1643 :
1644 : /* fast path for ASCII-subset characters */
1645 900 : if (!IS_HIGHBIT_SET(*s))
1646 : {
1647 702 : if (*s == '\0')
1648 36 : break;
1649 666 : l = 1;
1650 : }
1651 : else
1652 : {
1653 198 : l = pg_gb18030_verifychar(s, len);
1654 198 : if (l == -1)
1655 108 : break;
1656 : }
1657 756 : s += l;
1658 756 : len -= l;
1659 : }
1660 :
1661 222 : return s - start;
1662 : }
1663 :
1664 : static int
1665 16438 : pg_utf8_verifychar(const unsigned char *s, int len)
1666 : {
1667 : int l;
1668 :
1669 16438 : if ((*s & 0x80) == 0)
1670 : {
1671 0 : if (*s == '\0')
1672 0 : return -1;
1673 0 : return 1;
1674 : }
1675 16438 : else if ((*s & 0xe0) == 0xc0)
1676 5730 : l = 2;
1677 10708 : else if ((*s & 0xf0) == 0xe0)
1678 6080 : l = 3;
1679 4628 : else if ((*s & 0xf8) == 0xf0)
1680 4364 : l = 4;
1681 : else
1682 264 : l = 1;
1683 :
1684 16438 : if (l > len)
1685 180 : return -1;
1686 :
1687 16258 : if (!pg_utf8_islegal(s, l))
1688 1812 : return -1;
1689 :
1690 14446 : return l;
1691 : }
1692 :
1693 : /*
1694 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1695 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1696 : * input byte and current state are used to compute an index into an array of
1697 : * state transitions. Since the address of the next transition is dependent
1698 : * on this computation, there is latency in executing the load instruction,
1699 : * and the CPU is not kept busy.
1700 : *
1701 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1702 : *
1703 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1704 : *
1705 : * In a shift-based DFA, the input byte is an index into array of integers
1706 : * whose bit pattern encodes the state transitions. To compute the next
1707 : * state, we simply right-shift the integer by the current state and apply a
1708 : * mask. In this scheme, the address of the transition only depends on the
1709 : * input byte, so there is better pipelining.
1710 : *
1711 : * The naming convention for states and transitions was adopted from a UTF-8
1712 : * to UTF-16/32 transcoder, whose table is reproduced below:
1713 : *
1714 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1715 : *
1716 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1717 : * ==========================================================================
1718 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1719 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1720 : * |
1721 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1722 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1723 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1724 : * |
1725 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1726 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1727 : * |
1728 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1729 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1730 : *
1731 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1732 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1733 : * it's possible to find state numbers such that the transitions fit within
1734 : * 32-bit integers, as Dougall Johnson demonstrated:
1735 : *
1736 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1737 : *
1738 : * This packed representation is the reason for the seemingly odd choice of
1739 : * state values below.
1740 : */
1741 :
1742 : /* Error */
1743 : #define ERR 0
1744 : /* Begin */
1745 : #define BGN 11
1746 : /* Continuation states, expect 1/2/3 continuation bytes */
1747 : #define CS1 16
1748 : #define CS2 1
1749 : #define CS3 5
1750 : /* Partial states, where the first continuation byte has a restricted range */
1751 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1752 : #define P3B 20 /* Lead was ED, check for surrogate */
1753 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1754 : #define P4B 30 /* Lead was F4, check for too-large */
1755 : /* Begin and End are the same state */
1756 : #define END BGN
1757 :
1758 : /* the encoded state transitions for the lookup table */
1759 :
1760 : /* ASCII */
1761 : #define ASC (END << BGN)
1762 : /* 2-byte lead */
1763 : #define L2A (CS1 << BGN)
1764 : /* 3-byte lead */
1765 : #define L3A (P3A << BGN)
1766 : #define L3B (CS2 << BGN)
1767 : #define L3C (P3B << BGN)
1768 : /* 4-byte lead */
1769 : #define L4A (P4A << BGN)
1770 : #define L4B (CS3 << BGN)
1771 : #define L4C (P4B << BGN)
1772 : /* continuation byte */
1773 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1774 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1775 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1776 : /* invalid byte */
1777 : #define ILL ERR
1778 :
1779 : static const uint32 Utf8Transition[256] =
1780 : {
1781 : /* ASCII */
1782 :
1783 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1784 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1785 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1786 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1787 :
1788 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1789 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1790 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1791 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1792 :
1793 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1794 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1795 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1796 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1797 :
1798 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1799 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1800 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1801 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1802 :
1803 : /* continuation bytes */
1804 :
1805 : /* 80..8F */
1806 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1807 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1808 :
1809 : /* 90..9F */
1810 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1811 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1812 :
1813 : /* A0..BF */
1814 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1815 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1816 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1817 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1818 :
1819 : /* leading bytes */
1820 :
1821 : /* C0..DF */
1822 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1823 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1824 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1825 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1826 :
1827 : /* E0..EF */
1828 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1829 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1830 :
1831 : /* F0..FF */
1832 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1833 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1834 : };
1835 :
1836 : static void
1837 1632 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1838 : {
1839 : /* Note: We deliberately don't check the state's value here. */
1840 53856 : while (len > 0)
1841 : {
1842 : /*
1843 : * It's important that the mask value is 31: In most instruction sets,
1844 : * a shift by a 32-bit operand is understood to be a shift by its mod
1845 : * 32, so the compiler should elide the mask operation.
1846 : */
1847 52224 : *state = Utf8Transition[*s++] >> (*state & 31);
1848 52224 : len--;
1849 : }
1850 :
1851 1632 : *state &= 31;
1852 1632 : }
1853 :
1854 : static int
1855 1083830 : pg_utf8_verifystr(const unsigned char *s, int len)
1856 : {
1857 1083830 : const unsigned char *start = s;
1858 1083830 : const int orig_len = len;
1859 1083830 : uint32 state = BGN;
1860 :
1861 : /*
1862 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1863 : * the compiler can unroll a longer loop, it's not worth it because we
1864 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1865 : */
1866 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1867 :
1868 1083830 : if (len >= STRIDE_LENGTH)
1869 : {
1870 3601962 : while (len >= STRIDE_LENGTH)
1871 : {
1872 : /*
1873 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1874 : * but we must first check for a non-END state, which means the
1875 : * previous chunk ended in the middle of a multibyte sequence.
1876 : */
1877 3092538 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1878 1632 : utf8_advance(s, &state, STRIDE_LENGTH);
1879 :
1880 3092538 : s += STRIDE_LENGTH;
1881 3092538 : len -= STRIDE_LENGTH;
1882 : }
1883 :
1884 : /* The error state persists, so we only need to check for it here. */
1885 509424 : if (state == ERR)
1886 : {
1887 : /*
1888 : * Start over from the beginning with the slow path so we can
1889 : * count the valid bytes.
1890 : */
1891 504 : len = orig_len;
1892 504 : s = start;
1893 : }
1894 508920 : else if (state != END)
1895 : {
1896 : /*
1897 : * The fast path exited in the middle of a multibyte sequence.
1898 : * Walk backwards to find the leading byte so that the slow path
1899 : * can resume checking from there. We must always backtrack at
1900 : * least one byte, since the current byte could be e.g. an ASCII
1901 : * byte after a 2-byte lead, which is invalid.
1902 : */
1903 : do
1904 : {
1905 : Assert(s > start);
1906 108 : s--;
1907 108 : len++;
1908 : Assert(IS_HIGHBIT_SET(*s));
1909 108 : } while (pg_utf_mblen(s) <= 1);
1910 : }
1911 : }
1912 :
1913 : /* check remaining bytes */
1914 16069718 : while (len > 0)
1915 : {
1916 : int l;
1917 :
1918 : /* fast path for ASCII-subset characters */
1919 14988056 : if (!IS_HIGHBIT_SET(*s))
1920 : {
1921 14971618 : if (*s == '\0')
1922 176 : break;
1923 14971442 : l = 1;
1924 : }
1925 : else
1926 : {
1927 16438 : l = pg_utf8_verifychar(s, len);
1928 16438 : if (l == -1)
1929 1992 : break;
1930 : }
1931 14985888 : s += l;
1932 14985888 : len -= l;
1933 : }
1934 :
1935 1083830 : return s - start;
1936 : }
1937 :
1938 : /*
1939 : * Check for validity of a single UTF-8 encoded character
1940 : *
1941 : * This directly implements the rules in RFC3629. The bizarre-looking
1942 : * restrictions on the second byte are meant to ensure that there isn't
1943 : * more than one encoding of a given Unicode character point; that is,
1944 : * you may not use a longer-than-necessary byte sequence with high order
1945 : * zero bits to represent a character that would fit in fewer bytes.
1946 : * To do otherwise is to create security hazards (eg, create an apparent
1947 : * non-ASCII character that decodes to plain ASCII).
1948 : *
1949 : * length is assumed to have been obtained by pg_utf_mblen(), and the
1950 : * caller must have checked that that many bytes are present in the buffer.
1951 : */
1952 : bool
1953 22802 : pg_utf8_islegal(const unsigned char *source, int length)
1954 : {
1955 : unsigned char a;
1956 :
1957 22802 : switch (length)
1958 : {
1959 0 : default:
1960 : /* reject lengths 5 and 6 for now */
1961 0 : return false;
1962 4328 : case 4:
1963 4328 : a = source[3];
1964 4328 : if (a < 0x80 || a > 0xBF)
1965 96 : return false;
1966 : /* FALL THRU */
1967 : case 3:
1968 11898 : a = source[2];
1969 11898 : if (a < 0x80 || a > 0xBF)
1970 600 : return false;
1971 : /* FALL THRU */
1972 : case 2:
1973 17548 : a = source[1];
1974 17548 : switch (*source)
1975 : {
1976 312 : case 0xE0:
1977 312 : if (a < 0xA0 || a > 0xBF)
1978 264 : return false;
1979 48 : break;
1980 312 : case 0xED:
1981 312 : if (a < 0x80 || a > 0x9F)
1982 264 : return false;
1983 48 : break;
1984 4052 : case 0xF0:
1985 4052 : if (a < 0x90 || a > 0xBF)
1986 264 : return false;
1987 3788 : break;
1988 180 : case 0xF4:
1989 180 : if (a < 0x80 || a > 0x8F)
1990 132 : return false;
1991 48 : break;
1992 12692 : default:
1993 12692 : if (a < 0x80 || a > 0xBF)
1994 96 : return false;
1995 12596 : break;
1996 : }
1997 : /* FALL THRU */
1998 21086 : case 1:
1999 21086 : a = *source;
2000 21086 : if (a >= 0x80 && a < 0xC2)
2001 396 : return false;
2002 20690 : if (a > 0xF4)
2003 132 : return false;
2004 20558 : break;
2005 : }
2006 20558 : return true;
2007 : }
2008 :
2009 :
2010 : /*
2011 : *-------------------------------------------------------------------
2012 : * encoding info table
2013 : *-------------------------------------------------------------------
2014 : */
2015 : const pg_wchar_tbl pg_wchar_table[] = {
2016 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2017 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2018 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2019 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2020 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2021 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2022 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2023 : [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2024 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2025 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2026 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2027 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2028 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2029 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2030 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2031 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2032 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2033 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2034 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2035 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2036 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2037 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2038 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2039 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2040 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2041 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2042 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2043 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2044 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2045 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2046 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2047 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2048 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2049 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2050 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2051 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2052 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2053 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2054 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2055 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2056 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2057 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2058 : };
2059 :
2060 : /*
2061 : * Returns the byte length of a multibyte character.
2062 : *
2063 : * Caution: when dealing with text that is not certainly valid in the
2064 : * specified encoding, the result may exceed the actual remaining
2065 : * string length. Callers that are not prepared to deal with that
2066 : * should use pg_encoding_mblen_bounded() instead.
2067 : */
2068 : int
2069 52663272 : pg_encoding_mblen(int encoding, const char *mbstr)
2070 : {
2071 52663272 : return (PG_VALID_ENCODING(encoding) ?
2072 105326544 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2073 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2074 : }
2075 :
2076 : /*
2077 : * Returns the byte length of a multibyte character; but not more than
2078 : * the distance to end of string.
2079 : */
2080 : int
2081 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2082 : {
2083 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2084 : }
2085 :
2086 : /*
2087 : * Returns the display length of a multibyte character.
2088 : */
2089 : int
2090 52494570 : pg_encoding_dsplen(int encoding, const char *mbstr)
2091 : {
2092 52494570 : return (PG_VALID_ENCODING(encoding) ?
2093 104989140 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2094 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2095 : }
2096 :
2097 : /*
2098 : * Verify the first multibyte character of the given string.
2099 : * Return its byte length if good, -1 if bad. (See comments above for
2100 : * full details of the mbverifychar API.)
2101 : */
2102 : int
2103 2286 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2104 : {
2105 2286 : return (PG_VALID_ENCODING(encoding) ?
2106 4572 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2107 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2108 : }
2109 :
2110 : /*
2111 : * Verify that a string is valid for the given encoding.
2112 : * Returns the number of input bytes (<= len) that form a valid string.
2113 : * (See comments above for full details of the mbverifystr API.)
2114 : */
2115 : int
2116 455450 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2117 : {
2118 455450 : return (PG_VALID_ENCODING(encoding) ?
2119 910900 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2120 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2121 : }
2122 :
2123 : /*
2124 : * fetch maximum length of a given encoding
2125 : */
2126 : int
2127 848226 : pg_encoding_max_length(int encoding)
2128 : {
2129 : Assert(PG_VALID_ENCODING(encoding));
2130 :
2131 848226 : return pg_wchar_table[encoding].maxmblen;
2132 : }
|