Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include "mb/pg_wchar.h"
16 : #include "utils/ascii.h"
17 :
18 :
19 : /*
20 : * In today's multibyte encodings other than UTF8, this two-byte sequence
21 : * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
22 : *
23 : * For historical reasons, several verifychar implementations opt to reject
24 : * this pair specifically. Byte pair range constraints, in encoding
25 : * originator documentation, always excluded this pair. No core conversion
26 : * could translate it. However, longstanding verifychar implementations
27 : * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
28 : * pairs not valid per encoding originator documentation. To avoid tightening
29 : * core or non-core conversions in a security patch, we sought this one pair.
30 : *
31 : * PQescapeString() historically used spaces for BYTE1; many other values
32 : * could suffice for BYTE1.
33 : */
34 : #define NONUTF8_INVALID_BYTE0 (0x8d)
35 : #define NONUTF8_INVALID_BYTE1 (' ')
36 :
37 :
38 : /*
39 : * Operations on multi-byte encodings are driven by a table of helper
40 : * functions.
41 : *
42 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
43 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
44 : * and wchar2mb() conversion functions.
45 : *
46 : * These functions generally assume that their input is validly formed.
47 : * The "verifier" functions, further down in the file, have to be more
48 : * paranoid.
49 : *
50 : * We expect that mblen() does not need to examine more than the first byte
51 : * of the character to discover the correct length. GB18030 is an exception
52 : * to that rule, though, as it also looks at second byte. But even that
53 : * behaves in a predictable way, if you only pass the first byte: it will
54 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
55 : * good enough for all current uses.
56 : *
57 : * Note: for the display output of psql to work properly, the return values
58 : * of the dsplen functions must conform to the Unicode standard. In particular
59 : * the NUL character is zero width and control characters are generally
60 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
61 : * subset to the ASCII routines to ensure consistency.
62 : */
63 :
64 : /*
65 : * SQL/ASCII
66 : */
67 : static int
68 762 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
69 : {
70 762 : int cnt = 0;
71 :
72 63428 : while (len > 0 && *from)
73 : {
74 62666 : *to++ = *from++;
75 62666 : len--;
76 62666 : cnt++;
77 : }
78 762 : *to = 0;
79 762 : return cnt;
80 : }
81 :
82 : static int
83 48234 : pg_ascii_mblen(const unsigned char *s)
84 : {
85 48234 : return 1;
86 : }
87 :
88 : static int
89 45656 : pg_ascii_dsplen(const unsigned char *s)
90 : {
91 45656 : if (*s == '\0')
92 0 : return 0;
93 45656 : if (*s < 0x20 || *s == 0x7f)
94 6 : return -1;
95 :
96 45650 : return 1;
97 : }
98 :
99 : /*
100 : * EUC
101 : */
102 : static int
103 0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
104 : {
105 0 : int cnt = 0;
106 :
107 0 : while (len > 0 && *from)
108 : {
109 0 : if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
110 : * KANA") */
111 : {
112 0 : from++;
113 0 : *to = (SS2 << 8) | *from++;
114 0 : len -= 2;
115 : }
116 0 : else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
117 : {
118 0 : from++;
119 0 : *to = (SS3 << 16) | (*from++ << 8);
120 0 : *to |= *from++;
121 0 : len -= 3;
122 : }
123 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
124 : {
125 0 : *to = *from++ << 8;
126 0 : *to |= *from++;
127 0 : len -= 2;
128 : }
129 : else /* must be ASCII */
130 : {
131 0 : *to = *from++;
132 0 : len--;
133 : }
134 0 : to++;
135 0 : cnt++;
136 : }
137 0 : *to = 0;
138 0 : return cnt;
139 : }
140 :
141 : static inline int
142 234 : pg_euc_mblen(const unsigned char *s)
143 : {
144 : int len;
145 :
146 234 : if (*s == SS2)
147 0 : len = 2;
148 234 : else if (*s == SS3)
149 0 : len = 3;
150 234 : else if (IS_HIGHBIT_SET(*s))
151 162 : len = 2;
152 : else
153 72 : len = 1;
154 234 : return len;
155 : }
156 :
157 : static inline int
158 0 : pg_euc_dsplen(const unsigned char *s)
159 : {
160 : int len;
161 :
162 0 : if (*s == SS2)
163 0 : len = 2;
164 0 : else if (*s == SS3)
165 0 : len = 2;
166 0 : else if (IS_HIGHBIT_SET(*s))
167 0 : len = 2;
168 : else
169 0 : len = pg_ascii_dsplen(s);
170 0 : return len;
171 : }
172 :
173 : /*
174 : * EUC_JP
175 : */
176 : static int
177 0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
178 : {
179 0 : return pg_euc2wchar_with_len(from, to, len);
180 : }
181 :
182 : static int
183 204 : pg_eucjp_mblen(const unsigned char *s)
184 : {
185 204 : return pg_euc_mblen(s);
186 : }
187 :
188 : static int
189 0 : pg_eucjp_dsplen(const unsigned char *s)
190 : {
191 : int len;
192 :
193 0 : if (*s == SS2)
194 0 : len = 1;
195 0 : else if (*s == SS3)
196 0 : len = 2;
197 0 : else if (IS_HIGHBIT_SET(*s))
198 0 : len = 2;
199 : else
200 0 : len = pg_ascii_dsplen(s);
201 0 : return len;
202 : }
203 :
204 : /*
205 : * EUC_KR
206 : */
207 : static int
208 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
209 : {
210 0 : return pg_euc2wchar_with_len(from, to, len);
211 : }
212 :
213 : static int
214 6 : pg_euckr_mblen(const unsigned char *s)
215 : {
216 6 : return pg_euc_mblen(s);
217 : }
218 :
219 : static int
220 0 : pg_euckr_dsplen(const unsigned char *s)
221 : {
222 0 : return pg_euc_dsplen(s);
223 : }
224 :
225 : /*
226 : * EUC_CN
227 : *
228 : */
229 : static int
230 0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
231 : {
232 0 : int cnt = 0;
233 :
234 0 : while (len > 0 && *from)
235 : {
236 0 : if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
237 : {
238 0 : from++;
239 0 : *to = (SS2 << 16) | (*from++ << 8);
240 0 : *to |= *from++;
241 0 : len -= 3;
242 : }
243 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
244 : {
245 0 : from++;
246 0 : *to = (SS3 << 16) | (*from++ << 8);
247 0 : *to |= *from++;
248 0 : len -= 3;
249 : }
250 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
251 : {
252 0 : *to = *from++ << 8;
253 0 : *to |= *from++;
254 0 : len -= 2;
255 : }
256 : else
257 : {
258 0 : *to = *from++;
259 0 : len--;
260 : }
261 0 : to++;
262 0 : cnt++;
263 : }
264 0 : *to = 0;
265 0 : return cnt;
266 : }
267 :
268 : static int
269 6 : pg_euccn_mblen(const unsigned char *s)
270 : {
271 : int len;
272 :
273 6 : if (IS_HIGHBIT_SET(*s))
274 6 : len = 2;
275 : else
276 0 : len = 1;
277 6 : return len;
278 : }
279 :
280 : static int
281 0 : pg_euccn_dsplen(const unsigned char *s)
282 : {
283 : int len;
284 :
285 0 : if (IS_HIGHBIT_SET(*s))
286 0 : len = 2;
287 : else
288 0 : len = pg_ascii_dsplen(s);
289 0 : return len;
290 : }
291 :
292 : /*
293 : * EUC_TW
294 : *
295 : */
296 : static int
297 0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
298 : {
299 0 : int cnt = 0;
300 :
301 0 : while (len > 0 && *from)
302 : {
303 0 : if (*from == SS2 && len >= 4) /* code set 2 */
304 : {
305 0 : from++;
306 0 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
307 0 : *to |= *from++ << 8;
308 0 : *to |= *from++;
309 0 : len -= 4;
310 : }
311 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
312 : {
313 0 : from++;
314 0 : *to = (SS3 << 16) | (*from++ << 8);
315 0 : *to |= *from++;
316 0 : len -= 3;
317 : }
318 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
319 : {
320 0 : *to = *from++ << 8;
321 0 : *to |= *from++;
322 0 : len -= 2;
323 : }
324 : else
325 : {
326 0 : *to = *from++;
327 0 : len--;
328 : }
329 0 : to++;
330 0 : cnt++;
331 : }
332 0 : *to = 0;
333 0 : return cnt;
334 : }
335 :
336 : static int
337 6 : pg_euctw_mblen(const unsigned char *s)
338 : {
339 : int len;
340 :
341 6 : if (*s == SS2)
342 0 : len = 4;
343 6 : else if (*s == SS3)
344 0 : len = 3;
345 6 : else if (IS_HIGHBIT_SET(*s))
346 6 : len = 2;
347 : else
348 0 : len = 1;
349 6 : return len;
350 : }
351 :
352 : static int
353 0 : pg_euctw_dsplen(const unsigned char *s)
354 : {
355 : int len;
356 :
357 0 : if (*s == SS2)
358 0 : len = 2;
359 0 : else if (*s == SS3)
360 0 : len = 2;
361 0 : else if (IS_HIGHBIT_SET(*s))
362 0 : len = 2;
363 : else
364 0 : len = pg_ascii_dsplen(s);
365 0 : return len;
366 : }
367 :
368 : /*
369 : * Convert pg_wchar to EUC_* encoding.
370 : * caller must allocate enough space for "to", including a trailing zero!
371 : * len: length of from.
372 : * "from" not necessarily null terminated.
373 : */
374 : static int
375 0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
376 : {
377 0 : int cnt = 0;
378 :
379 0 : while (len > 0 && *from)
380 : {
381 : unsigned char c;
382 :
383 0 : if ((c = (*from >> 24)))
384 : {
385 0 : *to++ = c;
386 0 : *to++ = (*from >> 16) & 0xff;
387 0 : *to++ = (*from >> 8) & 0xff;
388 0 : *to++ = *from & 0xff;
389 0 : cnt += 4;
390 : }
391 0 : else if ((c = (*from >> 16)))
392 : {
393 0 : *to++ = c;
394 0 : *to++ = (*from >> 8) & 0xff;
395 0 : *to++ = *from & 0xff;
396 0 : cnt += 3;
397 : }
398 0 : else if ((c = (*from >> 8)))
399 : {
400 0 : *to++ = c;
401 0 : *to++ = *from & 0xff;
402 0 : cnt += 2;
403 : }
404 : else
405 : {
406 0 : *to++ = *from;
407 0 : cnt++;
408 : }
409 0 : from++;
410 0 : len--;
411 : }
412 0 : *to = 0;
413 0 : return cnt;
414 : }
415 :
416 :
417 : /*
418 : * JOHAB
419 : */
420 : static int
421 24 : pg_johab_mblen(const unsigned char *s)
422 : {
423 24 : return pg_euc_mblen(s);
424 : }
425 :
426 : static int
427 0 : pg_johab_dsplen(const unsigned char *s)
428 : {
429 0 : return pg_euc_dsplen(s);
430 : }
431 :
432 : /*
433 : * convert UTF8 string to pg_wchar (UCS-4)
434 : * caller must allocate enough space for "to", including a trailing zero!
435 : * len: length of from.
436 : * "from" not necessarily null terminated.
437 : */
438 : static int
439 6942930 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
440 : {
441 6942930 : int cnt = 0;
442 : uint32 c1,
443 : c2,
444 : c3,
445 : c4;
446 :
447 146530434 : while (len > 0 && *from)
448 : {
449 139587504 : if ((*from & 0x80) == 0)
450 : {
451 139587048 : *to = *from++;
452 139587048 : len--;
453 : }
454 456 : else if ((*from & 0xe0) == 0xc0)
455 : {
456 364 : if (len < 2)
457 0 : break; /* drop trailing incomplete char */
458 364 : c1 = *from++ & 0x1f;
459 364 : c2 = *from++ & 0x3f;
460 364 : *to = (c1 << 6) | c2;
461 364 : len -= 2;
462 : }
463 92 : else if ((*from & 0xf0) == 0xe0)
464 : {
465 92 : if (len < 3)
466 0 : break; /* drop trailing incomplete char */
467 92 : c1 = *from++ & 0x0f;
468 92 : c2 = *from++ & 0x3f;
469 92 : c3 = *from++ & 0x3f;
470 92 : *to = (c1 << 12) | (c2 << 6) | c3;
471 92 : len -= 3;
472 : }
473 0 : else if ((*from & 0xf8) == 0xf0)
474 : {
475 0 : if (len < 4)
476 0 : break; /* drop trailing incomplete char */
477 0 : c1 = *from++ & 0x07;
478 0 : c2 = *from++ & 0x3f;
479 0 : c3 = *from++ & 0x3f;
480 0 : c4 = *from++ & 0x3f;
481 0 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
482 0 : len -= 4;
483 : }
484 : else
485 : {
486 : /* treat a bogus char as length 1; not ours to raise error */
487 0 : *to = *from++;
488 0 : len--;
489 : }
490 139587504 : to++;
491 139587504 : cnt++;
492 : }
493 6942930 : *to = 0;
494 6942930 : return cnt;
495 : }
496 :
497 :
498 : /*
499 : * Trivial conversion from pg_wchar to UTF-8.
500 : * caller should allocate enough space for "to"
501 : * len: length of from.
502 : * "from" not necessarily null terminated.
503 : */
504 : static int
505 1113932 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
506 : {
507 1113932 : int cnt = 0;
508 :
509 16777720 : while (len > 0 && *from)
510 : {
511 : int char_len;
512 :
513 15663788 : unicode_to_utf8(*from, to);
514 15663788 : char_len = pg_utf_mblen(to);
515 15663788 : cnt += char_len;
516 15663788 : to += char_len;
517 15663788 : from++;
518 15663788 : len--;
519 : }
520 1113932 : *to = 0;
521 1113932 : return cnt;
522 : }
523 :
524 : /*
525 : * Return the byte length of a UTF8 character pointed to by s
526 : *
527 : * Note: in the current implementation we do not support UTF8 sequences
528 : * of more than 4 bytes; hence do NOT return a value larger than 4.
529 : * We return "1" for any leading byte that is either flat-out illegal or
530 : * indicates a length larger than we support.
531 : *
532 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
533 : * other places would need to be fixed to change this.
534 : */
535 : int
536 320641420 : pg_utf_mblen(const unsigned char *s)
537 : {
538 : int len;
539 :
540 320641420 : if ((*s & 0x80) == 0)
541 320612640 : len = 1;
542 28780 : else if ((*s & 0xe0) == 0xc0)
543 14794 : len = 2;
544 13986 : else if ((*s & 0xf0) == 0xe0)
545 9440 : len = 3;
546 4546 : else if ((*s & 0xf8) == 0xf0)
547 4372 : len = 4;
548 : #ifdef NOT_USED
549 : else if ((*s & 0xfc) == 0xf8)
550 : len = 5;
551 : else if ((*s & 0xfe) == 0xfc)
552 : len = 6;
553 : #endif
554 : else
555 174 : len = 1;
556 320641420 : return len;
557 : }
558 :
559 : /*
560 : * This is an implementation of wcwidth() and wcswidth() as defined in
561 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
562 : * <http://www.unix.org/online.html>
563 : *
564 : * Markus Kuhn -- 2001-09-08 -- public domain
565 : *
566 : * customised for PostgreSQL
567 : *
568 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
569 : */
570 :
571 : struct mbinterval
572 : {
573 : unsigned int first;
574 : unsigned int last;
575 : };
576 :
577 : /* auxiliary function for binary search in interval table */
578 : static int
579 105990556 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
580 : {
581 105990556 : int min = 0;
582 : int mid;
583 :
584 105990556 : if (ucs < table[0].first || ucs > table[max].last)
585 105980252 : return 0;
586 90936 : while (max >= min)
587 : {
588 81376 : mid = (min + max) / 2;
589 81376 : if (ucs > table[mid].last)
590 18032 : min = mid + 1;
591 63344 : else if (ucs < table[mid].first)
592 62600 : max = mid - 1;
593 : else
594 744 : return 1;
595 : }
596 :
597 9560 : return 0;
598 : }
599 :
600 :
601 : /* The following functions define the column width of an ISO 10646
602 : * character as follows:
603 : *
604 : * - The null character (U+0000) has a column width of 0.
605 : *
606 : * - Other C0/C1 control characters and DEL will lead to a return
607 : * value of -1.
608 : *
609 : * - Non-spacing and enclosing combining characters (general
610 : * category code Mn, Me or Cf in the Unicode database) have a
611 : * column width of 0.
612 : *
613 : * - Spacing characters in the East Asian Wide (W) or East Asian
614 : * FullWidth (F) category as defined in Unicode Technical
615 : * Report #11 have a column width of 2.
616 : *
617 : * - All remaining characters (including all printable
618 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
619 : * etc.) have a column width of 1.
620 : *
621 : * This implementation assumes that wchar_t characters are encoded
622 : * in ISO 10646.
623 : */
624 :
625 : static int
626 53057108 : ucs_wcwidth(pg_wchar ucs)
627 : {
628 : #include "common/unicode_nonspacing_table.h"
629 : #include "common/unicode_east_asian_fw_table.h"
630 :
631 : /* test for 8-bit control characters */
632 53057108 : if (ucs == 0)
633 0 : return 0;
634 :
635 53057108 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
636 61506 : return -1;
637 :
638 : /*
639 : * binary search in table of non-spacing characters
640 : *
641 : * XXX: In the official Unicode sources, it is possible for a character to
642 : * be described as both non-spacing and wide at the same time. As of
643 : * Unicode 13.0, treating the non-spacing property as the determining
644 : * factor for display width leads to the correct behavior, so do that
645 : * search first.
646 : */
647 52995602 : if (mbbisearch(ucs, nonspacing,
648 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
649 648 : return 0;
650 :
651 : /* binary search in table of wide characters */
652 52994954 : if (mbbisearch(ucs, east_asian_fw,
653 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
654 96 : return 2;
655 :
656 52994858 : return 1;
657 : }
658 :
659 : static int
660 53057108 : pg_utf_dsplen(const unsigned char *s)
661 : {
662 53057108 : return ucs_wcwidth(utf8_to_unicode(s));
663 : }
664 :
665 : /*
666 : * convert mule internal code to pg_wchar
667 : * caller should allocate enough space for "to"
668 : * len: length of from.
669 : * "from" not necessarily null terminated.
670 : */
671 : static int
672 0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
673 : {
674 0 : int cnt = 0;
675 :
676 0 : while (len > 0 && *from)
677 : {
678 0 : if (IS_LC1(*from) && len >= 2)
679 : {
680 0 : *to = *from++ << 16;
681 0 : *to |= *from++;
682 0 : len -= 2;
683 : }
684 0 : else if (IS_LCPRV1(*from) && len >= 3)
685 : {
686 0 : from++;
687 0 : *to = *from++ << 16;
688 0 : *to |= *from++;
689 0 : len -= 3;
690 : }
691 0 : else if (IS_LC2(*from) && len >= 3)
692 : {
693 0 : *to = *from++ << 16;
694 0 : *to |= *from++ << 8;
695 0 : *to |= *from++;
696 0 : len -= 3;
697 : }
698 0 : else if (IS_LCPRV2(*from) && len >= 4)
699 : {
700 0 : from++;
701 0 : *to = *from++ << 16;
702 0 : *to |= *from++ << 8;
703 0 : *to |= *from++;
704 0 : len -= 4;
705 : }
706 : else
707 : { /* assume ASCII */
708 0 : *to = (unsigned char) *from++;
709 0 : len--;
710 : }
711 0 : to++;
712 0 : cnt++;
713 : }
714 0 : *to = 0;
715 0 : return cnt;
716 : }
717 :
718 : /*
719 : * convert pg_wchar to mule internal code
720 : * caller should allocate enough space for "to"
721 : * len: length of from.
722 : * "from" not necessarily null terminated.
723 : */
724 : static int
725 0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
726 : {
727 0 : int cnt = 0;
728 :
729 0 : while (len > 0 && *from)
730 : {
731 : unsigned char lb;
732 :
733 0 : lb = (*from >> 16) & 0xff;
734 0 : if (IS_LC1(lb))
735 : {
736 0 : *to++ = lb;
737 0 : *to++ = *from & 0xff;
738 0 : cnt += 2;
739 : }
740 0 : else if (IS_LC2(lb))
741 : {
742 0 : *to++ = lb;
743 0 : *to++ = (*from >> 8) & 0xff;
744 0 : *to++ = *from & 0xff;
745 0 : cnt += 3;
746 : }
747 0 : else if (IS_LCPRV1_A_RANGE(lb))
748 : {
749 0 : *to++ = LCPRV1_A;
750 0 : *to++ = lb;
751 0 : *to++ = *from & 0xff;
752 0 : cnt += 3;
753 : }
754 0 : else if (IS_LCPRV1_B_RANGE(lb))
755 : {
756 0 : *to++ = LCPRV1_B;
757 0 : *to++ = lb;
758 0 : *to++ = *from & 0xff;
759 0 : cnt += 3;
760 : }
761 0 : else if (IS_LCPRV2_A_RANGE(lb))
762 : {
763 0 : *to++ = LCPRV2_A;
764 0 : *to++ = lb;
765 0 : *to++ = (*from >> 8) & 0xff;
766 0 : *to++ = *from & 0xff;
767 0 : cnt += 4;
768 : }
769 0 : else if (IS_LCPRV2_B_RANGE(lb))
770 : {
771 0 : *to++ = LCPRV2_B;
772 0 : *to++ = lb;
773 0 : *to++ = (*from >> 8) & 0xff;
774 0 : *to++ = *from & 0xff;
775 0 : cnt += 4;
776 : }
777 : else
778 : {
779 0 : *to++ = *from & 0xff;
780 0 : cnt += 1;
781 : }
782 0 : from++;
783 0 : len--;
784 : }
785 0 : *to = 0;
786 0 : return cnt;
787 : }
788 :
789 : /* exported for direct use by conv.c */
790 : int
791 3024 : pg_mule_mblen(const unsigned char *s)
792 : {
793 : int len;
794 :
795 3024 : if (IS_LC1(*s))
796 1220 : len = 2;
797 1804 : else if (IS_LCPRV1(*s))
798 0 : len = 3;
799 1804 : else if (IS_LC2(*s))
800 1710 : len = 3;
801 94 : else if (IS_LCPRV2(*s))
802 40 : len = 4;
803 : else
804 54 : len = 1; /* assume ASCII */
805 3024 : return len;
806 : }
807 :
808 : static int
809 0 : pg_mule_dsplen(const unsigned char *s)
810 : {
811 : int len;
812 :
813 : /*
814 : * Note: it's not really appropriate to assume that all multibyte charsets
815 : * are double-wide on screen. But this seems an okay approximation for
816 : * the MULE charsets we currently support.
817 : */
818 :
819 0 : if (IS_LC1(*s))
820 0 : len = 1;
821 0 : else if (IS_LCPRV1(*s))
822 0 : len = 1;
823 0 : else if (IS_LC2(*s))
824 0 : len = 2;
825 0 : else if (IS_LCPRV2(*s))
826 0 : len = 2;
827 : else
828 0 : len = 1; /* assume ASCII */
829 :
830 0 : return len;
831 : }
832 :
833 : /*
834 : * ISO8859-1
835 : */
836 : static int
837 1070 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
838 : {
839 1070 : int cnt = 0;
840 :
841 30004 : while (len > 0 && *from)
842 : {
843 28934 : *to++ = *from++;
844 28934 : len--;
845 28934 : cnt++;
846 : }
847 1070 : *to = 0;
848 1070 : return cnt;
849 : }
850 :
851 : /*
852 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
853 : * high bits.
854 : * caller should allocate enough space for "to"
855 : * len: length of from.
856 : * "from" not necessarily null terminated.
857 : */
858 : static int
859 150 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
860 : {
861 150 : int cnt = 0;
862 :
863 1356 : while (len > 0 && *from)
864 : {
865 1206 : *to++ = *from++;
866 1206 : len--;
867 1206 : cnt++;
868 : }
869 150 : *to = 0;
870 150 : return cnt;
871 : }
872 :
873 : static int
874 8428 : pg_latin1_mblen(const unsigned char *s)
875 : {
876 8428 : return 1;
877 : }
878 :
879 : static int
880 1232 : pg_latin1_dsplen(const unsigned char *s)
881 : {
882 1232 : return pg_ascii_dsplen(s);
883 : }
884 :
885 : /*
886 : * SJIS
887 : */
888 : static int
889 1690 : pg_sjis_mblen(const unsigned char *s)
890 : {
891 : int len;
892 :
893 1690 : if (*s >= 0xa1 && *s <= 0xdf)
894 0 : len = 1; /* 1 byte kana? */
895 1690 : else if (IS_HIGHBIT_SET(*s))
896 1314 : len = 2; /* kanji? */
897 : else
898 376 : len = 1; /* should be ASCII */
899 1690 : return len;
900 : }
901 :
902 : static int
903 0 : pg_sjis_dsplen(const unsigned char *s)
904 : {
905 : int len;
906 :
907 0 : if (*s >= 0xa1 && *s <= 0xdf)
908 0 : len = 1; /* 1 byte kana? */
909 0 : else if (IS_HIGHBIT_SET(*s))
910 0 : len = 2; /* kanji? */
911 : else
912 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
913 0 : return len;
914 : }
915 :
916 : /*
917 : * Big5
918 : */
919 : static int
920 492 : pg_big5_mblen(const unsigned char *s)
921 : {
922 : int len;
923 :
924 492 : if (IS_HIGHBIT_SET(*s))
925 438 : len = 2; /* kanji? */
926 : else
927 54 : len = 1; /* should be ASCII */
928 492 : return len;
929 : }
930 :
931 : static int
932 0 : pg_big5_dsplen(const unsigned char *s)
933 : {
934 : int len;
935 :
936 0 : if (IS_HIGHBIT_SET(*s))
937 0 : len = 2; /* kanji? */
938 : else
939 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
940 0 : return len;
941 : }
942 :
943 : /*
944 : * GBK
945 : */
946 : static int
947 556 : pg_gbk_mblen(const unsigned char *s)
948 : {
949 : int len;
950 :
951 556 : if (IS_HIGHBIT_SET(*s))
952 416 : len = 2; /* kanji? */
953 : else
954 140 : len = 1; /* should be ASCII */
955 556 : return len;
956 : }
957 :
958 : static int
959 0 : pg_gbk_dsplen(const unsigned char *s)
960 : {
961 : int len;
962 :
963 0 : if (IS_HIGHBIT_SET(*s))
964 0 : len = 2; /* kanji? */
965 : else
966 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
967 0 : return len;
968 : }
969 :
970 : /*
971 : * UHC
972 : */
973 : static int
974 24 : pg_uhc_mblen(const unsigned char *s)
975 : {
976 : int len;
977 :
978 24 : if (IS_HIGHBIT_SET(*s))
979 24 : len = 2; /* 2byte? */
980 : else
981 0 : len = 1; /* should be ASCII */
982 24 : return len;
983 : }
984 :
985 : static int
986 0 : pg_uhc_dsplen(const unsigned char *s)
987 : {
988 : int len;
989 :
990 0 : if (IS_HIGHBIT_SET(*s))
991 0 : len = 2; /* 2byte? */
992 : else
993 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
994 0 : return len;
995 : }
996 :
997 : /*
998 : * GB18030
999 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1000 : */
1001 :
1002 : /*
1003 : * Unlike all other mblen() functions, this also looks at the second byte of
1004 : * the input. However, if you only pass the first byte of a multi-byte
1005 : * string, and \0 as the second byte, this still works in a predictable way:
1006 : * a 4-byte character will be reported as two 2-byte characters. That's
1007 : * enough for all current uses, as a client-only encoding. It works that
1008 : * way, because in any valid 4-byte GB18030-encoded character, the third and
1009 : * fourth byte look like a 2-byte encoded character, when looked at
1010 : * separately.
1011 : */
1012 : static int
1013 926 : pg_gb18030_mblen(const unsigned char *s)
1014 : {
1015 : int len;
1016 :
1017 926 : if (!IS_HIGHBIT_SET(*s))
1018 540 : len = 1; /* ASCII */
1019 386 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1020 138 : len = 4;
1021 : else
1022 248 : len = 2;
1023 926 : return len;
1024 : }
1025 :
1026 : static int
1027 0 : pg_gb18030_dsplen(const unsigned char *s)
1028 : {
1029 : int len;
1030 :
1031 0 : if (IS_HIGHBIT_SET(*s))
1032 0 : len = 2;
1033 : else
1034 0 : len = pg_ascii_dsplen(s); /* ASCII */
1035 0 : return len;
1036 : }
1037 :
1038 : /*
1039 : *-------------------------------------------------------------------
1040 : * multibyte sequence validators
1041 : *
1042 : * The verifychar functions accept "s", a pointer to the first byte of a
1043 : * string, and "len", the remaining length of the string. If there is a
1044 : * validly encoded character beginning at *s, return its length in bytes;
1045 : * else return -1.
1046 : *
1047 : * The verifystr functions also accept "s", a pointer to a string and "len",
1048 : * the length of the string. They verify the whole string, and return the
1049 : * number of input bytes (<= len) that are valid. In other words, if the
1050 : * whole string is valid, verifystr returns "len", otherwise it returns the
1051 : * byte offset of the first invalid character. The verifystr functions must
1052 : * test for and reject zeroes in the input.
1053 : *
1054 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
1055 : * they must test for and reject zeroes in any additional bytes of a
1056 : * multibyte character. Note that this definition allows the function for a
1057 : * single-byte encoding to be just "return 1".
1058 : *-------------------------------------------------------------------
1059 : */
1060 : static int
1061 322 : pg_ascii_verifychar(const unsigned char *s, int len)
1062 : {
1063 322 : return 1;
1064 : }
1065 :
1066 : static int
1067 423304 : pg_ascii_verifystr(const unsigned char *s, int len)
1068 : {
1069 423304 : const unsigned char *nullpos = memchr(s, 0, len);
1070 :
1071 423304 : if (nullpos == NULL)
1072 423304 : return len;
1073 : else
1074 0 : return nullpos - s;
1075 : }
1076 :
1077 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1078 :
1079 : static int
1080 504 : pg_eucjp_verifychar(const unsigned char *s, int len)
1081 : {
1082 : int l;
1083 : unsigned char c1,
1084 : c2;
1085 :
1086 504 : c1 = *s++;
1087 :
1088 504 : switch (c1)
1089 : {
1090 0 : case SS2: /* JIS X 0201 */
1091 0 : l = 2;
1092 0 : if (l > len)
1093 0 : return -1;
1094 0 : c2 = *s++;
1095 0 : if (c2 < 0xa1 || c2 > 0xdf)
1096 0 : return -1;
1097 0 : break;
1098 :
1099 0 : case SS3: /* JIS X 0212 */
1100 0 : l = 3;
1101 0 : if (l > len)
1102 0 : return -1;
1103 0 : c2 = *s++;
1104 0 : if (!IS_EUC_RANGE_VALID(c2))
1105 0 : return -1;
1106 0 : c2 = *s++;
1107 0 : if (!IS_EUC_RANGE_VALID(c2))
1108 0 : return -1;
1109 0 : break;
1110 :
1111 504 : default:
1112 504 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1113 : {
1114 504 : l = 2;
1115 504 : if (l > len)
1116 84 : return -1;
1117 420 : if (!IS_EUC_RANGE_VALID(c1))
1118 24 : return -1;
1119 396 : c2 = *s++;
1120 396 : if (!IS_EUC_RANGE_VALID(c2))
1121 180 : return -1;
1122 : }
1123 : else
1124 : /* must be ASCII */
1125 : {
1126 0 : l = 1;
1127 : }
1128 216 : break;
1129 : }
1130 :
1131 216 : return l;
1132 : }
1133 :
1134 : static int
1135 300 : pg_eucjp_verifystr(const unsigned char *s, int len)
1136 : {
1137 300 : const unsigned char *start = s;
1138 :
1139 930 : while (len > 0)
1140 : {
1141 : int l;
1142 :
1143 : /* fast path for ASCII-subset characters */
1144 846 : if (!IS_HIGHBIT_SET(*s))
1145 : {
1146 594 : if (*s == '\0')
1147 72 : break;
1148 522 : l = 1;
1149 : }
1150 : else
1151 : {
1152 252 : l = pg_eucjp_verifychar(s, len);
1153 252 : if (l == -1)
1154 144 : break;
1155 : }
1156 630 : s += l;
1157 630 : len -= l;
1158 : }
1159 :
1160 300 : return s - start;
1161 : }
1162 :
1163 : static int
1164 36 : pg_euckr_verifychar(const unsigned char *s, int len)
1165 : {
1166 : int l;
1167 : unsigned char c1,
1168 : c2;
1169 :
1170 36 : c1 = *s++;
1171 :
1172 36 : if (IS_HIGHBIT_SET(c1))
1173 : {
1174 36 : l = 2;
1175 36 : if (l > len)
1176 12 : return -1;
1177 24 : if (!IS_EUC_RANGE_VALID(c1))
1178 24 : return -1;
1179 0 : c2 = *s++;
1180 0 : if (!IS_EUC_RANGE_VALID(c2))
1181 0 : return -1;
1182 : }
1183 : else
1184 : /* must be ASCII */
1185 : {
1186 0 : l = 1;
1187 : }
1188 :
1189 0 : return l;
1190 : }
1191 :
1192 : static int
1193 60 : pg_euckr_verifystr(const unsigned char *s, int len)
1194 : {
1195 60 : const unsigned char *start = s;
1196 :
1197 132 : while (len > 0)
1198 : {
1199 : int l;
1200 :
1201 : /* fast path for ASCII-subset characters */
1202 108 : if (!IS_HIGHBIT_SET(*s))
1203 : {
1204 72 : if (*s == '\0')
1205 0 : break;
1206 72 : l = 1;
1207 : }
1208 : else
1209 : {
1210 36 : l = pg_euckr_verifychar(s, len);
1211 36 : if (l == -1)
1212 36 : break;
1213 : }
1214 72 : s += l;
1215 72 : len -= l;
1216 : }
1217 :
1218 60 : return s - start;
1219 : }
1220 :
1221 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1222 : #define pg_euccn_verifychar pg_euckr_verifychar
1223 : #define pg_euccn_verifystr pg_euckr_verifystr
1224 :
1225 : static int
1226 18 : pg_euctw_verifychar(const unsigned char *s, int len)
1227 : {
1228 : int l;
1229 : unsigned char c1,
1230 : c2;
1231 :
1232 18 : c1 = *s++;
1233 :
1234 18 : switch (c1)
1235 : {
1236 0 : case SS2: /* CNS 11643 Plane 1-7 */
1237 0 : l = 4;
1238 0 : if (l > len)
1239 0 : return -1;
1240 0 : c2 = *s++;
1241 0 : if (c2 < 0xa1 || c2 > 0xa7)
1242 0 : return -1;
1243 0 : c2 = *s++;
1244 0 : if (!IS_EUC_RANGE_VALID(c2))
1245 0 : return -1;
1246 0 : c2 = *s++;
1247 0 : if (!IS_EUC_RANGE_VALID(c2))
1248 0 : return -1;
1249 0 : break;
1250 :
1251 0 : case SS3: /* unused */
1252 0 : return -1;
1253 :
1254 18 : default:
1255 18 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1256 : {
1257 18 : l = 2;
1258 18 : if (l > len)
1259 6 : return -1;
1260 : /* no further range check on c1? */
1261 12 : c2 = *s++;
1262 12 : if (!IS_EUC_RANGE_VALID(c2))
1263 12 : return -1;
1264 : }
1265 : else
1266 : /* must be ASCII */
1267 : {
1268 0 : l = 1;
1269 : }
1270 0 : break;
1271 : }
1272 0 : return l;
1273 : }
1274 :
1275 : static int
1276 36 : pg_euctw_verifystr(const unsigned char *s, int len)
1277 : {
1278 36 : const unsigned char *start = s;
1279 :
1280 90 : while (len > 0)
1281 : {
1282 : int l;
1283 :
1284 : /* fast path for ASCII-subset characters */
1285 72 : if (!IS_HIGHBIT_SET(*s))
1286 : {
1287 54 : if (*s == '\0')
1288 0 : break;
1289 54 : l = 1;
1290 : }
1291 : else
1292 : {
1293 18 : l = pg_euctw_verifychar(s, len);
1294 18 : if (l == -1)
1295 18 : break;
1296 : }
1297 54 : s += l;
1298 54 : len -= l;
1299 : }
1300 :
1301 36 : return s - start;
1302 : }
1303 :
1304 : static int
1305 18 : pg_johab_verifychar(const unsigned char *s, int len)
1306 : {
1307 : int l,
1308 : mbl;
1309 : unsigned char c;
1310 :
1311 18 : l = mbl = pg_johab_mblen(s);
1312 :
1313 18 : if (len < l)
1314 6 : return -1;
1315 :
1316 12 : if (!IS_HIGHBIT_SET(*s))
1317 0 : return mbl;
1318 :
1319 12 : while (--l > 0)
1320 : {
1321 12 : c = *++s;
1322 12 : if (!IS_EUC_RANGE_VALID(c))
1323 12 : return -1;
1324 : }
1325 0 : return mbl;
1326 : }
1327 :
1328 : static int
1329 24 : pg_johab_verifystr(const unsigned char *s, int len)
1330 : {
1331 24 : const unsigned char *start = s;
1332 :
1333 42 : while (len > 0)
1334 : {
1335 : int l;
1336 :
1337 : /* fast path for ASCII-subset characters */
1338 36 : if (!IS_HIGHBIT_SET(*s))
1339 : {
1340 18 : if (*s == '\0')
1341 0 : break;
1342 18 : l = 1;
1343 : }
1344 : else
1345 : {
1346 18 : l = pg_johab_verifychar(s, len);
1347 18 : if (l == -1)
1348 18 : break;
1349 : }
1350 18 : s += l;
1351 18 : len -= l;
1352 : }
1353 :
1354 24 : return s - start;
1355 : }
1356 :
1357 : static int
1358 1350 : pg_mule_verifychar(const unsigned char *s, int len)
1359 : {
1360 : int l,
1361 : mbl;
1362 : unsigned char c;
1363 :
1364 1350 : l = mbl = pg_mule_mblen(s);
1365 :
1366 1350 : if (len < l)
1367 344 : return -1;
1368 :
1369 2032 : while (--l > 0)
1370 : {
1371 1348 : c = *++s;
1372 1348 : if (!IS_HIGHBIT_SET(c))
1373 322 : return -1;
1374 : }
1375 684 : return mbl;
1376 : }
1377 :
1378 : static int
1379 438 : pg_mule_verifystr(const unsigned char *s, int len)
1380 : {
1381 438 : const unsigned char *start = s;
1382 :
1383 1290 : while (len > 0)
1384 : {
1385 : int l;
1386 :
1387 : /* fast path for ASCII-subset characters */
1388 1122 : if (!IS_HIGHBIT_SET(*s))
1389 : {
1390 690 : if (*s == '\0')
1391 36 : break;
1392 654 : l = 1;
1393 : }
1394 : else
1395 : {
1396 432 : l = pg_mule_verifychar(s, len);
1397 432 : if (l == -1)
1398 234 : break;
1399 : }
1400 852 : s += l;
1401 852 : len -= l;
1402 : }
1403 :
1404 438 : return s - start;
1405 : }
1406 :
1407 : static int
1408 7156 : pg_latin1_verifychar(const unsigned char *s, int len)
1409 : {
1410 7156 : return 1;
1411 : }
1412 :
1413 : static int
1414 11152 : pg_latin1_verifystr(const unsigned char *s, int len)
1415 : {
1416 11152 : const unsigned char *nullpos = memchr(s, 0, len);
1417 :
1418 11152 : if (nullpos == NULL)
1419 11044 : return len;
1420 : else
1421 108 : return nullpos - s;
1422 : }
1423 :
1424 : static int
1425 1002 : pg_sjis_verifychar(const unsigned char *s, int len)
1426 : {
1427 : int l,
1428 : mbl;
1429 : unsigned char c1,
1430 : c2;
1431 :
1432 1002 : l = mbl = pg_sjis_mblen(s);
1433 :
1434 1002 : if (len < l)
1435 132 : return -1;
1436 :
1437 870 : if (l == 1) /* pg_sjis_mblen already verified it */
1438 0 : return mbl;
1439 :
1440 870 : c1 = *s++;
1441 870 : c2 = *s;
1442 870 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1443 348 : return -1;
1444 522 : return mbl;
1445 : }
1446 :
1447 : static int
1448 546 : pg_sjis_verifystr(const unsigned char *s, int len)
1449 : {
1450 546 : const unsigned char *start = s;
1451 :
1452 2068 : while (len > 0)
1453 : {
1454 : int l;
1455 :
1456 : /* fast path for ASCII-subset characters */
1457 1842 : if (!IS_HIGHBIT_SET(*s))
1458 : {
1459 1348 : if (*s == '\0')
1460 72 : break;
1461 1276 : l = 1;
1462 : }
1463 : else
1464 : {
1465 494 : l = pg_sjis_verifychar(s, len);
1466 494 : if (l == -1)
1467 248 : break;
1468 : }
1469 1522 : s += l;
1470 1522 : len -= l;
1471 : }
1472 :
1473 546 : return s - start;
1474 : }
1475 :
1476 : static int
1477 360 : pg_big5_verifychar(const unsigned char *s, int len)
1478 : {
1479 : int l,
1480 : mbl;
1481 :
1482 360 : l = mbl = pg_big5_mblen(s);
1483 :
1484 360 : if (len < l)
1485 6 : return -1;
1486 :
1487 354 : if (l == 2 &&
1488 354 : s[0] == NONUTF8_INVALID_BYTE0 &&
1489 12 : s[1] == NONUTF8_INVALID_BYTE1)
1490 12 : return -1;
1491 :
1492 576 : while (--l > 0)
1493 : {
1494 342 : if (*++s == '\0')
1495 108 : return -1;
1496 : }
1497 :
1498 234 : return mbl;
1499 : }
1500 :
1501 : static int
1502 162 : pg_big5_verifystr(const unsigned char *s, int len)
1503 : {
1504 162 : const unsigned char *start = s;
1505 :
1506 666 : while (len > 0)
1507 : {
1508 : int l;
1509 :
1510 : /* fast path for ASCII-subset characters */
1511 594 : if (!IS_HIGHBIT_SET(*s))
1512 : {
1513 468 : if (*s == '\0')
1514 36 : break;
1515 432 : l = 1;
1516 : }
1517 : else
1518 : {
1519 126 : l = pg_big5_verifychar(s, len);
1520 126 : if (l == -1)
1521 54 : break;
1522 : }
1523 504 : s += l;
1524 504 : len -= l;
1525 : }
1526 :
1527 162 : return s - start;
1528 : }
1529 :
1530 : static int
1531 274 : pg_gbk_verifychar(const unsigned char *s, int len)
1532 : {
1533 : int l,
1534 : mbl;
1535 :
1536 274 : l = mbl = pg_gbk_mblen(s);
1537 :
1538 274 : if (len < l)
1539 54 : return -1;
1540 :
1541 220 : if (l == 2 &&
1542 220 : s[0] == NONUTF8_INVALID_BYTE0 &&
1543 28 : s[1] == NONUTF8_INVALID_BYTE1)
1544 28 : return -1;
1545 :
1546 384 : while (--l > 0)
1547 : {
1548 192 : if (*++s == '\0')
1549 0 : return -1;
1550 : }
1551 :
1552 192 : return mbl;
1553 : }
1554 :
1555 : static int
1556 256 : pg_gbk_verifystr(const unsigned char *s, int len)
1557 : {
1558 256 : const unsigned char *start = s;
1559 :
1560 658 : while (len > 0)
1561 : {
1562 : int l;
1563 :
1564 : /* fast path for ASCII-subset characters */
1565 484 : if (!IS_HIGHBIT_SET(*s))
1566 : {
1567 242 : if (*s == '\0')
1568 0 : break;
1569 242 : l = 1;
1570 : }
1571 : else
1572 : {
1573 242 : l = pg_gbk_verifychar(s, len);
1574 242 : if (l == -1)
1575 82 : break;
1576 : }
1577 402 : s += l;
1578 402 : len -= l;
1579 : }
1580 :
1581 256 : return s - start;
1582 : }
1583 :
1584 : static int
1585 18 : pg_uhc_verifychar(const unsigned char *s, int len)
1586 : {
1587 : int l,
1588 : mbl;
1589 :
1590 18 : l = mbl = pg_uhc_mblen(s);
1591 :
1592 18 : if (len < l)
1593 6 : return -1;
1594 :
1595 12 : if (l == 2 &&
1596 12 : s[0] == NONUTF8_INVALID_BYTE0 &&
1597 12 : s[1] == NONUTF8_INVALID_BYTE1)
1598 12 : return -1;
1599 :
1600 0 : while (--l > 0)
1601 : {
1602 0 : if (*++s == '\0')
1603 0 : return -1;
1604 : }
1605 :
1606 0 : return mbl;
1607 : }
1608 :
1609 : static int
1610 24 : pg_uhc_verifystr(const unsigned char *s, int len)
1611 : {
1612 24 : const unsigned char *start = s;
1613 :
1614 42 : while (len > 0)
1615 : {
1616 : int l;
1617 :
1618 : /* fast path for ASCII-subset characters */
1619 36 : if (!IS_HIGHBIT_SET(*s))
1620 : {
1621 18 : if (*s == '\0')
1622 0 : break;
1623 18 : l = 1;
1624 : }
1625 : else
1626 : {
1627 18 : l = pg_uhc_verifychar(s, len);
1628 18 : if (l == -1)
1629 18 : break;
1630 : }
1631 18 : s += l;
1632 18 : len -= l;
1633 : }
1634 :
1635 24 : return s - start;
1636 : }
1637 :
1638 : static int
1639 900 : pg_gb18030_verifychar(const unsigned char *s, int len)
1640 : {
1641 : int l;
1642 :
1643 900 : if (!IS_HIGHBIT_SET(*s))
1644 0 : l = 1; /* ASCII */
1645 900 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1646 : {
1647 : /* Should be 4-byte, validate remaining bytes */
1648 306 : if (*s >= 0x81 && *s <= 0xfe &&
1649 306 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1650 306 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1651 162 : l = 4;
1652 : else
1653 144 : l = -1;
1654 : }
1655 594 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1656 : {
1657 : /* Should be 2-byte, validate */
1658 564 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1659 324 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1660 276 : l = 2;
1661 : else
1662 288 : l = -1;
1663 : }
1664 : else
1665 30 : l = -1;
1666 900 : return l;
1667 : }
1668 :
1669 : static int
1670 648 : pg_gb18030_verifystr(const unsigned char *s, int len)
1671 : {
1672 648 : const unsigned char *start = s;
1673 :
1674 2450 : while (len > 0)
1675 : {
1676 : int l;
1677 :
1678 : /* fast path for ASCII-subset characters */
1679 2180 : if (!IS_HIGHBIT_SET(*s))
1680 : {
1681 1560 : if (*s == '\0')
1682 48 : break;
1683 1512 : l = 1;
1684 : }
1685 : else
1686 : {
1687 620 : l = pg_gb18030_verifychar(s, len);
1688 620 : if (l == -1)
1689 330 : break;
1690 : }
1691 1802 : s += l;
1692 1802 : len -= l;
1693 : }
1694 :
1695 648 : return s - start;
1696 : }
1697 :
1698 : static int
1699 17452 : pg_utf8_verifychar(const unsigned char *s, int len)
1700 : {
1701 : int l;
1702 :
1703 17452 : if ((*s & 0x80) == 0)
1704 : {
1705 0 : if (*s == '\0')
1706 0 : return -1;
1707 0 : return 1;
1708 : }
1709 17452 : else if ((*s & 0xe0) == 0xc0)
1710 6060 : l = 2;
1711 11392 : else if ((*s & 0xf0) == 0xe0)
1712 6272 : l = 3;
1713 5120 : else if ((*s & 0xf8) == 0xf0)
1714 4856 : l = 4;
1715 : else
1716 264 : l = 1;
1717 :
1718 17452 : if (l > len)
1719 578 : return -1;
1720 :
1721 16874 : if (!pg_utf8_islegal(s, l))
1722 2356 : return -1;
1723 :
1724 14518 : return l;
1725 : }
1726 :
1727 : /*
1728 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1729 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1730 : * input byte and current state are used to compute an index into an array of
1731 : * state transitions. Since the address of the next transition is dependent
1732 : * on this computation, there is latency in executing the load instruction,
1733 : * and the CPU is not kept busy.
1734 : *
1735 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1736 : *
1737 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1738 : *
1739 : * In a shift-based DFA, the input byte is an index into array of integers
1740 : * whose bit pattern encodes the state transitions. To compute the next
1741 : * state, we simply right-shift the integer by the current state and apply a
1742 : * mask. In this scheme, the address of the transition only depends on the
1743 : * input byte, so there is better pipelining.
1744 : *
1745 : * The naming convention for states and transitions was adopted from a UTF-8
1746 : * to UTF-16/32 transcoder, whose table is reproduced below:
1747 : *
1748 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1749 : *
1750 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1751 : * ==========================================================================
1752 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1753 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1754 : * |
1755 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1756 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1757 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1758 : * |
1759 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1760 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1761 : * |
1762 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1763 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1764 : *
1765 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1766 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1767 : * it's possible to find state numbers such that the transitions fit within
1768 : * 32-bit integers, as Dougall Johnson demonstrated:
1769 : *
1770 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1771 : *
1772 : * This packed representation is the reason for the seemingly odd choice of
1773 : * state values below.
1774 : */
1775 :
1776 : /* Error */
1777 : #define ERR 0
1778 : /* Begin */
1779 : #define BGN 11
1780 : /* Continuation states, expect 1/2/3 continuation bytes */
1781 : #define CS1 16
1782 : #define CS2 1
1783 : #define CS3 5
1784 : /* Partial states, where the first continuation byte has a restricted range */
1785 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1786 : #define P3B 20 /* Lead was ED, check for surrogate */
1787 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1788 : #define P4B 30 /* Lead was F4, check for too-large */
1789 : /* Begin and End are the same state */
1790 : #define END BGN
1791 :
1792 : /* the encoded state transitions for the lookup table */
1793 :
1794 : /* ASCII */
1795 : #define ASC (END << BGN)
1796 : /* 2-byte lead */
1797 : #define L2A (CS1 << BGN)
1798 : /* 3-byte lead */
1799 : #define L3A (P3A << BGN)
1800 : #define L3B (CS2 << BGN)
1801 : #define L3C (P3B << BGN)
1802 : /* 4-byte lead */
1803 : #define L4A (P4A << BGN)
1804 : #define L4B (CS3 << BGN)
1805 : #define L4C (P4B << BGN)
1806 : /* continuation byte */
1807 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1808 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1809 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1810 : /* invalid byte */
1811 : #define ILL ERR
1812 :
1813 : static const uint32 Utf8Transition[256] =
1814 : {
1815 : /* ASCII */
1816 :
1817 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1818 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1819 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1820 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1821 :
1822 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1823 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1824 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1825 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1826 :
1827 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1828 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1829 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1830 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1831 :
1832 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1833 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1834 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1835 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1836 :
1837 : /* continuation bytes */
1838 :
1839 : /* 80..8F */
1840 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1841 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1842 :
1843 : /* 90..9F */
1844 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1845 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1846 :
1847 : /* A0..BF */
1848 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1849 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1850 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1851 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1852 :
1853 : /* leading bytes */
1854 :
1855 : /* C0..DF */
1856 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1857 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1858 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1859 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1860 :
1861 : /* E0..EF */
1862 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1863 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1864 :
1865 : /* F0..FF */
1866 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1867 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1868 : };
1869 :
1870 : static void
1871 1698 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1872 : {
1873 : /* Note: We deliberately don't check the state's value here. */
1874 56034 : while (len > 0)
1875 : {
1876 : /*
1877 : * It's important that the mask value is 31: In most instruction sets,
1878 : * a shift by a 32-bit operand is understood to be a shift by its mod
1879 : * 32, so the compiler should elide the mask operation.
1880 : */
1881 54336 : *state = Utf8Transition[*s++] >> (*state & 31);
1882 54336 : len--;
1883 : }
1884 :
1885 1698 : *state &= 31;
1886 1698 : }
1887 :
1888 : static int
1889 1125416 : pg_utf8_verifystr(const unsigned char *s, int len)
1890 : {
1891 1125416 : const unsigned char *start = s;
1892 1125416 : const int orig_len = len;
1893 1125416 : uint32 state = BGN;
1894 :
1895 : /*
1896 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1897 : * the compiler can unroll a longer loop, it's not worth it because we
1898 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1899 : */
1900 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1901 :
1902 1125416 : if (len >= STRIDE_LENGTH)
1903 : {
1904 4039348 : while (len >= STRIDE_LENGTH)
1905 : {
1906 : /*
1907 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1908 : * but we must first check for a non-END state, which means the
1909 : * previous chunk ended in the middle of a multibyte sequence.
1910 : */
1911 3493996 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1912 1698 : utf8_advance(s, &state, STRIDE_LENGTH);
1913 :
1914 3493996 : s += STRIDE_LENGTH;
1915 3493996 : len -= STRIDE_LENGTH;
1916 : }
1917 :
1918 : /* The error state persists, so we only need to check for it here. */
1919 545352 : if (state == ERR)
1920 : {
1921 : /*
1922 : * Start over from the beginning with the slow path so we can
1923 : * count the valid bytes.
1924 : */
1925 504 : len = orig_len;
1926 504 : s = start;
1927 : }
1928 544848 : else if (state != END)
1929 : {
1930 : /*
1931 : * The fast path exited in the middle of a multibyte sequence.
1932 : * Walk backwards to find the leading byte so that the slow path
1933 : * can resume checking from there. We must always backtrack at
1934 : * least one byte, since the current byte could be e.g. an ASCII
1935 : * byte after a 2-byte lead, which is invalid.
1936 : */
1937 : do
1938 : {
1939 : Assert(s > start);
1940 102 : s--;
1941 102 : len++;
1942 : Assert(IS_HIGHBIT_SET(*s));
1943 102 : } while (pg_utf_mblen(s) <= 1);
1944 : }
1945 : }
1946 :
1947 : /* check remaining bytes */
1948 16728562 : while (len > 0)
1949 : {
1950 : int l;
1951 :
1952 : /* fast path for ASCII-subset characters */
1953 15606220 : if (!IS_HIGHBIT_SET(*s))
1954 : {
1955 15588840 : if (*s == '\0')
1956 204 : break;
1957 15588636 : l = 1;
1958 : }
1959 : else
1960 : {
1961 17380 : l = pg_utf8_verifychar(s, len);
1962 17380 : if (l == -1)
1963 2870 : break;
1964 : }
1965 15603146 : s += l;
1966 15603146 : len -= l;
1967 : }
1968 :
1969 1125416 : return s - start;
1970 : }
1971 :
1972 : /*
1973 : * Check for validity of a single UTF-8 encoded character
1974 : *
1975 : * This directly implements the rules in RFC3629. The bizarre-looking
1976 : * restrictions on the second byte are meant to ensure that there isn't
1977 : * more than one encoding of a given Unicode character point; that is,
1978 : * you may not use a longer-than-necessary byte sequence with high order
1979 : * zero bits to represent a character that would fit in fewer bytes.
1980 : * To do otherwise is to create security hazards (eg, create an apparent
1981 : * non-ASCII character that decodes to plain ASCII).
1982 : *
1983 : * length is assumed to have been obtained by pg_utf_mblen(), and the
1984 : * caller must have checked that that many bytes are present in the buffer.
1985 : */
1986 : bool
1987 23418 : pg_utf8_islegal(const unsigned char *source, int length)
1988 : {
1989 : unsigned char a;
1990 :
1991 23418 : switch (length)
1992 : {
1993 0 : default:
1994 : /* reject lengths 5 and 6 for now */
1995 0 : return false;
1996 4596 : case 4:
1997 4596 : a = source[3];
1998 4596 : if (a < 0x80 || a > 0xBF)
1999 364 : return false;
2000 : /* FALL THRU */
2001 : case 3:
2002 11978 : a = source[2];
2003 11978 : if (a < 0x80 || a > 0xBF)
2004 680 : return false;
2005 : /* FALL THRU */
2006 : case 2:
2007 17816 : a = source[1];
2008 17816 : switch (*source)
2009 : {
2010 312 : case 0xE0:
2011 312 : if (a < 0xA0 || a > 0xBF)
2012 264 : return false;
2013 48 : break;
2014 312 : case 0xED:
2015 312 : if (a < 0x80 || a > 0x9F)
2016 264 : return false;
2017 48 : break;
2018 4052 : case 0xF0:
2019 4052 : if (a < 0x90 || a > 0xBF)
2020 264 : return false;
2021 3788 : break;
2022 180 : case 0xF4:
2023 180 : if (a < 0x80 || a > 0x8F)
2024 132 : return false;
2025 48 : break;
2026 12960 : default:
2027 12960 : if (a < 0x80 || a > 0xBF)
2028 292 : return false;
2029 12668 : break;
2030 : }
2031 : /* FALL THRU */
2032 21158 : case 1:
2033 21158 : a = *source;
2034 21158 : if (a >= 0x80 && a < 0xC2)
2035 396 : return false;
2036 20762 : if (a > 0xF4)
2037 132 : return false;
2038 20630 : break;
2039 : }
2040 20630 : return true;
2041 : }
2042 :
2043 :
2044 : /*
2045 : * Fills the provided buffer with two bytes such that:
2046 : * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2047 : */
2048 : void
2049 364 : pg_encoding_set_invalid(int encoding, char *dst)
2050 : {
2051 : Assert(pg_encoding_max_length(encoding) > 1);
2052 :
2053 364 : dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2054 364 : dst[1] = NONUTF8_INVALID_BYTE1;
2055 364 : }
2056 :
2057 : /*
2058 : *-------------------------------------------------------------------
2059 : * encoding info table
2060 : *-------------------------------------------------------------------
2061 : */
2062 : const pg_wchar_tbl pg_wchar_table[] = {
2063 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2064 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2065 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2066 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2067 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2068 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2069 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2070 : [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2071 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2072 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2073 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2074 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2075 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2076 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2077 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2078 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2079 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2080 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2081 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2082 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2083 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2084 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2085 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2086 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2087 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2088 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2089 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2090 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2091 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2092 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2093 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2094 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2095 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2096 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2097 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2098 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2099 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2100 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2101 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2102 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2103 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2104 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2105 : };
2106 :
2107 : /*
2108 : * Returns the byte length of a multibyte character.
2109 : *
2110 : * Caution: when dealing with text that is not certainly valid in the
2111 : * specified encoding, the result may exceed the actual remaining
2112 : * string length. Callers that are not prepared to deal with that
2113 : * should use pg_encoding_mblen_bounded() instead.
2114 : */
2115 : int
2116 53271762 : pg_encoding_mblen(int encoding, const char *mbstr)
2117 : {
2118 53271762 : return (PG_VALID_ENCODING(encoding) ?
2119 106543524 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2120 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2121 : }
2122 :
2123 : /*
2124 : * Returns the byte length of a multibyte character; but not more than
2125 : * the distance to end of string.
2126 : */
2127 : int
2128 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2129 : {
2130 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2131 : }
2132 :
2133 : /*
2134 : * Returns the display length of a multibyte character.
2135 : */
2136 : int
2137 53094040 : pg_encoding_dsplen(int encoding, const char *mbstr)
2138 : {
2139 53094040 : return (PG_VALID_ENCODING(encoding) ?
2140 106188080 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2141 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2142 : }
2143 :
2144 : /*
2145 : * Verify the first multibyte character of the given string.
2146 : * Return its byte length if good, -1 if bad. (See comments above for
2147 : * full details of the mbverifychar API.)
2148 : */
2149 : int
2150 9670 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2151 : {
2152 9670 : return (PG_VALID_ENCODING(encoding) ?
2153 19340 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2154 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2155 : }
2156 :
2157 : /*
2158 : * Verify that a string is valid for the given encoding.
2159 : * Returns the number of input bytes (<= len) that form a valid string.
2160 : * (See comments above for full details of the mbverifystr API.)
2161 : */
2162 : int
2163 457992 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2164 : {
2165 457992 : return (PG_VALID_ENCODING(encoding) ?
2166 915984 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2167 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2168 : }
2169 :
2170 : /*
2171 : * fetch maximum length of a given encoding
2172 : */
2173 : int
2174 856698 : pg_encoding_max_length(int encoding)
2175 : {
2176 : Assert(PG_VALID_ENCODING(encoding));
2177 :
2178 : /*
2179 : * Check for the encoding despite the assert, due to some mingw versions
2180 : * otherwise issuing bogus warnings.
2181 : */
2182 856698 : return PG_VALID_ENCODING(encoding) ?
2183 1713396 : pg_wchar_table[encoding].maxmblen :
2184 : pg_wchar_table[PG_SQL_ASCII].maxmblen;
2185 : }
|