Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <limits.h>
16 :
17 : #include "mb/pg_wchar.h"
18 : #include "utils/ascii.h"
19 :
20 :
21 : /*
22 : * In today's multibyte encodings other than UTF8, this two-byte sequence
23 : * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 : *
25 : * For historical reasons, several verifychar implementations opt to reject
26 : * this pair specifically. Byte pair range constraints, in encoding
27 : * originator documentation, always excluded this pair. No core conversion
28 : * could translate it. However, longstanding verifychar implementations
29 : * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
30 : * pairs not valid per encoding originator documentation. To avoid tightening
31 : * core or non-core conversions in a security patch, we sought this one pair.
32 : *
33 : * PQescapeString() historically used spaces for BYTE1; many other values
34 : * could suffice for BYTE1.
35 : */
36 : #define NONUTF8_INVALID_BYTE0 (0x8d)
37 : #define NONUTF8_INVALID_BYTE1 (' ')
38 :
39 :
40 : /*
41 : * Operations on multi-byte encodings are driven by a table of helper
42 : * functions.
43 : *
44 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 : * and wchar2mb() conversion functions.
47 : *
48 : * These functions generally assume that their input is validly formed.
49 : * The "verifier" functions, further down in the file, have to be more
50 : * paranoid.
51 : *
52 : * We expect that mblen() does not need to examine more than the first byte
53 : * of the character to discover the correct length. GB18030 is an exception
54 : * to that rule, though, as it also looks at second byte. But even that
55 : * behaves in a predictable way, if you only pass the first byte: it will
56 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 : * good enough for all current uses.
58 : *
59 : * Note: for the display output of psql to work properly, the return values
60 : * of the dsplen functions must conform to the Unicode standard. In particular
61 : * the NUL character is zero width and control characters are generally
62 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 : * subset to the ASCII routines to ensure consistency.
64 : */
65 :
66 : /*
67 : * SQL/ASCII
68 : */
69 : static int
70 762 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
71 : {
72 762 : int cnt = 0;
73 :
74 63428 : while (len > 0 && *from)
75 : {
76 62666 : *to++ = *from++;
77 62666 : len--;
78 62666 : cnt++;
79 : }
80 762 : *to = 0;
81 762 : return cnt;
82 : }
83 :
84 : static int
85 48234 : pg_ascii_mblen(const unsigned char *s)
86 : {
87 48234 : return 1;
88 : }
89 :
90 : static int
91 45656 : pg_ascii_dsplen(const unsigned char *s)
92 : {
93 45656 : if (*s == '\0')
94 0 : return 0;
95 45656 : if (*s < 0x20 || *s == 0x7f)
96 6 : return -1;
97 :
98 45650 : return 1;
99 : }
100 :
101 : /*
102 : * EUC
103 : */
104 : static int
105 0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
106 : {
107 0 : int cnt = 0;
108 :
109 0 : while (len > 0 && *from)
110 : {
111 0 : if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
112 : * KANA") */
113 : {
114 0 : from++;
115 0 : *to = (SS2 << 8) | *from++;
116 0 : len -= 2;
117 : }
118 0 : else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
119 : {
120 0 : from++;
121 0 : *to = (SS3 << 16) | (*from++ << 8);
122 0 : *to |= *from++;
123 0 : len -= 3;
124 : }
125 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
126 : {
127 0 : *to = *from++ << 8;
128 0 : *to |= *from++;
129 0 : len -= 2;
130 : }
131 : else /* must be ASCII */
132 : {
133 0 : *to = *from++;
134 0 : len--;
135 : }
136 0 : to++;
137 0 : cnt++;
138 : }
139 0 : *to = 0;
140 0 : return cnt;
141 : }
142 :
143 : static inline int
144 234 : pg_euc_mblen(const unsigned char *s)
145 : {
146 : int len;
147 :
148 234 : if (*s == SS2)
149 0 : len = 2;
150 234 : else if (*s == SS3)
151 0 : len = 3;
152 234 : else if (IS_HIGHBIT_SET(*s))
153 162 : len = 2;
154 : else
155 72 : len = 1;
156 234 : return len;
157 : }
158 :
159 : static inline int
160 0 : pg_euc_dsplen(const unsigned char *s)
161 : {
162 : int len;
163 :
164 0 : if (*s == SS2)
165 0 : len = 2;
166 0 : else if (*s == SS3)
167 0 : len = 2;
168 0 : else if (IS_HIGHBIT_SET(*s))
169 0 : len = 2;
170 : else
171 0 : len = pg_ascii_dsplen(s);
172 0 : return len;
173 : }
174 :
175 : /*
176 : * EUC_JP
177 : */
178 : static int
179 0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
180 : {
181 0 : return pg_euc2wchar_with_len(from, to, len);
182 : }
183 :
184 : static int
185 204 : pg_eucjp_mblen(const unsigned char *s)
186 : {
187 204 : return pg_euc_mblen(s);
188 : }
189 :
190 : static int
191 0 : pg_eucjp_dsplen(const unsigned char *s)
192 : {
193 : int len;
194 :
195 0 : if (*s == SS2)
196 0 : len = 1;
197 0 : else if (*s == SS3)
198 0 : len = 2;
199 0 : else if (IS_HIGHBIT_SET(*s))
200 0 : len = 2;
201 : else
202 0 : len = pg_ascii_dsplen(s);
203 0 : return len;
204 : }
205 :
206 : /*
207 : * EUC_KR
208 : */
209 : static int
210 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211 : {
212 0 : return pg_euc2wchar_with_len(from, to, len);
213 : }
214 :
215 : static int
216 6 : pg_euckr_mblen(const unsigned char *s)
217 : {
218 6 : return pg_euc_mblen(s);
219 : }
220 :
221 : static int
222 0 : pg_euckr_dsplen(const unsigned char *s)
223 : {
224 0 : return pg_euc_dsplen(s);
225 : }
226 :
227 : /*
228 : * EUC_CN
229 : *
230 : */
231 : static int
232 0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
233 : {
234 0 : int cnt = 0;
235 :
236 0 : while (len > 0 && *from)
237 : {
238 0 : if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
239 : {
240 0 : from++;
241 0 : *to = (SS2 << 16) | (*from++ << 8);
242 0 : *to |= *from++;
243 0 : len -= 3;
244 : }
245 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
246 : {
247 0 : from++;
248 0 : *to = (SS3 << 16) | (*from++ << 8);
249 0 : *to |= *from++;
250 0 : len -= 3;
251 : }
252 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
253 : {
254 0 : *to = *from++ << 8;
255 0 : *to |= *from++;
256 0 : len -= 2;
257 : }
258 : else
259 : {
260 0 : *to = *from++;
261 0 : len--;
262 : }
263 0 : to++;
264 0 : cnt++;
265 : }
266 0 : *to = 0;
267 0 : return cnt;
268 : }
269 :
270 : static int
271 6 : pg_euccn_mblen(const unsigned char *s)
272 : {
273 : int len;
274 :
275 6 : if (IS_HIGHBIT_SET(*s))
276 6 : len = 2;
277 : else
278 0 : len = 1;
279 6 : return len;
280 : }
281 :
282 : static int
283 0 : pg_euccn_dsplen(const unsigned char *s)
284 : {
285 : int len;
286 :
287 0 : if (IS_HIGHBIT_SET(*s))
288 0 : len = 2;
289 : else
290 0 : len = pg_ascii_dsplen(s);
291 0 : return len;
292 : }
293 :
294 : /*
295 : * EUC_TW
296 : *
297 : */
298 : static int
299 0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
300 : {
301 0 : int cnt = 0;
302 :
303 0 : while (len > 0 && *from)
304 : {
305 0 : if (*from == SS2 && len >= 4) /* code set 2 */
306 : {
307 0 : from++;
308 0 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
309 0 : *to |= *from++ << 8;
310 0 : *to |= *from++;
311 0 : len -= 4;
312 : }
313 0 : else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
314 : {
315 0 : from++;
316 0 : *to = (SS3 << 16) | (*from++ << 8);
317 0 : *to |= *from++;
318 0 : len -= 3;
319 : }
320 0 : else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
321 : {
322 0 : *to = *from++ << 8;
323 0 : *to |= *from++;
324 0 : len -= 2;
325 : }
326 : else
327 : {
328 0 : *to = *from++;
329 0 : len--;
330 : }
331 0 : to++;
332 0 : cnt++;
333 : }
334 0 : *to = 0;
335 0 : return cnt;
336 : }
337 :
338 : static int
339 6 : pg_euctw_mblen(const unsigned char *s)
340 : {
341 : int len;
342 :
343 6 : if (*s == SS2)
344 0 : len = 4;
345 6 : else if (*s == SS3)
346 0 : len = 3;
347 6 : else if (IS_HIGHBIT_SET(*s))
348 6 : len = 2;
349 : else
350 0 : len = 1;
351 6 : return len;
352 : }
353 :
354 : static int
355 0 : pg_euctw_dsplen(const unsigned char *s)
356 : {
357 : int len;
358 :
359 0 : if (*s == SS2)
360 0 : len = 2;
361 0 : else if (*s == SS3)
362 0 : len = 2;
363 0 : else if (IS_HIGHBIT_SET(*s))
364 0 : len = 2;
365 : else
366 0 : len = pg_ascii_dsplen(s);
367 0 : return len;
368 : }
369 :
370 : /*
371 : * Convert pg_wchar to EUC_* encoding.
372 : * caller must allocate enough space for "to", including a trailing zero!
373 : * len: length of from.
374 : * "from" not necessarily null terminated.
375 : */
376 : static int
377 0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
378 : {
379 0 : int cnt = 0;
380 :
381 0 : while (len > 0 && *from)
382 : {
383 : unsigned char c;
384 :
385 0 : if ((c = (*from >> 24)))
386 : {
387 0 : *to++ = c;
388 0 : *to++ = (*from >> 16) & 0xff;
389 0 : *to++ = (*from >> 8) & 0xff;
390 0 : *to++ = *from & 0xff;
391 0 : cnt += 4;
392 : }
393 0 : else if ((c = (*from >> 16)))
394 : {
395 0 : *to++ = c;
396 0 : *to++ = (*from >> 8) & 0xff;
397 0 : *to++ = *from & 0xff;
398 0 : cnt += 3;
399 : }
400 0 : else if ((c = (*from >> 8)))
401 : {
402 0 : *to++ = c;
403 0 : *to++ = *from & 0xff;
404 0 : cnt += 2;
405 : }
406 : else
407 : {
408 0 : *to++ = *from;
409 0 : cnt++;
410 : }
411 0 : from++;
412 0 : len--;
413 : }
414 0 : *to = 0;
415 0 : return cnt;
416 : }
417 :
418 :
419 : /*
420 : * JOHAB
421 : */
422 : static int
423 24 : pg_johab_mblen(const unsigned char *s)
424 : {
425 24 : return pg_euc_mblen(s);
426 : }
427 :
428 : static int
429 0 : pg_johab_dsplen(const unsigned char *s)
430 : {
431 0 : return pg_euc_dsplen(s);
432 : }
433 :
434 : /*
435 : * convert UTF8 string to pg_wchar (UCS-4)
436 : * caller must allocate enough space for "to", including a trailing zero!
437 : * len: length of from.
438 : * "from" not necessarily null terminated.
439 : */
440 : static int
441 7059854 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
442 : {
443 7059854 : int cnt = 0;
444 : uint32 c1,
445 : c2,
446 : c3,
447 : c4;
448 :
449 148768158 : while (len > 0 && *from)
450 : {
451 141708304 : if ((*from & 0x80) == 0)
452 : {
453 141707848 : *to = *from++;
454 141707848 : len--;
455 : }
456 456 : else if ((*from & 0xe0) == 0xc0)
457 : {
458 364 : if (len < 2)
459 0 : break; /* drop trailing incomplete char */
460 364 : c1 = *from++ & 0x1f;
461 364 : c2 = *from++ & 0x3f;
462 364 : *to = (c1 << 6) | c2;
463 364 : len -= 2;
464 : }
465 92 : else if ((*from & 0xf0) == 0xe0)
466 : {
467 92 : if (len < 3)
468 0 : break; /* drop trailing incomplete char */
469 92 : c1 = *from++ & 0x0f;
470 92 : c2 = *from++ & 0x3f;
471 92 : c3 = *from++ & 0x3f;
472 92 : *to = (c1 << 12) | (c2 << 6) | c3;
473 92 : len -= 3;
474 : }
475 0 : else if ((*from & 0xf8) == 0xf0)
476 : {
477 0 : if (len < 4)
478 0 : break; /* drop trailing incomplete char */
479 0 : c1 = *from++ & 0x07;
480 0 : c2 = *from++ & 0x3f;
481 0 : c3 = *from++ & 0x3f;
482 0 : c4 = *from++ & 0x3f;
483 0 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
484 0 : len -= 4;
485 : }
486 : else
487 : {
488 : /* treat a bogus char as length 1; not ours to raise error */
489 0 : *to = *from++;
490 0 : len--;
491 : }
492 141708304 : to++;
493 141708304 : cnt++;
494 : }
495 7059854 : *to = 0;
496 7059854 : return cnt;
497 : }
498 :
499 :
500 : /*
501 : * Trivial conversion from pg_wchar to UTF-8.
502 : * caller should allocate enough space for "to"
503 : * len: length of from.
504 : * "from" not necessarily null terminated.
505 : */
506 : static int
507 1115096 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
508 : {
509 1115096 : int cnt = 0;
510 :
511 16791890 : while (len > 0 && *from)
512 : {
513 : int char_len;
514 :
515 15676794 : unicode_to_utf8(*from, to);
516 15676794 : char_len = pg_utf_mblen(to);
517 15676794 : cnt += char_len;
518 15676794 : to += char_len;
519 15676794 : from++;
520 15676794 : len--;
521 : }
522 1115096 : *to = 0;
523 1115096 : return cnt;
524 : }
525 :
526 : /*
527 : * Return the byte length of a UTF8 character pointed to by s
528 : *
529 : * Note: in the current implementation we do not support UTF8 sequences
530 : * of more than 4 bytes; hence do NOT return a value larger than 4.
531 : * We return "1" for any leading byte that is either flat-out illegal or
532 : * indicates a length larger than we support.
533 : *
534 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
535 : * other places would need to be fixed to change this.
536 : */
537 : int
538 322758862 : pg_utf_mblen(const unsigned char *s)
539 : {
540 : int len;
541 :
542 322758862 : if ((*s & 0x80) == 0)
543 322729482 : len = 1;
544 29380 : else if ((*s & 0xe0) == 0xc0)
545 15178 : len = 2;
546 14202 : else if ((*s & 0xf0) == 0xe0)
547 9656 : len = 3;
548 4546 : else if ((*s & 0xf8) == 0xf0)
549 4372 : len = 4;
550 : #ifdef NOT_USED
551 : else if ((*s & 0xfc) == 0xf8)
552 : len = 5;
553 : else if ((*s & 0xfe) == 0xfc)
554 : len = 6;
555 : #endif
556 : else
557 174 : len = 1;
558 322758862 : return len;
559 : }
560 :
561 : /*
562 : * This is an implementation of wcwidth() and wcswidth() as defined in
563 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
564 : * <http://www.unix.org/online.html>
565 : *
566 : * Markus Kuhn -- 2001-09-08 -- public domain
567 : *
568 : * customised for PostgreSQL
569 : *
570 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
571 : */
572 :
573 : struct mbinterval
574 : {
575 : unsigned int first;
576 : unsigned int last;
577 : };
578 :
579 : /* auxiliary function for binary search in interval table */
580 : static int
581 107916304 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
582 : {
583 107916304 : int min = 0;
584 : int mid;
585 :
586 107916304 : if (ucs < table[0].first || ucs > table[max].last)
587 107905232 : return 0;
588 97016 : while (max >= min)
589 : {
590 86880 : mid = (min + max) / 2;
591 86880 : if (ucs > table[mid].last)
592 17560 : min = mid + 1;
593 69320 : else if (ucs < table[mid].first)
594 68384 : max = mid - 1;
595 : else
596 936 : return 1;
597 : }
598 :
599 10136 : return 0;
600 : }
601 :
602 :
603 : /* The following functions define the column width of an ISO 10646
604 : * character as follows:
605 : *
606 : * - The null character (U+0000) has a column width of 0.
607 : *
608 : * - Other C0/C1 control characters and DEL will lead to a return
609 : * value of -1.
610 : *
611 : * - Non-spacing and enclosing combining characters (general
612 : * category code Mn, Me or Cf in the Unicode database) have a
613 : * column width of 0.
614 : *
615 : * - Spacing characters in the East Asian Wide (W) or East Asian
616 : * FullWidth (F) category as defined in Unicode Technical
617 : * Report #11 have a column width of 2.
618 : *
619 : * - All remaining characters (including all printable
620 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
621 : * etc.) have a column width of 1.
622 : *
623 : * This implementation assumes that wchar_t characters are encoded
624 : * in ISO 10646.
625 : */
626 :
627 : static int
628 54021020 : ucs_wcwidth(pg_wchar ucs)
629 : {
630 : #include "common/unicode_nonspacing_table.h"
631 : #include "common/unicode_east_asian_fw_table.h"
632 :
633 : /* test for 8-bit control characters */
634 54021020 : if (ucs == 0)
635 0 : return 0;
636 :
637 54021020 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
638 62544 : return -1;
639 :
640 : /*
641 : * binary search in table of non-spacing characters
642 : *
643 : * XXX: In the official Unicode sources, it is possible for a character to
644 : * be described as both non-spacing and wide at the same time. As of
645 : * Unicode 13.0, treating the non-spacing property as the determining
646 : * factor for display width leads to the correct behavior, so do that
647 : * search first.
648 : */
649 53958476 : if (mbbisearch(ucs, nonspacing,
650 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
651 648 : return 0;
652 :
653 : /* binary search in table of wide characters */
654 53957828 : if (mbbisearch(ucs, east_asian_fw,
655 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
656 288 : return 2;
657 :
658 53957540 : return 1;
659 : }
660 :
661 : static int
662 54021020 : pg_utf_dsplen(const unsigned char *s)
663 : {
664 54021020 : return ucs_wcwidth(utf8_to_unicode(s));
665 : }
666 :
667 : /*
668 : * convert mule internal code to pg_wchar
669 : * caller should allocate enough space for "to"
670 : * len: length of from.
671 : * "from" not necessarily null terminated.
672 : */
673 : static int
674 0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
675 : {
676 0 : int cnt = 0;
677 :
678 0 : while (len > 0 && *from)
679 : {
680 0 : if (IS_LC1(*from) && len >= 2)
681 : {
682 0 : *to = *from++ << 16;
683 0 : *to |= *from++;
684 0 : len -= 2;
685 : }
686 0 : else if (IS_LCPRV1(*from) && len >= 3)
687 : {
688 0 : from++;
689 0 : *to = *from++ << 16;
690 0 : *to |= *from++;
691 0 : len -= 3;
692 : }
693 0 : else if (IS_LC2(*from) && len >= 3)
694 : {
695 0 : *to = *from++ << 16;
696 0 : *to |= *from++ << 8;
697 0 : *to |= *from++;
698 0 : len -= 3;
699 : }
700 0 : else if (IS_LCPRV2(*from) && len >= 4)
701 : {
702 0 : from++;
703 0 : *to = *from++ << 16;
704 0 : *to |= *from++ << 8;
705 0 : *to |= *from++;
706 0 : len -= 4;
707 : }
708 : else
709 : { /* assume ASCII */
710 0 : *to = (unsigned char) *from++;
711 0 : len--;
712 : }
713 0 : to++;
714 0 : cnt++;
715 : }
716 0 : *to = 0;
717 0 : return cnt;
718 : }
719 :
720 : /*
721 : * convert pg_wchar to mule internal code
722 : * caller should allocate enough space for "to"
723 : * len: length of from.
724 : * "from" not necessarily null terminated.
725 : */
726 : static int
727 0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
728 : {
729 0 : int cnt = 0;
730 :
731 0 : while (len > 0 && *from)
732 : {
733 : unsigned char lb;
734 :
735 0 : lb = (*from >> 16) & 0xff;
736 0 : if (IS_LC1(lb))
737 : {
738 0 : *to++ = lb;
739 0 : *to++ = *from & 0xff;
740 0 : cnt += 2;
741 : }
742 0 : else if (IS_LC2(lb))
743 : {
744 0 : *to++ = lb;
745 0 : *to++ = (*from >> 8) & 0xff;
746 0 : *to++ = *from & 0xff;
747 0 : cnt += 3;
748 : }
749 0 : else if (IS_LCPRV1_A_RANGE(lb))
750 : {
751 0 : *to++ = LCPRV1_A;
752 0 : *to++ = lb;
753 0 : *to++ = *from & 0xff;
754 0 : cnt += 3;
755 : }
756 0 : else if (IS_LCPRV1_B_RANGE(lb))
757 : {
758 0 : *to++ = LCPRV1_B;
759 0 : *to++ = lb;
760 0 : *to++ = *from & 0xff;
761 0 : cnt += 3;
762 : }
763 0 : else if (IS_LCPRV2_A_RANGE(lb))
764 : {
765 0 : *to++ = LCPRV2_A;
766 0 : *to++ = lb;
767 0 : *to++ = (*from >> 8) & 0xff;
768 0 : *to++ = *from & 0xff;
769 0 : cnt += 4;
770 : }
771 0 : else if (IS_LCPRV2_B_RANGE(lb))
772 : {
773 0 : *to++ = LCPRV2_B;
774 0 : *to++ = lb;
775 0 : *to++ = (*from >> 8) & 0xff;
776 0 : *to++ = *from & 0xff;
777 0 : cnt += 4;
778 : }
779 : else
780 : {
781 0 : *to++ = *from & 0xff;
782 0 : cnt += 1;
783 : }
784 0 : from++;
785 0 : len--;
786 : }
787 0 : *to = 0;
788 0 : return cnt;
789 : }
790 :
791 : /* exported for direct use by conv.c */
792 : int
793 3024 : pg_mule_mblen(const unsigned char *s)
794 : {
795 : int len;
796 :
797 3024 : if (IS_LC1(*s))
798 1220 : len = 2;
799 1804 : else if (IS_LCPRV1(*s))
800 0 : len = 3;
801 1804 : else if (IS_LC2(*s))
802 1710 : len = 3;
803 94 : else if (IS_LCPRV2(*s))
804 40 : len = 4;
805 : else
806 54 : len = 1; /* assume ASCII */
807 3024 : return len;
808 : }
809 :
810 : static int
811 0 : pg_mule_dsplen(const unsigned char *s)
812 : {
813 : int len;
814 :
815 : /*
816 : * Note: it's not really appropriate to assume that all multibyte charsets
817 : * are double-wide on screen. But this seems an okay approximation for
818 : * the MULE charsets we currently support.
819 : */
820 :
821 0 : if (IS_LC1(*s))
822 0 : len = 1;
823 0 : else if (IS_LCPRV1(*s))
824 0 : len = 1;
825 0 : else if (IS_LC2(*s))
826 0 : len = 2;
827 0 : else if (IS_LCPRV2(*s))
828 0 : len = 2;
829 : else
830 0 : len = 1; /* assume ASCII */
831 :
832 0 : return len;
833 : }
834 :
835 : /*
836 : * ISO8859-1
837 : */
838 : static int
839 1070 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
840 : {
841 1070 : int cnt = 0;
842 :
843 30004 : while (len > 0 && *from)
844 : {
845 28934 : *to++ = *from++;
846 28934 : len--;
847 28934 : cnt++;
848 : }
849 1070 : *to = 0;
850 1070 : return cnt;
851 : }
852 :
853 : /*
854 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
855 : * high bits.
856 : * caller should allocate enough space for "to"
857 : * len: length of from.
858 : * "from" not necessarily null terminated.
859 : */
860 : static int
861 150 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
862 : {
863 150 : int cnt = 0;
864 :
865 1356 : while (len > 0 && *from)
866 : {
867 1206 : *to++ = *from++;
868 1206 : len--;
869 1206 : cnt++;
870 : }
871 150 : *to = 0;
872 150 : return cnt;
873 : }
874 :
875 : static int
876 8428 : pg_latin1_mblen(const unsigned char *s)
877 : {
878 8428 : return 1;
879 : }
880 :
881 : static int
882 1232 : pg_latin1_dsplen(const unsigned char *s)
883 : {
884 1232 : return pg_ascii_dsplen(s);
885 : }
886 :
887 : /*
888 : * SJIS
889 : */
890 : static int
891 1690 : pg_sjis_mblen(const unsigned char *s)
892 : {
893 : int len;
894 :
895 1690 : if (*s >= 0xa1 && *s <= 0xdf)
896 0 : len = 1; /* 1 byte kana? */
897 1690 : else if (IS_HIGHBIT_SET(*s))
898 1314 : len = 2; /* kanji? */
899 : else
900 376 : len = 1; /* should be ASCII */
901 1690 : return len;
902 : }
903 :
904 : static int
905 0 : pg_sjis_dsplen(const unsigned char *s)
906 : {
907 : int len;
908 :
909 0 : if (*s >= 0xa1 && *s <= 0xdf)
910 0 : len = 1; /* 1 byte kana? */
911 0 : else if (IS_HIGHBIT_SET(*s))
912 0 : len = 2; /* kanji? */
913 : else
914 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
915 0 : return len;
916 : }
917 :
918 : /*
919 : * Big5
920 : */
921 : static int
922 492 : pg_big5_mblen(const unsigned char *s)
923 : {
924 : int len;
925 :
926 492 : if (IS_HIGHBIT_SET(*s))
927 438 : len = 2; /* kanji? */
928 : else
929 54 : len = 1; /* should be ASCII */
930 492 : return len;
931 : }
932 :
933 : static int
934 0 : pg_big5_dsplen(const unsigned char *s)
935 : {
936 : int len;
937 :
938 0 : if (IS_HIGHBIT_SET(*s))
939 0 : len = 2; /* kanji? */
940 : else
941 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
942 0 : return len;
943 : }
944 :
945 : /*
946 : * GBK
947 : */
948 : static int
949 556 : pg_gbk_mblen(const unsigned char *s)
950 : {
951 : int len;
952 :
953 556 : if (IS_HIGHBIT_SET(*s))
954 416 : len = 2; /* kanji? */
955 : else
956 140 : len = 1; /* should be ASCII */
957 556 : return len;
958 : }
959 :
960 : static int
961 0 : pg_gbk_dsplen(const unsigned char *s)
962 : {
963 : int len;
964 :
965 0 : if (IS_HIGHBIT_SET(*s))
966 0 : len = 2; /* kanji? */
967 : else
968 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
969 0 : return len;
970 : }
971 :
972 : /*
973 : * UHC
974 : */
975 : static int
976 24 : pg_uhc_mblen(const unsigned char *s)
977 : {
978 : int len;
979 :
980 24 : if (IS_HIGHBIT_SET(*s))
981 24 : len = 2; /* 2byte? */
982 : else
983 0 : len = 1; /* should be ASCII */
984 24 : return len;
985 : }
986 :
987 : static int
988 0 : pg_uhc_dsplen(const unsigned char *s)
989 : {
990 : int len;
991 :
992 0 : if (IS_HIGHBIT_SET(*s))
993 0 : len = 2; /* 2byte? */
994 : else
995 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
996 0 : return len;
997 : }
998 :
999 : /*
1000 : * GB18030
1001 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1002 : */
1003 :
1004 : /*
1005 : * Unlike all other mblen() functions, this also looks at the second byte of
1006 : * the input. However, if you only pass the first byte of a multi-byte
1007 : * string, and \0 as the second byte, this still works in a predictable way:
1008 : * a 4-byte character will be reported as two 2-byte characters. That's
1009 : * enough for all current uses, as a client-only encoding. It works that
1010 : * way, because in any valid 4-byte GB18030-encoded character, the third and
1011 : * fourth byte look like a 2-byte encoded character, when looked at
1012 : * separately.
1013 : */
1014 : static int
1015 1158 : pg_gb18030_mblen(const unsigned char *s)
1016 : {
1017 : int len;
1018 :
1019 1158 : if (!IS_HIGHBIT_SET(*s))
1020 684 : len = 1; /* ASCII */
1021 474 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1022 186 : len = 4;
1023 : else
1024 288 : len = 2;
1025 1158 : return len;
1026 : }
1027 :
1028 : static int
1029 0 : pg_gb18030_dsplen(const unsigned char *s)
1030 : {
1031 : int len;
1032 :
1033 0 : if (IS_HIGHBIT_SET(*s))
1034 0 : len = 2;
1035 : else
1036 0 : len = pg_ascii_dsplen(s); /* ASCII */
1037 0 : return len;
1038 : }
1039 :
1040 : /*
1041 : *-------------------------------------------------------------------
1042 : * multibyte sequence validators
1043 : *
1044 : * The verifychar functions accept "s", a pointer to the first byte of a
1045 : * string, and "len", the remaining length of the string. If there is a
1046 : * validly encoded character beginning at *s, return its length in bytes;
1047 : * else return -1.
1048 : *
1049 : * The verifystr functions also accept "s", a pointer to a string and "len",
1050 : * the length of the string. They verify the whole string, and return the
1051 : * number of input bytes (<= len) that are valid. In other words, if the
1052 : * whole string is valid, verifystr returns "len", otherwise it returns the
1053 : * byte offset of the first invalid character. The verifystr functions must
1054 : * test for and reject zeroes in the input.
1055 : *
1056 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
1057 : * they must test for and reject zeroes in any additional bytes of a
1058 : * multibyte character. Note that this definition allows the function for a
1059 : * single-byte encoding to be just "return 1".
1060 : *-------------------------------------------------------------------
1061 : */
1062 : static int
1063 322 : pg_ascii_verifychar(const unsigned char *s, int len)
1064 : {
1065 322 : return 1;
1066 : }
1067 :
1068 : static int
1069 423418 : pg_ascii_verifystr(const unsigned char *s, int len)
1070 : {
1071 423418 : const unsigned char *nullpos = memchr(s, 0, len);
1072 :
1073 423418 : if (nullpos == NULL)
1074 423418 : return len;
1075 : else
1076 0 : return nullpos - s;
1077 : }
1078 :
1079 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1080 :
1081 : static int
1082 504 : pg_eucjp_verifychar(const unsigned char *s, int len)
1083 : {
1084 : int l;
1085 : unsigned char c1,
1086 : c2;
1087 :
1088 504 : c1 = *s++;
1089 :
1090 504 : switch (c1)
1091 : {
1092 0 : case SS2: /* JIS X 0201 */
1093 0 : l = 2;
1094 0 : if (l > len)
1095 0 : return -1;
1096 0 : c2 = *s++;
1097 0 : if (c2 < 0xa1 || c2 > 0xdf)
1098 0 : return -1;
1099 0 : break;
1100 :
1101 0 : case SS3: /* JIS X 0212 */
1102 0 : l = 3;
1103 0 : if (l > len)
1104 0 : return -1;
1105 0 : c2 = *s++;
1106 0 : if (!IS_EUC_RANGE_VALID(c2))
1107 0 : return -1;
1108 0 : c2 = *s++;
1109 0 : if (!IS_EUC_RANGE_VALID(c2))
1110 0 : return -1;
1111 0 : break;
1112 :
1113 504 : default:
1114 504 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1115 : {
1116 504 : l = 2;
1117 504 : if (l > len)
1118 84 : return -1;
1119 420 : if (!IS_EUC_RANGE_VALID(c1))
1120 24 : return -1;
1121 396 : c2 = *s++;
1122 396 : if (!IS_EUC_RANGE_VALID(c2))
1123 180 : return -1;
1124 : }
1125 : else
1126 : /* must be ASCII */
1127 : {
1128 0 : l = 1;
1129 : }
1130 216 : break;
1131 : }
1132 :
1133 216 : return l;
1134 : }
1135 :
1136 : static int
1137 300 : pg_eucjp_verifystr(const unsigned char *s, int len)
1138 : {
1139 300 : const unsigned char *start = s;
1140 :
1141 930 : while (len > 0)
1142 : {
1143 : int l;
1144 :
1145 : /* fast path for ASCII-subset characters */
1146 846 : if (!IS_HIGHBIT_SET(*s))
1147 : {
1148 594 : if (*s == '\0')
1149 72 : break;
1150 522 : l = 1;
1151 : }
1152 : else
1153 : {
1154 252 : l = pg_eucjp_verifychar(s, len);
1155 252 : if (l == -1)
1156 144 : break;
1157 : }
1158 630 : s += l;
1159 630 : len -= l;
1160 : }
1161 :
1162 300 : return s - start;
1163 : }
1164 :
1165 : static int
1166 36 : pg_euckr_verifychar(const unsigned char *s, int len)
1167 : {
1168 : int l;
1169 : unsigned char c1,
1170 : c2;
1171 :
1172 36 : c1 = *s++;
1173 :
1174 36 : if (IS_HIGHBIT_SET(c1))
1175 : {
1176 36 : l = 2;
1177 36 : if (l > len)
1178 12 : return -1;
1179 24 : if (!IS_EUC_RANGE_VALID(c1))
1180 24 : return -1;
1181 0 : c2 = *s++;
1182 0 : if (!IS_EUC_RANGE_VALID(c2))
1183 0 : return -1;
1184 : }
1185 : else
1186 : /* must be ASCII */
1187 : {
1188 0 : l = 1;
1189 : }
1190 :
1191 0 : return l;
1192 : }
1193 :
1194 : static int
1195 60 : pg_euckr_verifystr(const unsigned char *s, int len)
1196 : {
1197 60 : const unsigned char *start = s;
1198 :
1199 132 : while (len > 0)
1200 : {
1201 : int l;
1202 :
1203 : /* fast path for ASCII-subset characters */
1204 108 : if (!IS_HIGHBIT_SET(*s))
1205 : {
1206 72 : if (*s == '\0')
1207 0 : break;
1208 72 : l = 1;
1209 : }
1210 : else
1211 : {
1212 36 : l = pg_euckr_verifychar(s, len);
1213 36 : if (l == -1)
1214 36 : break;
1215 : }
1216 72 : s += l;
1217 72 : len -= l;
1218 : }
1219 :
1220 60 : return s - start;
1221 : }
1222 :
1223 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1224 : #define pg_euccn_verifychar pg_euckr_verifychar
1225 : #define pg_euccn_verifystr pg_euckr_verifystr
1226 :
1227 : static int
1228 18 : pg_euctw_verifychar(const unsigned char *s, int len)
1229 : {
1230 : int l;
1231 : unsigned char c1,
1232 : c2;
1233 :
1234 18 : c1 = *s++;
1235 :
1236 18 : switch (c1)
1237 : {
1238 0 : case SS2: /* CNS 11643 Plane 1-7 */
1239 0 : l = 4;
1240 0 : if (l > len)
1241 0 : return -1;
1242 0 : c2 = *s++;
1243 0 : if (c2 < 0xa1 || c2 > 0xa7)
1244 0 : return -1;
1245 0 : c2 = *s++;
1246 0 : if (!IS_EUC_RANGE_VALID(c2))
1247 0 : return -1;
1248 0 : c2 = *s++;
1249 0 : if (!IS_EUC_RANGE_VALID(c2))
1250 0 : return -1;
1251 0 : break;
1252 :
1253 0 : case SS3: /* unused */
1254 0 : return -1;
1255 :
1256 18 : default:
1257 18 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1258 : {
1259 18 : l = 2;
1260 18 : if (l > len)
1261 6 : return -1;
1262 : /* no further range check on c1? */
1263 12 : c2 = *s++;
1264 12 : if (!IS_EUC_RANGE_VALID(c2))
1265 12 : return -1;
1266 : }
1267 : else
1268 : /* must be ASCII */
1269 : {
1270 0 : l = 1;
1271 : }
1272 0 : break;
1273 : }
1274 0 : return l;
1275 : }
1276 :
1277 : static int
1278 36 : pg_euctw_verifystr(const unsigned char *s, int len)
1279 : {
1280 36 : const unsigned char *start = s;
1281 :
1282 90 : while (len > 0)
1283 : {
1284 : int l;
1285 :
1286 : /* fast path for ASCII-subset characters */
1287 72 : if (!IS_HIGHBIT_SET(*s))
1288 : {
1289 54 : if (*s == '\0')
1290 0 : break;
1291 54 : l = 1;
1292 : }
1293 : else
1294 : {
1295 18 : l = pg_euctw_verifychar(s, len);
1296 18 : if (l == -1)
1297 18 : break;
1298 : }
1299 54 : s += l;
1300 54 : len -= l;
1301 : }
1302 :
1303 36 : return s - start;
1304 : }
1305 :
1306 : static int
1307 18 : pg_johab_verifychar(const unsigned char *s, int len)
1308 : {
1309 : int l,
1310 : mbl;
1311 : unsigned char c;
1312 :
1313 18 : l = mbl = pg_johab_mblen(s);
1314 :
1315 18 : if (len < l)
1316 6 : return -1;
1317 :
1318 12 : if (!IS_HIGHBIT_SET(*s))
1319 0 : return mbl;
1320 :
1321 12 : while (--l > 0)
1322 : {
1323 12 : c = *++s;
1324 12 : if (!IS_EUC_RANGE_VALID(c))
1325 12 : return -1;
1326 : }
1327 0 : return mbl;
1328 : }
1329 :
1330 : static int
1331 24 : pg_johab_verifystr(const unsigned char *s, int len)
1332 : {
1333 24 : const unsigned char *start = s;
1334 :
1335 42 : while (len > 0)
1336 : {
1337 : int l;
1338 :
1339 : /* fast path for ASCII-subset characters */
1340 36 : if (!IS_HIGHBIT_SET(*s))
1341 : {
1342 18 : if (*s == '\0')
1343 0 : break;
1344 18 : l = 1;
1345 : }
1346 : else
1347 : {
1348 18 : l = pg_johab_verifychar(s, len);
1349 18 : if (l == -1)
1350 18 : break;
1351 : }
1352 18 : s += l;
1353 18 : len -= l;
1354 : }
1355 :
1356 24 : return s - start;
1357 : }
1358 :
1359 : static int
1360 1350 : pg_mule_verifychar(const unsigned char *s, int len)
1361 : {
1362 : int l,
1363 : mbl;
1364 : unsigned char c;
1365 :
1366 1350 : l = mbl = pg_mule_mblen(s);
1367 :
1368 1350 : if (len < l)
1369 344 : return -1;
1370 :
1371 2032 : while (--l > 0)
1372 : {
1373 1348 : c = *++s;
1374 1348 : if (!IS_HIGHBIT_SET(c))
1375 322 : return -1;
1376 : }
1377 684 : return mbl;
1378 : }
1379 :
1380 : static int
1381 438 : pg_mule_verifystr(const unsigned char *s, int len)
1382 : {
1383 438 : const unsigned char *start = s;
1384 :
1385 1290 : while (len > 0)
1386 : {
1387 : int l;
1388 :
1389 : /* fast path for ASCII-subset characters */
1390 1122 : if (!IS_HIGHBIT_SET(*s))
1391 : {
1392 690 : if (*s == '\0')
1393 36 : break;
1394 654 : l = 1;
1395 : }
1396 : else
1397 : {
1398 432 : l = pg_mule_verifychar(s, len);
1399 432 : if (l == -1)
1400 234 : break;
1401 : }
1402 852 : s += l;
1403 852 : len -= l;
1404 : }
1405 :
1406 438 : return s - start;
1407 : }
1408 :
1409 : static int
1410 7156 : pg_latin1_verifychar(const unsigned char *s, int len)
1411 : {
1412 7156 : return 1;
1413 : }
1414 :
1415 : static int
1416 11230 : pg_latin1_verifystr(const unsigned char *s, int len)
1417 : {
1418 11230 : const unsigned char *nullpos = memchr(s, 0, len);
1419 :
1420 11230 : if (nullpos == NULL)
1421 11122 : return len;
1422 : else
1423 108 : return nullpos - s;
1424 : }
1425 :
1426 : static int
1427 1002 : pg_sjis_verifychar(const unsigned char *s, int len)
1428 : {
1429 : int l,
1430 : mbl;
1431 : unsigned char c1,
1432 : c2;
1433 :
1434 1002 : l = mbl = pg_sjis_mblen(s);
1435 :
1436 1002 : if (len < l)
1437 132 : return -1;
1438 :
1439 870 : if (l == 1) /* pg_sjis_mblen already verified it */
1440 0 : return mbl;
1441 :
1442 870 : c1 = *s++;
1443 870 : c2 = *s;
1444 870 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1445 348 : return -1;
1446 522 : return mbl;
1447 : }
1448 :
1449 : static int
1450 546 : pg_sjis_verifystr(const unsigned char *s, int len)
1451 : {
1452 546 : const unsigned char *start = s;
1453 :
1454 2068 : while (len > 0)
1455 : {
1456 : int l;
1457 :
1458 : /* fast path for ASCII-subset characters */
1459 1842 : if (!IS_HIGHBIT_SET(*s))
1460 : {
1461 1348 : if (*s == '\0')
1462 72 : break;
1463 1276 : l = 1;
1464 : }
1465 : else
1466 : {
1467 494 : l = pg_sjis_verifychar(s, len);
1468 494 : if (l == -1)
1469 248 : break;
1470 : }
1471 1522 : s += l;
1472 1522 : len -= l;
1473 : }
1474 :
1475 546 : return s - start;
1476 : }
1477 :
1478 : static int
1479 360 : pg_big5_verifychar(const unsigned char *s, int len)
1480 : {
1481 : int l,
1482 : mbl;
1483 :
1484 360 : l = mbl = pg_big5_mblen(s);
1485 :
1486 360 : if (len < l)
1487 6 : return -1;
1488 :
1489 354 : if (l == 2 &&
1490 354 : s[0] == NONUTF8_INVALID_BYTE0 &&
1491 12 : s[1] == NONUTF8_INVALID_BYTE1)
1492 12 : return -1;
1493 :
1494 576 : while (--l > 0)
1495 : {
1496 342 : if (*++s == '\0')
1497 108 : return -1;
1498 : }
1499 :
1500 234 : return mbl;
1501 : }
1502 :
1503 : static int
1504 162 : pg_big5_verifystr(const unsigned char *s, int len)
1505 : {
1506 162 : const unsigned char *start = s;
1507 :
1508 666 : while (len > 0)
1509 : {
1510 : int l;
1511 :
1512 : /* fast path for ASCII-subset characters */
1513 594 : if (!IS_HIGHBIT_SET(*s))
1514 : {
1515 468 : if (*s == '\0')
1516 36 : break;
1517 432 : l = 1;
1518 : }
1519 : else
1520 : {
1521 126 : l = pg_big5_verifychar(s, len);
1522 126 : if (l == -1)
1523 54 : break;
1524 : }
1525 504 : s += l;
1526 504 : len -= l;
1527 : }
1528 :
1529 162 : return s - start;
1530 : }
1531 :
1532 : static int
1533 274 : pg_gbk_verifychar(const unsigned char *s, int len)
1534 : {
1535 : int l,
1536 : mbl;
1537 :
1538 274 : l = mbl = pg_gbk_mblen(s);
1539 :
1540 274 : if (len < l)
1541 54 : return -1;
1542 :
1543 220 : if (l == 2 &&
1544 220 : s[0] == NONUTF8_INVALID_BYTE0 &&
1545 28 : s[1] == NONUTF8_INVALID_BYTE1)
1546 28 : return -1;
1547 :
1548 384 : while (--l > 0)
1549 : {
1550 192 : if (*++s == '\0')
1551 0 : return -1;
1552 : }
1553 :
1554 192 : return mbl;
1555 : }
1556 :
1557 : static int
1558 256 : pg_gbk_verifystr(const unsigned char *s, int len)
1559 : {
1560 256 : const unsigned char *start = s;
1561 :
1562 658 : while (len > 0)
1563 : {
1564 : int l;
1565 :
1566 : /* fast path for ASCII-subset characters */
1567 484 : if (!IS_HIGHBIT_SET(*s))
1568 : {
1569 242 : if (*s == '\0')
1570 0 : break;
1571 242 : l = 1;
1572 : }
1573 : else
1574 : {
1575 242 : l = pg_gbk_verifychar(s, len);
1576 242 : if (l == -1)
1577 82 : break;
1578 : }
1579 402 : s += l;
1580 402 : len -= l;
1581 : }
1582 :
1583 256 : return s - start;
1584 : }
1585 :
1586 : static int
1587 18 : pg_uhc_verifychar(const unsigned char *s, int len)
1588 : {
1589 : int l,
1590 : mbl;
1591 :
1592 18 : l = mbl = pg_uhc_mblen(s);
1593 :
1594 18 : if (len < l)
1595 6 : return -1;
1596 :
1597 12 : if (l == 2 &&
1598 12 : s[0] == NONUTF8_INVALID_BYTE0 &&
1599 12 : s[1] == NONUTF8_INVALID_BYTE1)
1600 12 : return -1;
1601 :
1602 0 : while (--l > 0)
1603 : {
1604 0 : if (*++s == '\0')
1605 0 : return -1;
1606 : }
1607 :
1608 0 : return mbl;
1609 : }
1610 :
1611 : static int
1612 24 : pg_uhc_verifystr(const unsigned char *s, int len)
1613 : {
1614 24 : const unsigned char *start = s;
1615 :
1616 42 : while (len > 0)
1617 : {
1618 : int l;
1619 :
1620 : /* fast path for ASCII-subset characters */
1621 36 : if (!IS_HIGHBIT_SET(*s))
1622 : {
1623 18 : if (*s == '\0')
1624 0 : break;
1625 18 : l = 1;
1626 : }
1627 : else
1628 : {
1629 18 : l = pg_uhc_verifychar(s, len);
1630 18 : if (l == -1)
1631 18 : break;
1632 : }
1633 18 : s += l;
1634 18 : len -= l;
1635 : }
1636 :
1637 24 : return s - start;
1638 : }
1639 :
1640 : static int
1641 1164 : pg_gb18030_verifychar(const unsigned char *s, int len)
1642 : {
1643 : int l;
1644 :
1645 1164 : if (!IS_HIGHBIT_SET(*s))
1646 0 : l = 1; /* ASCII */
1647 1164 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1648 : {
1649 : /* Should be 4-byte, validate remaining bytes */
1650 318 : if (*s >= 0x81 && *s <= 0xfe &&
1651 306 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1652 306 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1653 162 : l = 4;
1654 : else
1655 156 : l = -1;
1656 : }
1657 846 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1658 : {
1659 : /* Should be 2-byte, validate */
1660 612 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1661 372 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1662 276 : l = 2;
1663 : else
1664 336 : l = -1;
1665 : }
1666 : else
1667 234 : l = -1;
1668 1164 : return l;
1669 : }
1670 :
1671 : static int
1672 884 : pg_gb18030_verifystr(const unsigned char *s, int len)
1673 : {
1674 884 : const unsigned char *start = s;
1675 :
1676 2930 : while (len > 0)
1677 : {
1678 : int l;
1679 :
1680 : /* fast path for ASCII-subset characters */
1681 2652 : if (!IS_HIGHBIT_SET(*s))
1682 : {
1683 1804 : if (*s == '\0')
1684 48 : break;
1685 1756 : l = 1;
1686 : }
1687 : else
1688 : {
1689 848 : l = pg_gb18030_verifychar(s, len);
1690 848 : if (l == -1)
1691 558 : break;
1692 : }
1693 2046 : s += l;
1694 2046 : len -= l;
1695 : }
1696 :
1697 884 : return s - start;
1698 : }
1699 :
1700 : static int
1701 17614 : pg_utf8_verifychar(const unsigned char *s, int len)
1702 : {
1703 : int l;
1704 :
1705 17614 : if ((*s & 0x80) == 0)
1706 : {
1707 0 : if (*s == '\0')
1708 0 : return -1;
1709 0 : return 1;
1710 : }
1711 17614 : else if ((*s & 0xe0) == 0xc0)
1712 6162 : l = 2;
1713 11452 : else if ((*s & 0xf0) == 0xe0)
1714 6332 : l = 3;
1715 5120 : else if ((*s & 0xf8) == 0xf0)
1716 4856 : l = 4;
1717 : else
1718 264 : l = 1;
1719 :
1720 17614 : if (l > len)
1721 578 : return -1;
1722 :
1723 17036 : if (!pg_utf8_islegal(s, l))
1724 2356 : return -1;
1725 :
1726 14680 : return l;
1727 : }
1728 :
1729 : /*
1730 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1731 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1732 : * input byte and current state are used to compute an index into an array of
1733 : * state transitions. Since the address of the next transition is dependent
1734 : * on this computation, there is latency in executing the load instruction,
1735 : * and the CPU is not kept busy.
1736 : *
1737 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1738 : *
1739 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1740 : *
1741 : * In a shift-based DFA, the input byte is an index into array of integers
1742 : * whose bit pattern encodes the state transitions. To compute the next
1743 : * state, we simply right-shift the integer by the current state and apply a
1744 : * mask. In this scheme, the address of the transition only depends on the
1745 : * input byte, so there is better pipelining.
1746 : *
1747 : * The naming convention for states and transitions was adopted from a UTF-8
1748 : * to UTF-16/32 transcoder, whose table is reproduced below:
1749 : *
1750 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1751 : *
1752 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1753 : * ==========================================================================
1754 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1755 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1756 : * |
1757 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1758 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1759 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1760 : * |
1761 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1762 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1763 : * |
1764 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1765 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1766 : *
1767 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1768 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1769 : * it's possible to find state numbers such that the transitions fit within
1770 : * 32-bit integers, as Dougall Johnson demonstrated:
1771 : *
1772 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1773 : *
1774 : * This packed representation is the reason for the seemingly odd choice of
1775 : * state values below.
1776 : */
1777 :
1778 : /* Error */
1779 : #define ERR 0
1780 : /* Begin */
1781 : #define BGN 11
1782 : /* Continuation states, expect 1/2/3 continuation bytes */
1783 : #define CS1 16
1784 : #define CS2 1
1785 : #define CS3 5
1786 : /* Partial states, where the first continuation byte has a restricted range */
1787 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1788 : #define P3B 20 /* Lead was ED, check for surrogate */
1789 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1790 : #define P4B 30 /* Lead was F4, check for too-large */
1791 : /* Begin and End are the same state */
1792 : #define END BGN
1793 :
1794 : /* the encoded state transitions for the lookup table */
1795 :
1796 : /* ASCII */
1797 : #define ASC (END << BGN)
1798 : /* 2-byte lead */
1799 : #define L2A (CS1 << BGN)
1800 : /* 3-byte lead */
1801 : #define L3A (P3A << BGN)
1802 : #define L3B (CS2 << BGN)
1803 : #define L3C (P3B << BGN)
1804 : /* 4-byte lead */
1805 : #define L4A (P4A << BGN)
1806 : #define L4B (CS3 << BGN)
1807 : #define L4C (P4B << BGN)
1808 : /* continuation byte */
1809 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1810 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1811 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1812 : /* invalid byte */
1813 : #define ILL ERR
1814 :
1815 : static const uint32 Utf8Transition[256] =
1816 : {
1817 : /* ASCII */
1818 :
1819 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1820 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1821 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1822 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1823 :
1824 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1825 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1826 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1827 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1828 :
1829 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1830 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1831 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1832 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1833 :
1834 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1835 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1836 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1837 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1838 :
1839 : /* continuation bytes */
1840 :
1841 : /* 80..8F */
1842 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1843 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1844 :
1845 : /* 90..9F */
1846 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1847 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1848 :
1849 : /* A0..BF */
1850 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1851 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1852 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1853 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1854 :
1855 : /* leading bytes */
1856 :
1857 : /* C0..DF */
1858 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1859 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1860 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1861 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1862 :
1863 : /* E0..EF */
1864 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1865 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1866 :
1867 : /* F0..FF */
1868 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1869 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1870 : };
1871 :
1872 : static void
1873 1704 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1874 : {
1875 : /* Note: We deliberately don't check the state's value here. */
1876 56232 : while (len > 0)
1877 : {
1878 : /*
1879 : * It's important that the mask value is 31: In most instruction sets,
1880 : * a shift by a 32-bit operand is understood to be a shift by its mod
1881 : * 32, so the compiler should elide the mask operation.
1882 : */
1883 54528 : *state = Utf8Transition[*s++] >> (*state & 31);
1884 54528 : len--;
1885 : }
1886 :
1887 1704 : *state &= 31;
1888 1704 : }
1889 :
1890 : static int
1891 1178874 : pg_utf8_verifystr(const unsigned char *s, int len)
1892 : {
1893 1178874 : const unsigned char *start = s;
1894 1178874 : const int orig_len = len;
1895 1178874 : uint32 state = BGN;
1896 :
1897 : /*
1898 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1899 : * the compiler can unroll a longer loop, it's not worth it because we
1900 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1901 : */
1902 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1903 :
1904 1178874 : if (len >= STRIDE_LENGTH)
1905 : {
1906 4888300 : while (len >= STRIDE_LENGTH)
1907 : {
1908 : /*
1909 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1910 : * but we must first check for a non-END state, which means the
1911 : * previous chunk ended in the middle of a multibyte sequence.
1912 : */
1913 4306660 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1914 1704 : utf8_advance(s, &state, STRIDE_LENGTH);
1915 :
1916 4306660 : s += STRIDE_LENGTH;
1917 4306660 : len -= STRIDE_LENGTH;
1918 : }
1919 :
1920 : /* The error state persists, so we only need to check for it here. */
1921 581640 : if (state == ERR)
1922 : {
1923 : /*
1924 : * Start over from the beginning with the slow path so we can
1925 : * count the valid bytes.
1926 : */
1927 504 : len = orig_len;
1928 504 : s = start;
1929 : }
1930 581136 : else if (state != END)
1931 : {
1932 : /*
1933 : * The fast path exited in the middle of a multibyte sequence.
1934 : * Walk backwards to find the leading byte so that the slow path
1935 : * can resume checking from there. We must always backtrack at
1936 : * least one byte, since the current byte could be e.g. an ASCII
1937 : * byte after a 2-byte lead, which is invalid.
1938 : */
1939 : do
1940 : {
1941 : Assert(s > start);
1942 114 : s--;
1943 114 : len++;
1944 : Assert(IS_HIGHBIT_SET(*s));
1945 114 : } while (pg_utf_mblen(s) <= 1);
1946 : }
1947 : }
1948 :
1949 : /* check remaining bytes */
1950 17423446 : while (len > 0)
1951 : {
1952 : int l;
1953 :
1954 : /* fast path for ASCII-subset characters */
1955 16247646 : if (!IS_HIGHBIT_SET(*s))
1956 : {
1957 16230104 : if (*s == '\0')
1958 204 : break;
1959 16229900 : l = 1;
1960 : }
1961 : else
1962 : {
1963 17542 : l = pg_utf8_verifychar(s, len);
1964 17542 : if (l == -1)
1965 2870 : break;
1966 : }
1967 16244572 : s += l;
1968 16244572 : len -= l;
1969 : }
1970 :
1971 1178874 : return s - start;
1972 : }
1973 :
1974 : /*
1975 : * Check for validity of a single UTF-8 encoded character
1976 : *
1977 : * This directly implements the rules in RFC3629. The bizarre-looking
1978 : * restrictions on the second byte are meant to ensure that there isn't
1979 : * more than one encoding of a given Unicode character point; that is,
1980 : * you may not use a longer-than-necessary byte sequence with high order
1981 : * zero bits to represent a character that would fit in fewer bytes.
1982 : * To do otherwise is to create security hazards (eg, create an apparent
1983 : * non-ASCII character that decodes to plain ASCII).
1984 : *
1985 : * length is assumed to have been obtained by pg_utf_mblen(), and the
1986 : * caller must have checked that that many bytes are present in the buffer.
1987 : */
1988 : bool
1989 23580 : pg_utf8_islegal(const unsigned char *source, int length)
1990 : {
1991 : unsigned char a;
1992 :
1993 23580 : switch (length)
1994 : {
1995 0 : default:
1996 : /* reject lengths 5 and 6 for now */
1997 0 : return false;
1998 4596 : case 4:
1999 4596 : a = source[3];
2000 4596 : if (a < 0x80 || a > 0xBF)
2001 364 : return false;
2002 : /* FALL THRU */
2003 : case 3:
2004 12038 : a = source[2];
2005 12038 : if (a < 0x80 || a > 0xBF)
2006 680 : return false;
2007 : /* FALL THRU */
2008 : case 2:
2009 17978 : a = source[1];
2010 17978 : switch (*source)
2011 : {
2012 312 : case 0xE0:
2013 312 : if (a < 0xA0 || a > 0xBF)
2014 264 : return false;
2015 48 : break;
2016 312 : case 0xED:
2017 312 : if (a < 0x80 || a > 0x9F)
2018 264 : return false;
2019 48 : break;
2020 4052 : case 0xF0:
2021 4052 : if (a < 0x90 || a > 0xBF)
2022 264 : return false;
2023 3788 : break;
2024 180 : case 0xF4:
2025 180 : if (a < 0x80 || a > 0x8F)
2026 132 : return false;
2027 48 : break;
2028 13122 : default:
2029 13122 : if (a < 0x80 || a > 0xBF)
2030 292 : return false;
2031 12830 : break;
2032 : }
2033 : /* FALL THRU */
2034 21320 : case 1:
2035 21320 : a = *source;
2036 21320 : if (a >= 0x80 && a < 0xC2)
2037 396 : return false;
2038 20924 : if (a > 0xF4)
2039 132 : return false;
2040 20792 : break;
2041 : }
2042 20792 : return true;
2043 : }
2044 :
2045 :
2046 : /*
2047 : * Fills the provided buffer with two bytes such that:
2048 : * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2049 : */
2050 : void
2051 412 : pg_encoding_set_invalid(int encoding, char *dst)
2052 : {
2053 : Assert(pg_encoding_max_length(encoding) > 1);
2054 :
2055 412 : dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2056 412 : dst[1] = NONUTF8_INVALID_BYTE1;
2057 412 : }
2058 :
2059 : /*
2060 : *-------------------------------------------------------------------
2061 : * encoding info table
2062 : *-------------------------------------------------------------------
2063 : */
2064 : const pg_wchar_tbl pg_wchar_table[] = {
2065 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2066 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2067 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2068 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2069 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2070 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2071 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2072 : [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2073 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2074 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2075 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2076 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2077 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2078 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2079 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2080 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2081 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2082 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2083 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2084 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2085 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2086 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2087 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2088 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2089 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2090 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2091 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2092 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2093 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2094 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2095 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2096 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2097 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2098 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2099 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2100 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2101 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2102 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2103 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2104 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2105 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2106 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2107 : };
2108 :
2109 : /*
2110 : * Returns the byte length of a multibyte character.
2111 : *
2112 : * Choose "mblen" functions based on the input string characteristics.
2113 : * pg_encoding_mblen() can be used when ANY of these conditions are met:
2114 : *
2115 : * - The input string is zero-terminated
2116 : *
2117 : * - The input string is known to be valid in the encoding (e.g., string
2118 : * converted from database encoding)
2119 : *
2120 : * - The encoding is not GB18030 (e.g., when only database encodings are
2121 : * passed to 'encoding' parameter)
2122 : *
2123 : * encoding==GB18030 requires examining up to two bytes to determine character
2124 : * length. Therefore, callers satisfying none of those conditions must use
2125 : * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2126 : * guaranteed to be within allocation bounds.
2127 : *
2128 : * When dealing with text that is not certainly valid in the specified
2129 : * encoding, the result may exceed the actual remaining string length.
2130 : * Callers that are not prepared to deal with that should use Min(remaining,
2131 : * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
2132 : * pg_encoding_mblen_bounded() are interchangeable.
2133 : */
2134 : int
2135 54237130 : pg_encoding_mblen(int encoding, const char *mbstr)
2136 : {
2137 54237130 : return (PG_VALID_ENCODING(encoding) ?
2138 108474260 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2139 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2140 : }
2141 :
2142 : /*
2143 : * Returns the byte length of a multibyte character (possibly not
2144 : * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2145 : */
2146 : int
2147 6094 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
2148 : size_t remaining)
2149 : {
2150 : /*
2151 : * Define zero remaining as too few, even for single-byte encodings.
2152 : * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2153 : * zero; others read one.
2154 : */
2155 6094 : if (remaining < 1 ||
2156 338 : (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
2157 72 : return INT_MAX;
2158 6022 : return pg_encoding_mblen(encoding, mbstr);
2159 : }
2160 :
2161 : /*
2162 : * Returns the byte length of a multibyte character; but not more than the
2163 : * distance to the terminating zero byte. For input that might lack a
2164 : * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
2165 : */
2166 : int
2167 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2168 : {
2169 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2170 : }
2171 :
2172 : /*
2173 : * Returns the display length of a multibyte character.
2174 : */
2175 : int
2176 54057952 : pg_encoding_dsplen(int encoding, const char *mbstr)
2177 : {
2178 54057952 : return (PG_VALID_ENCODING(encoding) ?
2179 108115904 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2180 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2181 : }
2182 :
2183 : /*
2184 : * Verify the first multibyte character of the given string.
2185 : * Return its byte length if good, -1 if bad. (See comments above for
2186 : * full details of the mbverifychar API.)
2187 : */
2188 : int
2189 9706 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2190 : {
2191 9706 : return (PG_VALID_ENCODING(encoding) ?
2192 19412 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2193 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2194 : }
2195 :
2196 : /*
2197 : * Verify that a string is valid for the given encoding.
2198 : * Returns the number of input bytes (<= len) that form a valid string.
2199 : * (See comments above for full details of the mbverifystr API.)
2200 : */
2201 : int
2202 467244 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2203 : {
2204 467244 : return (PG_VALID_ENCODING(encoding) ?
2205 934488 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2206 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2207 : }
2208 :
2209 : /*
2210 : * fetch maximum length of a given encoding
2211 : */
2212 : int
2213 884706 : pg_encoding_max_length(int encoding)
2214 : {
2215 : Assert(PG_VALID_ENCODING(encoding));
2216 :
2217 : /*
2218 : * Check for the encoding despite the assert, due to some mingw versions
2219 : * otherwise issuing bogus warnings.
2220 : */
2221 884706 : return PG_VALID_ENCODING(encoding) ?
2222 1769412 : pg_wchar_table[encoding].maxmblen :
2223 : pg_wchar_table[PG_SQL_ASCII].maxmblen;
2224 : }
|