Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wchar.c
4 : * Functions for working with multibyte characters in various encodings.
5 : *
6 : * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/wchar.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <limits.h>
16 :
17 : #include "mb/pg_wchar.h"
18 : #include "utils/ascii.h"
19 :
20 :
21 : /*
22 : * In today's multibyte encodings other than UTF8, this two-byte sequence
23 : * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 : *
25 : * For historical reasons, several verifychar implementations opt to reject
26 : * this pair specifically. Byte pair range constraints, in encoding
27 : * originator documentation, always excluded this pair. No core conversion
28 : * could translate it. However, longstanding verifychar implementations
29 : * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
30 : * pairs not valid per encoding originator documentation. To avoid tightening
31 : * core or non-core conversions in a security patch, we sought this one pair.
32 : *
33 : * PQescapeString() historically used spaces for BYTE1; many other values
34 : * could suffice for BYTE1.
35 : */
36 : #define NONUTF8_INVALID_BYTE0 (0x8d)
37 : #define NONUTF8_INVALID_BYTE1 (' ')
38 :
39 :
40 : /*
41 : * Operations on multi-byte encodings are driven by a table of helper
42 : * functions.
43 : *
44 : * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 : * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 : * and wchar2mb() conversion functions.
47 : *
48 : * These functions generally assume that their input is validly formed.
49 : * The "verifier" functions, further down in the file, have to be more
50 : * paranoid.
51 : *
52 : * We expect that mblen() does not need to examine more than the first byte
53 : * of the character to discover the correct length. GB18030 is an exception
54 : * to that rule, though, as it also looks at second byte. But even that
55 : * behaves in a predictable way, if you only pass the first byte: it will
56 : * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 : * good enough for all current uses.
58 : *
59 : * Note: for the display output of psql to work properly, the return values
60 : * of the dsplen functions must conform to the Unicode standard. In particular
61 : * the NUL character is zero width and control characters are generally
62 : * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 : * subset to the ASCII routines to ensure consistency.
64 : */
65 :
66 : /* No error-reporting facility. Ignore incomplete trailing byte sequence. */
67 : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
68 :
69 : /*
70 : * SQL/ASCII
71 : */
72 : static int
73 818 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
74 : {
75 818 : int cnt = 0;
76 :
77 64030 : while (len > 0 && *from)
78 : {
79 63212 : *to++ = *from++;
80 63212 : len--;
81 63212 : cnt++;
82 : }
83 818 : *to = 0;
84 818 : return cnt;
85 : }
86 :
87 : static int
88 37956 : pg_ascii_mblen(const unsigned char *s)
89 : {
90 37956 : return 1;
91 : }
92 :
93 : static int
94 34946 : pg_ascii_dsplen(const unsigned char *s)
95 : {
96 34946 : if (*s == '\0')
97 0 : return 0;
98 34946 : if (*s < 0x20 || *s == 0x7f)
99 4 : return -1;
100 :
101 34942 : return 1;
102 : }
103 :
104 : /*
105 : * EUC
106 : */
107 : static int
108 48 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
109 : {
110 48 : int cnt = 0;
111 :
112 72 : while (len > 0 && *from)
113 : {
114 48 : if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
115 : {
116 12 : MB2CHAR_NEED_AT_LEAST(len, 2);
117 6 : from++;
118 6 : *to = (SS2 << 8) | *from++;
119 6 : len -= 2;
120 : }
121 36 : else if (*from == SS3) /* JIS X 0212 KANJI */
122 : {
123 18 : MB2CHAR_NEED_AT_LEAST(len, 3);
124 6 : from++;
125 6 : *to = (SS3 << 16) | (*from++ << 8);
126 6 : *to |= *from++;
127 6 : len -= 3;
128 : }
129 18 : else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
130 : {
131 12 : MB2CHAR_NEED_AT_LEAST(len, 2);
132 6 : *to = *from++ << 8;
133 6 : *to |= *from++;
134 6 : len -= 2;
135 : }
136 : else /* must be ASCII */
137 : {
138 6 : *to = *from++;
139 6 : len--;
140 : }
141 24 : to++;
142 24 : cnt++;
143 : }
144 48 : *to = 0;
145 48 : return cnt;
146 : }
147 :
148 : static inline int
149 234 : pg_euc_mblen(const unsigned char *s)
150 : {
151 : int len;
152 :
153 234 : if (*s == SS2)
154 0 : len = 2;
155 234 : else if (*s == SS3)
156 0 : len = 3;
157 234 : else if (IS_HIGHBIT_SET(*s))
158 162 : len = 2;
159 : else
160 72 : len = 1;
161 234 : return len;
162 : }
163 :
164 : static inline int
165 0 : pg_euc_dsplen(const unsigned char *s)
166 : {
167 : int len;
168 :
169 0 : if (*s == SS2)
170 0 : len = 2;
171 0 : else if (*s == SS3)
172 0 : len = 2;
173 0 : else if (IS_HIGHBIT_SET(*s))
174 0 : len = 2;
175 : else
176 0 : len = pg_ascii_dsplen(s);
177 0 : return len;
178 : }
179 :
180 : /*
181 : * EUC_JP
182 : */
183 : static int
184 48 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
185 : {
186 48 : return pg_euc2wchar_with_len(from, to, len);
187 : }
188 :
189 : static int
190 204 : pg_eucjp_mblen(const unsigned char *s)
191 : {
192 204 : return pg_euc_mblen(s);
193 : }
194 :
195 : static int
196 0 : pg_eucjp_dsplen(const unsigned char *s)
197 : {
198 : int len;
199 :
200 0 : if (*s == SS2)
201 0 : len = 1;
202 0 : else if (*s == SS3)
203 0 : len = 2;
204 0 : else if (IS_HIGHBIT_SET(*s))
205 0 : len = 2;
206 : else
207 0 : len = pg_ascii_dsplen(s);
208 0 : return len;
209 : }
210 :
211 : /*
212 : * EUC_KR
213 : */
214 : static int
215 0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
216 : {
217 0 : return pg_euc2wchar_with_len(from, to, len);
218 : }
219 :
220 : static int
221 6 : pg_euckr_mblen(const unsigned char *s)
222 : {
223 6 : return pg_euc_mblen(s);
224 : }
225 :
226 : static int
227 0 : pg_euckr_dsplen(const unsigned char *s)
228 : {
229 0 : return pg_euc_dsplen(s);
230 : }
231 :
232 : /*
233 : * EUC_CN
234 : *
235 : */
236 : static int
237 54 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
238 : {
239 54 : int cnt = 0;
240 :
241 78 : while (len > 0 && *from)
242 : {
243 54 : if (*from == SS2) /* code set 2 (unused?) */
244 : {
245 18 : MB2CHAR_NEED_AT_LEAST(len, 3);
246 6 : from++;
247 6 : *to = (SS2 << 16) | (*from++ << 8);
248 6 : *to |= *from++;
249 6 : len -= 3;
250 : }
251 36 : else if (*from == SS3) /* code set 3 (unused ?) */
252 : {
253 18 : MB2CHAR_NEED_AT_LEAST(len, 3);
254 6 : from++;
255 6 : *to = (SS3 << 16) | (*from++ << 8);
256 6 : *to |= *from++;
257 6 : len -= 3;
258 : }
259 18 : else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
260 : {
261 12 : MB2CHAR_NEED_AT_LEAST(len, 2);
262 6 : *to = *from++ << 8;
263 6 : *to |= *from++;
264 6 : len -= 2;
265 : }
266 : else
267 : {
268 6 : *to = *from++;
269 6 : len--;
270 : }
271 24 : to++;
272 24 : cnt++;
273 : }
274 54 : *to = 0;
275 54 : return cnt;
276 : }
277 :
278 : /*
279 : * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
280 : * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
281 : * relies on agreement between mb2wchar_with_len and mblen. Invalid text
282 : * datums (e.g. from shared catalogs) reach this.
283 : */
284 : static int
285 6 : pg_euccn_mblen(const unsigned char *s)
286 : {
287 : int len;
288 :
289 6 : if (*s == SS2)
290 0 : len = 3;
291 6 : else if (*s == SS3)
292 0 : len = 3;
293 6 : else if (IS_HIGHBIT_SET(*s))
294 6 : len = 2;
295 : else
296 0 : len = 1;
297 6 : return len;
298 : }
299 :
300 : static int
301 0 : pg_euccn_dsplen(const unsigned char *s)
302 : {
303 : int len;
304 :
305 0 : if (IS_HIGHBIT_SET(*s))
306 0 : len = 2;
307 : else
308 0 : len = pg_ascii_dsplen(s);
309 0 : return len;
310 : }
311 :
312 : /*
313 : * EUC_TW
314 : *
315 : */
316 : static int
317 60 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
318 : {
319 60 : int cnt = 0;
320 :
321 84 : while (len > 0 && *from)
322 : {
323 60 : if (*from == SS2) /* code set 2 */
324 : {
325 24 : MB2CHAR_NEED_AT_LEAST(len, 4);
326 6 : from++;
327 6 : *to = (((uint32) SS2) << 24) | (*from++ << 16);
328 6 : *to |= *from++ << 8;
329 6 : *to |= *from++;
330 6 : len -= 4;
331 : }
332 36 : else if (*from == SS3) /* code set 3 (unused?) */
333 : {
334 18 : MB2CHAR_NEED_AT_LEAST(len, 3);
335 6 : from++;
336 6 : *to = (SS3 << 16) | (*from++ << 8);
337 6 : *to |= *from++;
338 6 : len -= 3;
339 : }
340 18 : else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
341 : {
342 12 : MB2CHAR_NEED_AT_LEAST(len, 2);
343 6 : *to = *from++ << 8;
344 6 : *to |= *from++;
345 6 : len -= 2;
346 : }
347 : else
348 : {
349 6 : *to = *from++;
350 6 : len--;
351 : }
352 24 : to++;
353 24 : cnt++;
354 : }
355 60 : *to = 0;
356 60 : return cnt;
357 : }
358 :
359 : static int
360 6 : pg_euctw_mblen(const unsigned char *s)
361 : {
362 : int len;
363 :
364 6 : if (*s == SS2)
365 0 : len = 4;
366 6 : else if (*s == SS3)
367 0 : len = 3;
368 6 : else if (IS_HIGHBIT_SET(*s))
369 6 : len = 2;
370 : else
371 0 : len = 1;
372 6 : return len;
373 : }
374 :
375 : static int
376 0 : pg_euctw_dsplen(const unsigned char *s)
377 : {
378 : int len;
379 :
380 0 : if (*s == SS2)
381 0 : len = 2;
382 0 : else if (*s == SS3)
383 0 : len = 2;
384 0 : else if (IS_HIGHBIT_SET(*s))
385 0 : len = 2;
386 : else
387 0 : len = pg_ascii_dsplen(s);
388 0 : return len;
389 : }
390 :
391 : /*
392 : * Convert pg_wchar to EUC_* encoding.
393 : * caller must allocate enough space for "to", including a trailing zero!
394 : * len: length of from.
395 : * "from" not necessarily null terminated.
396 : */
397 : static int
398 72 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
399 : {
400 72 : int cnt = 0;
401 :
402 144 : while (len > 0 && *from)
403 : {
404 : unsigned char c;
405 :
406 72 : if ((c = (*from >> 24)))
407 : {
408 6 : *to++ = c;
409 6 : *to++ = (*from >> 16) & 0xff;
410 6 : *to++ = (*from >> 8) & 0xff;
411 6 : *to++ = *from & 0xff;
412 6 : cnt += 4;
413 : }
414 66 : else if ((c = (*from >> 16)))
415 : {
416 24 : *to++ = c;
417 24 : *to++ = (*from >> 8) & 0xff;
418 24 : *to++ = *from & 0xff;
419 24 : cnt += 3;
420 : }
421 42 : else if ((c = (*from >> 8)))
422 : {
423 24 : *to++ = c;
424 24 : *to++ = *from & 0xff;
425 24 : cnt += 2;
426 : }
427 : else
428 : {
429 18 : *to++ = *from;
430 18 : cnt++;
431 : }
432 72 : from++;
433 72 : len--;
434 : }
435 72 : *to = 0;
436 72 : return cnt;
437 : }
438 :
439 :
440 : /*
441 : * JOHAB
442 : */
443 : static int
444 24 : pg_johab_mblen(const unsigned char *s)
445 : {
446 24 : return pg_euc_mblen(s);
447 : }
448 :
449 : static int
450 0 : pg_johab_dsplen(const unsigned char *s)
451 : {
452 0 : return pg_euc_dsplen(s);
453 : }
454 :
455 : /*
456 : * convert UTF8 string to pg_wchar (UCS-4)
457 : * caller must allocate enough space for "to", including a trailing zero!
458 : * len: length of from.
459 : * "from" not necessarily null terminated.
460 : */
461 : static int
462 10207072 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
463 : {
464 10207072 : int cnt = 0;
465 : uint32 c1,
466 : c2,
467 : c3,
468 : c4;
469 :
470 160207832 : while (len > 0 && *from)
471 : {
472 150000802 : if ((*from & 0x80) == 0)
473 : {
474 149999692 : *to = *from++;
475 149999692 : len--;
476 : }
477 1110 : else if ((*from & 0xe0) == 0xc0)
478 : {
479 536 : MB2CHAR_NEED_AT_LEAST(len, 2);
480 524 : c1 = *from++ & 0x1f;
481 524 : c2 = *from++ & 0x3f;
482 524 : *to = (c1 << 6) | c2;
483 524 : len -= 2;
484 : }
485 574 : else if ((*from & 0xf0) == 0xe0)
486 : {
487 334 : MB2CHAR_NEED_AT_LEAST(len, 3);
488 322 : c1 = *from++ & 0x0f;
489 322 : c2 = *from++ & 0x3f;
490 322 : c3 = *from++ & 0x3f;
491 322 : *to = (c1 << 12) | (c2 << 6) | c3;
492 322 : len -= 3;
493 : }
494 240 : else if ((*from & 0xf8) == 0xf0)
495 : {
496 24 : MB2CHAR_NEED_AT_LEAST(len, 4);
497 6 : c1 = *from++ & 0x07;
498 6 : c2 = *from++ & 0x3f;
499 6 : c3 = *from++ & 0x3f;
500 6 : c4 = *from++ & 0x3f;
501 6 : *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
502 6 : len -= 4;
503 : }
504 : else
505 : {
506 : /* treat a bogus char as length 1; not ours to raise error */
507 216 : *to = *from++;
508 216 : len--;
509 : }
510 150000760 : to++;
511 150000760 : cnt++;
512 : }
513 10207072 : *to = 0;
514 10207072 : return cnt;
515 : }
516 :
517 :
518 : /*
519 : * Trivial conversion from pg_wchar to UTF-8.
520 : * caller should allocate enough space for "to"
521 : * len: length of from.
522 : * "from" not necessarily null terminated.
523 : */
524 : static int
525 1116158 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
526 : {
527 1116158 : int cnt = 0;
528 :
529 16800580 : while (len > 0 && *from)
530 : {
531 : int char_len;
532 :
533 15684422 : unicode_to_utf8(*from, to);
534 15684422 : char_len = pg_utf_mblen(to);
535 15684422 : cnt += char_len;
536 15684422 : to += char_len;
537 15684422 : from++;
538 15684422 : len--;
539 : }
540 1116158 : *to = 0;
541 1116158 : return cnt;
542 : }
543 :
544 : /*
545 : * Return the byte length of a UTF8 character pointed to by s
546 : *
547 : * Note: in the current implementation we do not support UTF8 sequences
548 : * of more than 4 bytes; hence do NOT return a value larger than 4.
549 : * We return "1" for any leading byte that is either flat-out illegal or
550 : * indicates a length larger than we support.
551 : *
552 : * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
553 : * other places would need to be fixed to change this.
554 : */
555 : int
556 315032704 : pg_utf_mblen(const unsigned char *s)
557 : {
558 : int len;
559 :
560 315032704 : if ((*s & 0x80) == 0)
561 315004426 : len = 1;
562 28278 : else if ((*s & 0xe0) == 0xc0)
563 13970 : len = 2;
564 14308 : else if ((*s & 0xf0) == 0xe0)
565 9780 : len = 3;
566 4528 : else if ((*s & 0xf8) == 0xf0)
567 4354 : len = 4;
568 : #ifdef NOT_USED
569 : else if ((*s & 0xfc) == 0xf8)
570 : len = 5;
571 : else if ((*s & 0xfe) == 0xfc)
572 : len = 6;
573 : #endif
574 : else
575 174 : len = 1;
576 315032704 : return len;
577 : }
578 :
579 : /*
580 : * This is an implementation of wcwidth() and wcswidth() as defined in
581 : * "The Single UNIX Specification, Version 2, The Open Group, 1997"
582 : * <http://www.unix.org/online.html>
583 : *
584 : * Markus Kuhn -- 2001-09-08 -- public domain
585 : *
586 : * customised for PostgreSQL
587 : *
588 : * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
589 : */
590 :
591 : struct mbinterval
592 : {
593 : unsigned int first;
594 : unsigned int last;
595 : };
596 :
597 : /* auxiliary function for binary search in interval table */
598 : static int
599 89002390 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
600 : {
601 89002390 : int min = 0;
602 : int mid;
603 :
604 89002390 : if (ucs < table[0].first || ucs > table[max].last)
605 88993924 : return 0;
606 74220 : while (max >= min)
607 : {
608 66456 : mid = (min + max) / 2;
609 66456 : if (ucs > table[mid].last)
610 13332 : min = mid + 1;
611 53124 : else if (ucs < table[mid].first)
612 52422 : max = mid - 1;
613 : else
614 702 : return 1;
615 : }
616 :
617 7764 : return 0;
618 : }
619 :
620 :
621 : /* The following functions define the column width of an ISO 10646
622 : * character as follows:
623 : *
624 : * - The null character (U+0000) has a column width of 0.
625 : *
626 : * - Other C0/C1 control characters and DEL will lead to a return
627 : * value of -1.
628 : *
629 : * - Non-spacing and enclosing combining characters (general
630 : * category code Mn, Me or Cf in the Unicode database) have a
631 : * column width of 0.
632 : *
633 : * - Spacing characters in the East Asian Wide (W) or East Asian
634 : * FullWidth (F) category as defined in Unicode Technical
635 : * Report #11 have a column width of 2.
636 : *
637 : * - All remaining characters (including all printable
638 : * ISO 8859-1 and WGL4 characters, Unicode control characters,
639 : * etc.) have a column width of 1.
640 : *
641 : * This implementation assumes that wchar_t characters are encoded
642 : * in ISO 10646.
643 : */
644 :
645 : static int
646 44546456 : ucs_wcwidth(pg_wchar ucs)
647 : {
648 : #include "common/unicode_nonspacing_table.h"
649 : #include "common/unicode_east_asian_fw_table.h"
650 :
651 : /* test for 8-bit control characters */
652 44546456 : if (ucs == 0)
653 0 : return 0;
654 :
655 44546456 : if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
656 45018 : return -1;
657 :
658 : /*
659 : * binary search in table of non-spacing characters
660 : *
661 : * XXX: In the official Unicode sources, it is possible for a character to
662 : * be described as both non-spacing and wide at the same time. As of
663 : * Unicode 13.0, treating the non-spacing property as the determining
664 : * factor for display width leads to the correct behavior, so do that
665 : * search first.
666 : */
667 44501438 : if (mbbisearch(ucs, nonspacing,
668 : sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
669 486 : return 0;
670 :
671 : /* binary search in table of wide characters */
672 44500952 : if (mbbisearch(ucs, east_asian_fw,
673 : sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
674 216 : return 2;
675 :
676 44500736 : return 1;
677 : }
678 :
679 : static int
680 44546456 : pg_utf_dsplen(const unsigned char *s)
681 : {
682 44546456 : return ucs_wcwidth(utf8_to_unicode(s));
683 : }
684 :
685 : /*
686 : * convert mule internal code to pg_wchar
687 : * caller should allocate enough space for "to"
688 : * len: length of from.
689 : * "from" not necessarily null terminated.
690 : */
691 : static int
692 36 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
693 : {
694 36 : int cnt = 0;
695 :
696 54 : while (len > 0 && *from)
697 : {
698 36 : if (IS_LC1(*from))
699 : {
700 12 : MB2CHAR_NEED_AT_LEAST(len, 2);
701 6 : *to = *from++ << 16;
702 6 : *to |= *from++;
703 6 : len -= 2;
704 : }
705 24 : else if (IS_LCPRV1(*from))
706 : {
707 0 : MB2CHAR_NEED_AT_LEAST(len, 3);
708 0 : from++;
709 0 : *to = *from++ << 16;
710 0 : *to |= *from++;
711 0 : len -= 3;
712 : }
713 24 : else if (IS_LC2(*from))
714 : {
715 18 : MB2CHAR_NEED_AT_LEAST(len, 3);
716 6 : *to = *from++ << 16;
717 6 : *to |= *from++ << 8;
718 6 : *to |= *from++;
719 6 : len -= 3;
720 : }
721 6 : else if (IS_LCPRV2(*from))
722 : {
723 0 : MB2CHAR_NEED_AT_LEAST(len, 4);
724 0 : from++;
725 0 : *to = *from++ << 16;
726 0 : *to |= *from++ << 8;
727 0 : *to |= *from++;
728 0 : len -= 4;
729 : }
730 : else
731 : { /* assume ASCII */
732 6 : *to = (unsigned char) *from++;
733 6 : len--;
734 : }
735 18 : to++;
736 18 : cnt++;
737 : }
738 36 : *to = 0;
739 36 : return cnt;
740 : }
741 :
742 : /*
743 : * convert pg_wchar to mule internal code
744 : * caller should allocate enough space for "to"
745 : * len: length of from.
746 : * "from" not necessarily null terminated.
747 : */
748 : static int
749 18 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
750 : {
751 18 : int cnt = 0;
752 :
753 36 : while (len > 0 && *from)
754 : {
755 : unsigned char lb;
756 :
757 18 : lb = (*from >> 16) & 0xff;
758 18 : if (IS_LC1(lb))
759 : {
760 6 : *to++ = lb;
761 6 : *to++ = *from & 0xff;
762 6 : cnt += 2;
763 : }
764 12 : else if (IS_LC2(lb))
765 : {
766 6 : *to++ = lb;
767 6 : *to++ = (*from >> 8) & 0xff;
768 6 : *to++ = *from & 0xff;
769 6 : cnt += 3;
770 : }
771 6 : else if (IS_LCPRV1_A_RANGE(lb))
772 : {
773 0 : *to++ = LCPRV1_A;
774 0 : *to++ = lb;
775 0 : *to++ = *from & 0xff;
776 0 : cnt += 3;
777 : }
778 6 : else if (IS_LCPRV1_B_RANGE(lb))
779 : {
780 0 : *to++ = LCPRV1_B;
781 0 : *to++ = lb;
782 0 : *to++ = *from & 0xff;
783 0 : cnt += 3;
784 : }
785 6 : else if (IS_LCPRV2_A_RANGE(lb))
786 : {
787 0 : *to++ = LCPRV2_A;
788 0 : *to++ = lb;
789 0 : *to++ = (*from >> 8) & 0xff;
790 0 : *to++ = *from & 0xff;
791 0 : cnt += 4;
792 : }
793 6 : else if (IS_LCPRV2_B_RANGE(lb))
794 : {
795 0 : *to++ = LCPRV2_B;
796 0 : *to++ = lb;
797 0 : *to++ = (*from >> 8) & 0xff;
798 0 : *to++ = *from & 0xff;
799 0 : cnt += 4;
800 : }
801 : else
802 : {
803 6 : *to++ = *from & 0xff;
804 6 : cnt += 1;
805 : }
806 18 : from++;
807 18 : len--;
808 : }
809 18 : *to = 0;
810 18 : return cnt;
811 : }
812 :
813 : /* exported for direct use by conv.c */
814 : int
815 3024 : pg_mule_mblen(const unsigned char *s)
816 : {
817 : int len;
818 :
819 3024 : if (IS_LC1(*s))
820 1220 : len = 2;
821 1804 : else if (IS_LCPRV1(*s))
822 0 : len = 3;
823 1804 : else if (IS_LC2(*s))
824 1710 : len = 3;
825 94 : else if (IS_LCPRV2(*s))
826 40 : len = 4;
827 : else
828 54 : len = 1; /* assume ASCII */
829 3024 : return len;
830 : }
831 :
832 : static int
833 0 : pg_mule_dsplen(const unsigned char *s)
834 : {
835 : int len;
836 :
837 : /*
838 : * Note: it's not really appropriate to assume that all multibyte charsets
839 : * are double-wide on screen. But this seems an okay approximation for
840 : * the MULE charsets we currently support.
841 : */
842 :
843 0 : if (IS_LC1(*s))
844 0 : len = 1;
845 0 : else if (IS_LCPRV1(*s))
846 0 : len = 1;
847 0 : else if (IS_LC2(*s))
848 0 : len = 2;
849 0 : else if (IS_LCPRV2(*s))
850 0 : len = 2;
851 : else
852 0 : len = 1; /* assume ASCII */
853 :
854 0 : return len;
855 : }
856 :
857 : /*
858 : * ISO8859-1
859 : */
860 : static int
861 1082 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
862 : {
863 1082 : int cnt = 0;
864 :
865 30028 : while (len > 0 && *from)
866 : {
867 28946 : *to++ = *from++;
868 28946 : len--;
869 28946 : cnt++;
870 : }
871 1082 : *to = 0;
872 1082 : return cnt;
873 : }
874 :
875 : /*
876 : * Trivial conversion from pg_wchar to single byte encoding. Just ignores
877 : * high bits.
878 : * caller should allocate enough space for "to"
879 : * len: length of from.
880 : * "from" not necessarily null terminated.
881 : */
882 : static int
883 162 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
884 : {
885 162 : int cnt = 0;
886 :
887 1380 : while (len > 0 && *from)
888 : {
889 1218 : *to++ = *from++;
890 1218 : len--;
891 1218 : cnt++;
892 : }
893 162 : *to = 0;
894 162 : return cnt;
895 : }
896 :
897 : static int
898 7996 : pg_latin1_mblen(const unsigned char *s)
899 : {
900 7996 : return 1;
901 : }
902 :
903 : static int
904 800 : pg_latin1_dsplen(const unsigned char *s)
905 : {
906 800 : return pg_ascii_dsplen(s);
907 : }
908 :
909 : /*
910 : * SJIS
911 : */
912 : static int
913 1690 : pg_sjis_mblen(const unsigned char *s)
914 : {
915 : int len;
916 :
917 1690 : if (*s >= 0xa1 && *s <= 0xdf)
918 0 : len = 1; /* 1 byte kana? */
919 1690 : else if (IS_HIGHBIT_SET(*s))
920 1314 : len = 2; /* kanji? */
921 : else
922 376 : len = 1; /* should be ASCII */
923 1690 : return len;
924 : }
925 :
926 : static int
927 0 : pg_sjis_dsplen(const unsigned char *s)
928 : {
929 : int len;
930 :
931 0 : if (*s >= 0xa1 && *s <= 0xdf)
932 0 : len = 1; /* 1 byte kana? */
933 0 : else if (IS_HIGHBIT_SET(*s))
934 0 : len = 2; /* kanji? */
935 : else
936 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
937 0 : return len;
938 : }
939 :
940 : /*
941 : * Big5
942 : */
943 : static int
944 492 : pg_big5_mblen(const unsigned char *s)
945 : {
946 : int len;
947 :
948 492 : if (IS_HIGHBIT_SET(*s))
949 438 : len = 2; /* kanji? */
950 : else
951 54 : len = 1; /* should be ASCII */
952 492 : return len;
953 : }
954 :
955 : static int
956 0 : pg_big5_dsplen(const unsigned char *s)
957 : {
958 : int len;
959 :
960 0 : if (IS_HIGHBIT_SET(*s))
961 0 : len = 2; /* kanji? */
962 : else
963 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
964 0 : return len;
965 : }
966 :
967 : /*
968 : * GBK
969 : */
970 : static int
971 556 : pg_gbk_mblen(const unsigned char *s)
972 : {
973 : int len;
974 :
975 556 : if (IS_HIGHBIT_SET(*s))
976 416 : len = 2; /* kanji? */
977 : else
978 140 : len = 1; /* should be ASCII */
979 556 : return len;
980 : }
981 :
982 : static int
983 0 : pg_gbk_dsplen(const unsigned char *s)
984 : {
985 : int len;
986 :
987 0 : if (IS_HIGHBIT_SET(*s))
988 0 : len = 2; /* kanji? */
989 : else
990 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
991 0 : return len;
992 : }
993 :
994 : /*
995 : * UHC
996 : */
997 : static int
998 24 : pg_uhc_mblen(const unsigned char *s)
999 : {
1000 : int len;
1001 :
1002 24 : if (IS_HIGHBIT_SET(*s))
1003 24 : len = 2; /* 2byte? */
1004 : else
1005 0 : len = 1; /* should be ASCII */
1006 24 : return len;
1007 : }
1008 :
1009 : static int
1010 0 : pg_uhc_dsplen(const unsigned char *s)
1011 : {
1012 : int len;
1013 :
1014 0 : if (IS_HIGHBIT_SET(*s))
1015 0 : len = 2; /* 2byte? */
1016 : else
1017 0 : len = pg_ascii_dsplen(s); /* should be ASCII */
1018 0 : return len;
1019 : }
1020 :
1021 : /*
1022 : * GB18030
1023 : * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1024 : */
1025 :
1026 : /*
1027 : * Unlike all other mblen() functions, this also looks at the second byte of
1028 : * the input. However, if you only pass the first byte of a multi-byte
1029 : * string, and \0 as the second byte, this still works in a predictable way:
1030 : * a 4-byte character will be reported as two 2-byte characters. That's
1031 : * enough for all current uses, as a client-only encoding. It works that
1032 : * way, because in any valid 4-byte GB18030-encoded character, the third and
1033 : * fourth byte look like a 2-byte encoded character, when looked at
1034 : * separately.
1035 : */
1036 : static int
1037 1182 : pg_gb18030_mblen(const unsigned char *s)
1038 : {
1039 : int len;
1040 :
1041 1182 : if (!IS_HIGHBIT_SET(*s))
1042 684 : len = 1; /* ASCII */
1043 498 : else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1044 186 : len = 4;
1045 : else
1046 312 : len = 2;
1047 1182 : return len;
1048 : }
1049 :
1050 : static int
1051 0 : pg_gb18030_dsplen(const unsigned char *s)
1052 : {
1053 : int len;
1054 :
1055 0 : if (IS_HIGHBIT_SET(*s))
1056 0 : len = 2;
1057 : else
1058 0 : len = pg_ascii_dsplen(s); /* ASCII */
1059 0 : return len;
1060 : }
1061 :
1062 : /*
1063 : *-------------------------------------------------------------------
1064 : * multibyte sequence validators
1065 : *
1066 : * The verifychar functions accept "s", a pointer to the first byte of a
1067 : * string, and "len", the remaining length of the string. If there is a
1068 : * validly encoded character beginning at *s, return its length in bytes;
1069 : * else return -1.
1070 : *
1071 : * The verifystr functions also accept "s", a pointer to a string and "len",
1072 : * the length of the string. They verify the whole string, and return the
1073 : * number of input bytes (<= len) that are valid. In other words, if the
1074 : * whole string is valid, verifystr returns "len", otherwise it returns the
1075 : * byte offset of the first invalid character. The verifystr functions must
1076 : * test for and reject zeroes in the input.
1077 : *
1078 : * The verifychar functions can assume that len > 0 and that *s != '\0', but
1079 : * they must test for and reject zeroes in any additional bytes of a
1080 : * multibyte character. Note that this definition allows the function for a
1081 : * single-byte encoding to be just "return 1".
1082 : *-------------------------------------------------------------------
1083 : */
1084 : static int
1085 322 : pg_ascii_verifychar(const unsigned char *s, int len)
1086 : {
1087 322 : return 1;
1088 : }
1089 :
1090 : static int
1091 423860 : pg_ascii_verifystr(const unsigned char *s, int len)
1092 : {
1093 423860 : const unsigned char *nullpos = memchr(s, 0, len);
1094 :
1095 423860 : if (nullpos == NULL)
1096 423860 : return len;
1097 : else
1098 0 : return nullpos - s;
1099 : }
1100 :
1101 : #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1102 :
1103 : static int
1104 504 : pg_eucjp_verifychar(const unsigned char *s, int len)
1105 : {
1106 : int l;
1107 : unsigned char c1,
1108 : c2;
1109 :
1110 504 : c1 = *s++;
1111 :
1112 504 : switch (c1)
1113 : {
1114 0 : case SS2: /* JIS X 0201 */
1115 0 : l = 2;
1116 0 : if (l > len)
1117 0 : return -1;
1118 0 : c2 = *s++;
1119 0 : if (c2 < 0xa1 || c2 > 0xdf)
1120 0 : return -1;
1121 0 : break;
1122 :
1123 0 : case SS3: /* JIS X 0212 */
1124 0 : l = 3;
1125 0 : if (l > len)
1126 0 : return -1;
1127 0 : c2 = *s++;
1128 0 : if (!IS_EUC_RANGE_VALID(c2))
1129 0 : return -1;
1130 0 : c2 = *s++;
1131 0 : if (!IS_EUC_RANGE_VALID(c2))
1132 0 : return -1;
1133 0 : break;
1134 :
1135 504 : default:
1136 504 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1137 : {
1138 504 : l = 2;
1139 504 : if (l > len)
1140 84 : return -1;
1141 420 : if (!IS_EUC_RANGE_VALID(c1))
1142 24 : return -1;
1143 396 : c2 = *s++;
1144 396 : if (!IS_EUC_RANGE_VALID(c2))
1145 180 : return -1;
1146 : }
1147 : else
1148 : /* must be ASCII */
1149 : {
1150 0 : l = 1;
1151 : }
1152 216 : break;
1153 : }
1154 :
1155 216 : return l;
1156 : }
1157 :
1158 : static int
1159 300 : pg_eucjp_verifystr(const unsigned char *s, int len)
1160 : {
1161 300 : const unsigned char *start = s;
1162 :
1163 930 : while (len > 0)
1164 : {
1165 : int l;
1166 :
1167 : /* fast path for ASCII-subset characters */
1168 846 : if (!IS_HIGHBIT_SET(*s))
1169 : {
1170 594 : if (*s == '\0')
1171 72 : break;
1172 522 : l = 1;
1173 : }
1174 : else
1175 : {
1176 252 : l = pg_eucjp_verifychar(s, len);
1177 252 : if (l == -1)
1178 144 : break;
1179 : }
1180 630 : s += l;
1181 630 : len -= l;
1182 : }
1183 :
1184 300 : return s - start;
1185 : }
1186 :
1187 : static int
1188 144 : pg_euckr_verifychar(const unsigned char *s, int len)
1189 : {
1190 : int l;
1191 : unsigned char c1,
1192 : c2;
1193 :
1194 144 : c1 = *s++;
1195 :
1196 144 : if (IS_HIGHBIT_SET(c1))
1197 : {
1198 144 : l = 2;
1199 144 : if (l > len)
1200 12 : return -1;
1201 132 : if (!IS_EUC_RANGE_VALID(c1))
1202 24 : return -1;
1203 108 : c2 = *s++;
1204 108 : if (!IS_EUC_RANGE_VALID(c2))
1205 0 : return -1;
1206 : }
1207 : else
1208 : /* must be ASCII */
1209 : {
1210 0 : l = 1;
1211 : }
1212 :
1213 108 : return l;
1214 : }
1215 :
1216 : static int
1217 72 : pg_euckr_verifystr(const unsigned char *s, int len)
1218 : {
1219 72 : const unsigned char *start = s;
1220 :
1221 234 : while (len > 0)
1222 : {
1223 : int l;
1224 :
1225 : /* fast path for ASCII-subset characters */
1226 198 : if (!IS_HIGHBIT_SET(*s))
1227 : {
1228 108 : if (*s == '\0')
1229 0 : break;
1230 108 : l = 1;
1231 : }
1232 : else
1233 : {
1234 90 : l = pg_euckr_verifychar(s, len);
1235 90 : if (l == -1)
1236 36 : break;
1237 : }
1238 162 : s += l;
1239 162 : len -= l;
1240 : }
1241 :
1242 72 : return s - start;
1243 : }
1244 :
1245 : /* EUC-CN byte sequences are exactly same as EUC-KR */
1246 : #define pg_euccn_verifychar pg_euckr_verifychar
1247 : #define pg_euccn_verifystr pg_euckr_verifystr
1248 :
1249 : static int
1250 18 : pg_euctw_verifychar(const unsigned char *s, int len)
1251 : {
1252 : int l;
1253 : unsigned char c1,
1254 : c2;
1255 :
1256 18 : c1 = *s++;
1257 :
1258 18 : switch (c1)
1259 : {
1260 0 : case SS2: /* CNS 11643 Plane 1-7 */
1261 0 : l = 4;
1262 0 : if (l > len)
1263 0 : return -1;
1264 0 : c2 = *s++;
1265 0 : if (c2 < 0xa1 || c2 > 0xa7)
1266 0 : return -1;
1267 0 : c2 = *s++;
1268 0 : if (!IS_EUC_RANGE_VALID(c2))
1269 0 : return -1;
1270 0 : c2 = *s++;
1271 0 : if (!IS_EUC_RANGE_VALID(c2))
1272 0 : return -1;
1273 0 : break;
1274 :
1275 0 : case SS3: /* unused */
1276 0 : return -1;
1277 :
1278 18 : default:
1279 18 : if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1280 : {
1281 18 : l = 2;
1282 18 : if (l > len)
1283 6 : return -1;
1284 : /* no further range check on c1? */
1285 12 : c2 = *s++;
1286 12 : if (!IS_EUC_RANGE_VALID(c2))
1287 12 : return -1;
1288 : }
1289 : else
1290 : /* must be ASCII */
1291 : {
1292 0 : l = 1;
1293 : }
1294 0 : break;
1295 : }
1296 0 : return l;
1297 : }
1298 :
1299 : static int
1300 36 : pg_euctw_verifystr(const unsigned char *s, int len)
1301 : {
1302 36 : const unsigned char *start = s;
1303 :
1304 90 : while (len > 0)
1305 : {
1306 : int l;
1307 :
1308 : /* fast path for ASCII-subset characters */
1309 72 : if (!IS_HIGHBIT_SET(*s))
1310 : {
1311 54 : if (*s == '\0')
1312 0 : break;
1313 54 : l = 1;
1314 : }
1315 : else
1316 : {
1317 18 : l = pg_euctw_verifychar(s, len);
1318 18 : if (l == -1)
1319 18 : break;
1320 : }
1321 54 : s += l;
1322 54 : len -= l;
1323 : }
1324 :
1325 36 : return s - start;
1326 : }
1327 :
1328 : static int
1329 18 : pg_johab_verifychar(const unsigned char *s, int len)
1330 : {
1331 : int l,
1332 : mbl;
1333 : unsigned char c;
1334 :
1335 18 : l = mbl = pg_johab_mblen(s);
1336 :
1337 18 : if (len < l)
1338 6 : return -1;
1339 :
1340 12 : if (!IS_HIGHBIT_SET(*s))
1341 0 : return mbl;
1342 :
1343 12 : while (--l > 0)
1344 : {
1345 12 : c = *++s;
1346 12 : if (!IS_EUC_RANGE_VALID(c))
1347 12 : return -1;
1348 : }
1349 0 : return mbl;
1350 : }
1351 :
1352 : static int
1353 24 : pg_johab_verifystr(const unsigned char *s, int len)
1354 : {
1355 24 : const unsigned char *start = s;
1356 :
1357 42 : while (len > 0)
1358 : {
1359 : int l;
1360 :
1361 : /* fast path for ASCII-subset characters */
1362 36 : if (!IS_HIGHBIT_SET(*s))
1363 : {
1364 18 : if (*s == '\0')
1365 0 : break;
1366 18 : l = 1;
1367 : }
1368 : else
1369 : {
1370 18 : l = pg_johab_verifychar(s, len);
1371 18 : if (l == -1)
1372 18 : break;
1373 : }
1374 18 : s += l;
1375 18 : len -= l;
1376 : }
1377 :
1378 24 : return s - start;
1379 : }
1380 :
1381 : static int
1382 1350 : pg_mule_verifychar(const unsigned char *s, int len)
1383 : {
1384 : int l,
1385 : mbl;
1386 : unsigned char c;
1387 :
1388 1350 : l = mbl = pg_mule_mblen(s);
1389 :
1390 1350 : if (len < l)
1391 344 : return -1;
1392 :
1393 2032 : while (--l > 0)
1394 : {
1395 1348 : c = *++s;
1396 1348 : if (!IS_HIGHBIT_SET(c))
1397 322 : return -1;
1398 : }
1399 684 : return mbl;
1400 : }
1401 :
1402 : static int
1403 438 : pg_mule_verifystr(const unsigned char *s, int len)
1404 : {
1405 438 : const unsigned char *start = s;
1406 :
1407 1290 : while (len > 0)
1408 : {
1409 : int l;
1410 :
1411 : /* fast path for ASCII-subset characters */
1412 1122 : if (!IS_HIGHBIT_SET(*s))
1413 : {
1414 690 : if (*s == '\0')
1415 36 : break;
1416 654 : l = 1;
1417 : }
1418 : else
1419 : {
1420 432 : l = pg_mule_verifychar(s, len);
1421 432 : if (l == -1)
1422 234 : break;
1423 : }
1424 852 : s += l;
1425 852 : len -= l;
1426 : }
1427 :
1428 438 : return s - start;
1429 : }
1430 :
1431 : static int
1432 7156 : pg_latin1_verifychar(const unsigned char *s, int len)
1433 : {
1434 7156 : return 1;
1435 : }
1436 :
1437 : static int
1438 11376 : pg_latin1_verifystr(const unsigned char *s, int len)
1439 : {
1440 11376 : const unsigned char *nullpos = memchr(s, 0, len);
1441 :
1442 11376 : if (nullpos == NULL)
1443 11268 : return len;
1444 : else
1445 108 : return nullpos - s;
1446 : }
1447 :
1448 : static int
1449 1002 : pg_sjis_verifychar(const unsigned char *s, int len)
1450 : {
1451 : int l,
1452 : mbl;
1453 : unsigned char c1,
1454 : c2;
1455 :
1456 1002 : l = mbl = pg_sjis_mblen(s);
1457 :
1458 1002 : if (len < l)
1459 132 : return -1;
1460 :
1461 870 : if (l == 1) /* pg_sjis_mblen already verified it */
1462 0 : return mbl;
1463 :
1464 870 : c1 = *s++;
1465 870 : c2 = *s;
1466 870 : if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1467 348 : return -1;
1468 522 : return mbl;
1469 : }
1470 :
1471 : static int
1472 546 : pg_sjis_verifystr(const unsigned char *s, int len)
1473 : {
1474 546 : const unsigned char *start = s;
1475 :
1476 2068 : while (len > 0)
1477 : {
1478 : int l;
1479 :
1480 : /* fast path for ASCII-subset characters */
1481 1842 : if (!IS_HIGHBIT_SET(*s))
1482 : {
1483 1348 : if (*s == '\0')
1484 72 : break;
1485 1276 : l = 1;
1486 : }
1487 : else
1488 : {
1489 494 : l = pg_sjis_verifychar(s, len);
1490 494 : if (l == -1)
1491 248 : break;
1492 : }
1493 1522 : s += l;
1494 1522 : len -= l;
1495 : }
1496 :
1497 546 : return s - start;
1498 : }
1499 :
1500 : static int
1501 360 : pg_big5_verifychar(const unsigned char *s, int len)
1502 : {
1503 : int l,
1504 : mbl;
1505 :
1506 360 : l = mbl = pg_big5_mblen(s);
1507 :
1508 360 : if (len < l)
1509 6 : return -1;
1510 :
1511 354 : if (l == 2 &&
1512 354 : s[0] == NONUTF8_INVALID_BYTE0 &&
1513 12 : s[1] == NONUTF8_INVALID_BYTE1)
1514 12 : return -1;
1515 :
1516 576 : while (--l > 0)
1517 : {
1518 342 : if (*++s == '\0')
1519 108 : return -1;
1520 : }
1521 :
1522 234 : return mbl;
1523 : }
1524 :
1525 : static int
1526 162 : pg_big5_verifystr(const unsigned char *s, int len)
1527 : {
1528 162 : const unsigned char *start = s;
1529 :
1530 666 : while (len > 0)
1531 : {
1532 : int l;
1533 :
1534 : /* fast path for ASCII-subset characters */
1535 594 : if (!IS_HIGHBIT_SET(*s))
1536 : {
1537 468 : if (*s == '\0')
1538 36 : break;
1539 432 : l = 1;
1540 : }
1541 : else
1542 : {
1543 126 : l = pg_big5_verifychar(s, len);
1544 126 : if (l == -1)
1545 54 : break;
1546 : }
1547 504 : s += l;
1548 504 : len -= l;
1549 : }
1550 :
1551 162 : return s - start;
1552 : }
1553 :
1554 : static int
1555 274 : pg_gbk_verifychar(const unsigned char *s, int len)
1556 : {
1557 : int l,
1558 : mbl;
1559 :
1560 274 : l = mbl = pg_gbk_mblen(s);
1561 :
1562 274 : if (len < l)
1563 54 : return -1;
1564 :
1565 220 : if (l == 2 &&
1566 220 : s[0] == NONUTF8_INVALID_BYTE0 &&
1567 28 : s[1] == NONUTF8_INVALID_BYTE1)
1568 28 : return -1;
1569 :
1570 384 : while (--l > 0)
1571 : {
1572 192 : if (*++s == '\0')
1573 0 : return -1;
1574 : }
1575 :
1576 192 : return mbl;
1577 : }
1578 :
1579 : static int
1580 256 : pg_gbk_verifystr(const unsigned char *s, int len)
1581 : {
1582 256 : const unsigned char *start = s;
1583 :
1584 658 : while (len > 0)
1585 : {
1586 : int l;
1587 :
1588 : /* fast path for ASCII-subset characters */
1589 484 : if (!IS_HIGHBIT_SET(*s))
1590 : {
1591 242 : if (*s == '\0')
1592 0 : break;
1593 242 : l = 1;
1594 : }
1595 : else
1596 : {
1597 242 : l = pg_gbk_verifychar(s, len);
1598 242 : if (l == -1)
1599 82 : break;
1600 : }
1601 402 : s += l;
1602 402 : len -= l;
1603 : }
1604 :
1605 256 : return s - start;
1606 : }
1607 :
1608 : static int
1609 18 : pg_uhc_verifychar(const unsigned char *s, int len)
1610 : {
1611 : int l,
1612 : mbl;
1613 :
1614 18 : l = mbl = pg_uhc_mblen(s);
1615 :
1616 18 : if (len < l)
1617 6 : return -1;
1618 :
1619 12 : if (l == 2 &&
1620 12 : s[0] == NONUTF8_INVALID_BYTE0 &&
1621 12 : s[1] == NONUTF8_INVALID_BYTE1)
1622 12 : return -1;
1623 :
1624 0 : while (--l > 0)
1625 : {
1626 0 : if (*++s == '\0')
1627 0 : return -1;
1628 : }
1629 :
1630 0 : return mbl;
1631 : }
1632 :
1633 : static int
1634 24 : pg_uhc_verifystr(const unsigned char *s, int len)
1635 : {
1636 24 : const unsigned char *start = s;
1637 :
1638 42 : while (len > 0)
1639 : {
1640 : int l;
1641 :
1642 : /* fast path for ASCII-subset characters */
1643 36 : if (!IS_HIGHBIT_SET(*s))
1644 : {
1645 18 : if (*s == '\0')
1646 0 : break;
1647 18 : l = 1;
1648 : }
1649 : else
1650 : {
1651 18 : l = pg_uhc_verifychar(s, len);
1652 18 : if (l == -1)
1653 18 : break;
1654 : }
1655 18 : s += l;
1656 18 : len -= l;
1657 : }
1658 :
1659 24 : return s - start;
1660 : }
1661 :
1662 : static int
1663 1212 : pg_gb18030_verifychar(const unsigned char *s, int len)
1664 : {
1665 : int l;
1666 :
1667 1212 : if (!IS_HIGHBIT_SET(*s))
1668 0 : l = 1; /* ASCII */
1669 1212 : else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1670 : {
1671 : /* Should be 4-byte, validate remaining bytes */
1672 318 : if (*s >= 0x81 && *s <= 0xfe &&
1673 306 : *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1674 306 : *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1675 162 : l = 4;
1676 : else
1677 156 : l = -1;
1678 : }
1679 894 : else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1680 : {
1681 : /* Should be 2-byte, validate */
1682 660 : if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1683 420 : (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1684 324 : l = 2;
1685 : else
1686 336 : l = -1;
1687 : }
1688 : else
1689 234 : l = -1;
1690 1212 : return l;
1691 : }
1692 :
1693 : static int
1694 902 : pg_gb18030_verifystr(const unsigned char *s, int len)
1695 : {
1696 902 : const unsigned char *start = s;
1697 :
1698 2966 : while (len > 0)
1699 : {
1700 : int l;
1701 :
1702 : /* fast path for ASCII-subset characters */
1703 2670 : if (!IS_HIGHBIT_SET(*s))
1704 : {
1705 1804 : if (*s == '\0')
1706 48 : break;
1707 1756 : l = 1;
1708 : }
1709 : else
1710 : {
1711 866 : l = pg_gb18030_verifychar(s, len);
1712 866 : if (l == -1)
1713 558 : break;
1714 : }
1715 2064 : s += l;
1716 2064 : len -= l;
1717 : }
1718 :
1719 902 : return s - start;
1720 : }
1721 :
1722 : static int
1723 17628 : pg_utf8_verifychar(const unsigned char *s, int len)
1724 : {
1725 : int l;
1726 :
1727 17628 : if ((*s & 0x80) == 0)
1728 : {
1729 0 : if (*s == '\0')
1730 0 : return -1;
1731 0 : return 1;
1732 : }
1733 17628 : else if ((*s & 0xe0) == 0xc0)
1734 6176 : l = 2;
1735 11452 : else if ((*s & 0xf0) == 0xe0)
1736 6332 : l = 3;
1737 5120 : else if ((*s & 0xf8) == 0xf0)
1738 4856 : l = 4;
1739 : else
1740 264 : l = 1;
1741 :
1742 17628 : if (l > len)
1743 578 : return -1;
1744 :
1745 17050 : if (!pg_utf8_islegal(s, l))
1746 2362 : return -1;
1747 :
1748 14688 : return l;
1749 : }
1750 :
1751 : /*
1752 : * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1753 : * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1754 : * input byte and current state are used to compute an index into an array of
1755 : * state transitions. Since the address of the next transition is dependent
1756 : * on this computation, there is latency in executing the load instruction,
1757 : * and the CPU is not kept busy.
1758 : *
1759 : * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1760 : *
1761 : * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1762 : *
1763 : * In a shift-based DFA, the input byte is an index into array of integers
1764 : * whose bit pattern encodes the state transitions. To compute the next
1765 : * state, we simply right-shift the integer by the current state and apply a
1766 : * mask. In this scheme, the address of the transition only depends on the
1767 : * input byte, so there is better pipelining.
1768 : *
1769 : * The naming convention for states and transitions was adopted from a UTF-8
1770 : * to UTF-16/32 transcoder, whose table is reproduced below:
1771 : *
1772 : * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1773 : *
1774 : * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1775 : * ==========================================================================
1776 : * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1777 : * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1778 : * |
1779 : * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1780 : * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1781 : * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1782 : * |
1783 : * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1784 : * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1785 : * |
1786 : * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1787 : * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1788 : *
1789 : * In the most straightforward implementation, a shift-based DFA for UTF-8
1790 : * requires 64-bit integers to encode the transitions, but with an SMT solver
1791 : * it's possible to find state numbers such that the transitions fit within
1792 : * 32-bit integers, as Dougall Johnson demonstrated:
1793 : *
1794 : * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1795 : *
1796 : * This packed representation is the reason for the seemingly odd choice of
1797 : * state values below.
1798 : */
1799 :
1800 : /* Error */
1801 : #define ERR 0
1802 : /* Begin */
1803 : #define BGN 11
1804 : /* Continuation states, expect 1/2/3 continuation bytes */
1805 : #define CS1 16
1806 : #define CS2 1
1807 : #define CS3 5
1808 : /* Partial states, where the first continuation byte has a restricted range */
1809 : #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1810 : #define P3B 20 /* Lead was ED, check for surrogate */
1811 : #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1812 : #define P4B 30 /* Lead was F4, check for too-large */
1813 : /* Begin and End are the same state */
1814 : #define END BGN
1815 :
1816 : /* the encoded state transitions for the lookup table */
1817 :
1818 : /* ASCII */
1819 : #define ASC (END << BGN)
1820 : /* 2-byte lead */
1821 : #define L2A (CS1 << BGN)
1822 : /* 3-byte lead */
1823 : #define L3A (P3A << BGN)
1824 : #define L3B (CS2 << BGN)
1825 : #define L3C (P3B << BGN)
1826 : /* 4-byte lead */
1827 : #define L4A (P4A << BGN)
1828 : #define L4B (CS3 << BGN)
1829 : #define L4C (P4B << BGN)
1830 : /* continuation byte */
1831 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1832 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1833 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1834 : /* invalid byte */
1835 : #define ILL ERR
1836 :
1837 : static const uint32 Utf8Transition[256] =
1838 : {
1839 : /* ASCII */
1840 :
1841 : ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1842 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1843 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1844 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845 :
1846 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1847 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1848 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1849 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850 :
1851 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1852 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1853 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1854 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855 :
1856 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1857 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1858 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1859 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860 :
1861 : /* continuation bytes */
1862 :
1863 : /* 80..8F */
1864 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1865 : CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1866 :
1867 : /* 90..9F */
1868 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1869 : CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1870 :
1871 : /* A0..BF */
1872 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1873 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1874 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1875 : CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876 :
1877 : /* leading bytes */
1878 :
1879 : /* C0..DF */
1880 : ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1881 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1882 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1883 : L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1884 :
1885 : /* E0..EF */
1886 : L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1887 : L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1888 :
1889 : /* F0..FF */
1890 : L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1891 : ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1892 : };
1893 :
1894 : static void
1895 1750 : utf8_advance(const unsigned char *s, uint32 *state, int len)
1896 : {
1897 : /* Note: We deliberately don't check the state's value here. */
1898 57750 : while (len > 0)
1899 : {
1900 : /*
1901 : * It's important that the mask value is 31: In most instruction sets,
1902 : * a shift by a 32-bit operand is understood to be a shift by its mod
1903 : * 32, so the compiler should elide the mask operation.
1904 : */
1905 56000 : *state = Utf8Transition[*s++] >> (*state & 31);
1906 56000 : len--;
1907 : }
1908 :
1909 1750 : *state &= 31;
1910 1750 : }
1911 :
1912 : static int
1913 1219058 : pg_utf8_verifystr(const unsigned char *s, int len)
1914 : {
1915 1219058 : const unsigned char *start = s;
1916 1219058 : const int orig_len = len;
1917 1219058 : uint32 state = BGN;
1918 :
1919 : /*
1920 : * With a stride of two vector widths, gcc will unroll the loop. Even if
1921 : * the compiler can unroll a longer loop, it's not worth it because we
1922 : * must fall back to the byte-wise algorithm if we find any non-ASCII.
1923 : */
1924 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
1925 :
1926 1219058 : if (len >= STRIDE_LENGTH)
1927 : {
1928 4166986 : while (len >= STRIDE_LENGTH)
1929 : {
1930 : /*
1931 : * If the chunk is all ASCII, we can skip the full UTF-8 check,
1932 : * but we must first check for a non-END state, which means the
1933 : * previous chunk ended in the middle of a multibyte sequence.
1934 : */
1935 3582598 : if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1936 1750 : utf8_advance(s, &state, STRIDE_LENGTH);
1937 :
1938 3582598 : s += STRIDE_LENGTH;
1939 3582598 : len -= STRIDE_LENGTH;
1940 : }
1941 :
1942 : /* The error state persists, so we only need to check for it here. */
1943 584388 : if (state == ERR)
1944 : {
1945 : /*
1946 : * Start over from the beginning with the slow path so we can
1947 : * count the valid bytes.
1948 : */
1949 504 : len = orig_len;
1950 504 : s = start;
1951 : }
1952 583884 : else if (state != END)
1953 : {
1954 : /*
1955 : * The fast path exited in the middle of a multibyte sequence.
1956 : * Walk backwards to find the leading byte so that the slow path
1957 : * can resume checking from there. We must always backtrack at
1958 : * least one byte, since the current byte could be e.g. an ASCII
1959 : * byte after a 2-byte lead, which is invalid.
1960 : */
1961 : do
1962 : {
1963 : Assert(s > start);
1964 116 : s--;
1965 116 : len++;
1966 : Assert(IS_HIGHBIT_SET(*s));
1967 116 : } while (pg_utf_mblen(s) <= 1);
1968 : }
1969 : }
1970 :
1971 : /* check remaining bytes */
1972 17970278 : while (len > 0)
1973 : {
1974 : int l;
1975 :
1976 : /* fast path for ASCII-subset characters */
1977 16754302 : if (!IS_HIGHBIT_SET(*s))
1978 : {
1979 16736746 : if (*s == '\0')
1980 206 : break;
1981 16736540 : l = 1;
1982 : }
1983 : else
1984 : {
1985 17556 : l = pg_utf8_verifychar(s, len);
1986 17556 : if (l == -1)
1987 2876 : break;
1988 : }
1989 16751220 : s += l;
1990 16751220 : len -= l;
1991 : }
1992 :
1993 1219058 : return s - start;
1994 : }
1995 :
1996 : /*
1997 : * Check for validity of a single UTF-8 encoded character
1998 : *
1999 : * This directly implements the rules in RFC3629. The bizarre-looking
2000 : * restrictions on the second byte are meant to ensure that there isn't
2001 : * more than one encoding of a given Unicode character point; that is,
2002 : * you may not use a longer-than-necessary byte sequence with high order
2003 : * zero bits to represent a character that would fit in fewer bytes.
2004 : * To do otherwise is to create security hazards (eg, create an apparent
2005 : * non-ASCII character that decodes to plain ASCII).
2006 : *
2007 : * length is assumed to have been obtained by pg_utf_mblen(), and the
2008 : * caller must have checked that that many bytes are present in the buffer.
2009 : */
2010 : bool
2011 23654 : pg_utf8_islegal(const unsigned char *source, int length)
2012 : {
2013 : unsigned char a;
2014 :
2015 23654 : switch (length)
2016 : {
2017 0 : default:
2018 : /* reject lengths 5 and 6 for now */
2019 0 : return false;
2020 4596 : case 4:
2021 4596 : a = source[3];
2022 4596 : if (a < 0x80 || a > 0xBF)
2023 364 : return false;
2024 : /* FALL THRU */
2025 : case 3:
2026 12050 : a = source[2];
2027 12050 : if (a < 0x80 || a > 0xBF)
2028 680 : return false;
2029 : /* FALL THRU */
2030 : case 2:
2031 18004 : a = source[1];
2032 18004 : switch (*source)
2033 : {
2034 312 : case 0xE0:
2035 312 : if (a < 0xA0 || a > 0xBF)
2036 264 : return false;
2037 48 : break;
2038 312 : case 0xED:
2039 312 : if (a < 0x80 || a > 0x9F)
2040 264 : return false;
2041 48 : break;
2042 4052 : case 0xF0:
2043 4052 : if (a < 0x90 || a > 0xBF)
2044 264 : return false;
2045 3788 : break;
2046 180 : case 0xF4:
2047 180 : if (a < 0x80 || a > 0x8F)
2048 132 : return false;
2049 48 : break;
2050 13148 : default:
2051 13148 : if (a < 0x80 || a > 0xBF)
2052 298 : return false;
2053 12850 : break;
2054 : }
2055 : /* FALL THRU */
2056 : case 1:
2057 21388 : a = *source;
2058 21388 : if (a >= 0x80 && a < 0xC2)
2059 396 : return false;
2060 20992 : if (a > 0xF4)
2061 132 : return false;
2062 20860 : break;
2063 : }
2064 20860 : return true;
2065 : }
2066 :
2067 :
2068 : /*
2069 : * Fills the provided buffer with two bytes such that:
2070 : * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2071 : */
2072 : void
2073 412 : pg_encoding_set_invalid(int encoding, char *dst)
2074 : {
2075 : Assert(pg_encoding_max_length(encoding) > 1);
2076 :
2077 412 : dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2078 412 : dst[1] = NONUTF8_INVALID_BYTE1;
2079 412 : }
2080 :
2081 : /*
2082 : *-------------------------------------------------------------------
2083 : * encoding info table
2084 : *-------------------------------------------------------------------
2085 : */
2086 : const pg_wchar_tbl pg_wchar_table[] = {
2087 : [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2088 : [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2089 : [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
2090 : [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2091 : [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2092 : [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2093 : [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2094 : [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2095 : [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2096 : [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2097 : [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2098 : [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2099 : [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2100 : [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2101 : [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2102 : [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2103 : [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2104 : [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2105 : [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2106 : [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2107 : [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2108 : [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2109 : [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2110 : [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2111 : [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2112 : [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2113 : [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2114 : [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2115 : [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2116 : [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2117 : [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2118 : [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2119 : [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2120 : [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2121 : [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2122 : [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2123 : [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2124 : [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2125 : [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2126 : [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2127 : [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2128 : [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2129 : };
2130 :
2131 : /*
2132 : * Returns the byte length of a multibyte character.
2133 : *
2134 : * Choose "mblen" functions based on the input string characteristics.
2135 : * pg_encoding_mblen() can be used when ANY of these conditions are met:
2136 : *
2137 : * - The input string is zero-terminated
2138 : *
2139 : * - The input string is known to be valid in the encoding (e.g., string
2140 : * converted from database encoding)
2141 : *
2142 : * - The encoding is not GB18030 (e.g., when only database encodings are
2143 : * passed to 'encoding' parameter)
2144 : *
2145 : * encoding==GB18030 requires examining up to two bytes to determine character
2146 : * length. Therefore, callers satisfying none of those conditions must use
2147 : * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2148 : * guaranteed to be within allocation bounds.
2149 : *
2150 : * When dealing with text that is not certainly valid in the specified
2151 : * encoding, the result may exceed the actual remaining string length.
2152 : * Callers that are not prepared to deal with that should use Min(remaining,
2153 : * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
2154 : * pg_encoding_mblen_bounded() are interchangeable.
2155 : */
2156 : int
2157 44754272 : pg_encoding_mblen(int encoding, const char *mbstr)
2158 : {
2159 44754272 : return (PG_VALID_ENCODING(encoding) ?
2160 89508544 : pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2161 0 : pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2162 : }
2163 :
2164 : /*
2165 : * Returns the byte length of a multibyte character (possibly not
2166 : * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2167 : */
2168 : int
2169 6410 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
2170 : size_t remaining)
2171 : {
2172 : /*
2173 : * Define zero remaining as too few, even for single-byte encodings.
2174 : * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2175 : * zero; others read one.
2176 : */
2177 6410 : if (remaining < 1 ||
2178 338 : (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
2179 72 : return INT_MAX;
2180 6338 : return pg_encoding_mblen(encoding, mbstr);
2181 : }
2182 :
2183 : /*
2184 : * Returns the byte length of a multibyte character; but not more than the
2185 : * distance to the terminating zero byte. For input that might lack a
2186 : * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
2187 : */
2188 : int
2189 0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2190 : {
2191 0 : return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2192 : }
2193 :
2194 : /*
2195 : * Returns the display length of a multibyte character.
2196 : */
2197 : int
2198 44572678 : pg_encoding_dsplen(int encoding, const char *mbstr)
2199 : {
2200 44572678 : return (PG_VALID_ENCODING(encoding) ?
2201 89145356 : pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2202 0 : pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2203 : }
2204 :
2205 : /*
2206 : * Verify the first multibyte character of the given string.
2207 : * Return its byte length if good, -1 if bad. (See comments above for
2208 : * full details of the mbverifychar API.)
2209 : */
2210 : int
2211 9790 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2212 : {
2213 9790 : return (PG_VALID_ENCODING(encoding) ?
2214 19580 : pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2215 0 : pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2216 : }
2217 :
2218 : /*
2219 : * Verify that a string is valid for the given encoding.
2220 : * Returns the number of input bytes (<= len) that form a valid string.
2221 : * (See comments above for full details of the mbverifystr API.)
2222 : */
2223 : int
2224 462216 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2225 : {
2226 462216 : return (PG_VALID_ENCODING(encoding) ?
2227 924432 : pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2228 0 : pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2229 : }
2230 :
2231 : /*
2232 : * fetch maximum length of a given encoding
2233 : */
2234 : int
2235 1168690 : pg_encoding_max_length(int encoding)
2236 : {
2237 : Assert(PG_VALID_ENCODING(encoding));
2238 :
2239 : /*
2240 : * Check for the encoding despite the assert, due to some mingw versions
2241 : * otherwise issuing bogus warnings.
2242 : */
2243 1168690 : return PG_VALID_ENCODING(encoding) ?
2244 2337380 : pg_wchar_table[encoding].maxmblen :
2245 : pg_wchar_table[PG_SQL_ASCII].maxmblen;
2246 : }
|