LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 441 864 51.0 %
Date: 2023-11-29 04:11:06 Functions: 51 82 62.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wchar.c
       4             :  *    Functions for working with multibyte characters in various encodings.
       5             :  *
       6             :  * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/common/wchar.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #include "mb/pg_wchar.h"
      16             : 
      17             : 
      18             : /*
      19             :  * Operations on multi-byte encodings are driven by a table of helper
      20             :  * functions.
      21             :  *
      22             :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      23             :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      24             :  * and wchar2mb() conversion functions.
      25             :  *
      26             :  * These functions generally assume that their input is validly formed.
      27             :  * The "verifier" functions, further down in the file, have to be more
      28             :  * paranoid.
      29             :  *
      30             :  * We expect that mblen() does not need to examine more than the first byte
      31             :  * of the character to discover the correct length.  GB18030 is an exception
      32             :  * to that rule, though, as it also looks at second byte.  But even that
      33             :  * behaves in a predictable way, if you only pass the first byte: it will
      34             :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      35             :  * good enough for all current uses.
      36             :  *
      37             :  * Note: for the display output of psql to work properly, the return values
      38             :  * of the dsplen functions must conform to the Unicode standard. In particular
      39             :  * the NUL character is zero width and control characters are generally
      40             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      41             :  * subset to the ASCII routines to ensure consistency.
      42             :  */
      43             : 
      44             : /*
      45             :  * SQL/ASCII
      46             :  */
      47             : static int
      48      754028 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      49             : {
      50      754028 :     int         cnt = 0;
      51             : 
      52    44977914 :     while (len > 0 && *from)
      53             :     {
      54    44223886 :         *to++ = *from++;
      55    44223886 :         len--;
      56    44223886 :         cnt++;
      57             :     }
      58      754028 :     *to = 0;
      59      754028 :     return cnt;
      60             : }
      61             : 
      62             : static int
      63    44312836 : pg_ascii_mblen(const unsigned char *s)
      64             : {
      65    44312836 :     return 1;
      66             : }
      67             : 
      68             : static int
      69    33790354 : pg_ascii_dsplen(const unsigned char *s)
      70             : {
      71    33790354 :     if (*s == '\0')
      72           0 :         return 0;
      73    33790354 :     if (*s < 0x20 || *s == 0x7f)
      74       37864 :         return -1;
      75             : 
      76    33752490 :     return 1;
      77             : }
      78             : 
      79             : /*
      80             :  * EUC
      81             :  */
      82             : static int
      83           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      84             : {
      85           0 :     int         cnt = 0;
      86             : 
      87           0 :     while (len > 0 && *from)
      88             :     {
      89           0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
      90             :                                          * KANA") */
      91             :         {
      92           0 :             from++;
      93           0 :             *to = (SS2 << 8) | *from++;
      94           0 :             len -= 2;
      95             :         }
      96           0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
      97             :         {
      98           0 :             from++;
      99           0 :             *to = (SS3 << 16) | (*from++ << 8);
     100           0 :             *to |= *from++;
     101           0 :             len -= 3;
     102             :         }
     103           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
     104             :         {
     105           0 :             *to = *from++ << 8;
     106           0 :             *to |= *from++;
     107           0 :             len -= 2;
     108             :         }
     109             :         else                    /* must be ASCII */
     110             :         {
     111           0 :             *to = *from++;
     112           0 :             len--;
     113             :         }
     114           0 :         to++;
     115           0 :         cnt++;
     116             :     }
     117           0 :     *to = 0;
     118           0 :     return cnt;
     119             : }
     120             : 
     121             : static inline int
     122         180 : pg_euc_mblen(const unsigned char *s)
     123             : {
     124             :     int         len;
     125             : 
     126         180 :     if (*s == SS2)
     127           0 :         len = 2;
     128         180 :     else if (*s == SS3)
     129           0 :         len = 3;
     130         180 :     else if (IS_HIGHBIT_SET(*s))
     131         108 :         len = 2;
     132             :     else
     133          72 :         len = 1;
     134         180 :     return len;
     135             : }
     136             : 
     137             : static inline int
     138           0 : pg_euc_dsplen(const unsigned char *s)
     139             : {
     140             :     int         len;
     141             : 
     142           0 :     if (*s == SS2)
     143           0 :         len = 2;
     144           0 :     else if (*s == SS3)
     145           0 :         len = 2;
     146           0 :     else if (IS_HIGHBIT_SET(*s))
     147           0 :         len = 2;
     148             :     else
     149           0 :         len = pg_ascii_dsplen(s);
     150           0 :     return len;
     151             : }
     152             : 
     153             : /*
     154             :  * EUC_JP
     155             :  */
     156             : static int
     157           0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     158             : {
     159           0 :     return pg_euc2wchar_with_len(from, to, len);
     160             : }
     161             : 
     162             : static int
     163         180 : pg_eucjp_mblen(const unsigned char *s)
     164             : {
     165         180 :     return pg_euc_mblen(s);
     166             : }
     167             : 
     168             : static int
     169           0 : pg_eucjp_dsplen(const unsigned char *s)
     170             : {
     171             :     int         len;
     172             : 
     173           0 :     if (*s == SS2)
     174           0 :         len = 1;
     175           0 :     else if (*s == SS3)
     176           0 :         len = 2;
     177           0 :     else if (IS_HIGHBIT_SET(*s))
     178           0 :         len = 2;
     179             :     else
     180           0 :         len = pg_ascii_dsplen(s);
     181           0 :     return len;
     182             : }
     183             : 
     184             : /*
     185             :  * EUC_KR
     186             :  */
     187             : static int
     188           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     189             : {
     190           0 :     return pg_euc2wchar_with_len(from, to, len);
     191             : }
     192             : 
     193             : static int
     194           0 : pg_euckr_mblen(const unsigned char *s)
     195             : {
     196           0 :     return pg_euc_mblen(s);
     197             : }
     198             : 
     199             : static int
     200           0 : pg_euckr_dsplen(const unsigned char *s)
     201             : {
     202           0 :     return pg_euc_dsplen(s);
     203             : }
     204             : 
     205             : /*
     206             :  * EUC_CN
     207             :  *
     208             :  */
     209             : static int
     210           0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     211             : {
     212           0 :     int         cnt = 0;
     213             : 
     214           0 :     while (len > 0 && *from)
     215             :     {
     216           0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
     217             :         {
     218           0 :             from++;
     219           0 :             *to = (SS2 << 16) | (*from++ << 8);
     220           0 :             *to |= *from++;
     221           0 :             len -= 3;
     222             :         }
     223           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     224             :         {
     225           0 :             from++;
     226           0 :             *to = (SS3 << 16) | (*from++ << 8);
     227           0 :             *to |= *from++;
     228           0 :             len -= 3;
     229             :         }
     230           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     231             :         {
     232           0 :             *to = *from++ << 8;
     233           0 :             *to |= *from++;
     234           0 :             len -= 2;
     235             :         }
     236             :         else
     237             :         {
     238           0 :             *to = *from++;
     239           0 :             len--;
     240             :         }
     241           0 :         to++;
     242           0 :         cnt++;
     243             :     }
     244           0 :     *to = 0;
     245           0 :     return cnt;
     246             : }
     247             : 
     248             : static int
     249           0 : pg_euccn_mblen(const unsigned char *s)
     250             : {
     251             :     int         len;
     252             : 
     253           0 :     if (IS_HIGHBIT_SET(*s))
     254           0 :         len = 2;
     255             :     else
     256           0 :         len = 1;
     257           0 :     return len;
     258             : }
     259             : 
     260             : static int
     261           0 : pg_euccn_dsplen(const unsigned char *s)
     262             : {
     263             :     int         len;
     264             : 
     265           0 :     if (IS_HIGHBIT_SET(*s))
     266           0 :         len = 2;
     267             :     else
     268           0 :         len = pg_ascii_dsplen(s);
     269           0 :     return len;
     270             : }
     271             : 
     272             : /*
     273             :  * EUC_TW
     274             :  *
     275             :  */
     276             : static int
     277           0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     278             : {
     279           0 :     int         cnt = 0;
     280             : 
     281           0 :     while (len > 0 && *from)
     282             :     {
     283           0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
     284             :         {
     285           0 :             from++;
     286           0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     287           0 :             *to |= *from++ << 8;
     288           0 :             *to |= *from++;
     289           0 :             len -= 4;
     290             :         }
     291           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     292             :         {
     293           0 :             from++;
     294           0 :             *to = (SS3 << 16) | (*from++ << 8);
     295           0 :             *to |= *from++;
     296           0 :             len -= 3;
     297             :         }
     298           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     299             :         {
     300           0 :             *to = *from++ << 8;
     301           0 :             *to |= *from++;
     302           0 :             len -= 2;
     303             :         }
     304             :         else
     305             :         {
     306           0 :             *to = *from++;
     307           0 :             len--;
     308             :         }
     309           0 :         to++;
     310           0 :         cnt++;
     311             :     }
     312           0 :     *to = 0;
     313           0 :     return cnt;
     314             : }
     315             : 
     316             : static int
     317           0 : pg_euctw_mblen(const unsigned char *s)
     318             : {
     319             :     int         len;
     320             : 
     321           0 :     if (*s == SS2)
     322           0 :         len = 4;
     323           0 :     else if (*s == SS3)
     324           0 :         len = 3;
     325           0 :     else if (IS_HIGHBIT_SET(*s))
     326           0 :         len = 2;
     327             :     else
     328           0 :         len = 1;
     329           0 :     return len;
     330             : }
     331             : 
     332             : static int
     333           0 : pg_euctw_dsplen(const unsigned char *s)
     334             : {
     335             :     int         len;
     336             : 
     337           0 :     if (*s == SS2)
     338           0 :         len = 2;
     339           0 :     else if (*s == SS3)
     340           0 :         len = 2;
     341           0 :     else if (IS_HIGHBIT_SET(*s))
     342           0 :         len = 2;
     343             :     else
     344           0 :         len = pg_ascii_dsplen(s);
     345           0 :     return len;
     346             : }
     347             : 
     348             : /*
     349             :  * Convert pg_wchar to EUC_* encoding.
     350             :  * caller must allocate enough space for "to", including a trailing zero!
     351             :  * len: length of from.
     352             :  * "from" not necessarily null terminated.
     353             :  */
     354             : static int
     355           0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     356             : {
     357           0 :     int         cnt = 0;
     358             : 
     359           0 :     while (len > 0 && *from)
     360             :     {
     361             :         unsigned char c;
     362             : 
     363           0 :         if ((c = (*from >> 24)))
     364             :         {
     365           0 :             *to++ = c;
     366           0 :             *to++ = (*from >> 16) & 0xff;
     367           0 :             *to++ = (*from >> 8) & 0xff;
     368           0 :             *to++ = *from & 0xff;
     369           0 :             cnt += 4;
     370             :         }
     371           0 :         else if ((c = (*from >> 16)))
     372             :         {
     373           0 :             *to++ = c;
     374           0 :             *to++ = (*from >> 8) & 0xff;
     375           0 :             *to++ = *from & 0xff;
     376           0 :             cnt += 3;
     377             :         }
     378           0 :         else if ((c = (*from >> 8)))
     379             :         {
     380           0 :             *to++ = c;
     381           0 :             *to++ = *from & 0xff;
     382           0 :             cnt += 2;
     383             :         }
     384             :         else
     385             :         {
     386           0 :             *to++ = *from;
     387           0 :             cnt++;
     388             :         }
     389           0 :         from++;
     390           0 :         len--;
     391             :     }
     392           0 :     *to = 0;
     393           0 :     return cnt;
     394             : }
     395             : 
     396             : 
     397             : /*
     398             :  * JOHAB
     399             :  */
     400             : static int
     401           0 : pg_johab_mblen(const unsigned char *s)
     402             : {
     403           0 :     return pg_euc_mblen(s);
     404             : }
     405             : 
     406             : static int
     407           0 : pg_johab_dsplen(const unsigned char *s)
     408             : {
     409           0 :     return pg_euc_dsplen(s);
     410             : }
     411             : 
     412             : /*
     413             :  * convert UTF8 string to pg_wchar (UCS-4)
     414             :  * caller must allocate enough space for "to", including a trailing zero!
     415             :  * len: length of from.
     416             :  * "from" not necessarily null terminated.
     417             :  */
     418             : static int
     419      189116 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     420             : {
     421      189116 :     int         cnt = 0;
     422             :     uint32      c1,
     423             :                 c2,
     424             :                 c3,
     425             :                 c4;
     426             : 
     427     3727212 :     while (len > 0 && *from)
     428             :     {
     429     3538096 :         if ((*from & 0x80) == 0)
     430             :         {
     431     3538004 :             *to = *from++;
     432     3538004 :             len--;
     433             :         }
     434          92 :         else if ((*from & 0xe0) == 0xc0)
     435             :         {
     436          92 :             if (len < 2)
     437           0 :                 break;          /* drop trailing incomplete char */
     438          92 :             c1 = *from++ & 0x1f;
     439          92 :             c2 = *from++ & 0x3f;
     440          92 :             *to = (c1 << 6) | c2;
     441          92 :             len -= 2;
     442             :         }
     443           0 :         else if ((*from & 0xf0) == 0xe0)
     444             :         {
     445           0 :             if (len < 3)
     446           0 :                 break;          /* drop trailing incomplete char */
     447           0 :             c1 = *from++ & 0x0f;
     448           0 :             c2 = *from++ & 0x3f;
     449           0 :             c3 = *from++ & 0x3f;
     450           0 :             *to = (c1 << 12) | (c2 << 6) | c3;
     451           0 :             len -= 3;
     452             :         }
     453           0 :         else if ((*from & 0xf8) == 0xf0)
     454             :         {
     455           0 :             if (len < 4)
     456           0 :                 break;          /* drop trailing incomplete char */
     457           0 :             c1 = *from++ & 0x07;
     458           0 :             c2 = *from++ & 0x3f;
     459           0 :             c3 = *from++ & 0x3f;
     460           0 :             c4 = *from++ & 0x3f;
     461           0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     462           0 :             len -= 4;
     463             :         }
     464             :         else
     465             :         {
     466             :             /* treat a bogus char as length 1; not ours to raise error */
     467           0 :             *to = *from++;
     468           0 :             len--;
     469             :         }
     470     3538096 :         to++;
     471     3538096 :         cnt++;
     472             :     }
     473      189116 :     *to = 0;
     474      189116 :     return cnt;
     475             : }
     476             : 
     477             : 
     478             : /*
     479             :  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
     480             :  * space allocated.
     481             :  */
     482             : unsigned char *
     483      170360 : unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
     484             : {
     485      170360 :     if (c <= 0x7F)
     486             :     {
     487      170204 :         utf8string[0] = c;
     488             :     }
     489         156 :     else if (c <= 0x7FF)
     490             :     {
     491          92 :         utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
     492          92 :         utf8string[1] = 0x80 | (c & 0x3F);
     493             :     }
     494          64 :     else if (c <= 0xFFFF)
     495             :     {
     496          42 :         utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
     497          42 :         utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
     498          42 :         utf8string[2] = 0x80 | (c & 0x3F);
     499             :     }
     500             :     else
     501             :     {
     502          22 :         utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
     503          22 :         utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
     504          22 :         utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
     505          22 :         utf8string[3] = 0x80 | (c & 0x3F);
     506             :     }
     507             : 
     508      170360 :     return utf8string;
     509             : }
     510             : 
     511             : /*
     512             :  * Trivial conversion from pg_wchar to UTF-8.
     513             :  * caller should allocate enough space for "to"
     514             :  * len: length of from.
     515             :  * "from" not necessarily null terminated.
     516             :  */
     517             : static int
     518       36426 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     519             : {
     520       36426 :     int         cnt = 0;
     521             : 
     522      206558 :     while (len > 0 && *from)
     523             :     {
     524             :         int         char_len;
     525             : 
     526      170132 :         unicode_to_utf8(*from, to);
     527      170132 :         char_len = pg_utf_mblen(to);
     528      170132 :         cnt += char_len;
     529      170132 :         to += char_len;
     530      170132 :         from++;
     531      170132 :         len--;
     532             :     }
     533       36426 :     *to = 0;
     534       36426 :     return cnt;
     535             : }
     536             : 
     537             : /*
     538             :  * Return the byte length of a UTF8 character pointed to by s
     539             :  *
     540             :  * Note: in the current implementation we do not support UTF8 sequences
     541             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     542             :  * We return "1" for any leading byte that is either flat-out illegal or
     543             :  * indicates a length larger than we support.
     544             :  *
     545             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     546             :  * other places would need to be fixed to change this.
     547             :  */
     548             : int
     549    80610126 : pg_utf_mblen(const unsigned char *s)
     550             : {
     551             :     int         len;
     552             : 
     553    80610126 :     if ((*s & 0x80) == 0)
     554    80604284 :         len = 1;
     555        5842 :     else if ((*s & 0xe0) == 0xc0)
     556        1982 :         len = 2;
     557        3860 :     else if ((*s & 0xf0) == 0xe0)
     558        3360 :         len = 3;
     559         500 :     else if ((*s & 0xf8) == 0xf0)
     560         332 :         len = 4;
     561             : #ifdef NOT_USED
     562             :     else if ((*s & 0xfc) == 0xf8)
     563             :         len = 5;
     564             :     else if ((*s & 0xfe) == 0xfc)
     565             :         len = 6;
     566             : #endif
     567             :     else
     568         168 :         len = 1;
     569    80610126 :     return len;
     570             : }
     571             : 
     572             : /*
     573             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     574             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     575             :  * <http://www.unix.org/online.html>
     576             :  *
     577             :  * Markus Kuhn -- 2001-09-08 -- public domain
     578             :  *
     579             :  * customised for PostgreSQL
     580             :  *
     581             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     582             :  */
     583             : 
     584             : struct mbinterval
     585             : {
     586             :     unsigned int first;
     587             :     unsigned int last;
     588             : };
     589             : 
     590             : /* auxiliary function for binary search in interval table */
     591             : static int
     592    28936764 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     593             : {
     594    28936764 :     int         min = 0;
     595             :     int         mid;
     596             : 
     597    28936764 :     if (ucs < table[0].first || ucs > table[max].last)
     598    28935788 :         return 0;
     599        8560 :     while (max >= min)
     600             :     {
     601        7664 :         mid = (min + max) / 2;
     602        7664 :         if (ucs > table[mid].last)
     603        1424 :             min = mid + 1;
     604        6240 :         else if (ucs < table[mid].first)
     605        6160 :             max = mid - 1;
     606             :         else
     607          80 :             return 1;
     608             :     }
     609             : 
     610         896 :     return 0;
     611             : }
     612             : 
     613             : 
     614             : /* The following functions define the column width of an ISO 10646
     615             :  * character as follows:
     616             :  *
     617             :  *    - The null character (U+0000) has a column width of 0.
     618             :  *
     619             :  *    - Other C0/C1 control characters and DEL will lead to a return
     620             :  *      value of -1.
     621             :  *
     622             :  *    - Non-spacing and enclosing combining characters (general
     623             :  *      category code Mn, Me or Cf in the Unicode database) have a
     624             :  *      column width of 0.
     625             :  *
     626             :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     627             :  *      FullWidth (F) category as defined in Unicode Technical
     628             :  *      Report #11 have a column width of 2.
     629             :  *
     630             :  *    - All remaining characters (including all printable
     631             :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     632             :  *      etc.) have a column width of 1.
     633             :  *
     634             :  * This implementation assumes that wchar_t characters are encoded
     635             :  * in ISO 10646.
     636             :  */
     637             : 
     638             : static int
     639    14487330 : ucs_wcwidth(pg_wchar ucs)
     640             : {
     641             : #include "common/unicode_nonspacing_table.h"
     642             : #include "common/unicode_east_asian_fw_table.h"
     643             : 
     644             :     /* test for 8-bit control characters */
     645    14487330 :     if (ucs == 0)
     646           0 :         return 0;
     647             : 
     648    14487330 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     649       18924 :         return -1;
     650             : 
     651             :     /*
     652             :      * binary search in table of non-spacing characters
     653             :      *
     654             :      * XXX: In the official Unicode sources, it is possible for a character to
     655             :      * be described as both non-spacing and wide at the same time. As of
     656             :      * Unicode 13.0, treating the non-spacing property as the determining
     657             :      * factor for display width leads to the correct behavior, so do that
     658             :      * search first.
     659             :      */
     660    14468406 :     if (mbbisearch(ucs, nonspacing,
     661             :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     662          48 :         return 0;
     663             : 
     664             :     /* binary search in table of wide characters */
     665    14468358 :     if (mbbisearch(ucs, east_asian_fw,
     666             :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     667          32 :         return 2;
     668             : 
     669    14468326 :     return 1;
     670             : }
     671             : 
     672             : /*
     673             :  * Convert a UTF-8 character to a Unicode code point.
     674             :  * This is a one-character version of pg_utf2wchar_with_len.
     675             :  *
     676             :  * No error checks here, c must point to a long-enough string.
     677             :  */
     678             : pg_wchar
     679    14488296 : utf8_to_unicode(const unsigned char *c)
     680             : {
     681    14488296 :     if ((*c & 0x80) == 0)
     682    14487200 :         return (pg_wchar) c[0];
     683        1096 :     else if ((*c & 0xe0) == 0xc0)
     684         998 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
     685         998 :                            (c[1] & 0x3f));
     686          98 :     else if ((*c & 0xf0) == 0xe0)
     687          64 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
     688          64 :                            ((c[1] & 0x3f) << 6) |
     689          64 :                            (c[2] & 0x3f));
     690          34 :     else if ((*c & 0xf8) == 0xf0)
     691          34 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
     692          34 :                            ((c[1] & 0x3f) << 12) |
     693          34 :                            ((c[2] & 0x3f) << 6) |
     694          34 :                            (c[3] & 0x3f));
     695             :     else
     696             :         /* that is an invalid code on purpose */
     697           0 :         return 0xffffffff;
     698             : }
     699             : 
     700             : static int
     701    14487330 : pg_utf_dsplen(const unsigned char *s)
     702             : {
     703    14487330 :     return ucs_wcwidth(utf8_to_unicode(s));
     704             : }
     705             : 
     706             : /*
     707             :  * convert mule internal code to pg_wchar
     708             :  * caller should allocate enough space for "to"
     709             :  * len: length of from.
     710             :  * "from" not necessarily null terminated.
     711             :  */
     712             : static int
     713           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     714             : {
     715           0 :     int         cnt = 0;
     716             : 
     717           0 :     while (len > 0 && *from)
     718             :     {
     719           0 :         if (IS_LC1(*from) && len >= 2)
     720             :         {
     721           0 :             *to = *from++ << 16;
     722           0 :             *to |= *from++;
     723           0 :             len -= 2;
     724             :         }
     725           0 :         else if (IS_LCPRV1(*from) && len >= 3)
     726             :         {
     727           0 :             from++;
     728           0 :             *to = *from++ << 16;
     729           0 :             *to |= *from++;
     730           0 :             len -= 3;
     731             :         }
     732           0 :         else if (IS_LC2(*from) && len >= 3)
     733             :         {
     734           0 :             *to = *from++ << 16;
     735           0 :             *to |= *from++ << 8;
     736           0 :             *to |= *from++;
     737           0 :             len -= 3;
     738             :         }
     739           0 :         else if (IS_LCPRV2(*from) && len >= 4)
     740             :         {
     741           0 :             from++;
     742           0 :             *to = *from++ << 16;
     743           0 :             *to |= *from++ << 8;
     744           0 :             *to |= *from++;
     745           0 :             len -= 4;
     746             :         }
     747             :         else
     748             :         {                       /* assume ASCII */
     749           0 :             *to = (unsigned char) *from++;
     750           0 :             len--;
     751             :         }
     752           0 :         to++;
     753           0 :         cnt++;
     754             :     }
     755           0 :     *to = 0;
     756           0 :     return cnt;
     757             : }
     758             : 
     759             : /*
     760             :  * convert pg_wchar to mule internal code
     761             :  * caller should allocate enough space for "to"
     762             :  * len: length of from.
     763             :  * "from" not necessarily null terminated.
     764             :  */
     765             : static int
     766           0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     767             : {
     768           0 :     int         cnt = 0;
     769             : 
     770           0 :     while (len > 0 && *from)
     771             :     {
     772             :         unsigned char lb;
     773             : 
     774           0 :         lb = (*from >> 16) & 0xff;
     775           0 :         if (IS_LC1(lb))
     776             :         {
     777           0 :             *to++ = lb;
     778           0 :             *to++ = *from & 0xff;
     779           0 :             cnt += 2;
     780             :         }
     781           0 :         else if (IS_LC2(lb))
     782             :         {
     783           0 :             *to++ = lb;
     784           0 :             *to++ = (*from >> 8) & 0xff;
     785           0 :             *to++ = *from & 0xff;
     786           0 :             cnt += 3;
     787             :         }
     788           0 :         else if (IS_LCPRV1_A_RANGE(lb))
     789             :         {
     790           0 :             *to++ = LCPRV1_A;
     791           0 :             *to++ = lb;
     792           0 :             *to++ = *from & 0xff;
     793           0 :             cnt += 3;
     794             :         }
     795           0 :         else if (IS_LCPRV1_B_RANGE(lb))
     796             :         {
     797           0 :             *to++ = LCPRV1_B;
     798           0 :             *to++ = lb;
     799           0 :             *to++ = *from & 0xff;
     800           0 :             cnt += 3;
     801             :         }
     802           0 :         else if (IS_LCPRV2_A_RANGE(lb))
     803             :         {
     804           0 :             *to++ = LCPRV2_A;
     805           0 :             *to++ = lb;
     806           0 :             *to++ = (*from >> 8) & 0xff;
     807           0 :             *to++ = *from & 0xff;
     808           0 :             cnt += 4;
     809             :         }
     810           0 :         else if (IS_LCPRV2_B_RANGE(lb))
     811             :         {
     812           0 :             *to++ = LCPRV2_B;
     813           0 :             *to++ = lb;
     814           0 :             *to++ = (*from >> 8) & 0xff;
     815           0 :             *to++ = *from & 0xff;
     816           0 :             cnt += 4;
     817             :         }
     818             :         else
     819             :         {
     820           0 :             *to++ = *from & 0xff;
     821           0 :             cnt += 1;
     822             :         }
     823           0 :         from++;
     824           0 :         len--;
     825             :     }
     826           0 :     *to = 0;
     827           0 :     return cnt;
     828             : }
     829             : 
     830             : /* exported for direct use by conv.c */
     831             : int
     832        2952 : pg_mule_mblen(const unsigned char *s)
     833             : {
     834             :     int         len;
     835             : 
     836        2952 :     if (IS_LC1(*s))
     837        1188 :         len = 2;
     838        1764 :     else if (IS_LCPRV1(*s))
     839           0 :         len = 3;
     840        1764 :     else if (IS_LC2(*s))
     841        1710 :         len = 3;
     842          54 :     else if (IS_LCPRV2(*s))
     843           0 :         len = 4;
     844             :     else
     845          54 :         len = 1;                /* assume ASCII */
     846        2952 :     return len;
     847             : }
     848             : 
     849             : static int
     850           0 : pg_mule_dsplen(const unsigned char *s)
     851             : {
     852             :     int         len;
     853             : 
     854             :     /*
     855             :      * Note: it's not really appropriate to assume that all multibyte charsets
     856             :      * are double-wide on screen.  But this seems an okay approximation for
     857             :      * the MULE charsets we currently support.
     858             :      */
     859             : 
     860           0 :     if (IS_LC1(*s))
     861           0 :         len = 1;
     862           0 :     else if (IS_LCPRV1(*s))
     863           0 :         len = 1;
     864           0 :     else if (IS_LC2(*s))
     865           0 :         len = 2;
     866           0 :     else if (IS_LCPRV2(*s))
     867           0 :         len = 2;
     868             :     else
     869           0 :         len = 1;                /* assume ASCII */
     870             : 
     871           0 :     return len;
     872             : }
     873             : 
     874             : /*
     875             :  * ISO8859-1
     876             :  */
     877             : static int
     878         958 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     879             : {
     880         958 :     int         cnt = 0;
     881             : 
     882       27764 :     while (len > 0 && *from)
     883             :     {
     884       26806 :         *to++ = *from++;
     885       26806 :         len--;
     886       26806 :         cnt++;
     887             :     }
     888         958 :     *to = 0;
     889         958 :     return cnt;
     890             : }
     891             : 
     892             : /*
     893             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     894             :  * high bits.
     895             :  * caller should allocate enough space for "to"
     896             :  * len: length of from.
     897             :  * "from" not necessarily null terminated.
     898             :  */
     899             : static int
     900       10032 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     901             : {
     902       10032 :     int         cnt = 0;
     903             : 
     904      113212 :     while (len > 0 && *from)
     905             :     {
     906      103180 :         *to++ = *from++;
     907      103180 :         len--;
     908      103180 :         cnt++;
     909             :     }
     910       10032 :     *to = 0;
     911       10032 :     return cnt;
     912             : }
     913             : 
     914             : static int
     915        4316 : pg_latin1_mblen(const unsigned char *s)
     916             : {
     917        4316 :     return 1;
     918             : }
     919             : 
     920             : static int
     921         800 : pg_latin1_dsplen(const unsigned char *s)
     922             : {
     923         800 :     return pg_ascii_dsplen(s);
     924             : }
     925             : 
     926             : /*
     927             :  * SJIS
     928             :  */
     929             : static int
     930         972 : pg_sjis_mblen(const unsigned char *s)
     931             : {
     932             :     int         len;
     933             : 
     934         972 :     if (*s >= 0xa1 && *s <= 0xdf)
     935           0 :         len = 1;                /* 1 byte kana? */
     936         972 :     else if (IS_HIGHBIT_SET(*s))
     937         864 :         len = 2;                /* kanji? */
     938             :     else
     939         108 :         len = 1;                /* should be ASCII */
     940         972 :     return len;
     941             : }
     942             : 
     943             : static int
     944           0 : pg_sjis_dsplen(const unsigned char *s)
     945             : {
     946             :     int         len;
     947             : 
     948           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     949           0 :         len = 1;                /* 1 byte kana? */
     950           0 :     else if (IS_HIGHBIT_SET(*s))
     951           0 :         len = 2;                /* kanji? */
     952             :     else
     953           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     954           0 :     return len;
     955             : }
     956             : 
     957             : /*
     958             :  * Big5
     959             :  */
     960             : static int
     961         468 : pg_big5_mblen(const unsigned char *s)
     962             : {
     963             :     int         len;
     964             : 
     965         468 :     if (IS_HIGHBIT_SET(*s))
     966         414 :         len = 2;                /* kanji? */
     967             :     else
     968          54 :         len = 1;                /* should be ASCII */
     969         468 :     return len;
     970             : }
     971             : 
     972             : static int
     973           0 : pg_big5_dsplen(const unsigned char *s)
     974             : {
     975             :     int         len;
     976             : 
     977           0 :     if (IS_HIGHBIT_SET(*s))
     978           0 :         len = 2;                /* kanji? */
     979             :     else
     980           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     981           0 :     return len;
     982             : }
     983             : 
     984             : /*
     985             :  * GBK
     986             :  */
     987             : static int
     988           0 : pg_gbk_mblen(const unsigned char *s)
     989             : {
     990             :     int         len;
     991             : 
     992           0 :     if (IS_HIGHBIT_SET(*s))
     993           0 :         len = 2;                /* kanji? */
     994             :     else
     995           0 :         len = 1;                /* should be ASCII */
     996           0 :     return len;
     997             : }
     998             : 
     999             : static int
    1000           0 : pg_gbk_dsplen(const unsigned char *s)
    1001             : {
    1002             :     int         len;
    1003             : 
    1004           0 :     if (IS_HIGHBIT_SET(*s))
    1005           0 :         len = 2;                /* kanji? */
    1006             :     else
    1007           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1008           0 :     return len;
    1009             : }
    1010             : 
    1011             : /*
    1012             :  * UHC
    1013             :  */
    1014             : static int
    1015           0 : pg_uhc_mblen(const unsigned char *s)
    1016             : {
    1017             :     int         len;
    1018             : 
    1019           0 :     if (IS_HIGHBIT_SET(*s))
    1020           0 :         len = 2;                /* 2byte? */
    1021             :     else
    1022           0 :         len = 1;                /* should be ASCII */
    1023           0 :     return len;
    1024             : }
    1025             : 
    1026             : static int
    1027           0 : pg_uhc_dsplen(const unsigned char *s)
    1028             : {
    1029             :     int         len;
    1030             : 
    1031           0 :     if (IS_HIGHBIT_SET(*s))
    1032           0 :         len = 2;                /* 2byte? */
    1033             :     else
    1034           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1035           0 :     return len;
    1036             : }
    1037             : 
    1038             : /*
    1039             :  * GB18030
    1040             :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1041             :  */
    1042             : 
    1043             : /*
    1044             :  * Unlike all other mblen() functions, this also looks at the second byte of
    1045             :  * the input.  However, if you only pass the first byte of a multi-byte
    1046             :  * string, and \0 as the second byte, this still works in a predictable way:
    1047             :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1048             :  * enough for all current uses, as a client-only encoding.  It works that
    1049             :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1050             :  * fourth byte look like a 2-byte encoded character, when looked at
    1051             :  * separately.
    1052             :  */
    1053             : static int
    1054         162 : pg_gb18030_mblen(const unsigned char *s)
    1055             : {
    1056             :     int         len;
    1057             : 
    1058         162 :     if (!IS_HIGHBIT_SET(*s))
    1059          36 :         len = 1;                /* ASCII */
    1060         126 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1061         126 :         len = 4;
    1062             :     else
    1063           0 :         len = 2;
    1064         162 :     return len;
    1065             : }
    1066             : 
    1067             : static int
    1068           0 : pg_gb18030_dsplen(const unsigned char *s)
    1069             : {
    1070             :     int         len;
    1071             : 
    1072           0 :     if (IS_HIGHBIT_SET(*s))
    1073           0 :         len = 2;
    1074             :     else
    1075           0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1076           0 :     return len;
    1077             : }
    1078             : 
    1079             : /*
    1080             :  *-------------------------------------------------------------------
    1081             :  * multibyte sequence validators
    1082             :  *
    1083             :  * The verifychar functions accept "s", a pointer to the first byte of a
    1084             :  * string, and "len", the remaining length of the string.  If there is a
    1085             :  * validly encoded character beginning at *s, return its length in bytes;
    1086             :  * else return -1.
    1087             :  *
    1088             :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1089             :  * the length of the string.  They verify the whole string, and return the
    1090             :  * number of input bytes (<= len) that are valid.  In other words, if the
    1091             :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1092             :  * byte offset of the first invalid character.  The verifystr functions must
    1093             :  * test for and reject zeroes in the input.
    1094             :  *
    1095             :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1096             :  * they must test for and reject zeroes in any additional bytes of a
    1097             :  * multibyte character.  Note that this definition allows the function for a
    1098             :  * single-byte encoding to be just "return 1".
    1099             :  *-------------------------------------------------------------------
    1100             :  */
    1101             : static int
    1102        1364 : pg_ascii_verifychar(const unsigned char *s, int len)
    1103             : {
    1104        1364 :     return 1;
    1105             : }
    1106             : 
    1107             : static int
    1108      917234 : pg_ascii_verifystr(const unsigned char *s, int len)
    1109             : {
    1110      917234 :     const unsigned char *nullpos = memchr(s, 0, len);
    1111             : 
    1112      917234 :     if (nullpos == NULL)
    1113      917234 :         return len;
    1114             :     else
    1115           0 :         return nullpos - s;
    1116             : }
    1117             : 
    1118             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1119             : 
    1120             : static int
    1121         432 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1122             : {
    1123             :     int         l;
    1124             :     unsigned char c1,
    1125             :                 c2;
    1126             : 
    1127         432 :     c1 = *s++;
    1128             : 
    1129         432 :     switch (c1)
    1130             :     {
    1131           0 :         case SS2:               /* JIS X 0201 */
    1132           0 :             l = 2;
    1133           0 :             if (l > len)
    1134           0 :                 return -1;
    1135           0 :             c2 = *s++;
    1136           0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1137           0 :                 return -1;
    1138           0 :             break;
    1139             : 
    1140           0 :         case SS3:               /* JIS X 0212 */
    1141           0 :             l = 3;
    1142           0 :             if (l > len)
    1143           0 :                 return -1;
    1144           0 :             c2 = *s++;
    1145           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1146           0 :                 return -1;
    1147           0 :             c2 = *s++;
    1148           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1149           0 :                 return -1;
    1150           0 :             break;
    1151             : 
    1152         432 :         default:
    1153         432 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1154             :             {
    1155         432 :                 l = 2;
    1156         432 :                 if (l > len)
    1157          72 :                     return -1;
    1158         360 :                 if (!IS_EUC_RANGE_VALID(c1))
    1159           0 :                     return -1;
    1160         360 :                 c2 = *s++;
    1161         360 :                 if (!IS_EUC_RANGE_VALID(c2))
    1162         144 :                     return -1;
    1163             :             }
    1164             :             else
    1165             :                 /* must be ASCII */
    1166             :             {
    1167           0 :                 l = 1;
    1168             :             }
    1169         216 :             break;
    1170             :     }
    1171             : 
    1172         216 :     return l;
    1173             : }
    1174             : 
    1175             : static int
    1176         264 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1177             : {
    1178         264 :     const unsigned char *start = s;
    1179             : 
    1180         894 :     while (len > 0)
    1181             :     {
    1182             :         int         l;
    1183             : 
    1184             :         /* fast path for ASCII-subset characters */
    1185         810 :         if (!IS_HIGHBIT_SET(*s))
    1186             :         {
    1187         594 :             if (*s == '\0')
    1188          72 :                 break;
    1189         522 :             l = 1;
    1190             :         }
    1191             :         else
    1192             :         {
    1193         216 :             l = pg_eucjp_verifychar(s, len);
    1194         216 :             if (l == -1)
    1195         108 :                 break;
    1196             :         }
    1197         630 :         s += l;
    1198         630 :         len -= l;
    1199             :     }
    1200             : 
    1201         264 :     return s - start;
    1202             : }
    1203             : 
    1204             : static int
    1205           0 : pg_euckr_verifychar(const unsigned char *s, int len)
    1206             : {
    1207             :     int         l;
    1208             :     unsigned char c1,
    1209             :                 c2;
    1210             : 
    1211           0 :     c1 = *s++;
    1212             : 
    1213           0 :     if (IS_HIGHBIT_SET(c1))
    1214             :     {
    1215           0 :         l = 2;
    1216           0 :         if (l > len)
    1217           0 :             return -1;
    1218           0 :         if (!IS_EUC_RANGE_VALID(c1))
    1219           0 :             return -1;
    1220           0 :         c2 = *s++;
    1221           0 :         if (!IS_EUC_RANGE_VALID(c2))
    1222           0 :             return -1;
    1223             :     }
    1224             :     else
    1225             :         /* must be ASCII */
    1226             :     {
    1227           0 :         l = 1;
    1228             :     }
    1229             : 
    1230           0 :     return l;
    1231             : }
    1232             : 
    1233             : static int
    1234          24 : pg_euckr_verifystr(const unsigned char *s, int len)
    1235             : {
    1236          24 :     const unsigned char *start = s;
    1237             : 
    1238          96 :     while (len > 0)
    1239             :     {
    1240             :         int         l;
    1241             : 
    1242             :         /* fast path for ASCII-subset characters */
    1243          72 :         if (!IS_HIGHBIT_SET(*s))
    1244             :         {
    1245          72 :             if (*s == '\0')
    1246           0 :                 break;
    1247          72 :             l = 1;
    1248             :         }
    1249             :         else
    1250             :         {
    1251           0 :             l = pg_euckr_verifychar(s, len);
    1252           0 :             if (l == -1)
    1253           0 :                 break;
    1254             :         }
    1255          72 :         s += l;
    1256          72 :         len -= l;
    1257             :     }
    1258             : 
    1259          24 :     return s - start;
    1260             : }
    1261             : 
    1262             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1263             : #define pg_euccn_verifychar pg_euckr_verifychar
    1264             : #define pg_euccn_verifystr  pg_euckr_verifystr
    1265             : 
    1266             : static int
    1267           0 : pg_euctw_verifychar(const unsigned char *s, int len)
    1268             : {
    1269             :     int         l;
    1270             :     unsigned char c1,
    1271             :                 c2;
    1272             : 
    1273           0 :     c1 = *s++;
    1274             : 
    1275           0 :     switch (c1)
    1276             :     {
    1277           0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1278           0 :             l = 4;
    1279           0 :             if (l > len)
    1280           0 :                 return -1;
    1281           0 :             c2 = *s++;
    1282           0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1283           0 :                 return -1;
    1284           0 :             c2 = *s++;
    1285           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1286           0 :                 return -1;
    1287           0 :             c2 = *s++;
    1288           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1289           0 :                 return -1;
    1290           0 :             break;
    1291             : 
    1292           0 :         case SS3:               /* unused */
    1293           0 :             return -1;
    1294             : 
    1295           0 :         default:
    1296           0 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1297             :             {
    1298           0 :                 l = 2;
    1299           0 :                 if (l > len)
    1300           0 :                     return -1;
    1301             :                 /* no further range check on c1? */
    1302           0 :                 c2 = *s++;
    1303           0 :                 if (!IS_EUC_RANGE_VALID(c2))
    1304           0 :                     return -1;
    1305             :             }
    1306             :             else
    1307             :                 /* must be ASCII */
    1308             :             {
    1309           0 :                 l = 1;
    1310             :             }
    1311           0 :             break;
    1312             :     }
    1313           0 :     return l;
    1314             : }
    1315             : 
    1316             : static int
    1317          18 : pg_euctw_verifystr(const unsigned char *s, int len)
    1318             : {
    1319          18 :     const unsigned char *start = s;
    1320             : 
    1321          72 :     while (len > 0)
    1322             :     {
    1323             :         int         l;
    1324             : 
    1325             :         /* fast path for ASCII-subset characters */
    1326          54 :         if (!IS_HIGHBIT_SET(*s))
    1327             :         {
    1328          54 :             if (*s == '\0')
    1329           0 :                 break;
    1330          54 :             l = 1;
    1331             :         }
    1332             :         else
    1333             :         {
    1334           0 :             l = pg_euctw_verifychar(s, len);
    1335           0 :             if (l == -1)
    1336           0 :                 break;
    1337             :         }
    1338          54 :         s += l;
    1339          54 :         len -= l;
    1340             :     }
    1341             : 
    1342          18 :     return s - start;
    1343             : }
    1344             : 
    1345             : static int
    1346           0 : pg_johab_verifychar(const unsigned char *s, int len)
    1347             : {
    1348             :     int         l,
    1349             :                 mbl;
    1350             :     unsigned char c;
    1351             : 
    1352           0 :     l = mbl = pg_johab_mblen(s);
    1353             : 
    1354           0 :     if (len < l)
    1355           0 :         return -1;
    1356             : 
    1357           0 :     if (!IS_HIGHBIT_SET(*s))
    1358           0 :         return mbl;
    1359             : 
    1360           0 :     while (--l > 0)
    1361             :     {
    1362           0 :         c = *++s;
    1363           0 :         if (!IS_EUC_RANGE_VALID(c))
    1364           0 :             return -1;
    1365             :     }
    1366           0 :     return mbl;
    1367             : }
    1368             : 
    1369             : static int
    1370           6 : pg_johab_verifystr(const unsigned char *s, int len)
    1371             : {
    1372           6 :     const unsigned char *start = s;
    1373             : 
    1374          24 :     while (len > 0)
    1375             :     {
    1376             :         int         l;
    1377             : 
    1378             :         /* fast path for ASCII-subset characters */
    1379          18 :         if (!IS_HIGHBIT_SET(*s))
    1380             :         {
    1381          18 :             if (*s == '\0')
    1382           0 :                 break;
    1383          18 :             l = 1;
    1384             :         }
    1385             :         else
    1386             :         {
    1387           0 :             l = pg_johab_verifychar(s, len);
    1388           0 :             if (l == -1)
    1389           0 :                 break;
    1390             :         }
    1391          18 :         s += l;
    1392          18 :         len -= l;
    1393             :     }
    1394             : 
    1395           6 :     return s - start;
    1396             : }
    1397             : 
    1398             : static int
    1399        1296 : pg_mule_verifychar(const unsigned char *s, int len)
    1400             : {
    1401             :     int         l,
    1402             :                 mbl;
    1403             :     unsigned char c;
    1404             : 
    1405        1296 :     l = mbl = pg_mule_mblen(s);
    1406             : 
    1407        1296 :     if (len < l)
    1408         324 :         return -1;
    1409             : 
    1410        1998 :     while (--l > 0)
    1411             :     {
    1412        1314 :         c = *++s;
    1413        1314 :         if (!IS_HIGHBIT_SET(c))
    1414         288 :             return -1;
    1415             :     }
    1416         684 :     return mbl;
    1417             : }
    1418             : 
    1419             : static int
    1420         378 : pg_mule_verifystr(const unsigned char *s, int len)
    1421             : {
    1422         378 :     const unsigned char *start = s;
    1423             : 
    1424        1062 :     while (len > 0)
    1425             :     {
    1426             :         int         l;
    1427             : 
    1428             :         /* fast path for ASCII-subset characters */
    1429         900 :         if (!IS_HIGHBIT_SET(*s))
    1430             :         {
    1431         522 :             if (*s == '\0')
    1432          36 :                 break;
    1433         486 :             l = 1;
    1434             :         }
    1435             :         else
    1436             :         {
    1437         378 :             l = pg_mule_verifychar(s, len);
    1438         378 :             if (l == -1)
    1439         180 :                 break;
    1440             :         }
    1441         684 :         s += l;
    1442         684 :         len -= l;
    1443             :     }
    1444             : 
    1445         378 :     return s - start;
    1446             : }
    1447             : 
    1448             : static int
    1449         244 : pg_latin1_verifychar(const unsigned char *s, int len)
    1450             : {
    1451         244 :     return 1;
    1452             : }
    1453             : 
    1454             : static int
    1455       10876 : pg_latin1_verifystr(const unsigned char *s, int len)
    1456             : {
    1457       10876 :     const unsigned char *nullpos = memchr(s, 0, len);
    1458             : 
    1459       10876 :     if (nullpos == NULL)
    1460       10768 :         return len;
    1461             :     else
    1462         108 :         return nullpos - s;
    1463             : }
    1464             : 
    1465             : static int
    1466         702 : pg_sjis_verifychar(const unsigned char *s, int len)
    1467             : {
    1468             :     int         l,
    1469             :                 mbl;
    1470             :     unsigned char c1,
    1471             :                 c2;
    1472             : 
    1473         702 :     l = mbl = pg_sjis_mblen(s);
    1474             : 
    1475         702 :     if (len < l)
    1476         108 :         return -1;
    1477             : 
    1478         594 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1479           0 :         return mbl;
    1480             : 
    1481         594 :     c1 = *s++;
    1482         594 :     c2 = *s;
    1483         594 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1484         216 :         return -1;
    1485         378 :     return mbl;
    1486             : }
    1487             : 
    1488             : static int
    1489         282 : pg_sjis_verifystr(const unsigned char *s, int len)
    1490             : {
    1491         282 :     const unsigned char *start = s;
    1492             : 
    1493        1254 :     while (len > 0)
    1494             :     {
    1495             :         int         l;
    1496             : 
    1497             :         /* fast path for ASCII-subset characters */
    1498        1152 :         if (!IS_HIGHBIT_SET(*s))
    1499             :         {
    1500         918 :             if (*s == '\0')
    1501          72 :                 break;
    1502         846 :             l = 1;
    1503             :         }
    1504             :         else
    1505             :         {
    1506         234 :             l = pg_sjis_verifychar(s, len);
    1507         234 :             if (l == -1)
    1508         108 :                 break;
    1509             :         }
    1510         972 :         s += l;
    1511         972 :         len -= l;
    1512             :     }
    1513             : 
    1514         282 :     return s - start;
    1515             : }
    1516             : 
    1517             : static int
    1518         342 : pg_big5_verifychar(const unsigned char *s, int len)
    1519             : {
    1520             :     int         l,
    1521             :                 mbl;
    1522             : 
    1523         342 :     l = mbl = pg_big5_mblen(s);
    1524             : 
    1525         342 :     if (len < l)
    1526           0 :         return -1;
    1527             : 
    1528         576 :     while (--l > 0)
    1529             :     {
    1530         342 :         if (*++s == '\0')
    1531         108 :             return -1;
    1532             :     }
    1533             : 
    1534         234 :     return mbl;
    1535             : }
    1536             : 
    1537             : static int
    1538         144 : pg_big5_verifystr(const unsigned char *s, int len)
    1539             : {
    1540         144 :     const unsigned char *start = s;
    1541             : 
    1542         648 :     while (len > 0)
    1543             :     {
    1544             :         int         l;
    1545             : 
    1546             :         /* fast path for ASCII-subset characters */
    1547         576 :         if (!IS_HIGHBIT_SET(*s))
    1548             :         {
    1549         468 :             if (*s == '\0')
    1550          36 :                 break;
    1551         432 :             l = 1;
    1552             :         }
    1553             :         else
    1554             :         {
    1555         108 :             l = pg_big5_verifychar(s, len);
    1556         108 :             if (l == -1)
    1557          36 :                 break;
    1558             :         }
    1559         504 :         s += l;
    1560         504 :         len -= l;
    1561             :     }
    1562             : 
    1563         144 :     return s - start;
    1564             : }
    1565             : 
    1566             : static int
    1567           0 : pg_gbk_verifychar(const unsigned char *s, int len)
    1568             : {
    1569             :     int         l,
    1570             :                 mbl;
    1571             : 
    1572           0 :     l = mbl = pg_gbk_mblen(s);
    1573             : 
    1574           0 :     if (len < l)
    1575           0 :         return -1;
    1576             : 
    1577           0 :     while (--l > 0)
    1578             :     {
    1579           0 :         if (*++s == '\0')
    1580           0 :             return -1;
    1581             :     }
    1582             : 
    1583           0 :     return mbl;
    1584             : }
    1585             : 
    1586             : static int
    1587           6 : pg_gbk_verifystr(const unsigned char *s, int len)
    1588             : {
    1589           6 :     const unsigned char *start = s;
    1590             : 
    1591          24 :     while (len > 0)
    1592             :     {
    1593             :         int         l;
    1594             : 
    1595             :         /* fast path for ASCII-subset characters */
    1596          18 :         if (!IS_HIGHBIT_SET(*s))
    1597             :         {
    1598          18 :             if (*s == '\0')
    1599           0 :                 break;
    1600          18 :             l = 1;
    1601             :         }
    1602             :         else
    1603             :         {
    1604           0 :             l = pg_gbk_verifychar(s, len);
    1605           0 :             if (l == -1)
    1606           0 :                 break;
    1607             :         }
    1608          18 :         s += l;
    1609          18 :         len -= l;
    1610             :     }
    1611             : 
    1612           6 :     return s - start;
    1613             : }
    1614             : 
    1615             : static int
    1616           0 : pg_uhc_verifychar(const unsigned char *s, int len)
    1617             : {
    1618             :     int         l,
    1619             :                 mbl;
    1620             : 
    1621           0 :     l = mbl = pg_uhc_mblen(s);
    1622             : 
    1623           0 :     if (len < l)
    1624           0 :         return -1;
    1625             : 
    1626           0 :     while (--l > 0)
    1627             :     {
    1628           0 :         if (*++s == '\0')
    1629           0 :             return -1;
    1630             :     }
    1631             : 
    1632           0 :     return mbl;
    1633             : }
    1634             : 
    1635             : static int
    1636           6 : pg_uhc_verifystr(const unsigned char *s, int len)
    1637             : {
    1638           6 :     const unsigned char *start = s;
    1639             : 
    1640          24 :     while (len > 0)
    1641             :     {
    1642             :         int         l;
    1643             : 
    1644             :         /* fast path for ASCII-subset characters */
    1645          18 :         if (!IS_HIGHBIT_SET(*s))
    1646             :         {
    1647          18 :             if (*s == '\0')
    1648           0 :                 break;
    1649          18 :             l = 1;
    1650             :         }
    1651             :         else
    1652             :         {
    1653           0 :             l = pg_uhc_verifychar(s, len);
    1654           0 :             if (l == -1)
    1655           0 :                 break;
    1656             :         }
    1657          18 :         s += l;
    1658          18 :         len -= l;
    1659             :     }
    1660             : 
    1661           6 :     return s - start;
    1662             : }
    1663             : 
    1664             : static int
    1665         414 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1666             : {
    1667             :     int         l;
    1668             : 
    1669         414 :     if (!IS_HIGHBIT_SET(*s))
    1670           0 :         l = 1;                  /* ASCII */
    1671         414 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1672             :     {
    1673             :         /* Should be 4-byte, validate remaining bytes */
    1674         306 :         if (*s >= 0x81 && *s <= 0xfe &&
    1675         306 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1676         306 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1677         162 :             l = 4;
    1678             :         else
    1679         144 :             l = -1;
    1680             :     }
    1681         108 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1682             :     {
    1683             :         /* Should be 2-byte, validate */
    1684         108 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1685         108 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1686          36 :             l = 2;
    1687             :         else
    1688          72 :             l = -1;
    1689             :     }
    1690             :     else
    1691           0 :         l = -1;
    1692         414 :     return l;
    1693             : }
    1694             : 
    1695             : static int
    1696         222 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1697             : {
    1698         222 :     const unsigned char *start = s;
    1699             : 
    1700         978 :     while (len > 0)
    1701             :     {
    1702             :         int         l;
    1703             : 
    1704             :         /* fast path for ASCII-subset characters */
    1705         900 :         if (!IS_HIGHBIT_SET(*s))
    1706             :         {
    1707         702 :             if (*s == '\0')
    1708          36 :                 break;
    1709         666 :             l = 1;
    1710             :         }
    1711             :         else
    1712             :         {
    1713         198 :             l = pg_gb18030_verifychar(s, len);
    1714         198 :             if (l == -1)
    1715         108 :                 break;
    1716             :         }
    1717         756 :         s += l;
    1718         756 :         len -= l;
    1719             :     }
    1720             : 
    1721         222 :     return s - start;
    1722             : }
    1723             : 
    1724             : static int
    1725        2726 : pg_utf8_verifychar(const unsigned char *s, int len)
    1726             : {
    1727             :     int         l;
    1728             : 
    1729        2726 :     if ((*s & 0x80) == 0)
    1730             :     {
    1731           0 :         if (*s == '\0')
    1732           0 :             return -1;
    1733           0 :         return 1;
    1734             :     }
    1735        2726 :     else if ((*s & 0xe0) == 0xc0)
    1736         662 :         l = 2;
    1737        2064 :     else if ((*s & 0xf0) == 0xe0)
    1738        1176 :         l = 3;
    1739         888 :     else if ((*s & 0xf8) == 0xf0)
    1740         624 :         l = 4;
    1741             :     else
    1742         264 :         l = 1;
    1743             : 
    1744        2726 :     if (l > len)
    1745         180 :         return -1;
    1746             : 
    1747        2546 :     if (!pg_utf8_islegal(s, l))
    1748        1812 :         return -1;
    1749             : 
    1750         734 :     return l;
    1751             : }
    1752             : 
    1753             : /*
    1754             :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1755             :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1756             :  * input byte and current state are used to compute an index into an array of
    1757             :  * state transitions. Since the address of the next transition is dependent
    1758             :  * on this computation, there is latency in executing the load instruction,
    1759             :  * and the CPU is not kept busy.
    1760             :  *
    1761             :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1762             :  *
    1763             :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1764             :  *
    1765             :  * In a shift-based DFA, the input byte is an index into array of integers
    1766             :  * whose bit pattern encodes the state transitions. To compute the next
    1767             :  * state, we simply right-shift the integer by the current state and apply a
    1768             :  * mask. In this scheme, the address of the transition only depends on the
    1769             :  * input byte, so there is better pipelining.
    1770             :  *
    1771             :  * The naming convention for states and transitions was adopted from a UTF-8
    1772             :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1773             :  *
    1774             :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1775             :  *
    1776             :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1777             :  * ==========================================================================
    1778             :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1779             :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1780             :  *                                                                  |
    1781             :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1782             :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1783             :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1784             :  *                                                                  |
    1785             :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1786             :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1787             :  *                                                                  |
    1788             :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1789             :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1790             :  *
    1791             :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1792             :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1793             :  * it's possible to find state numbers such that the transitions fit within
    1794             :  * 32-bit integers, as Dougall Johnson demonstrated:
    1795             :  *
    1796             :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1797             :  *
    1798             :  * This packed representation is the reason for the seemingly odd choice of
    1799             :  * state values below.
    1800             :  */
    1801             : 
    1802             : /* Error */
    1803             : #define ERR  0
    1804             : /* Begin */
    1805             : #define BGN 11
    1806             : /* Continuation states, expect 1/2/3 continuation bytes */
    1807             : #define CS1 16
    1808             : #define CS2  1
    1809             : #define CS3  5
    1810             : /* Partial states, where the first continuation byte has a restricted range */
    1811             : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1812             : #define P3B 20                  /* Lead was ED, check for surrogate */
    1813             : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1814             : #define P4B 30                  /* Lead was F4, check for too-large */
    1815             : /* Begin and End are the same state */
    1816             : #define END BGN
    1817             : 
    1818             : /* the encoded state transitions for the lookup table */
    1819             : 
    1820             : /* ASCII */
    1821             : #define ASC (END << BGN)
    1822             : /* 2-byte lead */
    1823             : #define L2A (CS1 << BGN)
    1824             : /* 3-byte lead */
    1825             : #define L3A (P3A << BGN)
    1826             : #define L3B (CS2 << BGN)
    1827             : #define L3C (P3B << BGN)
    1828             : /* 4-byte lead */
    1829             : #define L4A (P4A << BGN)
    1830             : #define L4B (CS3 << BGN)
    1831             : #define L4C (P4B << BGN)
    1832             : /* continuation byte */
    1833             : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1834             : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1835             : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1836             : /* invalid byte */
    1837             : #define ILL ERR
    1838             : 
    1839             : static const uint32 Utf8Transition[256] =
    1840             : {
    1841             :     /* ASCII */
    1842             : 
    1843             :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1844             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1845             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1846             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1847             : 
    1848             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1849             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1850             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1851             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1852             : 
    1853             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1854             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1855             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1856             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1857             : 
    1858             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1859             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1860             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1861             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1862             : 
    1863             :     /* continuation bytes */
    1864             : 
    1865             :     /* 80..8F */
    1866             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1867             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1868             : 
    1869             :     /* 90..9F */
    1870             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1871             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1872             : 
    1873             :     /* A0..BF */
    1874             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1875             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1876             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1877             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1878             : 
    1879             :     /* leading bytes */
    1880             : 
    1881             :     /* C0..DF */
    1882             :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1883             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1884             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1885             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1886             : 
    1887             :     /* E0..EF */
    1888             :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1889             :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1890             : 
    1891             :     /* F0..FF */
    1892             :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1893             :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1894             : };
    1895             : 
    1896             : static void
    1897        1278 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1898             : {
    1899             :     /* Note: We deliberately don't check the state's value here. */
    1900       42174 :     while (len > 0)
    1901             :     {
    1902             :         /*
    1903             :          * It's important that the mask value is 31: In most instruction sets,
    1904             :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1905             :          * 32, so the compiler should elide the mask operation.
    1906             :          */
    1907       40896 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1908       40896 :         len--;
    1909             :     }
    1910             : 
    1911        1278 :     *state &= 31;
    1912        1278 : }
    1913             : 
    1914             : static int
    1915      678962 : pg_utf8_verifystr(const unsigned char *s, int len)
    1916             : {
    1917      678962 :     const unsigned char *start = s;
    1918      678962 :     const int   orig_len = len;
    1919      678962 :     uint32      state = BGN;
    1920             : 
    1921             : /*
    1922             :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1923             :  * the compiler can unroll a longer loop, it's not worth it because we
    1924             :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1925             :  */
    1926             : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1927             : 
    1928      678962 :     if (len >= STRIDE_LENGTH)
    1929             :     {
    1930     1079842 :         while (len >= STRIDE_LENGTH)
    1931             :         {
    1932             :             /*
    1933             :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1934             :              * but we must first check for a non-END state, which means the
    1935             :              * previous chunk ended in the middle of a multibyte sequence.
    1936             :              */
    1937      944330 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1938        1278 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1939             : 
    1940      944330 :             s += STRIDE_LENGTH;
    1941      944330 :             len -= STRIDE_LENGTH;
    1942             :         }
    1943             : 
    1944             :         /* The error state persists, so we only need to check for it here. */
    1945      135512 :         if (state == ERR)
    1946             :         {
    1947             :             /*
    1948             :              * Start over from the beginning with the slow path so we can
    1949             :              * count the valid bytes.
    1950             :              */
    1951         504 :             len = orig_len;
    1952         504 :             s = start;
    1953             :         }
    1954      135008 :         else if (state != END)
    1955             :         {
    1956             :             /*
    1957             :              * The fast path exited in the middle of a multibyte sequence.
    1958             :              * Walk backwards to find the leading byte so that the slow path
    1959             :              * can resume checking from there. We must always backtrack at
    1960             :              * least one byte, since the current byte could be e.g. an ASCII
    1961             :              * byte after a 2-byte lead, which is invalid.
    1962             :              */
    1963             :             do
    1964             :             {
    1965             :                 Assert(s > start);
    1966          78 :                 s--;
    1967          78 :                 len++;
    1968             :                 Assert(IS_HIGHBIT_SET(*s));
    1969          78 :             } while (pg_utf_mblen(s) <= 1);
    1970             :         }
    1971             :     }
    1972             : 
    1973             :     /* check remaining bytes */
    1974     7105106 :     while (len > 0)
    1975             :     {
    1976             :         int         l;
    1977             : 
    1978             :         /* fast path for ASCII-subset characters */
    1979     6428312 :         if (!IS_HIGHBIT_SET(*s))
    1980             :         {
    1981     6425586 :             if (*s == '\0')
    1982         176 :                 break;
    1983     6425410 :             l = 1;
    1984             :         }
    1985             :         else
    1986             :         {
    1987        2726 :             l = pg_utf8_verifychar(s, len);
    1988        2726 :             if (l == -1)
    1989        1992 :                 break;
    1990             :         }
    1991     6426144 :         s += l;
    1992     6426144 :         len -= l;
    1993             :     }
    1994             : 
    1995      678962 :     return s - start;
    1996             : }
    1997             : 
    1998             : /*
    1999             :  * Check for validity of a single UTF-8 encoded character
    2000             :  *
    2001             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    2002             :  * restrictions on the second byte are meant to ensure that there isn't
    2003             :  * more than one encoding of a given Unicode character point; that is,
    2004             :  * you may not use a longer-than-necessary byte sequence with high order
    2005             :  * zero bits to represent a character that would fit in fewer bytes.
    2006             :  * To do otherwise is to create security hazards (eg, create an apparent
    2007             :  * non-ASCII character that decodes to plain ASCII).
    2008             :  *
    2009             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    2010             :  * caller must have checked that that many bytes are present in the buffer.
    2011             :  */
    2012             : bool
    2013        8154 : pg_utf8_islegal(const unsigned char *source, int length)
    2014             : {
    2015             :     unsigned char a;
    2016             : 
    2017        8154 :     switch (length)
    2018             :     {
    2019           0 :         default:
    2020             :             /* reject lengths 5 and 6 for now */
    2021           0 :             return false;
    2022         588 :         case 4:
    2023         588 :             a = source[3];
    2024         588 :             if (a < 0x80 || a > 0xBF)
    2025          96 :                 return false;
    2026             :             /* FALL THRU */
    2027             :         case 3:
    2028        3254 :             a = source[2];
    2029        3254 :             if (a < 0x80 || a > 0xBF)
    2030         600 :                 return false;
    2031             :             /* FALL THRU */
    2032             :         case 2:
    2033        3836 :             a = source[1];
    2034        3836 :             switch (*source)
    2035             :             {
    2036         312 :                 case 0xE0:
    2037         312 :                     if (a < 0xA0 || a > 0xBF)
    2038         264 :                         return false;
    2039          48 :                     break;
    2040         312 :                 case 0xED:
    2041         312 :                     if (a < 0x80 || a > 0x9F)
    2042         264 :                         return false;
    2043          48 :                     break;
    2044         312 :                 case 0xF0:
    2045         312 :                     if (a < 0x90 || a > 0xBF)
    2046         264 :                         return false;
    2047          48 :                     break;
    2048         180 :                 case 0xF4:
    2049         180 :                     if (a < 0x80 || a > 0x8F)
    2050         132 :                         return false;
    2051          48 :                     break;
    2052        2720 :                 default:
    2053        2720 :                     if (a < 0x80 || a > 0xBF)
    2054          96 :                         return false;
    2055        2624 :                     break;
    2056             :             }
    2057             :             /* FALL THRU */
    2058        6438 :         case 1:
    2059        6438 :             a = *source;
    2060        6438 :             if (a >= 0x80 && a < 0xC2)
    2061         396 :                 return false;
    2062        6042 :             if (a > 0xF4)
    2063         132 :                 return false;
    2064        5910 :             break;
    2065             :     }
    2066        5910 :     return true;
    2067             : }
    2068             : 
    2069             : 
    2070             : /*
    2071             :  *-------------------------------------------------------------------
    2072             :  * encoding info table
    2073             :  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
    2074             :  *-------------------------------------------------------------------
    2075             :  */
    2076             : const pg_wchar_tbl pg_wchar_table[] = {
    2077             :     {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},   /* PG_SQL_ASCII */
    2078             :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JP */
    2079             :     {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},  /* PG_EUC_CN */
    2080             :     {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},  /* PG_EUC_KR */
    2081             :     {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},  /* PG_EUC_TW */
    2082             :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JIS_2004 */
    2083             :     {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},  /* PG_UTF8 */
    2084             :     {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},  /* PG_MULE_INTERNAL */
    2085             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN1 */
    2086             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN2 */
    2087             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN3 */
    2088             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN4 */
    2089             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN5 */
    2090             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN6 */
    2091             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN7 */
    2092             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN8 */
    2093             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN9 */
    2094             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN10 */
    2095             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1256 */
    2096             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1258 */
    2097             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN866 */
    2098             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN874 */
    2099             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8R */
    2100             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1251 */
    2101             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1252 */
    2102             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-5 */
    2103             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-6 */
    2104             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-7 */
    2105             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-8 */
    2106             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1250 */
    2107             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1253 */
    2108             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1254 */
    2109             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1255 */
    2110             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1257 */
    2111             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8U */
    2112             :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},    /* PG_SJIS */
    2113             :     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},    /* PG_BIG5 */
    2114             :     {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},    /* PG_GBK */
    2115             :     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},    /* PG_UHC */
    2116             :     {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},    /* PG_GB18030 */
    2117             :     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},    /* PG_JOHAB */
    2118             :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
    2119             : };
    2120             : 
    2121             : /*
    2122             :  * Returns the byte length of a multibyte character.
    2123             :  *
    2124             :  * Caution: when dealing with text that is not certainly valid in the
    2125             :  * specified encoding, the result may exceed the actual remaining
    2126             :  * string length.  Callers that are not prepared to deal with that
    2127             :  * should use pg_encoding_mblen_bounded() instead.
    2128             :  */
    2129             : int
    2130    48430874 : pg_encoding_mblen(int encoding, const char *mbstr)
    2131             : {
    2132    48430874 :     return (PG_VALID_ENCODING(encoding) ?
    2133    96861748 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2134           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2135             : }
    2136             : 
    2137             : /*
    2138             :  * Returns the byte length of a multibyte character; but not more than
    2139             :  * the distance to end of string.
    2140             :  */
    2141             : int
    2142         148 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2143             : {
    2144         148 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2145             : }
    2146             : 
    2147             : /*
    2148             :  * Returns the display length of a multibyte character.
    2149             :  */
    2150             : int
    2151    48268960 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2152             : {
    2153    48268960 :     return (PG_VALID_ENCODING(encoding) ?
    2154    96537920 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2155           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2156             : }
    2157             : 
    2158             : /*
    2159             :  * Verify the first multibyte character of the given string.
    2160             :  * Return its byte length if good, -1 if bad.  (See comments above for
    2161             :  * full details of the mbverifychar API.)
    2162             :  */
    2163             : int
    2164        2250 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2165             : {
    2166        2250 :     return (PG_VALID_ENCODING(encoding) ?
    2167        4500 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2168           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2169             : }
    2170             : 
    2171             : /*
    2172             :  * Verify that a string is valid for the given encoding.
    2173             :  * Returns the number of input bytes (<= len) that form a valid string.
    2174             :  * (See comments above for full details of the mbverifystr API.)
    2175             :  */
    2176             : int
    2177      433298 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2178             : {
    2179      433298 :     return (PG_VALID_ENCODING(encoding) ?
    2180      866596 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2181           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2182             : }
    2183             : 
    2184             : /*
    2185             :  * fetch maximum length of a given encoding
    2186             :  */
    2187             : int
    2188      778586 : pg_encoding_max_length(int encoding)
    2189             : {
    2190             :     Assert(PG_VALID_ENCODING(encoding));
    2191             : 
    2192      778586 :     return pg_wchar_table[encoding].maxmblen;
    2193             : }

Generated by: LCOV version 1.14