LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 78.7 % 868 683
Test Date: 2026-03-12 06:14:44 Functions: 82.9 % 82 68
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * wchar.c
       4              :  *    Functions for working with multibyte characters in various encodings.
       5              :  *
       6              :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/common/wchar.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "c.h"
      14              : 
      15              : #include <limits.h>
      16              : 
      17              : #include "mb/pg_wchar.h"
      18              : #include "utils/ascii.h"
      19              : 
      20              : 
      21              : /*
      22              :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23              :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24              :  *
      25              :  * For historical reasons, several verifychar implementations opt to reject
      26              :  * this pair specifically.  Byte pair range constraints, in encoding
      27              :  * originator documentation, always excluded this pair.  No core conversion
      28              :  * could translate it.  However, longstanding verifychar implementations
      29              :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      30              :  * pairs not valid per encoding originator documentation.  To avoid tightening
      31              :  * core or non-core conversions in a security patch, we sought this one pair.
      32              :  *
      33              :  * PQescapeString() historically used spaces for BYTE1; many other values
      34              :  * could suffice for BYTE1.
      35              :  */
      36              : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37              : #define NONUTF8_INVALID_BYTE1 (' ')
      38              : 
      39              : 
      40              : /*
      41              :  * Operations on multi-byte encodings are driven by a table of helper
      42              :  * functions.
      43              :  *
      44              :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45              :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46              :  * and wchar2mb() conversion functions.
      47              :  *
      48              :  * These functions generally assume that their input is validly formed.
      49              :  * The "verifier" functions, further down in the file, have to be more
      50              :  * paranoid.
      51              :  *
      52              :  * We expect that mblen() does not need to examine more than the first byte
      53              :  * of the character to discover the correct length.  GB18030 is an exception
      54              :  * to that rule, though, as it also looks at second byte.  But even that
      55              :  * behaves in a predictable way, if you only pass the first byte: it will
      56              :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57              :  * good enough for all current uses.
      58              :  *
      59              :  * Note: for the display output of psql to work properly, the return values
      60              :  * of the dsplen functions must conform to the Unicode standard. In particular
      61              :  * the NUL character is zero width and control characters are generally
      62              :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63              :  * subset to the ASCII routines to ensure consistency.
      64              :  */
      65              : 
      66              : /* No error-reporting facility.  Ignore incomplete trailing byte sequence. */
      67              : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
      68              : 
      69              : /*
      70              :  * SQL/ASCII
      71              :  */
      72              : static int
      73          409 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      74              : {
      75          409 :     int         cnt = 0;
      76              : 
      77        31967 :     while (len > 0 && *from)
      78              :     {
      79        31558 :         *to++ = *from++;
      80        31558 :         len--;
      81        31558 :         cnt++;
      82              :     }
      83          409 :     *to = 0;
      84          409 :     return cnt;
      85              : }
      86              : 
      87              : static int
      88        18978 : pg_ascii_mblen(const unsigned char *s)
      89              : {
      90        18978 :     return 1;
      91              : }
      92              : 
      93              : static int
      94        17473 : pg_ascii_dsplen(const unsigned char *s)
      95              : {
      96        17473 :     if (*s == '\0')
      97            0 :         return 0;
      98        17473 :     if (*s < 0x20 || *s == 0x7f)
      99            2 :         return -1;
     100              : 
     101        17471 :     return 1;
     102              : }
     103              : 
     104              : /*
     105              :  * EUC
     106              :  */
     107              : static int
     108           24 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     109              : {
     110           24 :     int         cnt = 0;
     111              : 
     112           36 :     while (len > 0 && *from)
     113              :     {
     114           24 :         if (*from == SS2)       /* JIS X 0201 (so called "1 byte KANA") */
     115              :         {
     116            6 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     117            3 :             from++;
     118            3 :             *to = (SS2 << 8) | *from++;
     119            3 :             len -= 2;
     120              :         }
     121           18 :         else if (*from == SS3)  /* JIS X 0212 KANJI */
     122              :         {
     123            9 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     124            3 :             from++;
     125            3 :             *to = (SS3 << 16) | (*from++ << 8);
     126            3 :             *to |= *from++;
     127            3 :             len -= 3;
     128              :         }
     129            9 :         else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
     130              :         {
     131            6 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     132            3 :             *to = *from++ << 8;
     133            3 :             *to |= *from++;
     134            3 :             len -= 2;
     135              :         }
     136              :         else                    /* must be ASCII */
     137              :         {
     138            3 :             *to = *from++;
     139            3 :             len--;
     140              :         }
     141           12 :         to++;
     142           12 :         cnt++;
     143              :     }
     144           24 :     *to = 0;
     145           24 :     return cnt;
     146              : }
     147              : 
     148              : static inline int
     149          117 : pg_euc_mblen(const unsigned char *s)
     150              : {
     151              :     int         len;
     152              : 
     153          117 :     if (*s == SS2)
     154            0 :         len = 2;
     155          117 :     else if (*s == SS3)
     156            0 :         len = 3;
     157          117 :     else if (IS_HIGHBIT_SET(*s))
     158           81 :         len = 2;
     159              :     else
     160           36 :         len = 1;
     161          117 :     return len;
     162              : }
     163              : 
     164              : static inline int
     165            0 : pg_euc_dsplen(const unsigned char *s)
     166              : {
     167              :     int         len;
     168              : 
     169            0 :     if (*s == SS2)
     170            0 :         len = 2;
     171            0 :     else if (*s == SS3)
     172            0 :         len = 2;
     173            0 :     else if (IS_HIGHBIT_SET(*s))
     174            0 :         len = 2;
     175              :     else
     176            0 :         len = pg_ascii_dsplen(s);
     177            0 :     return len;
     178              : }
     179              : 
     180              : /*
     181              :  * EUC_JP
     182              :  */
     183              : static int
     184           24 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     185              : {
     186           24 :     return pg_euc2wchar_with_len(from, to, len);
     187              : }
     188              : 
     189              : static int
     190          102 : pg_eucjp_mblen(const unsigned char *s)
     191              : {
     192          102 :     return pg_euc_mblen(s);
     193              : }
     194              : 
     195              : static int
     196            0 : pg_eucjp_dsplen(const unsigned char *s)
     197              : {
     198              :     int         len;
     199              : 
     200            0 :     if (*s == SS2)
     201            0 :         len = 1;
     202            0 :     else if (*s == SS3)
     203            0 :         len = 2;
     204            0 :     else if (IS_HIGHBIT_SET(*s))
     205            0 :         len = 2;
     206              :     else
     207            0 :         len = pg_ascii_dsplen(s);
     208            0 :     return len;
     209              : }
     210              : 
     211              : /*
     212              :  * EUC_KR
     213              :  */
     214              : static int
     215            0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     216              : {
     217            0 :     return pg_euc2wchar_with_len(from, to, len);
     218              : }
     219              : 
     220              : static int
     221            3 : pg_euckr_mblen(const unsigned char *s)
     222              : {
     223            3 :     return pg_euc_mblen(s);
     224              : }
     225              : 
     226              : static int
     227            0 : pg_euckr_dsplen(const unsigned char *s)
     228              : {
     229            0 :     return pg_euc_dsplen(s);
     230              : }
     231              : 
     232              : /*
     233              :  * EUC_CN
     234              :  *
     235              :  */
     236              : static int
     237           27 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     238              : {
     239           27 :     int         cnt = 0;
     240              : 
     241           39 :     while (len > 0 && *from)
     242              :     {
     243           27 :         if (*from == SS2)       /* code set 2 (unused?) */
     244              :         {
     245            9 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     246            3 :             from++;
     247            3 :             *to = (SS2 << 16) | (*from++ << 8);
     248            3 :             *to |= *from++;
     249            3 :             len -= 3;
     250              :         }
     251           18 :         else if (*from == SS3)  /* code set 3 (unused ?) */
     252              :         {
     253            9 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     254            3 :             from++;
     255            3 :             *to = (SS3 << 16) | (*from++ << 8);
     256            3 :             *to |= *from++;
     257            3 :             len -= 3;
     258              :         }
     259            9 :         else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
     260              :         {
     261            6 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     262            3 :             *to = *from++ << 8;
     263            3 :             *to |= *from++;
     264            3 :             len -= 2;
     265              :         }
     266              :         else
     267              :         {
     268            3 :             *to = *from++;
     269            3 :             len--;
     270              :         }
     271           12 :         to++;
     272           12 :         cnt++;
     273              :     }
     274           27 :     *to = 0;
     275           27 :     return cnt;
     276              : }
     277              : 
     278              : /*
     279              :  * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
     280              :  * EUC_CN), but mb2wchar_with_len does.  Tell a coherent story for code that
     281              :  * relies on agreement between mb2wchar_with_len and mblen.  Invalid text
     282              :  * datums (e.g. from shared catalogs) reach this.
     283              :  */
     284              : static int
     285            3 : pg_euccn_mblen(const unsigned char *s)
     286              : {
     287              :     int         len;
     288              : 
     289            3 :     if (*s == SS2)
     290            0 :         len = 3;
     291            3 :     else if (*s == SS3)
     292            0 :         len = 3;
     293            3 :     else if (IS_HIGHBIT_SET(*s))
     294            3 :         len = 2;
     295              :     else
     296            0 :         len = 1;
     297            3 :     return len;
     298              : }
     299              : 
     300              : static int
     301            0 : pg_euccn_dsplen(const unsigned char *s)
     302              : {
     303              :     int         len;
     304              : 
     305            0 :     if (IS_HIGHBIT_SET(*s))
     306            0 :         len = 2;
     307              :     else
     308            0 :         len = pg_ascii_dsplen(s);
     309            0 :     return len;
     310              : }
     311              : 
     312              : /*
     313              :  * EUC_TW
     314              :  *
     315              :  */
     316              : static int
     317           30 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     318              : {
     319           30 :     int         cnt = 0;
     320              : 
     321           42 :     while (len > 0 && *from)
     322              :     {
     323           30 :         if (*from == SS2)       /* code set 2 */
     324              :         {
     325           12 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     326            3 :             from++;
     327            3 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     328            3 :             *to |= *from++ << 8;
     329            3 :             *to |= *from++;
     330            3 :             len -= 4;
     331              :         }
     332           18 :         else if (*from == SS3)  /* code set 3 (unused?) */
     333              :         {
     334            9 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     335            3 :             from++;
     336            3 :             *to = (SS3 << 16) | (*from++ << 8);
     337            3 :             *to |= *from++;
     338            3 :             len -= 3;
     339              :         }
     340            9 :         else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
     341              :         {
     342            6 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     343            3 :             *to = *from++ << 8;
     344            3 :             *to |= *from++;
     345            3 :             len -= 2;
     346              :         }
     347              :         else
     348              :         {
     349            3 :             *to = *from++;
     350            3 :             len--;
     351              :         }
     352           12 :         to++;
     353           12 :         cnt++;
     354              :     }
     355           30 :     *to = 0;
     356           30 :     return cnt;
     357              : }
     358              : 
     359              : static int
     360            3 : pg_euctw_mblen(const unsigned char *s)
     361              : {
     362              :     int         len;
     363              : 
     364            3 :     if (*s == SS2)
     365            0 :         len = 4;
     366            3 :     else if (*s == SS3)
     367            0 :         len = 3;
     368            3 :     else if (IS_HIGHBIT_SET(*s))
     369            3 :         len = 2;
     370              :     else
     371            0 :         len = 1;
     372            3 :     return len;
     373              : }
     374              : 
     375              : static int
     376            0 : pg_euctw_dsplen(const unsigned char *s)
     377              : {
     378              :     int         len;
     379              : 
     380            0 :     if (*s == SS2)
     381            0 :         len = 2;
     382            0 :     else if (*s == SS3)
     383            0 :         len = 2;
     384            0 :     else if (IS_HIGHBIT_SET(*s))
     385            0 :         len = 2;
     386              :     else
     387            0 :         len = pg_ascii_dsplen(s);
     388            0 :     return len;
     389              : }
     390              : 
     391              : /*
     392              :  * Convert pg_wchar to EUC_* encoding.
     393              :  * caller must allocate enough space for "to", including a trailing zero!
     394              :  * len: length of from.
     395              :  * "from" not necessarily null terminated.
     396              :  */
     397              : static int
     398           36 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     399              : {
     400           36 :     int         cnt = 0;
     401              : 
     402           72 :     while (len > 0 && *from)
     403              :     {
     404              :         unsigned char c;
     405              : 
     406           36 :         if ((c = (*from >> 24)))
     407              :         {
     408            3 :             *to++ = c;
     409            3 :             *to++ = (*from >> 16) & 0xff;
     410            3 :             *to++ = (*from >> 8) & 0xff;
     411            3 :             *to++ = *from & 0xff;
     412            3 :             cnt += 4;
     413              :         }
     414           33 :         else if ((c = (*from >> 16)))
     415              :         {
     416           12 :             *to++ = c;
     417           12 :             *to++ = (*from >> 8) & 0xff;
     418           12 :             *to++ = *from & 0xff;
     419           12 :             cnt += 3;
     420              :         }
     421           21 :         else if ((c = (*from >> 8)))
     422              :         {
     423           12 :             *to++ = c;
     424           12 :             *to++ = *from & 0xff;
     425           12 :             cnt += 2;
     426              :         }
     427              :         else
     428              :         {
     429            9 :             *to++ = *from;
     430            9 :             cnt++;
     431              :         }
     432           36 :         from++;
     433           36 :         len--;
     434              :     }
     435           36 :     *to = 0;
     436           36 :     return cnt;
     437              : }
     438              : 
     439              : 
     440              : /*
     441              :  * JOHAB
     442              :  */
     443              : static int
     444           12 : pg_johab_mblen(const unsigned char *s)
     445              : {
     446           12 :     return pg_euc_mblen(s);
     447              : }
     448              : 
     449              : static int
     450            0 : pg_johab_dsplen(const unsigned char *s)
     451              : {
     452            0 :     return pg_euc_dsplen(s);
     453              : }
     454              : 
     455              : /*
     456              :  * convert UTF8 string to pg_wchar (UCS-4)
     457              :  * caller must allocate enough space for "to", including a trailing zero!
     458              :  * len: length of from.
     459              :  * "from" not necessarily null terminated.
     460              :  */
     461              : static int
     462      5123723 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     463              : {
     464      5123723 :     int         cnt = 0;
     465              :     uint32      c1,
     466              :                 c2,
     467              :                 c3,
     468              :                 c4;
     469              : 
     470     80467599 :     while (len > 0 && *from)
     471              :     {
     472     75343897 :         if ((*from & 0x80) == 0)
     473              :         {
     474     75343342 :             *to = *from++;
     475     75343342 :             len--;
     476              :         }
     477          555 :         else if ((*from & 0xe0) == 0xc0)
     478              :         {
     479          268 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     480          262 :             c1 = *from++ & 0x1f;
     481          262 :             c2 = *from++ & 0x3f;
     482          262 :             *to = (c1 << 6) | c2;
     483          262 :             len -= 2;
     484              :         }
     485          287 :         else if ((*from & 0xf0) == 0xe0)
     486              :         {
     487          167 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     488          161 :             c1 = *from++ & 0x0f;
     489          161 :             c2 = *from++ & 0x3f;
     490          161 :             c3 = *from++ & 0x3f;
     491          161 :             *to = (c1 << 12) | (c2 << 6) | c3;
     492          161 :             len -= 3;
     493              :         }
     494          120 :         else if ((*from & 0xf8) == 0xf0)
     495              :         {
     496           12 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     497            3 :             c1 = *from++ & 0x07;
     498            3 :             c2 = *from++ & 0x3f;
     499            3 :             c3 = *from++ & 0x3f;
     500            3 :             c4 = *from++ & 0x3f;
     501            3 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     502            3 :             len -= 4;
     503              :         }
     504              :         else
     505              :         {
     506              :             /* treat a bogus char as length 1; not ours to raise error */
     507          108 :             *to = *from++;
     508          108 :             len--;
     509              :         }
     510     75343876 :         to++;
     511     75343876 :         cnt++;
     512              :     }
     513      5123723 :     *to = 0;
     514      5123723 :     return cnt;
     515              : }
     516              : 
     517              : 
     518              : /*
     519              :  * Trivial conversion from pg_wchar to UTF-8.
     520              :  * caller should allocate enough space for "to"
     521              :  * len: length of from.
     522              :  * "from" not necessarily null terminated.
     523              :  */
     524              : static int
     525       558182 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     526              : {
     527       558182 :     int         cnt = 0;
     528              : 
     529      8401345 :     while (len > 0 && *from)
     530              :     {
     531              :         int         char_len;
     532              : 
     533      7843163 :         unicode_to_utf8(*from, to);
     534      7843163 :         char_len = pg_utf_mblen(to);
     535      7843163 :         cnt += char_len;
     536      7843163 :         to += char_len;
     537      7843163 :         from++;
     538      7843163 :         len--;
     539              :     }
     540       558182 :     *to = 0;
     541       558182 :     return cnt;
     542              : }
     543              : 
     544              : /*
     545              :  * Return the byte length of a UTF8 character pointed to by s
     546              :  *
     547              :  * Note: in the current implementation we do not support UTF8 sequences
     548              :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     549              :  * We return "1" for any leading byte that is either flat-out illegal or
     550              :  * indicates a length larger than we support.
     551              :  *
     552              :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     553              :  * other places would need to be fixed to change this.
     554              :  */
     555              : int
     556     76714336 : pg_utf_mblen(const unsigned char *s)
     557              : {
     558              :     int         len;
     559              : 
     560     76714336 :     if ((*s & 0x80) == 0)
     561     76634164 :         len = 1;
     562        80172 :     else if ((*s & 0xe0) == 0xc0)
     563         6976 :         len = 2;
     564        73196 :     else if ((*s & 0xf0) == 0xe0)
     565        52920 :         len = 3;
     566        20276 :     else if ((*s & 0xf8) == 0xf0)
     567        20189 :         len = 4;
     568              : #ifdef NOT_USED
     569              :     else if ((*s & 0xfc) == 0xf8)
     570              :         len = 5;
     571              :     else if ((*s & 0xfe) == 0xfc)
     572              :         len = 6;
     573              : #endif
     574              :     else
     575           87 :         len = 1;
     576     76714336 :     return len;
     577              : }
     578              : 
     579              : /*
     580              :  * This is an implementation of wcwidth() and wcswidth() as defined in
     581              :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     582              :  * <http://www.unix.org/online.html>
     583              :  *
     584              :  * Markus Kuhn -- 2001-09-08 -- public domain
     585              :  *
     586              :  * customised for PostgreSQL
     587              :  *
     588              :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     589              :  */
     590              : 
     591              : struct mbinterval
     592              : {
     593              :     unsigned int first;
     594              :     unsigned int last;
     595              : };
     596              : 
     597              : /* auxiliary function for binary search in interval table */
     598              : static int
     599     44831137 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     600              : {
     601     44831137 :     int         min = 0;
     602              :     int         mid;
     603              : 
     604     44831137 :     if (ucs < table[0].first || ucs > table[max].last)
     605     44826850 :         return 0;
     606        37569 :     while (max >= min)
     607              :     {
     608        33642 :         mid = (min + max) / 2;
     609        33642 :         if (ucs > table[mid].last)
     610         6891 :             min = mid + 1;
     611        26751 :         else if (ucs < table[mid].first)
     612        26391 :             max = mid - 1;
     613              :         else
     614          360 :             return 1;
     615              :     }
     616              : 
     617         3927 :     return 0;
     618              : }
     619              : 
     620              : 
     621              : /* The following functions define the column width of an ISO 10646
     622              :  * character as follows:
     623              :  *
     624              :  *    - The null character (U+0000) has a column width of 0.
     625              :  *
     626              :  *    - Other C0/C1 control characters and DEL will lead to a return
     627              :  *      value of -1.
     628              :  *
     629              :  *    - Non-spacing and enclosing combining characters (general
     630              :  *      category code Mn, Me or Cf in the Unicode database) have a
     631              :  *      column width of 0.
     632              :  *
     633              :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     634              :  *      FullWidth (F) category as defined in Unicode Technical
     635              :  *      Report #11 have a column width of 2.
     636              :  *
     637              :  *    - All remaining characters (including all printable
     638              :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     639              :  *      etc.) have a column width of 1.
     640              :  *
     641              :  * This implementation assumes that wchar_t characters are encoded
     642              :  * in ISO 10646.
     643              :  */
     644              : 
     645              : static int
     646     22438249 : ucs_wcwidth(pg_wchar ucs)
     647              : {
     648              : #include "common/unicode_nonspacing_table.h"
     649              : #include "common/unicode_east_asian_fw_table.h"
     650              : 
     651              :     /* test for 8-bit control characters */
     652     22438249 :     if (ucs == 0)
     653            0 :         return 0;
     654              : 
     655     22438249 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     656        22559 :         return -1;
     657              : 
     658              :     /*
     659              :      * binary search in table of non-spacing characters
     660              :      *
     661              :      * XXX: In the official Unicode sources, it is possible for a character to
     662              :      * be described as both non-spacing and wide at the same time. As of
     663              :      * Unicode 13.0, treating the non-spacing property as the determining
     664              :      * factor for display width leads to the correct behavior, so do that
     665              :      * search first.
     666              :      */
     667     22415690 :     if (mbbisearch(ucs, nonspacing,
     668              :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     669          243 :         return 0;
     670              : 
     671              :     /* binary search in table of wide characters */
     672     22415447 :     if (mbbisearch(ucs, east_asian_fw,
     673              :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     674          117 :         return 2;
     675              : 
     676     22415330 :     return 1;
     677              : }
     678              : 
     679              : static int
     680     22438249 : pg_utf_dsplen(const unsigned char *s)
     681              : {
     682     22438249 :     return ucs_wcwidth(utf8_to_unicode(s));
     683              : }
     684              : 
     685              : /*
     686              :  * convert mule internal code to pg_wchar
     687              :  * caller should allocate enough space for "to"
     688              :  * len: length of from.
     689              :  * "from" not necessarily null terminated.
     690              :  */
     691              : static int
     692           18 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     693              : {
     694           18 :     int         cnt = 0;
     695              : 
     696           27 :     while (len > 0 && *from)
     697              :     {
     698           18 :         if (IS_LC1(*from))
     699              :         {
     700            6 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     701            3 :             *to = *from++ << 16;
     702            3 :             *to |= *from++;
     703            3 :             len -= 2;
     704              :         }
     705           12 :         else if (IS_LCPRV1(*from))
     706              :         {
     707            0 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     708            0 :             from++;
     709            0 :             *to = *from++ << 16;
     710            0 :             *to |= *from++;
     711            0 :             len -= 3;
     712              :         }
     713           12 :         else if (IS_LC2(*from))
     714              :         {
     715            9 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     716            3 :             *to = *from++ << 16;
     717            3 :             *to |= *from++ << 8;
     718            3 :             *to |= *from++;
     719            3 :             len -= 3;
     720              :         }
     721            3 :         else if (IS_LCPRV2(*from))
     722              :         {
     723            0 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     724            0 :             from++;
     725            0 :             *to = *from++ << 16;
     726            0 :             *to |= *from++ << 8;
     727            0 :             *to |= *from++;
     728            0 :             len -= 4;
     729              :         }
     730              :         else
     731              :         {                       /* assume ASCII */
     732            3 :             *to = (unsigned char) *from++;
     733            3 :             len--;
     734              :         }
     735            9 :         to++;
     736            9 :         cnt++;
     737              :     }
     738           18 :     *to = 0;
     739           18 :     return cnt;
     740              : }
     741              : 
     742              : /*
     743              :  * convert pg_wchar to mule internal code
     744              :  * caller should allocate enough space for "to"
     745              :  * len: length of from.
     746              :  * "from" not necessarily null terminated.
     747              :  */
     748              : static int
     749            9 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     750              : {
     751            9 :     int         cnt = 0;
     752              : 
     753           18 :     while (len > 0 && *from)
     754              :     {
     755              :         unsigned char lb;
     756              : 
     757            9 :         lb = (*from >> 16) & 0xff;
     758            9 :         if (IS_LC1(lb))
     759              :         {
     760            3 :             *to++ = lb;
     761            3 :             *to++ = *from & 0xff;
     762            3 :             cnt += 2;
     763              :         }
     764            6 :         else if (IS_LC2(lb))
     765              :         {
     766            3 :             *to++ = lb;
     767            3 :             *to++ = (*from >> 8) & 0xff;
     768            3 :             *to++ = *from & 0xff;
     769            3 :             cnt += 3;
     770              :         }
     771            3 :         else if (IS_LCPRV1_A_RANGE(lb))
     772              :         {
     773            0 :             *to++ = LCPRV1_A;
     774            0 :             *to++ = lb;
     775            0 :             *to++ = *from & 0xff;
     776            0 :             cnt += 3;
     777              :         }
     778            3 :         else if (IS_LCPRV1_B_RANGE(lb))
     779              :         {
     780            0 :             *to++ = LCPRV1_B;
     781            0 :             *to++ = lb;
     782            0 :             *to++ = *from & 0xff;
     783            0 :             cnt += 3;
     784              :         }
     785            3 :         else if (IS_LCPRV2_A_RANGE(lb))
     786              :         {
     787            0 :             *to++ = LCPRV2_A;
     788            0 :             *to++ = lb;
     789            0 :             *to++ = (*from >> 8) & 0xff;
     790            0 :             *to++ = *from & 0xff;
     791            0 :             cnt += 4;
     792              :         }
     793            3 :         else if (IS_LCPRV2_B_RANGE(lb))
     794              :         {
     795            0 :             *to++ = LCPRV2_B;
     796            0 :             *to++ = lb;
     797            0 :             *to++ = (*from >> 8) & 0xff;
     798            0 :             *to++ = *from & 0xff;
     799            0 :             cnt += 4;
     800              :         }
     801              :         else
     802              :         {
     803            3 :             *to++ = *from & 0xff;
     804            3 :             cnt += 1;
     805              :         }
     806            9 :         from++;
     807            9 :         len--;
     808              :     }
     809            9 :     *to = 0;
     810            9 :     return cnt;
     811              : }
     812              : 
     813              : /* exported for direct use by conv.c */
     814              : int
     815         1512 : pg_mule_mblen(const unsigned char *s)
     816              : {
     817              :     int         len;
     818              : 
     819         1512 :     if (IS_LC1(*s))
     820          610 :         len = 2;
     821          902 :     else if (IS_LCPRV1(*s))
     822            0 :         len = 3;
     823          902 :     else if (IS_LC2(*s))
     824          855 :         len = 3;
     825           47 :     else if (IS_LCPRV2(*s))
     826           20 :         len = 4;
     827              :     else
     828           27 :         len = 1;                /* assume ASCII */
     829         1512 :     return len;
     830              : }
     831              : 
     832              : static int
     833            0 : pg_mule_dsplen(const unsigned char *s)
     834              : {
     835              :     int         len;
     836              : 
     837              :     /*
     838              :      * Note: it's not really appropriate to assume that all multibyte charsets
     839              :      * are double-wide on screen.  But this seems an okay approximation for
     840              :      * the MULE charsets we currently support.
     841              :      */
     842              : 
     843            0 :     if (IS_LC1(*s))
     844            0 :         len = 1;
     845            0 :     else if (IS_LCPRV1(*s))
     846            0 :         len = 1;
     847            0 :     else if (IS_LC2(*s))
     848            0 :         len = 2;
     849            0 :     else if (IS_LCPRV2(*s))
     850            0 :         len = 2;
     851              :     else
     852            0 :         len = 1;                /* assume ASCII */
     853              : 
     854            0 :     return len;
     855              : }
     856              : 
     857              : /*
     858              :  * ISO8859-1
     859              :  */
     860              : static int
     861          466 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     862              : {
     863          466 :     int         cnt = 0;
     864              : 
     865        13373 :     while (len > 0 && *from)
     866              :     {
     867        12907 :         *to++ = *from++;
     868        12907 :         len--;
     869        12907 :         cnt++;
     870              :     }
     871          466 :     *to = 0;
     872          466 :     return cnt;
     873              : }
     874              : 
     875              : /*
     876              :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     877              :  * high bits.
     878              :  * caller should allocate enough space for "to"
     879              :  * len: length of from.
     880              :  * "from" not necessarily null terminated.
     881              :  */
     882              : static int
     883           77 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     884              : {
     885           77 :     int         cnt = 0;
     886              : 
     887          674 :     while (len > 0 && *from)
     888              :     {
     889          597 :         *to++ = *from++;
     890          597 :         len--;
     891          597 :         cnt++;
     892              :     }
     893           77 :     *to = 0;
     894           77 :     return cnt;
     895              : }
     896              : 
     897              : static int
     898         3614 : pg_latin1_mblen(const unsigned char *s)
     899              : {
     900         3614 :     return 1;
     901              : }
     902              : 
     903              : static int
     904          400 : pg_latin1_dsplen(const unsigned char *s)
     905              : {
     906          400 :     return pg_ascii_dsplen(s);
     907              : }
     908              : 
     909              : /*
     910              :  * SJIS
     911              :  */
     912              : static int
     913          845 : pg_sjis_mblen(const unsigned char *s)
     914              : {
     915              :     int         len;
     916              : 
     917          845 :     if (*s >= 0xa1 && *s <= 0xdf)
     918            0 :         len = 1;                /* 1 byte kana? */
     919          845 :     else if (IS_HIGHBIT_SET(*s))
     920          657 :         len = 2;                /* kanji? */
     921              :     else
     922          188 :         len = 1;                /* should be ASCII */
     923          845 :     return len;
     924              : }
     925              : 
     926              : static int
     927            0 : pg_sjis_dsplen(const unsigned char *s)
     928              : {
     929              :     int         len;
     930              : 
     931            0 :     if (*s >= 0xa1 && *s <= 0xdf)
     932            0 :         len = 1;                /* 1 byte kana? */
     933            0 :     else if (IS_HIGHBIT_SET(*s))
     934            0 :         len = 2;                /* kanji? */
     935              :     else
     936            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     937            0 :     return len;
     938              : }
     939              : 
     940              : /*
     941              :  * Big5
     942              :  */
     943              : static int
     944          246 : pg_big5_mblen(const unsigned char *s)
     945              : {
     946              :     int         len;
     947              : 
     948          246 :     if (IS_HIGHBIT_SET(*s))
     949          219 :         len = 2;                /* kanji? */
     950              :     else
     951           27 :         len = 1;                /* should be ASCII */
     952          246 :     return len;
     953              : }
     954              : 
     955              : static int
     956            0 : pg_big5_dsplen(const unsigned char *s)
     957              : {
     958              :     int         len;
     959              : 
     960            0 :     if (IS_HIGHBIT_SET(*s))
     961            0 :         len = 2;                /* kanji? */
     962              :     else
     963            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     964            0 :     return len;
     965              : }
     966              : 
     967              : /*
     968              :  * GBK
     969              :  */
     970              : static int
     971          278 : pg_gbk_mblen(const unsigned char *s)
     972              : {
     973              :     int         len;
     974              : 
     975          278 :     if (IS_HIGHBIT_SET(*s))
     976          208 :         len = 2;                /* kanji? */
     977              :     else
     978           70 :         len = 1;                /* should be ASCII */
     979          278 :     return len;
     980              : }
     981              : 
     982              : static int
     983            0 : pg_gbk_dsplen(const unsigned char *s)
     984              : {
     985              :     int         len;
     986              : 
     987            0 :     if (IS_HIGHBIT_SET(*s))
     988            0 :         len = 2;                /* kanji? */
     989              :     else
     990            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     991            0 :     return len;
     992              : }
     993              : 
     994              : /*
     995              :  * UHC
     996              :  */
     997              : static int
     998           12 : pg_uhc_mblen(const unsigned char *s)
     999              : {
    1000              :     int         len;
    1001              : 
    1002           12 :     if (IS_HIGHBIT_SET(*s))
    1003           12 :         len = 2;                /* 2byte? */
    1004              :     else
    1005            0 :         len = 1;                /* should be ASCII */
    1006           12 :     return len;
    1007              : }
    1008              : 
    1009              : static int
    1010            0 : pg_uhc_dsplen(const unsigned char *s)
    1011              : {
    1012              :     int         len;
    1013              : 
    1014            0 :     if (IS_HIGHBIT_SET(*s))
    1015            0 :         len = 2;                /* 2byte? */
    1016              :     else
    1017            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1018            0 :     return len;
    1019              : }
    1020              : 
    1021              : /*
    1022              :  * GB18030
    1023              :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1024              :  */
    1025              : 
    1026              : /*
    1027              :  * Unlike all other mblen() functions, this also looks at the second byte of
    1028              :  * the input.  However, if you only pass the first byte of a multi-byte
    1029              :  * string, and \0 as the second byte, this still works in a predictable way:
    1030              :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1031              :  * enough for all current uses, as a client-only encoding.  It works that
    1032              :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1033              :  * fourth byte look like a 2-byte encoded character, when looked at
    1034              :  * separately.
    1035              :  */
    1036              : static int
    1037          591 : pg_gb18030_mblen(const unsigned char *s)
    1038              : {
    1039              :     int         len;
    1040              : 
    1041          591 :     if (!IS_HIGHBIT_SET(*s))
    1042          342 :         len = 1;                /* ASCII */
    1043          249 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1044           93 :         len = 4;
    1045              :     else
    1046          156 :         len = 2;
    1047          591 :     return len;
    1048              : }
    1049              : 
    1050              : static int
    1051            0 : pg_gb18030_dsplen(const unsigned char *s)
    1052              : {
    1053              :     int         len;
    1054              : 
    1055            0 :     if (IS_HIGHBIT_SET(*s))
    1056            0 :         len = 2;
    1057              :     else
    1058            0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1059            0 :     return len;
    1060              : }
    1061              : 
    1062              : /*
    1063              :  *-------------------------------------------------------------------
    1064              :  * multibyte sequence validators
    1065              :  *
    1066              :  * The verifychar functions accept "s", a pointer to the first byte of a
    1067              :  * string, and "len", the remaining length of the string.  If there is a
    1068              :  * validly encoded character beginning at *s, return its length in bytes;
    1069              :  * else return -1.
    1070              :  *
    1071              :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1072              :  * the length of the string.  They verify the whole string, and return the
    1073              :  * number of input bytes (<= len) that are valid.  In other words, if the
    1074              :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1075              :  * byte offset of the first invalid character.  The verifystr functions must
    1076              :  * test for and reject zeroes in the input.
    1077              :  *
    1078              :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1079              :  * they must test for and reject zeroes in any additional bytes of a
    1080              :  * multibyte character.  Note that this definition allows the function for a
    1081              :  * single-byte encoding to be just "return 1".
    1082              :  *-------------------------------------------------------------------
    1083              :  */
    1084              : static int
    1085          161 : pg_ascii_verifychar(const unsigned char *s, int len)
    1086              : {
    1087          161 :     return 1;
    1088              : }
    1089              : 
    1090              : static int
    1091       211279 : pg_ascii_verifystr(const unsigned char *s, int len)
    1092              : {
    1093       211279 :     const unsigned char *nullpos = memchr(s, 0, len);
    1094              : 
    1095       211279 :     if (nullpos == NULL)
    1096       211279 :         return len;
    1097              :     else
    1098            0 :         return nullpos - s;
    1099              : }
    1100              : 
    1101              : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1102              : 
    1103              : static int
    1104          252 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1105              : {
    1106              :     int         l;
    1107              :     unsigned char c1,
    1108              :                 c2;
    1109              : 
    1110          252 :     c1 = *s++;
    1111              : 
    1112          252 :     switch (c1)
    1113              :     {
    1114            0 :         case SS2:               /* JIS X 0201 */
    1115            0 :             l = 2;
    1116            0 :             if (l > len)
    1117            0 :                 return -1;
    1118            0 :             c2 = *s++;
    1119            0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1120            0 :                 return -1;
    1121            0 :             break;
    1122              : 
    1123            0 :         case SS3:               /* JIS X 0212 */
    1124            0 :             l = 3;
    1125            0 :             if (l > len)
    1126            0 :                 return -1;
    1127            0 :             c2 = *s++;
    1128            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1129            0 :                 return -1;
    1130            0 :             c2 = *s++;
    1131            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1132            0 :                 return -1;
    1133            0 :             break;
    1134              : 
    1135          252 :         default:
    1136          252 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1137              :             {
    1138          252 :                 l = 2;
    1139          252 :                 if (l > len)
    1140           42 :                     return -1;
    1141          210 :                 if (!IS_EUC_RANGE_VALID(c1))
    1142           12 :                     return -1;
    1143          198 :                 c2 = *s++;
    1144          198 :                 if (!IS_EUC_RANGE_VALID(c2))
    1145           90 :                     return -1;
    1146              :             }
    1147              :             else
    1148              :                 /* must be ASCII */
    1149              :             {
    1150            0 :                 l = 1;
    1151              :             }
    1152          108 :             break;
    1153              :     }
    1154              : 
    1155          108 :     return l;
    1156              : }
    1157              : 
    1158              : static int
    1159          150 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1160              : {
    1161          150 :     const unsigned char *start = s;
    1162              : 
    1163          465 :     while (len > 0)
    1164              :     {
    1165              :         int         l;
    1166              : 
    1167              :         /* fast path for ASCII-subset characters */
    1168          423 :         if (!IS_HIGHBIT_SET(*s))
    1169              :         {
    1170          297 :             if (*s == '\0')
    1171           36 :                 break;
    1172          261 :             l = 1;
    1173              :         }
    1174              :         else
    1175              :         {
    1176          126 :             l = pg_eucjp_verifychar(s, len);
    1177          126 :             if (l == -1)
    1178           72 :                 break;
    1179              :         }
    1180          315 :         s += l;
    1181          315 :         len -= l;
    1182              :     }
    1183              : 
    1184          150 :     return s - start;
    1185              : }
    1186              : 
    1187              : static int
    1188           72 : pg_euckr_verifychar(const unsigned char *s, int len)
    1189              : {
    1190              :     int         l;
    1191              :     unsigned char c1,
    1192              :                 c2;
    1193              : 
    1194           72 :     c1 = *s++;
    1195              : 
    1196           72 :     if (IS_HIGHBIT_SET(c1))
    1197              :     {
    1198           72 :         l = 2;
    1199           72 :         if (l > len)
    1200            6 :             return -1;
    1201           66 :         if (!IS_EUC_RANGE_VALID(c1))
    1202           12 :             return -1;
    1203           54 :         c2 = *s++;
    1204           54 :         if (!IS_EUC_RANGE_VALID(c2))
    1205            0 :             return -1;
    1206              :     }
    1207              :     else
    1208              :         /* must be ASCII */
    1209              :     {
    1210            0 :         l = 1;
    1211              :     }
    1212              : 
    1213           54 :     return l;
    1214              : }
    1215              : 
    1216              : static int
    1217           36 : pg_euckr_verifystr(const unsigned char *s, int len)
    1218              : {
    1219           36 :     const unsigned char *start = s;
    1220              : 
    1221          117 :     while (len > 0)
    1222              :     {
    1223              :         int         l;
    1224              : 
    1225              :         /* fast path for ASCII-subset characters */
    1226           99 :         if (!IS_HIGHBIT_SET(*s))
    1227              :         {
    1228           54 :             if (*s == '\0')
    1229            0 :                 break;
    1230           54 :             l = 1;
    1231              :         }
    1232              :         else
    1233              :         {
    1234           45 :             l = pg_euckr_verifychar(s, len);
    1235           45 :             if (l == -1)
    1236           18 :                 break;
    1237              :         }
    1238           81 :         s += l;
    1239           81 :         len -= l;
    1240              :     }
    1241              : 
    1242           36 :     return s - start;
    1243              : }
    1244              : 
    1245              : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1246              : #define pg_euccn_verifychar pg_euckr_verifychar
    1247              : #define pg_euccn_verifystr  pg_euckr_verifystr
    1248              : 
    1249              : static int
    1250            9 : pg_euctw_verifychar(const unsigned char *s, int len)
    1251              : {
    1252              :     int         l;
    1253              :     unsigned char c1,
    1254              :                 c2;
    1255              : 
    1256            9 :     c1 = *s++;
    1257              : 
    1258            9 :     switch (c1)
    1259              :     {
    1260            0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1261            0 :             l = 4;
    1262            0 :             if (l > len)
    1263            0 :                 return -1;
    1264            0 :             c2 = *s++;
    1265            0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1266            0 :                 return -1;
    1267            0 :             c2 = *s++;
    1268            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1269            0 :                 return -1;
    1270            0 :             c2 = *s++;
    1271            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1272            0 :                 return -1;
    1273            0 :             break;
    1274              : 
    1275            0 :         case SS3:               /* unused */
    1276            0 :             return -1;
    1277              : 
    1278            9 :         default:
    1279            9 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1280              :             {
    1281            9 :                 l = 2;
    1282            9 :                 if (l > len)
    1283            3 :                     return -1;
    1284              :                 /* no further range check on c1? */
    1285            6 :                 c2 = *s++;
    1286            6 :                 if (!IS_EUC_RANGE_VALID(c2))
    1287            6 :                     return -1;
    1288              :             }
    1289              :             else
    1290              :                 /* must be ASCII */
    1291              :             {
    1292            0 :                 l = 1;
    1293              :             }
    1294            0 :             break;
    1295              :     }
    1296            0 :     return l;
    1297              : }
    1298              : 
    1299              : static int
    1300           18 : pg_euctw_verifystr(const unsigned char *s, int len)
    1301              : {
    1302           18 :     const unsigned char *start = s;
    1303              : 
    1304           45 :     while (len > 0)
    1305              :     {
    1306              :         int         l;
    1307              : 
    1308              :         /* fast path for ASCII-subset characters */
    1309           36 :         if (!IS_HIGHBIT_SET(*s))
    1310              :         {
    1311           27 :             if (*s == '\0')
    1312            0 :                 break;
    1313           27 :             l = 1;
    1314              :         }
    1315              :         else
    1316              :         {
    1317            9 :             l = pg_euctw_verifychar(s, len);
    1318            9 :             if (l == -1)
    1319            9 :                 break;
    1320              :         }
    1321           27 :         s += l;
    1322           27 :         len -= l;
    1323              :     }
    1324              : 
    1325           18 :     return s - start;
    1326              : }
    1327              : 
    1328              : static int
    1329            9 : pg_johab_verifychar(const unsigned char *s, int len)
    1330              : {
    1331              :     int         l,
    1332              :                 mbl;
    1333              :     unsigned char c;
    1334              : 
    1335            9 :     l = mbl = pg_johab_mblen(s);
    1336              : 
    1337            9 :     if (len < l)
    1338            3 :         return -1;
    1339              : 
    1340            6 :     if (!IS_HIGHBIT_SET(*s))
    1341            0 :         return mbl;
    1342              : 
    1343            6 :     while (--l > 0)
    1344              :     {
    1345            6 :         c = *++s;
    1346            6 :         if (!IS_EUC_RANGE_VALID(c))
    1347            6 :             return -1;
    1348              :     }
    1349            0 :     return mbl;
    1350              : }
    1351              : 
    1352              : static int
    1353           12 : pg_johab_verifystr(const unsigned char *s, int len)
    1354              : {
    1355           12 :     const unsigned char *start = s;
    1356              : 
    1357           21 :     while (len > 0)
    1358              :     {
    1359              :         int         l;
    1360              : 
    1361              :         /* fast path for ASCII-subset characters */
    1362           18 :         if (!IS_HIGHBIT_SET(*s))
    1363              :         {
    1364            9 :             if (*s == '\0')
    1365            0 :                 break;
    1366            9 :             l = 1;
    1367              :         }
    1368              :         else
    1369              :         {
    1370            9 :             l = pg_johab_verifychar(s, len);
    1371            9 :             if (l == -1)
    1372            9 :                 break;
    1373              :         }
    1374            9 :         s += l;
    1375            9 :         len -= l;
    1376              :     }
    1377              : 
    1378           12 :     return s - start;
    1379              : }
    1380              : 
    1381              : static int
    1382          675 : pg_mule_verifychar(const unsigned char *s, int len)
    1383              : {
    1384              :     int         l,
    1385              :                 mbl;
    1386              :     unsigned char c;
    1387              : 
    1388          675 :     l = mbl = pg_mule_mblen(s);
    1389              : 
    1390          675 :     if (len < l)
    1391          172 :         return -1;
    1392              : 
    1393         1016 :     while (--l > 0)
    1394              :     {
    1395          674 :         c = *++s;
    1396          674 :         if (!IS_HIGHBIT_SET(c))
    1397          161 :             return -1;
    1398              :     }
    1399          342 :     return mbl;
    1400              : }
    1401              : 
    1402              : static int
    1403          219 : pg_mule_verifystr(const unsigned char *s, int len)
    1404              : {
    1405          219 :     const unsigned char *start = s;
    1406              : 
    1407          645 :     while (len > 0)
    1408              :     {
    1409              :         int         l;
    1410              : 
    1411              :         /* fast path for ASCII-subset characters */
    1412          561 :         if (!IS_HIGHBIT_SET(*s))
    1413              :         {
    1414          345 :             if (*s == '\0')
    1415           18 :                 break;
    1416          327 :             l = 1;
    1417              :         }
    1418              :         else
    1419              :         {
    1420          216 :             l = pg_mule_verifychar(s, len);
    1421          216 :             if (l == -1)
    1422          117 :                 break;
    1423              :         }
    1424          426 :         s += l;
    1425          426 :         len -= l;
    1426              :     }
    1427              : 
    1428          219 :     return s - start;
    1429              : }
    1430              : 
    1431              : static int
    1432         3190 : pg_latin1_verifychar(const unsigned char *s, int len)
    1433              : {
    1434         3190 :     return 1;
    1435              : }
    1436              : 
    1437              : static int
    1438         4988 : pg_latin1_verifystr(const unsigned char *s, int len)
    1439              : {
    1440         4988 :     const unsigned char *nullpos = memchr(s, 0, len);
    1441              : 
    1442         4988 :     if (nullpos == NULL)
    1443         4934 :         return len;
    1444              :     else
    1445           54 :         return nullpos - s;
    1446              : }
    1447              : 
    1448              : static int
    1449          501 : pg_sjis_verifychar(const unsigned char *s, int len)
    1450              : {
    1451              :     int         l,
    1452              :                 mbl;
    1453              :     unsigned char c1,
    1454              :                 c2;
    1455              : 
    1456          501 :     l = mbl = pg_sjis_mblen(s);
    1457              : 
    1458          501 :     if (len < l)
    1459           66 :         return -1;
    1460              : 
    1461          435 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1462            0 :         return mbl;
    1463              : 
    1464          435 :     c1 = *s++;
    1465          435 :     c2 = *s;
    1466          435 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1467          174 :         return -1;
    1468          261 :     return mbl;
    1469              : }
    1470              : 
    1471              : static int
    1472          273 : pg_sjis_verifystr(const unsigned char *s, int len)
    1473              : {
    1474          273 :     const unsigned char *start = s;
    1475              : 
    1476         1034 :     while (len > 0)
    1477              :     {
    1478              :         int         l;
    1479              : 
    1480              :         /* fast path for ASCII-subset characters */
    1481          921 :         if (!IS_HIGHBIT_SET(*s))
    1482              :         {
    1483          674 :             if (*s == '\0')
    1484           36 :                 break;
    1485          638 :             l = 1;
    1486              :         }
    1487              :         else
    1488              :         {
    1489          247 :             l = pg_sjis_verifychar(s, len);
    1490          247 :             if (l == -1)
    1491          124 :                 break;
    1492              :         }
    1493          761 :         s += l;
    1494          761 :         len -= l;
    1495              :     }
    1496              : 
    1497          273 :     return s - start;
    1498              : }
    1499              : 
    1500              : static int
    1501          180 : pg_big5_verifychar(const unsigned char *s, int len)
    1502              : {
    1503              :     int         l,
    1504              :                 mbl;
    1505              : 
    1506          180 :     l = mbl = pg_big5_mblen(s);
    1507              : 
    1508          180 :     if (len < l)
    1509            3 :         return -1;
    1510              : 
    1511          177 :     if (l == 2 &&
    1512          177 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1513            6 :         s[1] == NONUTF8_INVALID_BYTE1)
    1514            6 :         return -1;
    1515              : 
    1516          288 :     while (--l > 0)
    1517              :     {
    1518          171 :         if (*++s == '\0')
    1519           54 :             return -1;
    1520              :     }
    1521              : 
    1522          117 :     return mbl;
    1523              : }
    1524              : 
    1525              : static int
    1526           81 : pg_big5_verifystr(const unsigned char *s, int len)
    1527              : {
    1528           81 :     const unsigned char *start = s;
    1529              : 
    1530          333 :     while (len > 0)
    1531              :     {
    1532              :         int         l;
    1533              : 
    1534              :         /* fast path for ASCII-subset characters */
    1535          297 :         if (!IS_HIGHBIT_SET(*s))
    1536              :         {
    1537          234 :             if (*s == '\0')
    1538           18 :                 break;
    1539          216 :             l = 1;
    1540              :         }
    1541              :         else
    1542              :         {
    1543           63 :             l = pg_big5_verifychar(s, len);
    1544           63 :             if (l == -1)
    1545           27 :                 break;
    1546              :         }
    1547          252 :         s += l;
    1548          252 :         len -= l;
    1549              :     }
    1550              : 
    1551           81 :     return s - start;
    1552              : }
    1553              : 
    1554              : static int
    1555          137 : pg_gbk_verifychar(const unsigned char *s, int len)
    1556              : {
    1557              :     int         l,
    1558              :                 mbl;
    1559              : 
    1560          137 :     l = mbl = pg_gbk_mblen(s);
    1561              : 
    1562          137 :     if (len < l)
    1563           27 :         return -1;
    1564              : 
    1565          110 :     if (l == 2 &&
    1566          110 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1567           14 :         s[1] == NONUTF8_INVALID_BYTE1)
    1568           14 :         return -1;
    1569              : 
    1570          192 :     while (--l > 0)
    1571              :     {
    1572           96 :         if (*++s == '\0')
    1573            0 :             return -1;
    1574              :     }
    1575              : 
    1576           96 :     return mbl;
    1577              : }
    1578              : 
    1579              : static int
    1580          128 : pg_gbk_verifystr(const unsigned char *s, int len)
    1581              : {
    1582          128 :     const unsigned char *start = s;
    1583              : 
    1584          329 :     while (len > 0)
    1585              :     {
    1586              :         int         l;
    1587              : 
    1588              :         /* fast path for ASCII-subset characters */
    1589          242 :         if (!IS_HIGHBIT_SET(*s))
    1590              :         {
    1591          121 :             if (*s == '\0')
    1592            0 :                 break;
    1593          121 :             l = 1;
    1594              :         }
    1595              :         else
    1596              :         {
    1597          121 :             l = pg_gbk_verifychar(s, len);
    1598          121 :             if (l == -1)
    1599           41 :                 break;
    1600              :         }
    1601          201 :         s += l;
    1602          201 :         len -= l;
    1603              :     }
    1604              : 
    1605          128 :     return s - start;
    1606              : }
    1607              : 
    1608              : static int
    1609            9 : pg_uhc_verifychar(const unsigned char *s, int len)
    1610              : {
    1611              :     int         l,
    1612              :                 mbl;
    1613              : 
    1614            9 :     l = mbl = pg_uhc_mblen(s);
    1615              : 
    1616            9 :     if (len < l)
    1617            3 :         return -1;
    1618              : 
    1619            6 :     if (l == 2 &&
    1620            6 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1621            6 :         s[1] == NONUTF8_INVALID_BYTE1)
    1622            6 :         return -1;
    1623              : 
    1624            0 :     while (--l > 0)
    1625              :     {
    1626            0 :         if (*++s == '\0')
    1627            0 :             return -1;
    1628              :     }
    1629              : 
    1630            0 :     return mbl;
    1631              : }
    1632              : 
    1633              : static int
    1634           12 : pg_uhc_verifystr(const unsigned char *s, int len)
    1635              : {
    1636           12 :     const unsigned char *start = s;
    1637              : 
    1638           21 :     while (len > 0)
    1639              :     {
    1640              :         int         l;
    1641              : 
    1642              :         /* fast path for ASCII-subset characters */
    1643           18 :         if (!IS_HIGHBIT_SET(*s))
    1644              :         {
    1645            9 :             if (*s == '\0')
    1646            0 :                 break;
    1647            9 :             l = 1;
    1648              :         }
    1649              :         else
    1650              :         {
    1651            9 :             l = pg_uhc_verifychar(s, len);
    1652            9 :             if (l == -1)
    1653            9 :                 break;
    1654              :         }
    1655            9 :         s += l;
    1656            9 :         len -= l;
    1657              :     }
    1658              : 
    1659           12 :     return s - start;
    1660              : }
    1661              : 
    1662              : static int
    1663          606 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1664              : {
    1665              :     int         l;
    1666              : 
    1667          606 :     if (!IS_HIGHBIT_SET(*s))
    1668            0 :         l = 1;                  /* ASCII */
    1669          606 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1670              :     {
    1671              :         /* Should be 4-byte, validate remaining bytes */
    1672          159 :         if (*s >= 0x81 && *s <= 0xfe &&
    1673          153 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1674          153 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1675           81 :             l = 4;
    1676              :         else
    1677           78 :             l = -1;
    1678              :     }
    1679          447 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1680              :     {
    1681              :         /* Should be 2-byte, validate */
    1682          330 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1683          210 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1684          162 :             l = 2;
    1685              :         else
    1686          168 :             l = -1;
    1687              :     }
    1688              :     else
    1689          117 :         l = -1;
    1690          606 :     return l;
    1691              : }
    1692              : 
    1693              : static int
    1694          451 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1695              : {
    1696          451 :     const unsigned char *start = s;
    1697              : 
    1698         1483 :     while (len > 0)
    1699              :     {
    1700              :         int         l;
    1701              : 
    1702              :         /* fast path for ASCII-subset characters */
    1703         1335 :         if (!IS_HIGHBIT_SET(*s))
    1704              :         {
    1705          902 :             if (*s == '\0')
    1706           24 :                 break;
    1707          878 :             l = 1;
    1708              :         }
    1709              :         else
    1710              :         {
    1711          433 :             l = pg_gb18030_verifychar(s, len);
    1712          433 :             if (l == -1)
    1713          279 :                 break;
    1714              :         }
    1715         1032 :         s += l;
    1716         1032 :         len -= l;
    1717              :     }
    1718              : 
    1719          451 :     return s - start;
    1720              : }
    1721              : 
    1722              : static int
    1723         8814 : pg_utf8_verifychar(const unsigned char *s, int len)
    1724              : {
    1725              :     int         l;
    1726              : 
    1727         8814 :     if ((*s & 0x80) == 0)
    1728              :     {
    1729            0 :         if (*s == '\0')
    1730            0 :             return -1;
    1731            0 :         return 1;
    1732              :     }
    1733         8814 :     else if ((*s & 0xe0) == 0xc0)
    1734         3088 :         l = 2;
    1735         5726 :     else if ((*s & 0xf0) == 0xe0)
    1736         3166 :         l = 3;
    1737         2560 :     else if ((*s & 0xf8) == 0xf0)
    1738         2428 :         l = 4;
    1739              :     else
    1740          132 :         l = 1;
    1741              : 
    1742         8814 :     if (l > len)
    1743          289 :         return -1;
    1744              : 
    1745         8525 :     if (!pg_utf8_islegal(s, l))
    1746         1181 :         return -1;
    1747              : 
    1748         7344 :     return l;
    1749              : }
    1750              : 
    1751              : /*
    1752              :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1753              :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1754              :  * input byte and current state are used to compute an index into an array of
    1755              :  * state transitions. Since the address of the next transition is dependent
    1756              :  * on this computation, there is latency in executing the load instruction,
    1757              :  * and the CPU is not kept busy.
    1758              :  *
    1759              :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1760              :  *
    1761              :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1762              :  *
    1763              :  * In a shift-based DFA, the input byte is an index into array of integers
    1764              :  * whose bit pattern encodes the state transitions. To compute the next
    1765              :  * state, we simply right-shift the integer by the current state and apply a
    1766              :  * mask. In this scheme, the address of the transition only depends on the
    1767              :  * input byte, so there is better pipelining.
    1768              :  *
    1769              :  * The naming convention for states and transitions was adopted from a UTF-8
    1770              :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1771              :  *
    1772              :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1773              :  *
    1774              :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1775              :  * ==========================================================================
    1776              :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1777              :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1778              :  *                                                                  |
    1779              :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1780              :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1781              :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1782              :  *                                                                  |
    1783              :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1784              :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1785              :  *                                                                  |
    1786              :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1787              :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1788              :  *
    1789              :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1790              :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1791              :  * it's possible to find state numbers such that the transitions fit within
    1792              :  * 32-bit integers, as Dougall Johnson demonstrated:
    1793              :  *
    1794              :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1795              :  *
    1796              :  * This packed representation is the reason for the seemingly odd choice of
    1797              :  * state values below.
    1798              :  */
    1799              : 
    1800              : /* Error */
    1801              : #define ERR  0
    1802              : /* Begin */
    1803              : #define BGN 11
    1804              : /* Continuation states, expect 1/2/3 continuation bytes */
    1805              : #define CS1 16
    1806              : #define CS2  1
    1807              : #define CS3  5
    1808              : /* Partial states, where the first continuation byte has a restricted range */
    1809              : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1810              : #define P3B 20                  /* Lead was ED, check for surrogate */
    1811              : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1812              : #define P4B 30                  /* Lead was F4, check for too-large */
    1813              : /* Begin and End are the same state */
    1814              : #define END BGN
    1815              : 
    1816              : /* the encoded state transitions for the lookup table */
    1817              : 
    1818              : /* ASCII */
    1819              : #define ASC (END << BGN)
    1820              : /* 2-byte lead */
    1821              : #define L2A (CS1 << BGN)
    1822              : /* 3-byte lead */
    1823              : #define L3A (P3A << BGN)
    1824              : #define L3B (CS2 << BGN)
    1825              : #define L3C (P3B << BGN)
    1826              : /* 4-byte lead */
    1827              : #define L4A (P4A << BGN)
    1828              : #define L4B (CS3 << BGN)
    1829              : #define L4C (P4B << BGN)
    1830              : /* continuation byte */
    1831              : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1832              : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1833              : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1834              : /* invalid byte */
    1835              : #define ILL ERR
    1836              : 
    1837              : static const uint32 Utf8Transition[256] =
    1838              : {
    1839              :     /* ASCII */
    1840              : 
    1841              :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1842              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1843              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1844              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1845              : 
    1846              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1847              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1848              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1849              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1850              : 
    1851              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1852              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1853              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1854              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1855              : 
    1856              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1857              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1858              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1859              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1860              : 
    1861              :     /* continuation bytes */
    1862              : 
    1863              :     /* 80..8F */
    1864              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1865              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1866              : 
    1867              :     /* 90..9F */
    1868              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1869              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1870              : 
    1871              :     /* A0..BF */
    1872              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1873              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1874              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1875              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1876              : 
    1877              :     /* leading bytes */
    1878              : 
    1879              :     /* C0..DF */
    1880              :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1881              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1882              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1883              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1884              : 
    1885              :     /* E0..EF */
    1886              :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1887              :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1888              : 
    1889              :     /* F0..FF */
    1890              :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1891              :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1892              : };
    1893              : 
    1894              : static void
    1895          875 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1896              : {
    1897              :     /* Note: We deliberately don't check the state's value here. */
    1898        28875 :     while (len > 0)
    1899              :     {
    1900              :         /*
    1901              :          * It's important that the mask value is 31: In most instruction sets,
    1902              :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1903              :          * 32, so the compiler should elide the mask operation.
    1904              :          */
    1905        28000 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1906        28000 :         len--;
    1907              :     }
    1908              : 
    1909          875 :     *state &= 31;
    1910          875 : }
    1911              : 
    1912              : static int
    1913       616969 : pg_utf8_verifystr(const unsigned char *s, int len)
    1914              : {
    1915       616969 :     const unsigned char *start = s;
    1916       616969 :     const int   orig_len = len;
    1917       616969 :     uint32      state = BGN;
    1918              : 
    1919              : /*
    1920              :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1921              :  * the compiler can unroll a longer loop, it's not worth it because we
    1922              :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1923              :  */
    1924              : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1925              : 
    1926       616969 :     if (len >= STRIDE_LENGTH)
    1927              :     {
    1928      2143390 :         while (len >= STRIDE_LENGTH)
    1929              :         {
    1930              :             /*
    1931              :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1932              :              * but we must first check for a non-END state, which means the
    1933              :              * previous chunk ended in the middle of a multibyte sequence.
    1934              :              */
    1935      1846406 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1936          875 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1937              : 
    1938      1846406 :             s += STRIDE_LENGTH;
    1939      1846406 :             len -= STRIDE_LENGTH;
    1940              :         }
    1941              : 
    1942              :         /* The error state persists, so we only need to check for it here. */
    1943       296984 :         if (state == ERR)
    1944              :         {
    1945              :             /*
    1946              :              * Start over from the beginning with the slow path so we can
    1947              :              * count the valid bytes.
    1948              :              */
    1949          252 :             len = orig_len;
    1950          252 :             s = start;
    1951              :         }
    1952       296732 :         else if (state != END)
    1953              :         {
    1954              :             /*
    1955              :              * The fast path exited in the middle of a multibyte sequence.
    1956              :              * Walk backwards to find the leading byte so that the slow path
    1957              :              * can resume checking from there. We must always backtrack at
    1958              :              * least one byte, since the current byte could be e.g. an ASCII
    1959              :              * byte after a 2-byte lead, which is invalid.
    1960              :              */
    1961              :             do
    1962              :             {
    1963              :                 Assert(s > start);
    1964           58 :                 s--;
    1965           58 :                 len++;
    1966              :                 Assert(IS_HIGHBIT_SET(*s));
    1967           58 :             } while (pg_utf_mblen(s) <= 1);
    1968              :         }
    1969              :     }
    1970              : 
    1971              :     /* check remaining bytes */
    1972      9096992 :     while (len > 0)
    1973              :     {
    1974              :         int         l;
    1975              : 
    1976              :         /* fast path for ASCII-subset characters */
    1977      8481564 :         if (!IS_HIGHBIT_SET(*s))
    1978              :         {
    1979      8472786 :             if (*s == '\0')
    1980          103 :                 break;
    1981      8472683 :             l = 1;
    1982              :         }
    1983              :         else
    1984              :         {
    1985         8778 :             l = pg_utf8_verifychar(s, len);
    1986         8778 :             if (l == -1)
    1987         1438 :                 break;
    1988              :         }
    1989      8480023 :         s += l;
    1990      8480023 :         len -= l;
    1991              :     }
    1992              : 
    1993       616969 :     return s - start;
    1994              : }
    1995              : 
    1996              : /*
    1997              :  * Check for validity of a single UTF-8 encoded character
    1998              :  *
    1999              :  * This directly implements the rules in RFC3629.  The bizarre-looking
    2000              :  * restrictions on the second byte are meant to ensure that there isn't
    2001              :  * more than one encoding of a given Unicode character point; that is,
    2002              :  * you may not use a longer-than-necessary byte sequence with high order
    2003              :  * zero bits to represent a character that would fit in fewer bytes.
    2004              :  * To do otherwise is to create security hazards (eg, create an apparent
    2005              :  * non-ASCII character that decodes to plain ASCII).
    2006              :  *
    2007              :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    2008              :  * caller must have checked that that many bytes are present in the buffer.
    2009              :  */
    2010              : bool
    2011        11827 : pg_utf8_islegal(const unsigned char *source, int length)
    2012              : {
    2013              :     unsigned char a;
    2014              : 
    2015        11827 :     switch (length)
    2016              :     {
    2017            0 :         default:
    2018              :             /* reject lengths 5 and 6 for now */
    2019            0 :             return false;
    2020         2298 :         case 4:
    2021         2298 :             a = source[3];
    2022         2298 :             if (a < 0x80 || a > 0xBF)
    2023          182 :                 return false;
    2024              :             pg_fallthrough;
    2025              :         case 3:
    2026         6025 :             a = source[2];
    2027         6025 :             if (a < 0x80 || a > 0xBF)
    2028          340 :                 return false;
    2029              :             pg_fallthrough;
    2030              :         case 2:
    2031         9002 :             a = source[1];
    2032         9002 :             switch (*source)
    2033              :             {
    2034          156 :                 case 0xE0:
    2035          156 :                     if (a < 0xA0 || a > 0xBF)
    2036          132 :                         return false;
    2037           24 :                     break;
    2038          156 :                 case 0xED:
    2039          156 :                     if (a < 0x80 || a > 0x9F)
    2040          132 :                         return false;
    2041           24 :                     break;
    2042         2026 :                 case 0xF0:
    2043         2026 :                     if (a < 0x90 || a > 0xBF)
    2044          132 :                         return false;
    2045         1894 :                     break;
    2046           90 :                 case 0xF4:
    2047           90 :                     if (a < 0x80 || a > 0x8F)
    2048           66 :                         return false;
    2049           24 :                     break;
    2050         6574 :                 default:
    2051         6574 :                     if (a < 0x80 || a > 0xBF)
    2052          149 :                         return false;
    2053         6425 :                     break;
    2054              :             }
    2055              :             pg_fallthrough;
    2056              :         case 1:
    2057        10694 :             a = *source;
    2058        10694 :             if (a >= 0x80 && a < 0xC2)
    2059          198 :                 return false;
    2060        10496 :             if (a > 0xF4)
    2061           66 :                 return false;
    2062        10430 :             break;
    2063              :     }
    2064        10430 :     return true;
    2065              : }
    2066              : 
    2067              : 
    2068              : /*
    2069              :  * Fills the provided buffer with two bytes such that:
    2070              :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2071              :  */
    2072              : void
    2073          206 : pg_encoding_set_invalid(int encoding, char *dst)
    2074              : {
    2075              :     Assert(pg_encoding_max_length(encoding) > 1);
    2076              : 
    2077          206 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2078          206 :     dst[1] = NONUTF8_INVALID_BYTE1;
    2079          206 : }
    2080              : 
    2081              : /*
    2082              :  *-------------------------------------------------------------------
    2083              :  * encoding info table
    2084              :  *-------------------------------------------------------------------
    2085              :  */
    2086              : const pg_wchar_tbl pg_wchar_table[] = {
    2087              :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2088              :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2089              :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
    2090              :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2091              :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2092              :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2093              :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2094              :     [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2095              :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096              :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097              :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098              :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2099              :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2100              :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2101              :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2102              :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2103              :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2104              :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2105              :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2106              :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2107              :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2108              :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2109              :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2110              :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2111              :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2112              :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2113              :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2114              :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2115              :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2116              :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2117              :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2118              :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2119              :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2120              :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2121              :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2122              :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2123              :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2124              :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2125              :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2126              :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2127              :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2128              :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2129              : };
    2130              : 
    2131              : /*
    2132              :  * Returns the byte length of a multibyte character.
    2133              :  *
    2134              :  * Choose "mblen" functions based on the input string characteristics.
    2135              :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    2136              :  *
    2137              :  * - The input string is zero-terminated
    2138              :  *
    2139              :  * - The input string is known to be valid in the encoding (e.g., string
    2140              :  *   converted from database encoding)
    2141              :  *
    2142              :  * - The encoding is not GB18030 (e.g., when only database encodings are
    2143              :  *   passed to 'encoding' parameter)
    2144              :  *
    2145              :  * encoding==GB18030 requires examining up to two bytes to determine character
    2146              :  * length.  Therefore, callers satisfying none of those conditions must use
    2147              :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    2148              :  * guaranteed to be within allocation bounds.
    2149              :  *
    2150              :  * When dealing with text that is not certainly valid in the specified
    2151              :  * encoding, the result may exceed the actual remaining string length.
    2152              :  * Callers that are not prepared to deal with that should use Min(remaining,
    2153              :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    2154              :  * pg_encoding_mblen_bounded() are interchangeable.
    2155              :  */
    2156              : int
    2157     22542258 : pg_encoding_mblen(int encoding, const char *mbstr)
    2158              : {
    2159     22542258 :     return (PG_VALID_ENCODING(encoding) ?
    2160     45084516 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2161            0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2162              : }
    2163              : 
    2164              : /*
    2165              :  * Returns the byte length of a multibyte character (possibly not
    2166              :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    2167              :  */
    2168              : int
    2169         3077 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    2170              :                                 size_t remaining)
    2171              : {
    2172              :     /*
    2173              :      * Define zero remaining as too few, even for single-byte encodings.
    2174              :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    2175              :      * zero; others read one.
    2176              :      */
    2177         3077 :     if (remaining < 1 ||
    2178          169 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    2179           36 :         return INT_MAX;
    2180         3041 :     return pg_encoding_mblen(encoding, mbstr);
    2181              : }
    2182              : 
    2183              : /*
    2184              :  * Returns the byte length of a multibyte character; but not more than the
    2185              :  * distance to the terminating zero byte.  For input that might lack a
    2186              :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    2187              :  */
    2188              : int
    2189            0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2190              : {
    2191            0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2192              : }
    2193              : 
    2194              : /*
    2195              :  * Returns the display length of a multibyte character.
    2196              :  */
    2197              : int
    2198     22451360 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2199              : {
    2200     22451360 :     return (PG_VALID_ENCODING(encoding) ?
    2201     44902720 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2202            0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2203              : }
    2204              : 
    2205              : /*
    2206              :  * Verify the first multibyte character of the given string.
    2207              :  * Return its byte length if good, -1 if bad.  (See comments above for
    2208              :  * full details of the mbverifychar API.)
    2209              :  */
    2210              : int
    2211         4511 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2212              : {
    2213         4511 :     return (PG_VALID_ENCODING(encoding) ?
    2214         9022 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2215            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2216              : }
    2217              : 
    2218              : /*
    2219              :  * Verify that a string is valid for the given encoding.
    2220              :  * Returns the number of input bytes (<= len) that form a valid string.
    2221              :  * (See comments above for full details of the mbverifystr API.)
    2222              :  */
    2223              : int
    2224       232145 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2225              : {
    2226       232145 :     return (PG_VALID_ENCODING(encoding) ?
    2227       464290 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2228            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2229              : }
    2230              : 
    2231              : /*
    2232              :  * fetch maximum length of a given encoding
    2233              :  */
    2234              : int
    2235       585469 : pg_encoding_max_length(int encoding)
    2236              : {
    2237              :     Assert(PG_VALID_ENCODING(encoding));
    2238              : 
    2239              :     /*
    2240              :      * Check for the encoding despite the assert, due to some mingw versions
    2241              :      * otherwise issuing bogus warnings.
    2242              :      */
    2243       585469 :     return PG_VALID_ENCODING(encoding) ?
    2244      1170938 :         pg_wchar_table[encoding].maxmblen :
    2245              :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2246              : }
        

Generated by: LCOV version 2.0-1