LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 78.7 % 868 683
Test Date: 2026-04-07 14:16:30 Functions: 82.9 % 82 68
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * wchar.c
       4              :  *    Functions for working with multibyte characters in various encodings.
       5              :  *
       6              :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/common/wchar.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "c.h"
      14              : 
      15              : #include <limits.h>
      16              : 
      17              : #include "mb/pg_wchar.h"
      18              : #include "utils/ascii.h"
      19              : 
      20              : 
      21              : /*
      22              :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23              :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24              :  *
      25              :  * For historical reasons, several verifychar implementations opt to reject
      26              :  * this pair specifically.  Byte pair range constraints, in encoding
      27              :  * originator documentation, always excluded this pair.  No core conversion
      28              :  * could translate it.  However, longstanding verifychar implementations
      29              :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      30              :  * pairs not valid per encoding originator documentation.  To avoid tightening
      31              :  * core or non-core conversions in a security patch, we sought this one pair.
      32              :  *
      33              :  * PQescapeString() historically used spaces for BYTE1; many other values
      34              :  * could suffice for BYTE1.
      35              :  */
      36              : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37              : #define NONUTF8_INVALID_BYTE1 (' ')
      38              : 
      39              : 
      40              : /*
      41              :  * Operations on multi-byte encodings are driven by a table of helper
      42              :  * functions.
      43              :  *
      44              :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45              :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46              :  * and wchar2mb() conversion functions.
      47              :  *
      48              :  * These functions generally assume that their input is validly formed.
      49              :  * The "verifier" functions, further down in the file, have to be more
      50              :  * paranoid.
      51              :  *
      52              :  * We expect that mblen() does not need to examine more than the first byte
      53              :  * of the character to discover the correct length.  GB18030 is an exception
      54              :  * to that rule, though, as it also looks at second byte.  But even that
      55              :  * behaves in a predictable way, if you only pass the first byte: it will
      56              :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57              :  * good enough for all current uses.
      58              :  *
      59              :  * Note: for the display output of psql to work properly, the return values
      60              :  * of the dsplen functions must conform to the Unicode standard. In particular
      61              :  * the NUL character is zero width and control characters are generally
      62              :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63              :  * subset to the ASCII routines to ensure consistency.
      64              :  */
      65              : 
      66              : /* No error-reporting facility.  Ignore incomplete trailing byte sequence. */
      67              : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
      68              : 
      69              : /*
      70              :  * SQL/ASCII
      71              :  */
      72              : static int
      73          430 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      74              : {
      75          430 :     int         cnt = 0;
      76              : 
      77        32345 :     while (len > 0 && *from)
      78              :     {
      79        31915 :         *to++ = *from++;
      80        31915 :         len--;
      81        31915 :         cnt++;
      82              :     }
      83          430 :     *to = 0;
      84          430 :     return cnt;
      85              : }
      86              : 
      87              : static int
      88        18978 : pg_ascii_mblen(const unsigned char *s)
      89              : {
      90        18978 :     return 1;
      91              : }
      92              : 
      93              : static int
      94        17473 : pg_ascii_dsplen(const unsigned char *s)
      95              : {
      96        17473 :     if (*s == '\0')
      97            0 :         return 0;
      98        17473 :     if (*s < 0x20 || *s == 0x7f)
      99            2 :         return -1;
     100              : 
     101        17471 :     return 1;
     102              : }
     103              : 
     104              : /*
     105              :  * EUC
     106              :  */
     107              : static int
     108           32 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     109              : {
     110           32 :     int         cnt = 0;
     111              : 
     112           48 :     while (len > 0 && *from)
     113              :     {
     114           32 :         if (*from == SS2)       /* JIS X 0201 (so called "1 byte KANA") */
     115              :         {
     116            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     117            4 :             from++;
     118            4 :             *to = (SS2 << 8) | *from++;
     119            4 :             len -= 2;
     120              :         }
     121           24 :         else if (*from == SS3)  /* JIS X 0212 KANJI */
     122              :         {
     123           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     124            4 :             from++;
     125            4 :             *to = (SS3 << 16) | (*from++ << 8);
     126            4 :             *to |= *from++;
     127            4 :             len -= 3;
     128              :         }
     129           12 :         else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
     130              :         {
     131            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     132            4 :             *to = *from++ << 8;
     133            4 :             *to |= *from++;
     134            4 :             len -= 2;
     135              :         }
     136              :         else                    /* must be ASCII */
     137              :         {
     138            4 :             *to = *from++;
     139            4 :             len--;
     140              :         }
     141           16 :         to++;
     142           16 :         cnt++;
     143              :     }
     144           32 :     *to = 0;
     145           32 :     return cnt;
     146              : }
     147              : 
     148              : static inline int
     149          156 : pg_euc_mblen(const unsigned char *s)
     150              : {
     151              :     int         len;
     152              : 
     153          156 :     if (*s == SS2)
     154            0 :         len = 2;
     155          156 :     else if (*s == SS3)
     156            0 :         len = 3;
     157          156 :     else if (IS_HIGHBIT_SET(*s))
     158          108 :         len = 2;
     159              :     else
     160           48 :         len = 1;
     161          156 :     return len;
     162              : }
     163              : 
     164              : static inline int
     165            0 : pg_euc_dsplen(const unsigned char *s)
     166              : {
     167              :     int         len;
     168              : 
     169            0 :     if (*s == SS2)
     170            0 :         len = 2;
     171            0 :     else if (*s == SS3)
     172            0 :         len = 2;
     173            0 :     else if (IS_HIGHBIT_SET(*s))
     174            0 :         len = 2;
     175              :     else
     176            0 :         len = pg_ascii_dsplen(s);
     177            0 :     return len;
     178              : }
     179              : 
     180              : /*
     181              :  * EUC_JP
     182              :  */
     183              : static int
     184           32 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     185              : {
     186           32 :     return pg_euc2wchar_with_len(from, to, len);
     187              : }
     188              : 
     189              : static int
     190          136 : pg_eucjp_mblen(const unsigned char *s)
     191              : {
     192          136 :     return pg_euc_mblen(s);
     193              : }
     194              : 
     195              : static int
     196            0 : pg_eucjp_dsplen(const unsigned char *s)
     197              : {
     198              :     int         len;
     199              : 
     200            0 :     if (*s == SS2)
     201            0 :         len = 1;
     202            0 :     else if (*s == SS3)
     203            0 :         len = 2;
     204            0 :     else if (IS_HIGHBIT_SET(*s))
     205            0 :         len = 2;
     206              :     else
     207            0 :         len = pg_ascii_dsplen(s);
     208            0 :     return len;
     209              : }
     210              : 
     211              : /*
     212              :  * EUC_KR
     213              :  */
     214              : static int
     215            0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     216              : {
     217            0 :     return pg_euc2wchar_with_len(from, to, len);
     218              : }
     219              : 
     220              : static int
     221            4 : pg_euckr_mblen(const unsigned char *s)
     222              : {
     223            4 :     return pg_euc_mblen(s);
     224              : }
     225              : 
     226              : static int
     227            0 : pg_euckr_dsplen(const unsigned char *s)
     228              : {
     229            0 :     return pg_euc_dsplen(s);
     230              : }
     231              : 
     232              : /*
     233              :  * EUC_CN
     234              :  *
     235              :  */
     236              : static int
     237           36 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     238              : {
     239           36 :     int         cnt = 0;
     240              : 
     241           52 :     while (len > 0 && *from)
     242              :     {
     243           36 :         if (*from == SS2)       /* code set 2 (unused?) */
     244              :         {
     245           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     246            4 :             from++;
     247            4 :             *to = (SS2 << 16) | (*from++ << 8);
     248            4 :             *to |= *from++;
     249            4 :             len -= 3;
     250              :         }
     251           24 :         else if (*from == SS3)  /* code set 3 (unused ?) */
     252              :         {
     253           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     254            4 :             from++;
     255            4 :             *to = (SS3 << 16) | (*from++ << 8);
     256            4 :             *to |= *from++;
     257            4 :             len -= 3;
     258              :         }
     259           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
     260              :         {
     261            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     262            4 :             *to = *from++ << 8;
     263            4 :             *to |= *from++;
     264            4 :             len -= 2;
     265              :         }
     266              :         else
     267              :         {
     268            4 :             *to = *from++;
     269            4 :             len--;
     270              :         }
     271           16 :         to++;
     272           16 :         cnt++;
     273              :     }
     274           36 :     *to = 0;
     275           36 :     return cnt;
     276              : }
     277              : 
     278              : /*
     279              :  * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
     280              :  * EUC_CN), but mb2wchar_with_len does.  Tell a coherent story for code that
     281              :  * relies on agreement between mb2wchar_with_len and mblen.  Invalid text
     282              :  * datums (e.g. from shared catalogs) reach this.
     283              :  */
     284              : static int
     285            4 : pg_euccn_mblen(const unsigned char *s)
     286              : {
     287              :     int         len;
     288              : 
     289            4 :     if (*s == SS2)
     290            0 :         len = 3;
     291            4 :     else if (*s == SS3)
     292            0 :         len = 3;
     293            4 :     else if (IS_HIGHBIT_SET(*s))
     294            4 :         len = 2;
     295              :     else
     296            0 :         len = 1;
     297            4 :     return len;
     298              : }
     299              : 
     300              : static int
     301            0 : pg_euccn_dsplen(const unsigned char *s)
     302              : {
     303              :     int         len;
     304              : 
     305            0 :     if (IS_HIGHBIT_SET(*s))
     306            0 :         len = 2;
     307              :     else
     308            0 :         len = pg_ascii_dsplen(s);
     309            0 :     return len;
     310              : }
     311              : 
     312              : /*
     313              :  * EUC_TW
     314              :  *
     315              :  */
     316              : static int
     317           40 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     318              : {
     319           40 :     int         cnt = 0;
     320              : 
     321           56 :     while (len > 0 && *from)
     322              :     {
     323           40 :         if (*from == SS2)       /* code set 2 */
     324              :         {
     325           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     326            4 :             from++;
     327            4 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     328            4 :             *to |= *from++ << 8;
     329            4 :             *to |= *from++;
     330            4 :             len -= 4;
     331              :         }
     332           24 :         else if (*from == SS3)  /* code set 3 (unused?) */
     333              :         {
     334           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     335            4 :             from++;
     336            4 :             *to = (SS3 << 16) | (*from++ << 8);
     337            4 :             *to |= *from++;
     338            4 :             len -= 3;
     339              :         }
     340           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
     341              :         {
     342            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     343            4 :             *to = *from++ << 8;
     344            4 :             *to |= *from++;
     345            4 :             len -= 2;
     346              :         }
     347              :         else
     348              :         {
     349            4 :             *to = *from++;
     350            4 :             len--;
     351              :         }
     352           16 :         to++;
     353           16 :         cnt++;
     354              :     }
     355           40 :     *to = 0;
     356           40 :     return cnt;
     357              : }
     358              : 
     359              : static int
     360            4 : pg_euctw_mblen(const unsigned char *s)
     361              : {
     362              :     int         len;
     363              : 
     364            4 :     if (*s == SS2)
     365            0 :         len = 4;
     366            4 :     else if (*s == SS3)
     367            0 :         len = 3;
     368            4 :     else if (IS_HIGHBIT_SET(*s))
     369            4 :         len = 2;
     370              :     else
     371            0 :         len = 1;
     372            4 :     return len;
     373              : }
     374              : 
     375              : static int
     376            0 : pg_euctw_dsplen(const unsigned char *s)
     377              : {
     378              :     int         len;
     379              : 
     380            0 :     if (*s == SS2)
     381            0 :         len = 2;
     382            0 :     else if (*s == SS3)
     383            0 :         len = 2;
     384            0 :     else if (IS_HIGHBIT_SET(*s))
     385            0 :         len = 2;
     386              :     else
     387            0 :         len = pg_ascii_dsplen(s);
     388            0 :     return len;
     389              : }
     390              : 
     391              : /*
     392              :  * Convert pg_wchar to EUC_* encoding.
     393              :  * caller must allocate enough space for "to", including a trailing zero!
     394              :  * len: length of from.
     395              :  * "from" not necessarily null terminated.
     396              :  */
     397              : static int
     398           48 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     399              : {
     400           48 :     int         cnt = 0;
     401              : 
     402           96 :     while (len > 0 && *from)
     403              :     {
     404              :         unsigned char c;
     405              : 
     406           48 :         if ((c = (*from >> 24)))
     407              :         {
     408            4 :             *to++ = c;
     409            4 :             *to++ = (*from >> 16) & 0xff;
     410            4 :             *to++ = (*from >> 8) & 0xff;
     411            4 :             *to++ = *from & 0xff;
     412            4 :             cnt += 4;
     413              :         }
     414           44 :         else if ((c = (*from >> 16)))
     415              :         {
     416           16 :             *to++ = c;
     417           16 :             *to++ = (*from >> 8) & 0xff;
     418           16 :             *to++ = *from & 0xff;
     419           16 :             cnt += 3;
     420              :         }
     421           28 :         else if ((c = (*from >> 8)))
     422              :         {
     423           16 :             *to++ = c;
     424           16 :             *to++ = *from & 0xff;
     425           16 :             cnt += 2;
     426              :         }
     427              :         else
     428              :         {
     429           12 :             *to++ = *from;
     430           12 :             cnt++;
     431              :         }
     432           48 :         from++;
     433           48 :         len--;
     434              :     }
     435           48 :     *to = 0;
     436           48 :     return cnt;
     437              : }
     438              : 
     439              : 
     440              : /*
     441              :  * JOHAB
     442              :  */
     443              : static int
     444           16 : pg_johab_mblen(const unsigned char *s)
     445              : {
     446           16 :     return pg_euc_mblen(s);
     447              : }
     448              : 
     449              : static int
     450            0 : pg_johab_dsplen(const unsigned char *s)
     451              : {
     452            0 :     return pg_euc_dsplen(s);
     453              : }
     454              : 
     455              : /*
     456              :  * convert UTF8 string to pg_wchar (UCS-4)
     457              :  * caller must allocate enough space for "to", including a trailing zero!
     458              :  * len: length of from.
     459              :  * "from" not necessarily null terminated.
     460              :  */
     461              : static int
     462      6728985 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     463              : {
     464      6728985 :     int         cnt = 0;
     465              :     uint32      c1,
     466              :                 c2,
     467              :                 c3,
     468              :                 c4;
     469              : 
     470    106850047 :     while (len > 0 && *from)
     471              :     {
     472    100121090 :         if ((*from & 0x80) == 0)
     473              :         {
     474    100120411 :             *to = *from++;
     475    100120411 :             len--;
     476              :         }
     477          679 :         else if ((*from & 0xe0) == 0xc0)
     478              :         {
     479          345 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     480          337 :             c1 = *from++ & 0x1f;
     481          337 :             c2 = *from++ & 0x3f;
     482          337 :             *to = (c1 << 6) | c2;
     483          337 :             len -= 2;
     484              :         }
     485          334 :         else if ((*from & 0xf0) == 0xe0)
     486              :         {
     487          174 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     488          166 :             c1 = *from++ & 0x0f;
     489          166 :             c2 = *from++ & 0x3f;
     490          166 :             c3 = *from++ & 0x3f;
     491          166 :             *to = (c1 << 12) | (c2 << 6) | c3;
     492          166 :             len -= 3;
     493              :         }
     494          160 :         else if ((*from & 0xf8) == 0xf0)
     495              :         {
     496           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     497            4 :             c1 = *from++ & 0x07;
     498            4 :             c2 = *from++ & 0x3f;
     499            4 :             c3 = *from++ & 0x3f;
     500            4 :             c4 = *from++ & 0x3f;
     501            4 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     502            4 :             len -= 4;
     503              :         }
     504              :         else
     505              :         {
     506              :             /* treat a bogus char as length 1; not ours to raise error */
     507          144 :             *to = *from++;
     508          144 :             len--;
     509              :         }
     510    100121062 :         to++;
     511    100121062 :         cnt++;
     512              :     }
     513      6728985 :     *to = 0;
     514      6728985 :     return cnt;
     515              : }
     516              : 
     517              : 
     518              : /*
     519              :  * Trivial conversion from pg_wchar to UTF-8.
     520              :  * caller should allocate enough space for "to"
     521              :  * len: length of from.
     522              :  * "from" not necessarily null terminated.
     523              :  */
     524              : static int
     525       579534 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     526              : {
     527       579534 :     int         cnt = 0;
     528              : 
     529      8542706 :     while (len > 0 && *from)
     530              :     {
     531              :         int         char_len;
     532              : 
     533      7963172 :         unicode_to_utf8(*from, to);
     534      7963172 :         char_len = pg_utf_mblen(to);
     535      7963172 :         cnt += char_len;
     536      7963172 :         to += char_len;
     537      7963172 :         from++;
     538      7963172 :         len--;
     539              :     }
     540       579534 :     *to = 0;
     541       579534 :     return cnt;
     542              : }
     543              : 
     544              : /*
     545              :  * Return the byte length of a UTF8 character pointed to by s
     546              :  *
     547              :  * Note: in the current implementation we do not support UTF8 sequences
     548              :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     549              :  * We return "1" for any leading byte that is either flat-out illegal or
     550              :  * indicates a length larger than we support.
     551              :  *
     552              :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     553              :  * other places would need to be fixed to change this.
     554              :  */
     555              : int
     556     96456470 : pg_utf_mblen(const unsigned char *s)
     557              : {
     558              :     int         len;
     559              : 
     560     96456470 :     if ((*s & 0x80) == 0)
     561     96351876 :         len = 1;
     562       104594 :     else if ((*s & 0xe0) == 0xc0)
     563         8514 :         len = 2;
     564        96080 :     else if ((*s & 0xf0) == 0xe0)
     565        69698 :         len = 3;
     566        26382 :     else if ((*s & 0xf8) == 0xf0)
     567        26267 :         len = 4;
     568              : #ifdef NOT_USED
     569              :     else if ((*s & 0xfc) == 0xf8)
     570              :         len = 5;
     571              :     else if ((*s & 0xfe) == 0xfc)
     572              :         len = 6;
     573              : #endif
     574              :     else
     575          115 :         len = 1;
     576     96456470 :     return len;
     577              : }
     578              : 
     579              : /*
     580              :  * This is an implementation of wcwidth() and wcswidth() as defined in
     581              :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     582              :  * <http://www.unix.org/online.html>
     583              :  *
     584              :  * Markus Kuhn -- 2001-09-08 -- public domain
     585              :  *
     586              :  * customised for PostgreSQL
     587              :  *
     588              :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     589              :  */
     590              : 
     591              : struct mbinterval
     592              : {
     593              :     unsigned int first;
     594              :     unsigned int last;
     595              : };
     596              : 
     597              : /* auxiliary function for binary search in interval table */
     598              : static int
     599     59917004 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     600              : {
     601     59917004 :     int         min = 0;
     602              :     int         mid;
     603              : 
     604     59917004 :     if (ucs < table[0].first || ucs > table[max].last)
     605     59911358 :         return 0;
     606        49149 :     while (max >= min)
     607              :     {
     608        43983 :         mid = (min + max) / 2;
     609        43983 :         if (ucs > table[mid].last)
     610         9747 :             min = mid + 1;
     611        34236 :         else if (ucs < table[mid].first)
     612        33756 :             max = mid - 1;
     613              :         else
     614          480 :             return 1;
     615              :     }
     616              : 
     617         5166 :     return 0;
     618              : }
     619              : 
     620              : 
     621              : /* The following functions define the column width of an ISO 10646
     622              :  * character as follows:
     623              :  *
     624              :  *    - The null character (U+0000) has a column width of 0.
     625              :  *
     626              :  *    - Other C0/C1 control characters and DEL will lead to a return
     627              :  *      value of -1.
     628              :  *
     629              :  *    - Non-spacing and enclosing combining characters (general
     630              :  *      category code Mn, Me or Cf in the Unicode database) have a
     631              :  *      column width of 0.
     632              :  *
     633              :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     634              :  *      FullWidth (F) category as defined in Unicode Technical
     635              :  *      Report #11 have a column width of 2.
     636              :  *
     637              :  *    - All remaining characters (including all printable
     638              :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     639              :  *      etc.) have a column width of 1.
     640              :  *
     641              :  * This implementation assumes that wchar_t characters are encoded
     642              :  * in ISO 10646.
     643              :  */
     644              : 
     645              : static int
     646     29988913 : ucs_wcwidth(pg_wchar ucs)
     647              : {
     648              : #include "common/unicode_nonspacing_table.h"
     649              : #include "common/unicode_east_asian_fw_table.h"
     650              : 
     651              :     /* test for 8-bit control characters */
     652     29988913 :     if (ucs == 0)
     653            0 :         return 0;
     654              : 
     655     29988913 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     656        30249 :         return -1;
     657              : 
     658              :     /*
     659              :      * binary search in table of non-spacing characters
     660              :      *
     661              :      * XXX: In the official Unicode sources, it is possible for a character to
     662              :      * be described as both non-spacing and wide at the same time. As of
     663              :      * Unicode 13.0, treating the non-spacing property as the determining
     664              :      * factor for display width leads to the correct behavior, so do that
     665              :      * search first.
     666              :      */
     667     29958664 :     if (mbbisearch(ucs, nonspacing,
     668              :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     669          324 :         return 0;
     670              : 
     671              :     /* binary search in table of wide characters */
     672     29958340 :     if (mbbisearch(ucs, east_asian_fw,
     673              :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     674          156 :         return 2;
     675              : 
     676     29958184 :     return 1;
     677              : }
     678              : 
     679              : static int
     680     29988913 : pg_utf_dsplen(const unsigned char *s)
     681              : {
     682     29988913 :     return ucs_wcwidth(utf8_to_unicode(s));
     683              : }
     684              : 
     685              : /*
     686              :  * convert mule internal code to pg_wchar
     687              :  * caller should allocate enough space for "to"
     688              :  * len: length of from.
     689              :  * "from" not necessarily null terminated.
     690              :  */
     691              : static int
     692           24 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     693              : {
     694           24 :     int         cnt = 0;
     695              : 
     696           36 :     while (len > 0 && *from)
     697              :     {
     698           24 :         if (IS_LC1(*from))
     699              :         {
     700            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     701            4 :             *to = *from++ << 16;
     702            4 :             *to |= *from++;
     703            4 :             len -= 2;
     704              :         }
     705           16 :         else if (IS_LCPRV1(*from))
     706              :         {
     707            0 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     708            0 :             from++;
     709            0 :             *to = *from++ << 16;
     710            0 :             *to |= *from++;
     711            0 :             len -= 3;
     712              :         }
     713           16 :         else if (IS_LC2(*from))
     714              :         {
     715           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     716            4 :             *to = *from++ << 16;
     717            4 :             *to |= *from++ << 8;
     718            4 :             *to |= *from++;
     719            4 :             len -= 3;
     720              :         }
     721            4 :         else if (IS_LCPRV2(*from))
     722              :         {
     723            0 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     724            0 :             from++;
     725            0 :             *to = *from++ << 16;
     726            0 :             *to |= *from++ << 8;
     727            0 :             *to |= *from++;
     728            0 :             len -= 4;
     729              :         }
     730              :         else
     731              :         {                       /* assume ASCII */
     732            4 :             *to = (unsigned char) *from++;
     733            4 :             len--;
     734              :         }
     735           12 :         to++;
     736           12 :         cnt++;
     737              :     }
     738           24 :     *to = 0;
     739           24 :     return cnt;
     740              : }
     741              : 
     742              : /*
     743              :  * convert pg_wchar to mule internal code
     744              :  * caller should allocate enough space for "to"
     745              :  * len: length of from.
     746              :  * "from" not necessarily null terminated.
     747              :  */
     748              : static int
     749           12 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     750              : {
     751           12 :     int         cnt = 0;
     752              : 
     753           24 :     while (len > 0 && *from)
     754              :     {
     755              :         unsigned char lb;
     756              : 
     757           12 :         lb = (*from >> 16) & 0xff;
     758           12 :         if (IS_LC1(lb))
     759              :         {
     760            4 :             *to++ = lb;
     761            4 :             *to++ = *from & 0xff;
     762            4 :             cnt += 2;
     763              :         }
     764            8 :         else if (IS_LC2(lb))
     765              :         {
     766            4 :             *to++ = lb;
     767            4 :             *to++ = (*from >> 8) & 0xff;
     768            4 :             *to++ = *from & 0xff;
     769            4 :             cnt += 3;
     770              :         }
     771            4 :         else if (IS_LCPRV1_A_RANGE(lb))
     772              :         {
     773            0 :             *to++ = LCPRV1_A;
     774            0 :             *to++ = lb;
     775            0 :             *to++ = *from & 0xff;
     776            0 :             cnt += 3;
     777              :         }
     778            4 :         else if (IS_LCPRV1_B_RANGE(lb))
     779              :         {
     780            0 :             *to++ = LCPRV1_B;
     781            0 :             *to++ = lb;
     782            0 :             *to++ = *from & 0xff;
     783            0 :             cnt += 3;
     784              :         }
     785            4 :         else if (IS_LCPRV2_A_RANGE(lb))
     786              :         {
     787            0 :             *to++ = LCPRV2_A;
     788            0 :             *to++ = lb;
     789            0 :             *to++ = (*from >> 8) & 0xff;
     790            0 :             *to++ = *from & 0xff;
     791            0 :             cnt += 4;
     792              :         }
     793            4 :         else if (IS_LCPRV2_B_RANGE(lb))
     794              :         {
     795            0 :             *to++ = LCPRV2_B;
     796            0 :             *to++ = lb;
     797            0 :             *to++ = (*from >> 8) & 0xff;
     798            0 :             *to++ = *from & 0xff;
     799            0 :             cnt += 4;
     800              :         }
     801              :         else
     802              :         {
     803            4 :             *to++ = *from & 0xff;
     804            4 :             cnt += 1;
     805              :         }
     806           12 :         from++;
     807           12 :         len--;
     808              :     }
     809           12 :     *to = 0;
     810           12 :     return cnt;
     811              : }
     812              : 
     813              : /* exported for direct use by conv.c */
     814              : int
     815         2008 : pg_mule_mblen(const unsigned char *s)
     816              : {
     817              :     int         len;
     818              : 
     819         2008 :     if (IS_LC1(*s))
     820          812 :         len = 2;
     821         1196 :     else if (IS_LCPRV1(*s))
     822            0 :         len = 3;
     823         1196 :     else if (IS_LC2(*s))
     824         1140 :         len = 3;
     825           56 :     else if (IS_LCPRV2(*s))
     826           20 :         len = 4;
     827              :     else
     828           36 :         len = 1;                /* assume ASCII */
     829         2008 :     return len;
     830              : }
     831              : 
     832              : static int
     833            0 : pg_mule_dsplen(const unsigned char *s)
     834              : {
     835              :     int         len;
     836              : 
     837              :     /*
     838              :      * Note: it's not really appropriate to assume that all multibyte charsets
     839              :      * are double-wide on screen.  But this seems an okay approximation for
     840              :      * the MULE charsets we currently support.
     841              :      */
     842              : 
     843            0 :     if (IS_LC1(*s))
     844            0 :         len = 1;
     845            0 :     else if (IS_LCPRV1(*s))
     846            0 :         len = 1;
     847            0 :     else if (IS_LC2(*s))
     848            0 :         len = 2;
     849            0 :     else if (IS_LCPRV2(*s))
     850            0 :         len = 2;
     851              :     else
     852            0 :         len = 1;                /* assume ASCII */
     853              : 
     854            0 :     return len;
     855              : }
     856              : 
     857              : /*
     858              :  * ISO8859-1
     859              :  */
     860              : static int
     861          468 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     862              : {
     863          468 :     int         cnt = 0;
     864              : 
     865        13377 :     while (len > 0 && *from)
     866              :     {
     867        12909 :         *to++ = *from++;
     868        12909 :         len--;
     869        12909 :         cnt++;
     870              :     }
     871          468 :     *to = 0;
     872          468 :     return cnt;
     873              : }
     874              : 
     875              : /*
     876              :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     877              :  * high bits.
     878              :  * caller should allocate enough space for "to"
     879              :  * len: length of from.
     880              :  * "from" not necessarily null terminated.
     881              :  */
     882              : static int
     883           79 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     884              : {
     885           79 :     int         cnt = 0;
     886              : 
     887          678 :     while (len > 0 && *from)
     888              :     {
     889          599 :         *to++ = *from++;
     890          599 :         len--;
     891          599 :         cnt++;
     892              :     }
     893           79 :     *to = 0;
     894           79 :     return cnt;
     895              : }
     896              : 
     897              : static int
     898         3650 : pg_latin1_mblen(const unsigned char *s)
     899              : {
     900         3650 :     return 1;
     901              : }
     902              : 
     903              : static int
     904          400 : pg_latin1_dsplen(const unsigned char *s)
     905              : {
     906          400 :     return pg_ascii_dsplen(s);
     907              : }
     908              : 
     909              : /*
     910              :  * SJIS
     911              :  */
     912              : static int
     913         1015 : pg_sjis_mblen(const unsigned char *s)
     914              : {
     915              :     int         len;
     916              : 
     917         1015 :     if (*s >= 0xa1 && *s <= 0xdf)
     918            0 :         len = 1;                /* 1 byte kana? */
     919         1015 :     else if (IS_HIGHBIT_SET(*s))
     920          809 :         len = 2;                /* kanji? */
     921              :     else
     922          206 :         len = 1;                /* should be ASCII */
     923         1015 :     return len;
     924              : }
     925              : 
     926              : static int
     927            0 : pg_sjis_dsplen(const unsigned char *s)
     928              : {
     929              :     int         len;
     930              : 
     931            0 :     if (*s >= 0xa1 && *s <= 0xdf)
     932            0 :         len = 1;                /* 1 byte kana? */
     933            0 :     else if (IS_HIGHBIT_SET(*s))
     934            0 :         len = 2;                /* kanji? */
     935              :     else
     936            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     937            0 :     return len;
     938              : }
     939              : 
     940              : /*
     941              :  * Big5
     942              :  */
     943              : static int
     944          328 : pg_big5_mblen(const unsigned char *s)
     945              : {
     946              :     int         len;
     947              : 
     948          328 :     if (IS_HIGHBIT_SET(*s))
     949          292 :         len = 2;                /* kanji? */
     950              :     else
     951           36 :         len = 1;                /* should be ASCII */
     952          328 :     return len;
     953              : }
     954              : 
     955              : static int
     956            0 : pg_big5_dsplen(const unsigned char *s)
     957              : {
     958              :     int         len;
     959              : 
     960            0 :     if (IS_HIGHBIT_SET(*s))
     961            0 :         len = 2;                /* kanji? */
     962              :     else
     963            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     964            0 :     return len;
     965              : }
     966              : 
     967              : /*
     968              :  * GBK
     969              :  */
     970              : static int
     971          282 : pg_gbk_mblen(const unsigned char *s)
     972              : {
     973              :     int         len;
     974              : 
     975          282 :     if (IS_HIGHBIT_SET(*s))
     976          212 :         len = 2;                /* kanji? */
     977              :     else
     978           70 :         len = 1;                /* should be ASCII */
     979          282 :     return len;
     980              : }
     981              : 
     982              : static int
     983            0 : pg_gbk_dsplen(const unsigned char *s)
     984              : {
     985              :     int         len;
     986              : 
     987            0 :     if (IS_HIGHBIT_SET(*s))
     988            0 :         len = 2;                /* kanji? */
     989              :     else
     990            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     991            0 :     return len;
     992              : }
     993              : 
     994              : /*
     995              :  * UHC
     996              :  */
     997              : static int
     998           16 : pg_uhc_mblen(const unsigned char *s)
     999              : {
    1000              :     int         len;
    1001              : 
    1002           16 :     if (IS_HIGHBIT_SET(*s))
    1003           16 :         len = 2;                /* 2byte? */
    1004              :     else
    1005            0 :         len = 1;                /* should be ASCII */
    1006           16 :     return len;
    1007              : }
    1008              : 
    1009              : static int
    1010            0 : pg_uhc_dsplen(const unsigned char *s)
    1011              : {
    1012              :     int         len;
    1013              : 
    1014            0 :     if (IS_HIGHBIT_SET(*s))
    1015            0 :         len = 2;                /* 2byte? */
    1016              :     else
    1017            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1018            0 :     return len;
    1019              : }
    1020              : 
    1021              : /*
    1022              :  * GB18030
    1023              :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1024              :  */
    1025              : 
    1026              : /*
    1027              :  * Unlike all other mblen() functions, this also looks at the second byte of
    1028              :  * the input.  However, if you only pass the first byte of a multi-byte
    1029              :  * string, and \0 as the second byte, this still works in a predictable way:
    1030              :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1031              :  * enough for all current uses, as a client-only encoding.  It works that
    1032              :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1033              :  * fourth byte look like a 2-byte encoded character, when looked at
    1034              :  * separately.
    1035              :  */
    1036              : static int
    1037          623 : pg_gb18030_mblen(const unsigned char *s)
    1038              : {
    1039              :     int         len;
    1040              : 
    1041          623 :     if (!IS_HIGHBIT_SET(*s))
    1042          348 :         len = 1;                /* ASCII */
    1043          275 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1044          114 :         len = 4;
    1045              :     else
    1046          161 :         len = 2;
    1047          623 :     return len;
    1048              : }
    1049              : 
    1050              : static int
    1051            0 : pg_gb18030_dsplen(const unsigned char *s)
    1052              : {
    1053              :     int         len;
    1054              : 
    1055            0 :     if (IS_HIGHBIT_SET(*s))
    1056            0 :         len = 2;
    1057              :     else
    1058            0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1059            0 :     return len;
    1060              : }
    1061              : 
    1062              : /*
    1063              :  *-------------------------------------------------------------------
    1064              :  * multibyte sequence validators
    1065              :  *
    1066              :  * The verifychar functions accept "s", a pointer to the first byte of a
    1067              :  * string, and "len", the remaining length of the string.  If there is a
    1068              :  * validly encoded character beginning at *s, return its length in bytes;
    1069              :  * else return -1.
    1070              :  *
    1071              :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1072              :  * the length of the string.  They verify the whole string, and return the
    1073              :  * number of input bytes (<= len) that are valid.  In other words, if the
    1074              :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1075              :  * byte offset of the first invalid character.  The verifystr functions must
    1076              :  * test for and reject zeroes in the input.
    1077              :  *
    1078              :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1079              :  * they must test for and reject zeroes in any additional bytes of a
    1080              :  * multibyte character.  Note that this definition allows the function for a
    1081              :  * single-byte encoding to be just "return 1".
    1082              :  *-------------------------------------------------------------------
    1083              :  */
    1084              : static int
    1085          161 : pg_ascii_verifychar(const unsigned char *s, int len)
    1086              : {
    1087          161 :     return 1;
    1088              : }
    1089              : 
    1090              : static int
    1091       211524 : pg_ascii_verifystr(const unsigned char *s, int len)
    1092              : {
    1093       211524 :     const unsigned char *nullpos = memchr(s, 0, len);
    1094              : 
    1095       211524 :     if (nullpos == NULL)
    1096       211524 :         return len;
    1097              :     else
    1098            0 :         return nullpos - s;
    1099              : }
    1100              : 
    1101              : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1102              : 
    1103              : static int
    1104          336 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1105              : {
    1106              :     int         l;
    1107              :     unsigned char c1,
    1108              :                 c2;
    1109              : 
    1110          336 :     c1 = *s++;
    1111              : 
    1112          336 :     switch (c1)
    1113              :     {
    1114            0 :         case SS2:               /* JIS X 0201 */
    1115            0 :             l = 2;
    1116            0 :             if (l > len)
    1117            0 :                 return -1;
    1118            0 :             c2 = *s++;
    1119            0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1120            0 :                 return -1;
    1121            0 :             break;
    1122              : 
    1123            0 :         case SS3:               /* JIS X 0212 */
    1124            0 :             l = 3;
    1125            0 :             if (l > len)
    1126            0 :                 return -1;
    1127            0 :             c2 = *s++;
    1128            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1129            0 :                 return -1;
    1130            0 :             c2 = *s++;
    1131            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1132            0 :                 return -1;
    1133            0 :             break;
    1134              : 
    1135          336 :         default:
    1136          336 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1137              :             {
    1138          336 :                 l = 2;
    1139          336 :                 if (l > len)
    1140           56 :                     return -1;
    1141          280 :                 if (!IS_EUC_RANGE_VALID(c1))
    1142           16 :                     return -1;
    1143          264 :                 c2 = *s++;
    1144          264 :                 if (!IS_EUC_RANGE_VALID(c2))
    1145          120 :                     return -1;
    1146              :             }
    1147              :             else
    1148              :                 /* must be ASCII */
    1149              :             {
    1150            0 :                 l = 1;
    1151              :             }
    1152          144 :             break;
    1153              :     }
    1154              : 
    1155          144 :     return l;
    1156              : }
    1157              : 
    1158              : static int
    1159          200 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1160              : {
    1161          200 :     const unsigned char *start = s;
    1162              : 
    1163          620 :     while (len > 0)
    1164              :     {
    1165              :         int         l;
    1166              : 
    1167              :         /* fast path for ASCII-subset characters */
    1168          564 :         if (!IS_HIGHBIT_SET(*s))
    1169              :         {
    1170          396 :             if (*s == '\0')
    1171           48 :                 break;
    1172          348 :             l = 1;
    1173              :         }
    1174              :         else
    1175              :         {
    1176          168 :             l = pg_eucjp_verifychar(s, len);
    1177          168 :             if (l == -1)
    1178           96 :                 break;
    1179              :         }
    1180          420 :         s += l;
    1181          420 :         len -= l;
    1182              :     }
    1183              : 
    1184          200 :     return s - start;
    1185              : }
    1186              : 
    1187              : static int
    1188           96 : pg_euckr_verifychar(const unsigned char *s, int len)
    1189              : {
    1190              :     int         l;
    1191              :     unsigned char c1,
    1192              :                 c2;
    1193              : 
    1194           96 :     c1 = *s++;
    1195              : 
    1196           96 :     if (IS_HIGHBIT_SET(c1))
    1197              :     {
    1198           96 :         l = 2;
    1199           96 :         if (l > len)
    1200            8 :             return -1;
    1201           88 :         if (!IS_EUC_RANGE_VALID(c1))
    1202           16 :             return -1;
    1203           72 :         c2 = *s++;
    1204           72 :         if (!IS_EUC_RANGE_VALID(c2))
    1205            0 :             return -1;
    1206              :     }
    1207              :     else
    1208              :         /* must be ASCII */
    1209              :     {
    1210            0 :         l = 1;
    1211              :     }
    1212              : 
    1213           72 :     return l;
    1214              : }
    1215              : 
    1216              : static int
    1217           48 : pg_euckr_verifystr(const unsigned char *s, int len)
    1218              : {
    1219           48 :     const unsigned char *start = s;
    1220              : 
    1221          156 :     while (len > 0)
    1222              :     {
    1223              :         int         l;
    1224              : 
    1225              :         /* fast path for ASCII-subset characters */
    1226          132 :         if (!IS_HIGHBIT_SET(*s))
    1227              :         {
    1228           72 :             if (*s == '\0')
    1229            0 :                 break;
    1230           72 :             l = 1;
    1231              :         }
    1232              :         else
    1233              :         {
    1234           60 :             l = pg_euckr_verifychar(s, len);
    1235           60 :             if (l == -1)
    1236           24 :                 break;
    1237              :         }
    1238          108 :         s += l;
    1239          108 :         len -= l;
    1240              :     }
    1241              : 
    1242           48 :     return s - start;
    1243              : }
    1244              : 
    1245              : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1246              : #define pg_euccn_verifychar pg_euckr_verifychar
    1247              : #define pg_euccn_verifystr  pg_euckr_verifystr
    1248              : 
    1249              : static int
    1250           12 : pg_euctw_verifychar(const unsigned char *s, int len)
    1251              : {
    1252              :     int         l;
    1253              :     unsigned char c1,
    1254              :                 c2;
    1255              : 
    1256           12 :     c1 = *s++;
    1257              : 
    1258           12 :     switch (c1)
    1259              :     {
    1260            0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1261            0 :             l = 4;
    1262            0 :             if (l > len)
    1263            0 :                 return -1;
    1264            0 :             c2 = *s++;
    1265            0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1266            0 :                 return -1;
    1267            0 :             c2 = *s++;
    1268            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1269            0 :                 return -1;
    1270            0 :             c2 = *s++;
    1271            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1272            0 :                 return -1;
    1273            0 :             break;
    1274              : 
    1275            0 :         case SS3:               /* unused */
    1276            0 :             return -1;
    1277              : 
    1278           12 :         default:
    1279           12 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1280              :             {
    1281           12 :                 l = 2;
    1282           12 :                 if (l > len)
    1283            4 :                     return -1;
    1284              :                 /* no further range check on c1? */
    1285            8 :                 c2 = *s++;
    1286            8 :                 if (!IS_EUC_RANGE_VALID(c2))
    1287            8 :                     return -1;
    1288              :             }
    1289              :             else
    1290              :                 /* must be ASCII */
    1291              :             {
    1292            0 :                 l = 1;
    1293              :             }
    1294            0 :             break;
    1295              :     }
    1296            0 :     return l;
    1297              : }
    1298              : 
    1299              : static int
    1300           24 : pg_euctw_verifystr(const unsigned char *s, int len)
    1301              : {
    1302           24 :     const unsigned char *start = s;
    1303              : 
    1304           60 :     while (len > 0)
    1305              :     {
    1306              :         int         l;
    1307              : 
    1308              :         /* fast path for ASCII-subset characters */
    1309           48 :         if (!IS_HIGHBIT_SET(*s))
    1310              :         {
    1311           36 :             if (*s == '\0')
    1312            0 :                 break;
    1313           36 :             l = 1;
    1314              :         }
    1315              :         else
    1316              :         {
    1317           12 :             l = pg_euctw_verifychar(s, len);
    1318           12 :             if (l == -1)
    1319           12 :                 break;
    1320              :         }
    1321           36 :         s += l;
    1322           36 :         len -= l;
    1323              :     }
    1324              : 
    1325           24 :     return s - start;
    1326              : }
    1327              : 
    1328              : static int
    1329           12 : pg_johab_verifychar(const unsigned char *s, int len)
    1330              : {
    1331              :     int         l,
    1332              :                 mbl;
    1333              :     unsigned char c;
    1334              : 
    1335           12 :     l = mbl = pg_johab_mblen(s);
    1336              : 
    1337           12 :     if (len < l)
    1338            4 :         return -1;
    1339              : 
    1340            8 :     if (!IS_HIGHBIT_SET(*s))
    1341            0 :         return mbl;
    1342              : 
    1343            8 :     while (--l > 0)
    1344              :     {
    1345            8 :         c = *++s;
    1346            8 :         if (!IS_EUC_RANGE_VALID(c))
    1347            8 :             return -1;
    1348              :     }
    1349            0 :     return mbl;
    1350              : }
    1351              : 
    1352              : static int
    1353           16 : pg_johab_verifystr(const unsigned char *s, int len)
    1354              : {
    1355           16 :     const unsigned char *start = s;
    1356              : 
    1357           28 :     while (len > 0)
    1358              :     {
    1359              :         int         l;
    1360              : 
    1361              :         /* fast path for ASCII-subset characters */
    1362           24 :         if (!IS_HIGHBIT_SET(*s))
    1363              :         {
    1364           12 :             if (*s == '\0')
    1365            0 :                 break;
    1366           12 :             l = 1;
    1367              :         }
    1368              :         else
    1369              :         {
    1370           12 :             l = pg_johab_verifychar(s, len);
    1371           12 :             if (l == -1)
    1372           12 :                 break;
    1373              :         }
    1374           12 :         s += l;
    1375           12 :         len -= l;
    1376              :     }
    1377              : 
    1378           16 :     return s - start;
    1379              : }
    1380              : 
    1381              : static int
    1382          894 : pg_mule_verifychar(const unsigned char *s, int len)
    1383              : {
    1384              :     int         l,
    1385              :                 mbl;
    1386              :     unsigned char c;
    1387              : 
    1388          894 :     l = mbl = pg_mule_mblen(s);
    1389              : 
    1390          894 :     if (len < l)
    1391          227 :         return -1;
    1392              : 
    1393         1351 :     while (--l > 0)
    1394              :     {
    1395          895 :         c = *++s;
    1396          895 :         if (!IS_HIGHBIT_SET(c))
    1397          211 :             return -1;
    1398              :     }
    1399          456 :     return mbl;
    1400              : }
    1401              : 
    1402              : static int
    1403          285 : pg_mule_verifystr(const unsigned char *s, int len)
    1404              : {
    1405          285 :     const unsigned char *start = s;
    1406              : 
    1407          825 :     while (len > 0)
    1408              :     {
    1409              :         int         l;
    1410              : 
    1411              :         /* fast path for ASCII-subset characters */
    1412          714 :         if (!IS_HIGHBIT_SET(*s))
    1413              :         {
    1414          432 :             if (*s == '\0')
    1415           24 :                 break;
    1416          408 :             l = 1;
    1417              :         }
    1418              :         else
    1419              :         {
    1420          282 :             l = pg_mule_verifychar(s, len);
    1421          282 :             if (l == -1)
    1422          150 :                 break;
    1423              :         }
    1424          540 :         s += l;
    1425          540 :         len -= l;
    1426              :     }
    1427              : 
    1428          285 :     return s - start;
    1429              : }
    1430              : 
    1431              : static int
    1432         3223 : pg_latin1_verifychar(const unsigned char *s, int len)
    1433              : {
    1434         3223 :     return 1;
    1435              : }
    1436              : 
    1437              : static int
    1438         5247 : pg_latin1_verifystr(const unsigned char *s, int len)
    1439              : {
    1440         5247 :     const unsigned char *nullpos = memchr(s, 0, len);
    1441              : 
    1442         5247 :     if (nullpos == NULL)
    1443         5175 :         return len;
    1444              :     else
    1445           72 :         return nullpos - s;
    1446              : }
    1447              : 
    1448              : static int
    1449          624 : pg_sjis_verifychar(const unsigned char *s, int len)
    1450              : {
    1451              :     int         l,
    1452              :                 mbl;
    1453              :     unsigned char c1,
    1454              :                 c2;
    1455              : 
    1456          624 :     l = mbl = pg_sjis_mblen(s);
    1457              : 
    1458          624 :     if (len < l)
    1459           86 :         return -1;
    1460              : 
    1461          538 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1462            0 :         return mbl;
    1463              : 
    1464          538 :     c1 = *s++;
    1465          538 :     c2 = *s;
    1466          538 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1467          214 :         return -1;
    1468          324 :     return mbl;
    1469              : }
    1470              : 
    1471              : static int
    1472          326 : pg_sjis_verifystr(const unsigned char *s, int len)
    1473              : {
    1474          326 :     const unsigned char *start = s;
    1475              : 
    1476         1249 :     while (len > 0)
    1477              :     {
    1478              :         int         l;
    1479              : 
    1480              :         /* fast path for ASCII-subset characters */
    1481         1119 :         if (!IS_HIGHBIT_SET(*s))
    1482              :         {
    1483          827 :             if (*s == '\0')
    1484           48 :                 break;
    1485          779 :             l = 1;
    1486              :         }
    1487              :         else
    1488              :         {
    1489          292 :             l = pg_sjis_verifychar(s, len);
    1490          292 :             if (l == -1)
    1491          148 :                 break;
    1492              :         }
    1493          923 :         s += l;
    1494          923 :         len -= l;
    1495              :     }
    1496              : 
    1497          326 :     return s - start;
    1498              : }
    1499              : 
    1500              : static int
    1501          240 : pg_big5_verifychar(const unsigned char *s, int len)
    1502              : {
    1503              :     int         l,
    1504              :                 mbl;
    1505              : 
    1506          240 :     l = mbl = pg_big5_mblen(s);
    1507              : 
    1508          240 :     if (len < l)
    1509            4 :         return -1;
    1510              : 
    1511          236 :     if (l == 2 &&
    1512          236 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1513            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1514            8 :         return -1;
    1515              : 
    1516          384 :     while (--l > 0)
    1517              :     {
    1518          228 :         if (*++s == '\0')
    1519           72 :             return -1;
    1520              :     }
    1521              : 
    1522          156 :     return mbl;
    1523              : }
    1524              : 
    1525              : static int
    1526          108 : pg_big5_verifystr(const unsigned char *s, int len)
    1527              : {
    1528          108 :     const unsigned char *start = s;
    1529              : 
    1530          444 :     while (len > 0)
    1531              :     {
    1532              :         int         l;
    1533              : 
    1534              :         /* fast path for ASCII-subset characters */
    1535          396 :         if (!IS_HIGHBIT_SET(*s))
    1536              :         {
    1537          312 :             if (*s == '\0')
    1538           24 :                 break;
    1539          288 :             l = 1;
    1540              :         }
    1541              :         else
    1542              :         {
    1543           84 :             l = pg_big5_verifychar(s, len);
    1544           84 :             if (l == -1)
    1545           36 :                 break;
    1546              :         }
    1547          336 :         s += l;
    1548          336 :         len -= l;
    1549              :     }
    1550              : 
    1551          108 :     return s - start;
    1552              : }
    1553              : 
    1554              : static int
    1555          140 : pg_gbk_verifychar(const unsigned char *s, int len)
    1556              : {
    1557              :     int         l,
    1558              :                 mbl;
    1559              : 
    1560          140 :     l = mbl = pg_gbk_mblen(s);
    1561              : 
    1562          140 :     if (len < l)
    1563           28 :         return -1;
    1564              : 
    1565          112 :     if (l == 2 &&
    1566          112 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1567           16 :         s[1] == NONUTF8_INVALID_BYTE1)
    1568           16 :         return -1;
    1569              : 
    1570          192 :     while (--l > 0)
    1571              :     {
    1572           96 :         if (*++s == '\0')
    1573            0 :             return -1;
    1574              :     }
    1575              : 
    1576           96 :     return mbl;
    1577              : }
    1578              : 
    1579              : static int
    1580          132 : pg_gbk_verifystr(const unsigned char *s, int len)
    1581              : {
    1582          132 :     const unsigned char *start = s;
    1583              : 
    1584          336 :     while (len > 0)
    1585              :     {
    1586              :         int         l;
    1587              : 
    1588              :         /* fast path for ASCII-subset characters */
    1589          248 :         if (!IS_HIGHBIT_SET(*s))
    1590              :         {
    1591          124 :             if (*s == '\0')
    1592            0 :                 break;
    1593          124 :             l = 1;
    1594              :         }
    1595              :         else
    1596              :         {
    1597          124 :             l = pg_gbk_verifychar(s, len);
    1598          124 :             if (l == -1)
    1599           44 :                 break;
    1600              :         }
    1601          204 :         s += l;
    1602          204 :         len -= l;
    1603              :     }
    1604              : 
    1605          132 :     return s - start;
    1606              : }
    1607              : 
    1608              : static int
    1609           12 : pg_uhc_verifychar(const unsigned char *s, int len)
    1610              : {
    1611              :     int         l,
    1612              :                 mbl;
    1613              : 
    1614           12 :     l = mbl = pg_uhc_mblen(s);
    1615              : 
    1616           12 :     if (len < l)
    1617            4 :         return -1;
    1618              : 
    1619            8 :     if (l == 2 &&
    1620            8 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1621            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1622            8 :         return -1;
    1623              : 
    1624            0 :     while (--l > 0)
    1625              :     {
    1626            0 :         if (*++s == '\0')
    1627            0 :             return -1;
    1628              :     }
    1629              : 
    1630            0 :     return mbl;
    1631              : }
    1632              : 
    1633              : static int
    1634           16 : pg_uhc_verifystr(const unsigned char *s, int len)
    1635              : {
    1636           16 :     const unsigned char *start = s;
    1637              : 
    1638           28 :     while (len > 0)
    1639              :     {
    1640              :         int         l;
    1641              : 
    1642              :         /* fast path for ASCII-subset characters */
    1643           24 :         if (!IS_HIGHBIT_SET(*s))
    1644              :         {
    1645           12 :             if (*s == '\0')
    1646            0 :                 break;
    1647           12 :             l = 1;
    1648              :         }
    1649              :         else
    1650              :         {
    1651           12 :             l = pg_uhc_verifychar(s, len);
    1652           12 :             if (l == -1)
    1653           12 :                 break;
    1654              :         }
    1655           12 :         s += l;
    1656           12 :         len -= l;
    1657              :     }
    1658              : 
    1659           16 :     return s - start;
    1660              : }
    1661              : 
    1662              : static int
    1663          698 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1664              : {
    1665              :     int         l;
    1666              : 
    1667          698 :     if (!IS_HIGHBIT_SET(*s))
    1668            0 :         l = 1;                  /* ASCII */
    1669          698 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1670              :     {
    1671              :         /* Should be 4-byte, validate remaining bytes */
    1672          210 :         if (*s >= 0x81 && *s <= 0xfe &&
    1673          204 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1674          204 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1675          108 :             l = 4;
    1676              :         else
    1677          102 :             l = -1;
    1678              :     }
    1679          488 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1680              :     {
    1681              :         /* Should be 2-byte, validate */
    1682          358 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1683          238 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1684          176 :             l = 2;
    1685              :         else
    1686          182 :             l = -1;
    1687              :     }
    1688              :     else
    1689          130 :         l = -1;
    1690          698 :     return l;
    1691              : }
    1692              : 
    1693              : static int
    1694          500 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1695              : {
    1696          500 :     const unsigned char *start = s;
    1697              : 
    1698         1679 :     while (len > 0)
    1699              :     {
    1700              :         int         l;
    1701              : 
    1702              :         /* fast path for ASCII-subset characters */
    1703         1515 :         if (!IS_HIGHBIT_SET(*s))
    1704              :         {
    1705         1037 :             if (*s == '\0')
    1706           30 :                 break;
    1707         1007 :             l = 1;
    1708              :         }
    1709              :         else
    1710              :         {
    1711          478 :             l = pg_gb18030_verifychar(s, len);
    1712          478 :             if (l == -1)
    1713          306 :                 break;
    1714              :         }
    1715         1179 :         s += l;
    1716         1179 :         len -= l;
    1717              :     }
    1718              : 
    1719          500 :     return s - start;
    1720              : }
    1721              : 
    1722              : static int
    1723         9451 : pg_utf8_verifychar(const unsigned char *s, int len)
    1724              : {
    1725              :     int         l;
    1726              : 
    1727         9451 :     if ((*s & 0x80) == 0)
    1728              :     {
    1729            0 :         if (*s == '\0')
    1730            0 :             return -1;
    1731            0 :         return 1;
    1732              :     }
    1733         9451 :     else if ((*s & 0xe0) == 0xc0)
    1734         3331 :         l = 2;
    1735         6120 :     else if ((*s & 0xf0) == 0xe0)
    1736         3412 :         l = 3;
    1737         2708 :     else if ((*s & 0xf8) == 0xf0)
    1738         2532 :         l = 4;
    1739              :     else
    1740          176 :         l = 1;
    1741              : 
    1742         9451 :     if (l > len)
    1743          320 :         return -1;
    1744              : 
    1745         9131 :     if (!pg_utf8_islegal(s, l))
    1746         1486 :         return -1;
    1747              : 
    1748         7645 :     return l;
    1749              : }
    1750              : 
    1751              : /*
    1752              :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1753              :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1754              :  * input byte and current state are used to compute an index into an array of
    1755              :  * state transitions. Since the address of the next transition is dependent
    1756              :  * on this computation, there is latency in executing the load instruction,
    1757              :  * and the CPU is not kept busy.
    1758              :  *
    1759              :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1760              :  *
    1761              :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1762              :  *
    1763              :  * In a shift-based DFA, the input byte is an index into array of integers
    1764              :  * whose bit pattern encodes the state transitions. To compute the next
    1765              :  * state, we simply right-shift the integer by the current state and apply a
    1766              :  * mask. In this scheme, the address of the transition only depends on the
    1767              :  * input byte, so there is better pipelining.
    1768              :  *
    1769              :  * The naming convention for states and transitions was adopted from a UTF-8
    1770              :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1771              :  *
    1772              :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1773              :  *
    1774              :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1775              :  * ==========================================================================
    1776              :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1777              :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1778              :  *                                                                  |
    1779              :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1780              :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1781              :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1782              :  *                                                                  |
    1783              :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1784              :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1785              :  *                                                                  |
    1786              :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1787              :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1788              :  *
    1789              :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1790              :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1791              :  * it's possible to find state numbers such that the transitions fit within
    1792              :  * 32-bit integers, as Dougall Johnson demonstrated:
    1793              :  *
    1794              :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1795              :  *
    1796              :  * This packed representation is the reason for the seemingly odd choice of
    1797              :  * state values below.
    1798              :  */
    1799              : 
    1800              : /* Error */
    1801              : #define ERR  0
    1802              : /* Begin */
    1803              : #define BGN 11
    1804              : /* Continuation states, expect 1/2/3 continuation bytes */
    1805              : #define CS1 16
    1806              : #define CS2  1
    1807              : #define CS3  5
    1808              : /* Partial states, where the first continuation byte has a restricted range */
    1809              : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1810              : #define P3B 20                  /* Lead was ED, check for surrogate */
    1811              : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1812              : #define P4B 30                  /* Lead was F4, check for too-large */
    1813              : /* Begin and End are the same state */
    1814              : #define END BGN
    1815              : 
    1816              : /* the encoded state transitions for the lookup table */
    1817              : 
    1818              : /* ASCII */
    1819              : #define ASC (END << BGN)
    1820              : /* 2-byte lead */
    1821              : #define L2A (CS1 << BGN)
    1822              : /* 3-byte lead */
    1823              : #define L3A (P3A << BGN)
    1824              : #define L3B (CS2 << BGN)
    1825              : #define L3C (P3B << BGN)
    1826              : /* 4-byte lead */
    1827              : #define L4A (P4A << BGN)
    1828              : #define L4B (CS3 << BGN)
    1829              : #define L4C (P4B << BGN)
    1830              : /* continuation byte */
    1831              : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1832              : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1833              : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1834              : /* invalid byte */
    1835              : #define ILL ERR
    1836              : 
    1837              : static const uint32 Utf8Transition[256] =
    1838              : {
    1839              :     /* ASCII */
    1840              : 
    1841              :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1842              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1843              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1844              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1845              : 
    1846              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1847              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1848              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1849              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1850              : 
    1851              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1852              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1853              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1854              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1855              : 
    1856              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1857              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1858              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1859              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1860              : 
    1861              :     /* continuation bytes */
    1862              : 
    1863              :     /* 80..8F */
    1864              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1865              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1866              : 
    1867              :     /* 90..9F */
    1868              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1869              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1870              : 
    1871              :     /* A0..BF */
    1872              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1873              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1874              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1875              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1876              : 
    1877              :     /* leading bytes */
    1878              : 
    1879              :     /* C0..DF */
    1880              :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1881              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1882              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1883              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1884              : 
    1885              :     /* E0..EF */
    1886              :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1887              :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1888              : 
    1889              :     /* F0..FF */
    1890              :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1891              :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1892              : };
    1893              : 
    1894              : static void
    1895         1147 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1896              : {
    1897              :     /* Note: We deliberately don't check the state's value here. */
    1898        37851 :     while (len > 0)
    1899              :     {
    1900              :         /*
    1901              :          * It's important that the mask value is 31: In most instruction sets,
    1902              :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1903              :          * 32, so the compiler should elide the mask operation.
    1904              :          */
    1905        36704 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1906        36704 :         len--;
    1907              :     }
    1908              : 
    1909         1147 :     *state &= 31;
    1910         1147 : }
    1911              : 
    1912              : static int
    1913       708768 : pg_utf8_verifystr(const unsigned char *s, int len)
    1914              : {
    1915       708768 :     const unsigned char *start = s;
    1916       708768 :     const int   orig_len = len;
    1917       708768 :     uint32      state = BGN;
    1918              : 
    1919              : /*
    1920              :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1921              :  * the compiler can unroll a longer loop, it's not worth it because we
    1922              :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1923              :  */
    1924              : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1925              : 
    1926       708768 :     if (len >= STRIDE_LENGTH)
    1927              :     {
    1928      2650897 :         while (len >= STRIDE_LENGTH)
    1929              :         {
    1930              :             /*
    1931              :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1932              :              * but we must first check for a non-END state, which means the
    1933              :              * previous chunk ended in the middle of a multibyte sequence.
    1934              :              */
    1935      2289726 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1936         1147 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1937              : 
    1938      2289726 :             s += STRIDE_LENGTH;
    1939      2289726 :             len -= STRIDE_LENGTH;
    1940              :         }
    1941              : 
    1942              :         /* The error state persists, so we only need to check for it here. */
    1943       361171 :         if (state == ERR)
    1944              :         {
    1945              :             /*
    1946              :              * Start over from the beginning with the slow path so we can
    1947              :              * count the valid bytes.
    1948              :              */
    1949          336 :             len = orig_len;
    1950          336 :             s = start;
    1951              :         }
    1952       360835 :         else if (state != END)
    1953              :         {
    1954              :             /*
    1955              :              * The fast path exited in the middle of a multibyte sequence.
    1956              :              * Walk backwards to find the leading byte so that the slow path
    1957              :              * can resume checking from there. We must always backtrack at
    1958              :              * least one byte, since the current byte could be e.g. an ASCII
    1959              :              * byte after a 2-byte lead, which is invalid.
    1960              :              */
    1961              :             do
    1962              :             {
    1963              :                 Assert(s > start);
    1964           73 :                 s--;
    1965           73 :                 len++;
    1966              :                 Assert(IS_HIGHBIT_SET(*s));
    1967           73 :             } while (pg_utf_mblen(s) <= 1);
    1968              :         }
    1969              :     }
    1970              : 
    1971              :     /* check remaining bytes */
    1972     10599434 :     while (len > 0)
    1973              :     {
    1974              :         int         l;
    1975              : 
    1976              :         /* fast path for ASCII-subset characters */
    1977      9892572 :         if (!IS_HIGHBIT_SET(*s))
    1978              :         {
    1979      9883157 :             if (*s == '\0')
    1980          132 :                 break;
    1981      9883025 :             l = 1;
    1982              :         }
    1983              :         else
    1984              :         {
    1985         9415 :             l = pg_utf8_verifychar(s, len);
    1986         9415 :             if (l == -1)
    1987         1774 :                 break;
    1988              :         }
    1989      9890666 :         s += l;
    1990      9890666 :         len -= l;
    1991              :     }
    1992              : 
    1993       708768 :     return s - start;
    1994              : }
    1995              : 
    1996              : /*
    1997              :  * Check for validity of a single UTF-8 encoded character
    1998              :  *
    1999              :  * This directly implements the rules in RFC3629.  The bizarre-looking
    2000              :  * restrictions on the second byte are meant to ensure that there isn't
    2001              :  * more than one encoding of a given Unicode character point; that is,
    2002              :  * you may not use a longer-than-necessary byte sequence with high order
    2003              :  * zero bits to represent a character that would fit in fewer bytes.
    2004              :  * To do otherwise is to create security hazards (eg, create an apparent
    2005              :  * non-ASCII character that decodes to plain ASCII).
    2006              :  *
    2007              :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    2008              :  * caller must have checked that that many bytes are present in the buffer.
    2009              :  */
    2010              : bool
    2011        16172 : pg_utf8_islegal(const unsigned char *source, int length)
    2012              : {
    2013              :     unsigned char a;
    2014              : 
    2015        16172 :     switch (length)
    2016              :     {
    2017            0 :         default:
    2018              :             /* reject lengths 5 and 6 for now */
    2019            0 :             return false;
    2020         2396 :         case 4:
    2021         2396 :             a = source[3];
    2022         2396 :             if (a < 0x80 || a > 0xBF)
    2023          198 :                 return false;
    2024              :             pg_fallthrough;
    2025              :         case 3:
    2026         6619 :             a = source[2];
    2027         6619 :             if (a < 0x80 || a > 0xBF)
    2028          440 :                 return false;
    2029              :             pg_fallthrough;
    2030              :         case 2:
    2031         9822 :             a = source[1];
    2032         9822 :             switch (*source)
    2033              :             {
    2034          208 :                 case 0xE0:
    2035          208 :                     if (a < 0xA0 || a > 0xBF)
    2036          176 :                         return false;
    2037           32 :                     break;
    2038          208 :                 case 0xED:
    2039          208 :                     if (a < 0x80 || a > 0x9F)
    2040          176 :                         return false;
    2041           32 :                     break;
    2042         2078 :                 case 0xF0:
    2043         2078 :                     if (a < 0x90 || a > 0xBF)
    2044          176 :                         return false;
    2045         1902 :                     break;
    2046          120 :                 case 0xF4:
    2047          120 :                     if (a < 0x80 || a > 0x8F)
    2048           88 :                         return false;
    2049           32 :                     break;
    2050         7208 :                 default:
    2051         7208 :                     if (a < 0x80 || a > 0xBF)
    2052          168 :                         return false;
    2053         7040 :                     break;
    2054              :             }
    2055              :             pg_fallthrough;
    2056              :         case 1:
    2057        14750 :             a = *source;
    2058        14750 :             if (a >= 0x80 && a < 0xC2)
    2059          264 :                 return false;
    2060        14486 :             if (a > 0xF4)
    2061           88 :                 return false;
    2062        14398 :             break;
    2063              :     }
    2064        14398 :     return true;
    2065              : }
    2066              : 
    2067              : 
    2068              : /*
    2069              :  * Fills the provided buffer with two bytes such that:
    2070              :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2071              :  */
    2072              : void
    2073          220 : pg_encoding_set_invalid(int encoding, char *dst)
    2074              : {
    2075              :     Assert(pg_encoding_max_length(encoding) > 1);
    2076              : 
    2077          220 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2078          220 :     dst[1] = NONUTF8_INVALID_BYTE1;
    2079          220 : }
    2080              : 
    2081              : /*
    2082              :  *-------------------------------------------------------------------
    2083              :  * encoding info table
    2084              :  *-------------------------------------------------------------------
    2085              :  */
    2086              : const pg_wchar_tbl pg_wchar_table[] = {
    2087              :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2088              :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2089              :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
    2090              :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2091              :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2092              :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2093              :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2094              :     [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2095              :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096              :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097              :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098              :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2099              :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2100              :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2101              :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2102              :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2103              :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2104              :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2105              :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2106              :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2107              :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2108              :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2109              :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2110              :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2111              :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2112              :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2113              :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2114              :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2115              :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2116              :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2117              :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2118              :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2119              :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2120              :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2121              :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2122              :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2123              :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2124              :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2125              :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2126              :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2127              :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2128              :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2129              : };
    2130              : 
    2131              : /*
    2132              :  * Returns the byte length of a multibyte character.
    2133              :  *
    2134              :  * Choose "mblen" functions based on the input string characteristics.
    2135              :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    2136              :  *
    2137              :  * - The input string is zero-terminated
    2138              :  *
    2139              :  * - The input string is known to be valid in the encoding (e.g., string
    2140              :  *   converted from database encoding)
    2141              :  *
    2142              :  * - The encoding is not GB18030 (e.g., when only database encodings are
    2143              :  *   passed to 'encoding' parameter)
    2144              :  *
    2145              :  * encoding==GB18030 requires examining up to two bytes to determine character
    2146              :  * length.  Therefore, callers satisfying none of those conditions must use
    2147              :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    2148              :  * guaranteed to be within allocation bounds.
    2149              :  *
    2150              :  * When dealing with text that is not certainly valid in the specified
    2151              :  * encoding, the result may exceed the actual remaining string length.
    2152              :  * Callers that are not prepared to deal with that should use Min(remaining,
    2153              :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    2154              :  * pg_encoding_mblen_bounded() are interchangeable.
    2155              :  */
    2156              : int
    2157     30115334 : pg_encoding_mblen(int encoding, const char *mbstr)
    2158              : {
    2159     30115334 :     return (PG_VALID_ENCODING(encoding) ?
    2160     60230668 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2161            0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2162              : }
    2163              : 
    2164              : /*
    2165              :  * Returns the byte length of a multibyte character (possibly not
    2166              :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    2167              :  */
    2168              : int
    2169         3752 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    2170              :                                 size_t remaining)
    2171              : {
    2172              :     /*
    2173              :      * Define zero remaining as too few, even for single-byte encodings.
    2174              :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    2175              :      * zero; others read one.
    2176              :      */
    2177         3752 :     if (remaining < 1 ||
    2178          202 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    2179           42 :         return INT_MAX;
    2180         3710 :     return pg_encoding_mblen(encoding, mbstr);
    2181              : }
    2182              : 
    2183              : /*
    2184              :  * Returns the byte length of a multibyte character; but not more than the
    2185              :  * distance to the terminating zero byte.  For input that might lack a
    2186              :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    2187              :  */
    2188              : int
    2189            0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2190              : {
    2191            0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2192              : }
    2193              : 
    2194              : /*
    2195              :  * Returns the display length of a multibyte character.
    2196              :  */
    2197              : int
    2198     29999207 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2199              : {
    2200     29999207 :     return (PG_VALID_ENCODING(encoding) ?
    2201     59998414 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2202            0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2203              : }
    2204              : 
    2205              : /*
    2206              :  * Verify the first multibyte character of the given string.
    2207              :  * Return its byte length if good, -1 if bad.  (See comments above for
    2208              :  * full details of the mbverifychar API.)
    2209              :  */
    2210              : int
    2211         4912 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2212              : {
    2213         4912 :     return (PG_VALID_ENCODING(encoding) ?
    2214         9824 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2215            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2216              : }
    2217              : 
    2218              : /*
    2219              :  * Verify that a string is valid for the given encoding.
    2220              :  * Returns the number of input bytes (<= len) that form a valid string.
    2221              :  * (See comments above for full details of the mbverifystr API.)
    2222              :  */
    2223              : int
    2224       230938 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2225              : {
    2226       230938 :     return (PG_VALID_ENCODING(encoding) ?
    2227       461876 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2228            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2229              : }
    2230              : 
    2231              : /*
    2232              :  * fetch maximum length of a given encoding
    2233              :  */
    2234              : int
    2235       640621 : pg_encoding_max_length(int encoding)
    2236              : {
    2237              :     Assert(PG_VALID_ENCODING(encoding));
    2238              : 
    2239              :     /*
    2240              :      * Check for the encoding despite the assert, due to some mingw versions
    2241              :      * otherwise issuing bogus warnings.
    2242              :      */
    2243       640621 :     return PG_VALID_ENCODING(encoding) ?
    2244      1281242 :         pg_wchar_table[encoding].maxmblen :
    2245              :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2246              : }
        

Generated by: LCOV version 2.0-1