LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19beta1 Lines: 80.8 % 750 606
Test Date: 2026-06-10 08:16:38 Functions: 82.9 % 76 63
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * wchar.c
       4              :  *    Functions for working with multibyte characters in various encodings.
       5              :  *
       6              :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/common/wchar.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "c.h"
      14              : 
      15              : #include <limits.h>
      16              : 
      17              : #include "mb/pg_wchar.h"
      18              : #include "utils/ascii.h"
      19              : 
      20              : 
      21              : /*
      22              :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23              :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24              :  *
      25              :  * For historical reasons, several verifychar implementations opt to reject
      26              :  * this pair specifically.  Byte pair range constraints, in encoding
      27              :  * originator documentation, always excluded this pair.  No core conversion
      28              :  * could translate it.  However, longstanding verifychar implementations
      29              :  * accepted any non-NUL byte.  big5_to_euc_tw even translates pairs not
      30              :  * valid per encoding originator documentation.  To avoid tightening core
      31              :  * or non-core conversions in a security patch, we sought this one pair.
      32              :  *
      33              :  * PQescapeString() historically used spaces for BYTE1; many other values
      34              :  * could suffice for BYTE1.
      35              :  */
      36              : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37              : #define NONUTF8_INVALID_BYTE1 (' ')
      38              : 
      39              : 
      40              : /*
      41              :  * Operations on multi-byte encodings are driven by a table of helper
      42              :  * functions.
      43              :  *
      44              :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45              :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46              :  * and wchar2mb() conversion functions.
      47              :  *
      48              :  * These functions generally assume that their input is validly formed.
      49              :  * The "verifier" functions, further down in the file, have to be more
      50              :  * paranoid.
      51              :  *
      52              :  * We expect that mblen() does not need to examine more than the first byte
      53              :  * of the character to discover the correct length.  GB18030 is an exception
      54              :  * to that rule, though, as it also looks at second byte.  But even that
      55              :  * behaves in a predictable way, if you only pass the first byte: it will
      56              :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57              :  * good enough for all current uses.
      58              :  *
      59              :  * Note: for the display output of psql to work properly, the return values
      60              :  * of the dsplen functions must conform to the Unicode standard. In particular
      61              :  * the NUL character is zero width and control characters are generally
      62              :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63              :  * subset to the ASCII routines to ensure consistency.
      64              :  */
      65              : 
      66              : /* No error-reporting facility.  Ignore incomplete trailing byte sequence. */
      67              : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
      68              : 
      69              : /*
      70              :  * SQL/ASCII
      71              :  */
      72              : static int
      73          433 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      74              : {
      75          433 :     int         cnt = 0;
      76              : 
      77        33279 :     while (len > 0 && *from)
      78              :     {
      79        32846 :         *to++ = *from++;
      80        32846 :         len--;
      81        32846 :         cnt++;
      82              :     }
      83          433 :     *to = 0;
      84          433 :     return cnt;
      85              : }
      86              : 
      87              : static int
      88        19580 : pg_ascii_mblen(const unsigned char *s)
      89              : {
      90        19580 :     return 1;
      91              : }
      92              : 
      93              : static int
      94        18075 : pg_ascii_dsplen(const unsigned char *s)
      95              : {
      96        18075 :     if (*s == '\0')
      97            0 :         return 0;
      98        18075 :     if (*s < 0x20 || *s == 0x7f)
      99            2 :         return -1;
     100              : 
     101        18073 :     return 1;
     102              : }
     103              : 
     104              : /*
     105              :  * EUC
     106              :  */
     107              : static int
     108           32 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     109              : {
     110           32 :     int         cnt = 0;
     111              : 
     112           48 :     while (len > 0 && *from)
     113              :     {
     114           32 :         if (*from == SS2)       /* JIS X 0201 (so called "1 byte KANA") */
     115              :         {
     116            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     117            4 :             from++;
     118            4 :             *to = (SS2 << 8) | *from++;
     119            4 :             len -= 2;
     120              :         }
     121           24 :         else if (*from == SS3)  /* JIS X 0212 KANJI */
     122              :         {
     123           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     124            4 :             from++;
     125            4 :             *to = (SS3 << 16) | (*from++ << 8);
     126            4 :             *to |= *from++;
     127            4 :             len -= 3;
     128              :         }
     129           12 :         else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
     130              :         {
     131            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     132            4 :             *to = *from++ << 8;
     133            4 :             *to |= *from++;
     134            4 :             len -= 2;
     135              :         }
     136              :         else                    /* must be ASCII */
     137              :         {
     138            4 :             *to = *from++;
     139            4 :             len--;
     140              :         }
     141           16 :         to++;
     142           16 :         cnt++;
     143              :     }
     144           32 :     *to = 0;
     145           32 :     return cnt;
     146              : }
     147              : 
     148              : static inline int
     149          156 : pg_euc_mblen(const unsigned char *s)
     150              : {
     151              :     int         len;
     152              : 
     153          156 :     if (*s == SS2)
     154            0 :         len = 2;
     155          156 :     else if (*s == SS3)
     156            0 :         len = 3;
     157          156 :     else if (IS_HIGHBIT_SET(*s))
     158          108 :         len = 2;
     159              :     else
     160           48 :         len = 1;
     161          156 :     return len;
     162              : }
     163              : 
     164              : static inline int
     165            0 : pg_euc_dsplen(const unsigned char *s)
     166              : {
     167              :     int         len;
     168              : 
     169            0 :     if (*s == SS2)
     170            0 :         len = 2;
     171            0 :     else if (*s == SS3)
     172            0 :         len = 2;
     173            0 :     else if (IS_HIGHBIT_SET(*s))
     174            0 :         len = 2;
     175              :     else
     176            0 :         len = pg_ascii_dsplen(s);
     177            0 :     return len;
     178              : }
     179              : 
     180              : /*
     181              :  * EUC_JP
     182              :  */
     183              : static int
     184           32 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     185              : {
     186           32 :     return pg_euc2wchar_with_len(from, to, len);
     187              : }
     188              : 
     189              : static int
     190          136 : pg_eucjp_mblen(const unsigned char *s)
     191              : {
     192          136 :     return pg_euc_mblen(s);
     193              : }
     194              : 
     195              : static int
     196            0 : pg_eucjp_dsplen(const unsigned char *s)
     197              : {
     198              :     int         len;
     199              : 
     200            0 :     if (*s == SS2)
     201            0 :         len = 1;
     202            0 :     else if (*s == SS3)
     203            0 :         len = 2;
     204            0 :     else if (IS_HIGHBIT_SET(*s))
     205            0 :         len = 2;
     206              :     else
     207            0 :         len = pg_ascii_dsplen(s);
     208            0 :     return len;
     209              : }
     210              : 
     211              : /*
     212              :  * EUC_KR
     213              :  */
     214              : static int
     215            0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     216              : {
     217            0 :     return pg_euc2wchar_with_len(from, to, len);
     218              : }
     219              : 
     220              : static int
     221            4 : pg_euckr_mblen(const unsigned char *s)
     222              : {
     223            4 :     return pg_euc_mblen(s);
     224              : }
     225              : 
     226              : static int
     227            0 : pg_euckr_dsplen(const unsigned char *s)
     228              : {
     229            0 :     return pg_euc_dsplen(s);
     230              : }
     231              : 
     232              : /*
     233              :  * EUC_CN
     234              :  *
     235              :  */
     236              : static int
     237           36 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     238              : {
     239           36 :     int         cnt = 0;
     240              : 
     241           52 :     while (len > 0 && *from)
     242              :     {
     243           36 :         if (*from == SS2)       /* code set 2 (unused?) */
     244              :         {
     245           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     246            4 :             from++;
     247            4 :             *to = (SS2 << 16) | (*from++ << 8);
     248            4 :             *to |= *from++;
     249            4 :             len -= 3;
     250              :         }
     251           24 :         else if (*from == SS3)  /* code set 3 (unused ?) */
     252              :         {
     253           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     254            4 :             from++;
     255            4 :             *to = (SS3 << 16) | (*from++ << 8);
     256            4 :             *to |= *from++;
     257            4 :             len -= 3;
     258              :         }
     259           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
     260              :         {
     261            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     262            4 :             *to = *from++ << 8;
     263            4 :             *to |= *from++;
     264            4 :             len -= 2;
     265              :         }
     266              :         else
     267              :         {
     268            4 :             *to = *from++;
     269            4 :             len--;
     270              :         }
     271           16 :         to++;
     272           16 :         cnt++;
     273              :     }
     274           36 :     *to = 0;
     275           36 :     return cnt;
     276              : }
     277              : 
     278              : /*
     279              :  * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
     280              :  * EUC_CN), but mb2wchar_with_len does.  Tell a coherent story for code that
     281              :  * relies on agreement between mb2wchar_with_len and mblen.  Invalid text
     282              :  * datums (e.g. from shared catalogs) reach this.
     283              :  */
     284              : static int
     285            4 : pg_euccn_mblen(const unsigned char *s)
     286              : {
     287              :     int         len;
     288              : 
     289            4 :     if (*s == SS2)
     290            0 :         len = 3;
     291            4 :     else if (*s == SS3)
     292            0 :         len = 3;
     293            4 :     else if (IS_HIGHBIT_SET(*s))
     294            4 :         len = 2;
     295              :     else
     296            0 :         len = 1;
     297            4 :     return len;
     298              : }
     299              : 
     300              : static int
     301            0 : pg_euccn_dsplen(const unsigned char *s)
     302              : {
     303              :     int         len;
     304              : 
     305            0 :     if (IS_HIGHBIT_SET(*s))
     306            0 :         len = 2;
     307              :     else
     308            0 :         len = pg_ascii_dsplen(s);
     309            0 :     return len;
     310              : }
     311              : 
     312              : /*
     313              :  * EUC_TW
     314              :  *
     315              :  */
     316              : static int
     317           40 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     318              : {
     319           40 :     int         cnt = 0;
     320              : 
     321           56 :     while (len > 0 && *from)
     322              :     {
     323           40 :         if (*from == SS2)       /* code set 2 */
     324              :         {
     325           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     326            4 :             from++;
     327            4 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     328            4 :             *to |= *from++ << 8;
     329            4 :             *to |= *from++;
     330            4 :             len -= 4;
     331              :         }
     332           24 :         else if (*from == SS3)  /* code set 3 (unused?) */
     333              :         {
     334           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     335            4 :             from++;
     336            4 :             *to = (SS3 << 16) | (*from++ << 8);
     337            4 :             *to |= *from++;
     338            4 :             len -= 3;
     339              :         }
     340           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
     341              :         {
     342            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     343            4 :             *to = *from++ << 8;
     344            4 :             *to |= *from++;
     345            4 :             len -= 2;
     346              :         }
     347              :         else
     348              :         {
     349            4 :             *to = *from++;
     350            4 :             len--;
     351              :         }
     352           16 :         to++;
     353           16 :         cnt++;
     354              :     }
     355           40 :     *to = 0;
     356           40 :     return cnt;
     357              : }
     358              : 
     359              : static int
     360            4 : pg_euctw_mblen(const unsigned char *s)
     361              : {
     362              :     int         len;
     363              : 
     364            4 :     if (*s == SS2)
     365            0 :         len = 4;
     366            4 :     else if (*s == SS3)
     367            0 :         len = 3;
     368            4 :     else if (IS_HIGHBIT_SET(*s))
     369            4 :         len = 2;
     370              :     else
     371            0 :         len = 1;
     372            4 :     return len;
     373              : }
     374              : 
     375              : static int
     376            0 : pg_euctw_dsplen(const unsigned char *s)
     377              : {
     378              :     int         len;
     379              : 
     380            0 :     if (*s == SS2)
     381            0 :         len = 2;
     382            0 :     else if (*s == SS3)
     383            0 :         len = 2;
     384            0 :     else if (IS_HIGHBIT_SET(*s))
     385            0 :         len = 2;
     386              :     else
     387            0 :         len = pg_ascii_dsplen(s);
     388            0 :     return len;
     389              : }
     390              : 
     391              : /*
     392              :  * Convert pg_wchar to EUC_* encoding.
     393              :  * caller must allocate enough space for "to", including a trailing zero!
     394              :  * len: length of from.
     395              :  * "from" not necessarily null terminated.
     396              :  */
     397              : static int
     398           48 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     399              : {
     400           48 :     int         cnt = 0;
     401              : 
     402           96 :     while (len > 0 && *from)
     403              :     {
     404              :         unsigned char c;
     405              : 
     406           48 :         if ((c = (*from >> 24)))
     407              :         {
     408            4 :             *to++ = c;
     409            4 :             *to++ = (*from >> 16) & 0xff;
     410            4 :             *to++ = (*from >> 8) & 0xff;
     411            4 :             *to++ = *from & 0xff;
     412            4 :             cnt += 4;
     413              :         }
     414           44 :         else if ((c = (*from >> 16)))
     415              :         {
     416           16 :             *to++ = c;
     417           16 :             *to++ = (*from >> 8) & 0xff;
     418           16 :             *to++ = *from & 0xff;
     419           16 :             cnt += 3;
     420              :         }
     421           28 :         else if ((c = (*from >> 8)))
     422              :         {
     423           16 :             *to++ = c;
     424           16 :             *to++ = *from & 0xff;
     425           16 :             cnt += 2;
     426              :         }
     427              :         else
     428              :         {
     429           12 :             *to++ = *from;
     430           12 :             cnt++;
     431              :         }
     432           48 :         from++;
     433           48 :         len--;
     434              :     }
     435           48 :     *to = 0;
     436           48 :     return cnt;
     437              : }
     438              : 
     439              : 
     440              : /*
     441              :  * JOHAB
     442              :  */
     443              : static int
     444           16 : pg_johab_mblen(const unsigned char *s)
     445              : {
     446           16 :     return pg_euc_mblen(s);
     447              : }
     448              : 
     449              : static int
     450            0 : pg_johab_dsplen(const unsigned char *s)
     451              : {
     452            0 :     return pg_euc_dsplen(s);
     453              : }
     454              : 
     455              : /*
     456              :  * convert UTF8 string to pg_wchar (UCS-4)
     457              :  * caller must allocate enough space for "to", including a trailing zero!
     458              :  * len: length of from.
     459              :  * "from" not necessarily null terminated.
     460              :  */
     461              : static int
     462      6879946 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     463              : {
     464      6879946 :     int         cnt = 0;
     465              :     uint32      c1,
     466              :                 c2,
     467              :                 c3,
     468              :                 c4;
     469              : 
     470    106927058 :     while (len > 0 && *from)
     471              :     {
     472    100047140 :         if ((*from & 0x80) == 0)
     473              :         {
     474    100046461 :             *to = *from++;
     475    100046461 :             len--;
     476              :         }
     477          679 :         else if ((*from & 0xe0) == 0xc0)
     478              :         {
     479          345 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     480          337 :             c1 = *from++ & 0x1f;
     481          337 :             c2 = *from++ & 0x3f;
     482          337 :             *to = (c1 << 6) | c2;
     483          337 :             len -= 2;
     484              :         }
     485          334 :         else if ((*from & 0xf0) == 0xe0)
     486              :         {
     487          174 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     488          166 :             c1 = *from++ & 0x0f;
     489          166 :             c2 = *from++ & 0x3f;
     490          166 :             c3 = *from++ & 0x3f;
     491          166 :             *to = (c1 << 12) | (c2 << 6) | c3;
     492          166 :             len -= 3;
     493              :         }
     494          160 :         else if ((*from & 0xf8) == 0xf0)
     495              :         {
     496           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     497            4 :             c1 = *from++ & 0x07;
     498            4 :             c2 = *from++ & 0x3f;
     499            4 :             c3 = *from++ & 0x3f;
     500            4 :             c4 = *from++ & 0x3f;
     501            4 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     502            4 :             len -= 4;
     503              :         }
     504              :         else
     505              :         {
     506              :             /* treat a bogus char as length 1; not ours to raise error */
     507          144 :             *to = *from++;
     508          144 :             len--;
     509              :         }
     510    100047112 :         to++;
     511    100047112 :         cnt++;
     512              :     }
     513      6879946 :     *to = 0;
     514      6879946 :     return cnt;
     515              : }
     516              : 
     517              : 
     518              : /*
     519              :  * Trivial conversion from pg_wchar to UTF-8.
     520              :  * caller should allocate enough space for "to"
     521              :  * len: length of from.
     522              :  * "from" not necessarily null terminated.
     523              :  */
     524              : static int
     525       579537 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     526              : {
     527       579537 :     int         cnt = 0;
     528              : 
     529      8542733 :     while (len > 0 && *from)
     530              :     {
     531              :         int         char_len;
     532              : 
     533      7963196 :         unicode_to_utf8(*from, to);
     534      7963196 :         char_len = pg_utf_mblen(to);
     535      7963196 :         cnt += char_len;
     536      7963196 :         to += char_len;
     537      7963196 :         from++;
     538      7963196 :         len--;
     539              :     }
     540       579537 :     *to = 0;
     541       579537 :     return cnt;
     542              : }
     543              : 
     544              : /*
     545              :  * Return the byte length of a UTF8 character pointed to by s
     546              :  *
     547              :  * Note: in the current implementation we do not support UTF8 sequences
     548              :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     549              :  * We return "1" for any leading byte that is either flat-out illegal or
     550              :  * indicates a length larger than we support.
     551              :  *
     552              :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     553              :  * other places would need to be fixed to change this.
     554              :  */
     555              : int
     556     97258227 : pg_utf_mblen(const unsigned char *s)
     557              : {
     558              :     int         len;
     559              : 
     560     97258227 :     if ((*s & 0x80) == 0)
     561     97153028 :         len = 1;
     562       105199 :     else if ((*s & 0xe0) == 0xc0)
     563         8520 :         len = 2;
     564        96679 :     else if ((*s & 0xf0) == 0xe0)
     565        70297 :         len = 3;
     566        26382 :     else if ((*s & 0xf8) == 0xf0)
     567        26267 :         len = 4;
     568              : #ifdef NOT_USED
     569              :     else if ((*s & 0xfc) == 0xf8)
     570              :         len = 5;
     571              :     else if ((*s & 0xfe) == 0xfc)
     572              :         len = 6;
     573              : #endif
     574              :     else
     575          115 :         len = 1;
     576     97258227 :     return len;
     577              : }
     578              : 
     579              : /*
     580              :  * This is an implementation of wcwidth() and wcswidth() as defined in
     581              :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     582              :  * <http://www.unix.org/online.html>
     583              :  *
     584              :  * Markus Kuhn -- 2001-09-08 -- public domain
     585              :  *
     586              :  * customised for PostgreSQL
     587              :  *
     588              :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     589              :  */
     590              : 
     591              : struct mbinterval
     592              : {
     593              :     unsigned int first;
     594              :     unsigned int last;
     595              : };
     596              : 
     597              : /* auxiliary function for binary search in interval table */
     598              : static int
     599     60018000 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     600              : {
     601     60018000 :     int         min = 0;
     602              :     int         mid;
     603              : 
     604     60018000 :     if (ucs < table[0].first || ucs > table[max].last)
     605     60012354 :         return 0;
     606        49149 :     while (max >= min)
     607              :     {
     608        43983 :         mid = (min + max) / 2;
     609        43983 :         if (ucs > table[mid].last)
     610         9747 :             min = mid + 1;
     611        34236 :         else if (ucs < table[mid].first)
     612        33756 :             max = mid - 1;
     613              :         else
     614          480 :             return 1;
     615              :     }
     616              : 
     617         5166 :     return 0;
     618              : }
     619              : 
     620              : 
     621              : /*
     622              :  * The following functions define the column width of an ISO 10646
     623              :  * character as follows:
     624              :  *
     625              :  *    - The null character (U+0000) has a column width of 0.
     626              :  *
     627              :  *    - Other C0/C1 control characters and DEL will lead to a return
     628              :  *      value of -1.
     629              :  *
     630              :  *    - Non-spacing and enclosing combining characters (general
     631              :  *      category code Mn, Me or Cf in the Unicode database) have a
     632              :  *      column width of 0.
     633              :  *
     634              :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     635              :  *      FullWidth (F) category as defined in Unicode Technical
     636              :  *      Report #11 have a column width of 2.
     637              :  *
     638              :  *    - All remaining characters (including all printable
     639              :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     640              :  *      etc.) have a column width of 1.
     641              :  *
     642              :  * This implementation assumes that wchar_t characters are encoded
     643              :  * in ISO 10646.
     644              :  */
     645              : 
     646              : static int
     647     30038587 : ucs_wcwidth(pg_wchar ucs)
     648              : {
     649              : #include "common/unicode_nonspacing_table.h"
     650              : #include "common/unicode_east_asian_fw_table.h"
     651              : 
     652              :     /* test for 8-bit control characters */
     653     30038587 :     if (ucs == 0)
     654            0 :         return 0;
     655              : 
     656     30038587 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     657        29425 :         return -1;
     658              : 
     659              :     /*
     660              :      * binary search in table of non-spacing characters
     661              :      *
     662              :      * XXX: In the official Unicode sources, it is possible for a character to
     663              :      * be described as both non-spacing and wide at the same time. As of
     664              :      * Unicode 13.0, treating the non-spacing property as the determining
     665              :      * factor for display width leads to the correct behavior, so do that
     666              :      * search first.
     667              :      */
     668     30009162 :     if (mbbisearch(ucs, nonspacing,
     669              :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     670          324 :         return 0;
     671              : 
     672              :     /* binary search in table of wide characters */
     673     30008838 :     if (mbbisearch(ucs, east_asian_fw,
     674              :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     675          156 :         return 2;
     676              : 
     677     30008682 :     return 1;
     678              : }
     679              : 
     680              : static int
     681     30038587 : pg_utf_dsplen(const unsigned char *s)
     682              : {
     683     30038587 :     return ucs_wcwidth(utf8_to_unicode(s));
     684              : }
     685              : 
     686              : /*
     687              :  * ISO8859-1
     688              :  */
     689              : static int
     690          468 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     691              : {
     692          468 :     int         cnt = 0;
     693              : 
     694        13377 :     while (len > 0 && *from)
     695              :     {
     696        12909 :         *to++ = *from++;
     697        12909 :         len--;
     698        12909 :         cnt++;
     699              :     }
     700          468 :     *to = 0;
     701          468 :     return cnt;
     702              : }
     703              : 
     704              : /*
     705              :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     706              :  * high bits.
     707              :  * caller should allocate enough space for "to"
     708              :  * len: length of from.
     709              :  * "from" not necessarily null terminated.
     710              :  */
     711              : static int
     712           79 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     713              : {
     714           79 :     int         cnt = 0;
     715              : 
     716          678 :     while (len > 0 && *from)
     717              :     {
     718          599 :         *to++ = *from++;
     719          599 :         len--;
     720          599 :         cnt++;
     721              :     }
     722           79 :     *to = 0;
     723           79 :     return cnt;
     724              : }
     725              : 
     726              : static int
     727         3614 : pg_latin1_mblen(const unsigned char *s)
     728              : {
     729         3614 :     return 1;
     730              : }
     731              : 
     732              : static int
     733          400 : pg_latin1_dsplen(const unsigned char *s)
     734              : {
     735          400 :     return pg_ascii_dsplen(s);
     736              : }
     737              : 
     738              : /*
     739              :  * SJIS
     740              :  */
     741              : static int
     742         1015 : pg_sjis_mblen(const unsigned char *s)
     743              : {
     744              :     int         len;
     745              : 
     746         1015 :     if (*s >= 0xa1 && *s <= 0xdf)
     747            0 :         len = 1;                /* 1 byte kana? */
     748         1015 :     else if (IS_HIGHBIT_SET(*s))
     749          809 :         len = 2;                /* kanji? */
     750              :     else
     751          206 :         len = 1;                /* should be ASCII */
     752         1015 :     return len;
     753              : }
     754              : 
     755              : static int
     756            0 : pg_sjis_dsplen(const unsigned char *s)
     757              : {
     758              :     int         len;
     759              : 
     760            0 :     if (*s >= 0xa1 && *s <= 0xdf)
     761            0 :         len = 1;                /* 1 byte kana? */
     762            0 :     else if (IS_HIGHBIT_SET(*s))
     763            0 :         len = 2;                /* kanji? */
     764              :     else
     765            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     766            0 :     return len;
     767              : }
     768              : 
     769              : /*
     770              :  * Big5
     771              :  */
     772              : static int
     773          232 : pg_big5_mblen(const unsigned char *s)
     774              : {
     775              :     int         len;
     776              : 
     777          232 :     if (IS_HIGHBIT_SET(*s))
     778          208 :         len = 2;                /* kanji? */
     779              :     else
     780           24 :         len = 1;                /* should be ASCII */
     781          232 :     return len;
     782              : }
     783              : 
     784              : static int
     785            0 : pg_big5_dsplen(const unsigned char *s)
     786              : {
     787              :     int         len;
     788              : 
     789            0 :     if (IS_HIGHBIT_SET(*s))
     790            0 :         len = 2;                /* kanji? */
     791              :     else
     792            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     793            0 :     return len;
     794              : }
     795              : 
     796              : /*
     797              :  * GBK
     798              :  */
     799              : static int
     800          282 : pg_gbk_mblen(const unsigned char *s)
     801              : {
     802              :     int         len;
     803              : 
     804          282 :     if (IS_HIGHBIT_SET(*s))
     805          212 :         len = 2;                /* kanji? */
     806              :     else
     807           70 :         len = 1;                /* should be ASCII */
     808          282 :     return len;
     809              : }
     810              : 
     811              : static int
     812            0 : pg_gbk_dsplen(const unsigned char *s)
     813              : {
     814              :     int         len;
     815              : 
     816            0 :     if (IS_HIGHBIT_SET(*s))
     817            0 :         len = 2;                /* kanji? */
     818              :     else
     819            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     820            0 :     return len;
     821              : }
     822              : 
     823              : /*
     824              :  * UHC
     825              :  */
     826              : static int
     827           16 : pg_uhc_mblen(const unsigned char *s)
     828              : {
     829              :     int         len;
     830              : 
     831           16 :     if (IS_HIGHBIT_SET(*s))
     832           16 :         len = 2;                /* 2byte? */
     833              :     else
     834            0 :         len = 1;                /* should be ASCII */
     835           16 :     return len;
     836              : }
     837              : 
     838              : static int
     839            0 : pg_uhc_dsplen(const unsigned char *s)
     840              : {
     841              :     int         len;
     842              : 
     843            0 :     if (IS_HIGHBIT_SET(*s))
     844            0 :         len = 2;                /* 2byte? */
     845              :     else
     846            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     847            0 :     return len;
     848              : }
     849              : 
     850              : /*
     851              :  * GB18030
     852              :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
     853              :  */
     854              : 
     855              : /*
     856              :  * Unlike all other mblen() functions, this also looks at the second byte of
     857              :  * the input.  However, if you only pass the first byte of a multi-byte
     858              :  * string, and \0 as the second byte, this still works in a predictable way:
     859              :  * a 4-byte character will be reported as two 2-byte characters.  That's
     860              :  * enough for all current uses, as a client-only encoding.  It works that
     861              :  * way, because in any valid 4-byte GB18030-encoded character, the third and
     862              :  * fourth byte look like a 2-byte encoded character, when looked at
     863              :  * separately.
     864              :  */
     865              : static int
     866          623 : pg_gb18030_mblen(const unsigned char *s)
     867              : {
     868              :     int         len;
     869              : 
     870          623 :     if (!IS_HIGHBIT_SET(*s))
     871          348 :         len = 1;                /* ASCII */
     872          275 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
     873          114 :         len = 4;
     874              :     else
     875          161 :         len = 2;
     876          623 :     return len;
     877              : }
     878              : 
     879              : static int
     880            0 : pg_gb18030_dsplen(const unsigned char *s)
     881              : {
     882              :     int         len;
     883              : 
     884            0 :     if (IS_HIGHBIT_SET(*s))
     885            0 :         len = 2;
     886              :     else
     887            0 :         len = pg_ascii_dsplen(s);   /* ASCII */
     888            0 :     return len;
     889              : }
     890              : 
     891              : /*
     892              :  *-------------------------------------------------------------------
     893              :  * multibyte sequence validators
     894              :  *
     895              :  * The verifychar functions accept "s", a pointer to the first byte of a
     896              :  * string, and "len", the remaining length of the string.  If there is a
     897              :  * validly encoded character beginning at *s, return its length in bytes;
     898              :  * else return -1.
     899              :  *
     900              :  * The verifystr functions also accept "s", a pointer to a string and "len",
     901              :  * the length of the string.  They verify the whole string, and return the
     902              :  * number of input bytes (<= len) that are valid.  In other words, if the
     903              :  * whole string is valid, verifystr returns "len", otherwise it returns the
     904              :  * byte offset of the first invalid character.  The verifystr functions must
     905              :  * test for and reject zeroes in the input.
     906              :  *
     907              :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
     908              :  * they must test for and reject zeroes in any additional bytes of a
     909              :  * multibyte character.  Note that this definition allows the function for a
     910              :  * single-byte encoding to be just "return 1".
     911              :  *-------------------------------------------------------------------
     912              :  */
     913              : static int
     914          161 : pg_ascii_verifychar(const unsigned char *s, int len)
     915              : {
     916          161 :     return 1;
     917              : }
     918              : 
     919              : static int
     920       211652 : pg_ascii_verifystr(const unsigned char *s, int len)
     921              : {
     922       211652 :     const unsigned char *nullpos = memchr(s, 0, len);
     923              : 
     924       211652 :     if (nullpos == NULL)
     925       211652 :         return len;
     926              :     else
     927            0 :         return nullpos - s;
     928              : }
     929              : 
     930              : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
     931              : 
     932              : static int
     933          336 : pg_eucjp_verifychar(const unsigned char *s, int len)
     934              : {
     935              :     int         l;
     936              :     unsigned char c1,
     937              :                 c2;
     938              : 
     939          336 :     c1 = *s++;
     940              : 
     941          336 :     switch (c1)
     942              :     {
     943            0 :         case SS2:               /* JIS X 0201 */
     944            0 :             l = 2;
     945            0 :             if (l > len)
     946            0 :                 return -1;
     947            0 :             c2 = *s++;
     948            0 :             if (c2 < 0xa1 || c2 > 0xdf)
     949            0 :                 return -1;
     950            0 :             break;
     951              : 
     952            0 :         case SS3:               /* JIS X 0212 */
     953            0 :             l = 3;
     954            0 :             if (l > len)
     955            0 :                 return -1;
     956            0 :             c2 = *s++;
     957            0 :             if (!IS_EUC_RANGE_VALID(c2))
     958            0 :                 return -1;
     959            0 :             c2 = *s++;
     960            0 :             if (!IS_EUC_RANGE_VALID(c2))
     961            0 :                 return -1;
     962            0 :             break;
     963              : 
     964          336 :         default:
     965          336 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
     966              :             {
     967          336 :                 l = 2;
     968          336 :                 if (l > len)
     969           56 :                     return -1;
     970          280 :                 if (!IS_EUC_RANGE_VALID(c1))
     971           16 :                     return -1;
     972          264 :                 c2 = *s++;
     973          264 :                 if (!IS_EUC_RANGE_VALID(c2))
     974          120 :                     return -1;
     975              :             }
     976              :             else
     977              :                 /* must be ASCII */
     978              :             {
     979            0 :                 l = 1;
     980              :             }
     981          144 :             break;
     982              :     }
     983              : 
     984          144 :     return l;
     985              : }
     986              : 
     987              : static int
     988          196 : pg_eucjp_verifystr(const unsigned char *s, int len)
     989              : {
     990          196 :     const unsigned char *start = s;
     991              : 
     992          604 :     while (len > 0)
     993              :     {
     994              :         int         l;
     995              : 
     996              :         /* fast path for ASCII-subset characters */
     997          552 :         if (!IS_HIGHBIT_SET(*s))
     998              :         {
     999          384 :             if (*s == '\0')
    1000           48 :                 break;
    1001          336 :             l = 1;
    1002              :         }
    1003              :         else
    1004              :         {
    1005          168 :             l = pg_eucjp_verifychar(s, len);
    1006          168 :             if (l == -1)
    1007           96 :                 break;
    1008              :         }
    1009          408 :         s += l;
    1010          408 :         len -= l;
    1011              :     }
    1012              : 
    1013          196 :     return s - start;
    1014              : }
    1015              : 
    1016              : static int
    1017           96 : pg_euckr_verifychar(const unsigned char *s, int len)
    1018              : {
    1019              :     int         l;
    1020              :     unsigned char c1,
    1021              :                 c2;
    1022              : 
    1023           96 :     c1 = *s++;
    1024              : 
    1025           96 :     if (IS_HIGHBIT_SET(c1))
    1026              :     {
    1027           96 :         l = 2;
    1028           96 :         if (l > len)
    1029            8 :             return -1;
    1030           88 :         if (!IS_EUC_RANGE_VALID(c1))
    1031           16 :             return -1;
    1032           72 :         c2 = *s++;
    1033           72 :         if (!IS_EUC_RANGE_VALID(c2))
    1034            0 :             return -1;
    1035              :     }
    1036              :     else
    1037              :         /* must be ASCII */
    1038              :     {
    1039            0 :         l = 1;
    1040              :     }
    1041              : 
    1042           72 :     return l;
    1043              : }
    1044              : 
    1045              : static int
    1046           40 : pg_euckr_verifystr(const unsigned char *s, int len)
    1047              : {
    1048           40 :     const unsigned char *start = s;
    1049              : 
    1050          124 :     while (len > 0)
    1051              :     {
    1052              :         int         l;
    1053              : 
    1054              :         /* fast path for ASCII-subset characters */
    1055          108 :         if (!IS_HIGHBIT_SET(*s))
    1056              :         {
    1057           48 :             if (*s == '\0')
    1058            0 :                 break;
    1059           48 :             l = 1;
    1060              :         }
    1061              :         else
    1062              :         {
    1063           60 :             l = pg_euckr_verifychar(s, len);
    1064           60 :             if (l == -1)
    1065           24 :                 break;
    1066              :         }
    1067           84 :         s += l;
    1068           84 :         len -= l;
    1069              :     }
    1070              : 
    1071           40 :     return s - start;
    1072              : }
    1073              : 
    1074              : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1075              : #define pg_euccn_verifychar pg_euckr_verifychar
    1076              : #define pg_euccn_verifystr  pg_euckr_verifystr
    1077              : 
    1078              : static int
    1079           12 : pg_euctw_verifychar(const unsigned char *s, int len)
    1080              : {
    1081              :     int         l;
    1082              :     unsigned char c1,
    1083              :                 c2;
    1084              : 
    1085           12 :     c1 = *s++;
    1086              : 
    1087           12 :     switch (c1)
    1088              :     {
    1089            0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1090            0 :             l = 4;
    1091            0 :             if (l > len)
    1092            0 :                 return -1;
    1093            0 :             c2 = *s++;
    1094            0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1095            0 :                 return -1;
    1096            0 :             c2 = *s++;
    1097            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1098            0 :                 return -1;
    1099            0 :             c2 = *s++;
    1100            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1101            0 :                 return -1;
    1102            0 :             break;
    1103              : 
    1104            0 :         case SS3:               /* unused */
    1105            0 :             return -1;
    1106              : 
    1107           12 :         default:
    1108           12 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1109              :             {
    1110           12 :                 l = 2;
    1111           12 :                 if (l > len)
    1112            4 :                     return -1;
    1113              :                 /* no further range check on c1? */
    1114            8 :                 c2 = *s++;
    1115            8 :                 if (!IS_EUC_RANGE_VALID(c2))
    1116            8 :                     return -1;
    1117              :             }
    1118              :             else
    1119              :                 /* must be ASCII */
    1120              :             {
    1121            0 :                 l = 1;
    1122              :             }
    1123            0 :             break;
    1124              :     }
    1125            0 :     return l;
    1126              : }
    1127              : 
    1128              : static int
    1129           20 : pg_euctw_verifystr(const unsigned char *s, int len)
    1130              : {
    1131           20 :     const unsigned char *start = s;
    1132              : 
    1133           44 :     while (len > 0)
    1134              :     {
    1135              :         int         l;
    1136              : 
    1137              :         /* fast path for ASCII-subset characters */
    1138           36 :         if (!IS_HIGHBIT_SET(*s))
    1139              :         {
    1140           24 :             if (*s == '\0')
    1141            0 :                 break;
    1142           24 :             l = 1;
    1143              :         }
    1144              :         else
    1145              :         {
    1146           12 :             l = pg_euctw_verifychar(s, len);
    1147           12 :             if (l == -1)
    1148           12 :                 break;
    1149              :         }
    1150           24 :         s += l;
    1151           24 :         len -= l;
    1152              :     }
    1153              : 
    1154           20 :     return s - start;
    1155              : }
    1156              : 
    1157              : static int
    1158           12 : pg_johab_verifychar(const unsigned char *s, int len)
    1159              : {
    1160              :     int         l,
    1161              :                 mbl;
    1162              :     unsigned char c;
    1163              : 
    1164           12 :     l = mbl = pg_johab_mblen(s);
    1165              : 
    1166           12 :     if (len < l)
    1167            4 :         return -1;
    1168              : 
    1169            8 :     if (!IS_HIGHBIT_SET(*s))
    1170            0 :         return mbl;
    1171              : 
    1172            8 :     while (--l > 0)
    1173              :     {
    1174            8 :         c = *++s;
    1175            8 :         if (!IS_EUC_RANGE_VALID(c))
    1176            8 :             return -1;
    1177              :     }
    1178            0 :     return mbl;
    1179              : }
    1180              : 
    1181              : static int
    1182           16 : pg_johab_verifystr(const unsigned char *s, int len)
    1183              : {
    1184           16 :     const unsigned char *start = s;
    1185              : 
    1186           28 :     while (len > 0)
    1187              :     {
    1188              :         int         l;
    1189              : 
    1190              :         /* fast path for ASCII-subset characters */
    1191           24 :         if (!IS_HIGHBIT_SET(*s))
    1192              :         {
    1193           12 :             if (*s == '\0')
    1194            0 :                 break;
    1195           12 :             l = 1;
    1196              :         }
    1197              :         else
    1198              :         {
    1199           12 :             l = pg_johab_verifychar(s, len);
    1200           12 :             if (l == -1)
    1201           12 :                 break;
    1202              :         }
    1203           12 :         s += l;
    1204           12 :         len -= l;
    1205              :     }
    1206              : 
    1207           16 :     return s - start;
    1208              : }
    1209              : 
    1210              : static int
    1211         3223 : pg_latin1_verifychar(const unsigned char *s, int len)
    1212              : {
    1213         3223 :     return 1;
    1214              : }
    1215              : 
    1216              : static int
    1217         5212 : pg_latin1_verifystr(const unsigned char *s, int len)
    1218              : {
    1219         5212 :     const unsigned char *nullpos = memchr(s, 0, len);
    1220              : 
    1221         5212 :     if (nullpos == NULL)
    1222         5140 :         return len;
    1223              :     else
    1224           72 :         return nullpos - s;
    1225              : }
    1226              : 
    1227              : static int
    1228          624 : pg_sjis_verifychar(const unsigned char *s, int len)
    1229              : {
    1230              :     int         l,
    1231              :                 mbl;
    1232              :     unsigned char c1,
    1233              :                 c2;
    1234              : 
    1235          624 :     l = mbl = pg_sjis_mblen(s);
    1236              : 
    1237          624 :     if (len < l)
    1238           86 :         return -1;
    1239              : 
    1240          538 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1241            0 :         return mbl;
    1242              : 
    1243          538 :     c1 = *s++;
    1244          538 :     c2 = *s;
    1245          538 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1246          214 :         return -1;
    1247          324 :     return mbl;
    1248              : }
    1249              : 
    1250              : static int
    1251          322 : pg_sjis_verifystr(const unsigned char *s, int len)
    1252              : {
    1253          322 :     const unsigned char *start = s;
    1254              : 
    1255         1233 :     while (len > 0)
    1256              :     {
    1257              :         int         l;
    1258              : 
    1259              :         /* fast path for ASCII-subset characters */
    1260         1107 :         if (!IS_HIGHBIT_SET(*s))
    1261              :         {
    1262          815 :             if (*s == '\0')
    1263           48 :                 break;
    1264          767 :             l = 1;
    1265              :         }
    1266              :         else
    1267              :         {
    1268          292 :             l = pg_sjis_verifychar(s, len);
    1269          292 :             if (l == -1)
    1270          148 :                 break;
    1271              :         }
    1272          911 :         s += l;
    1273          911 :         len -= l;
    1274              :     }
    1275              : 
    1276          322 :     return s - start;
    1277              : }
    1278              : 
    1279              : static int
    1280          168 : pg_big5_verifychar(const unsigned char *s, int len)
    1281              : {
    1282              :     int         l,
    1283              :                 mbl;
    1284              : 
    1285          168 :     l = mbl = pg_big5_mblen(s);
    1286              : 
    1287          168 :     if (len < l)
    1288            4 :         return -1;
    1289              : 
    1290          164 :     if (l == 2 &&
    1291          164 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1292            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1293            8 :         return -1;
    1294              : 
    1295          264 :     while (--l > 0)
    1296              :     {
    1297          156 :         if (*++s == '\0')
    1298           48 :             return -1;
    1299              :     }
    1300              : 
    1301          108 :     return mbl;
    1302              : }
    1303              : 
    1304              : static int
    1305          104 : pg_big5_verifystr(const unsigned char *s, int len)
    1306              : {
    1307          104 :     const unsigned char *start = s;
    1308              : 
    1309          428 :     while (len > 0)
    1310              :     {
    1311              :         int         l;
    1312              : 
    1313              :         /* fast path for ASCII-subset characters */
    1314          384 :         if (!IS_HIGHBIT_SET(*s))
    1315              :         {
    1316          300 :             if (*s == '\0')
    1317           24 :                 break;
    1318          276 :             l = 1;
    1319              :         }
    1320              :         else
    1321              :         {
    1322           84 :             l = pg_big5_verifychar(s, len);
    1323           84 :             if (l == -1)
    1324           36 :                 break;
    1325              :         }
    1326          324 :         s += l;
    1327          324 :         len -= l;
    1328              :     }
    1329              : 
    1330          104 :     return s - start;
    1331              : }
    1332              : 
    1333              : static int
    1334          140 : pg_gbk_verifychar(const unsigned char *s, int len)
    1335              : {
    1336              :     int         l,
    1337              :                 mbl;
    1338              : 
    1339          140 :     l = mbl = pg_gbk_mblen(s);
    1340              : 
    1341          140 :     if (len < l)
    1342           28 :         return -1;
    1343              : 
    1344          112 :     if (l == 2 &&
    1345          112 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1346           16 :         s[1] == NONUTF8_INVALID_BYTE1)
    1347           16 :         return -1;
    1348              : 
    1349          192 :     while (--l > 0)
    1350              :     {
    1351           96 :         if (*++s == '\0')
    1352            0 :             return -1;
    1353              :     }
    1354              : 
    1355           96 :     return mbl;
    1356              : }
    1357              : 
    1358              : static int
    1359          132 : pg_gbk_verifystr(const unsigned char *s, int len)
    1360              : {
    1361          132 :     const unsigned char *start = s;
    1362              : 
    1363          336 :     while (len > 0)
    1364              :     {
    1365              :         int         l;
    1366              : 
    1367              :         /* fast path for ASCII-subset characters */
    1368          248 :         if (!IS_HIGHBIT_SET(*s))
    1369              :         {
    1370          124 :             if (*s == '\0')
    1371            0 :                 break;
    1372          124 :             l = 1;
    1373              :         }
    1374              :         else
    1375              :         {
    1376          124 :             l = pg_gbk_verifychar(s, len);
    1377          124 :             if (l == -1)
    1378           44 :                 break;
    1379              :         }
    1380          204 :         s += l;
    1381          204 :         len -= l;
    1382              :     }
    1383              : 
    1384          132 :     return s - start;
    1385              : }
    1386              : 
    1387              : static int
    1388           12 : pg_uhc_verifychar(const unsigned char *s, int len)
    1389              : {
    1390              :     int         l,
    1391              :                 mbl;
    1392              : 
    1393           12 :     l = mbl = pg_uhc_mblen(s);
    1394              : 
    1395           12 :     if (len < l)
    1396            4 :         return -1;
    1397              : 
    1398            8 :     if (l == 2 &&
    1399            8 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1400            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1401            8 :         return -1;
    1402              : 
    1403            0 :     while (--l > 0)
    1404              :     {
    1405            0 :         if (*++s == '\0')
    1406            0 :             return -1;
    1407              :     }
    1408              : 
    1409            0 :     return mbl;
    1410              : }
    1411              : 
    1412              : static int
    1413           16 : pg_uhc_verifystr(const unsigned char *s, int len)
    1414              : {
    1415           16 :     const unsigned char *start = s;
    1416              : 
    1417           28 :     while (len > 0)
    1418              :     {
    1419              :         int         l;
    1420              : 
    1421              :         /* fast path for ASCII-subset characters */
    1422           24 :         if (!IS_HIGHBIT_SET(*s))
    1423              :         {
    1424           12 :             if (*s == '\0')
    1425            0 :                 break;
    1426           12 :             l = 1;
    1427              :         }
    1428              :         else
    1429              :         {
    1430           12 :             l = pg_uhc_verifychar(s, len);
    1431           12 :             if (l == -1)
    1432           12 :                 break;
    1433              :         }
    1434           12 :         s += l;
    1435           12 :         len -= l;
    1436              :     }
    1437              : 
    1438           16 :     return s - start;
    1439              : }
    1440              : 
    1441              : static int
    1442          698 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1443              : {
    1444              :     int         l;
    1445              : 
    1446          698 :     if (!IS_HIGHBIT_SET(*s))
    1447            0 :         l = 1;                  /* ASCII */
    1448          698 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1449              :     {
    1450              :         /* Should be 4-byte, validate remaining bytes */
    1451          210 :         if (*s >= 0x81 && *s <= 0xfe &&
    1452          204 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1453          204 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1454          108 :             l = 4;
    1455              :         else
    1456          102 :             l = -1;
    1457              :     }
    1458          488 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1459              :     {
    1460              :         /* Should be 2-byte, validate */
    1461          358 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1462          238 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1463          176 :             l = 2;
    1464              :         else
    1465          182 :             l = -1;
    1466              :     }
    1467              :     else
    1468          130 :         l = -1;
    1469          698 :     return l;
    1470              : }
    1471              : 
    1472              : static int
    1473          500 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1474              : {
    1475          500 :     const unsigned char *start = s;
    1476              : 
    1477         1679 :     while (len > 0)
    1478              :     {
    1479              :         int         l;
    1480              : 
    1481              :         /* fast path for ASCII-subset characters */
    1482         1515 :         if (!IS_HIGHBIT_SET(*s))
    1483              :         {
    1484         1037 :             if (*s == '\0')
    1485           30 :                 break;
    1486         1007 :             l = 1;
    1487              :         }
    1488              :         else
    1489              :         {
    1490          478 :             l = pg_gb18030_verifychar(s, len);
    1491          478 :             if (l == -1)
    1492          306 :                 break;
    1493              :         }
    1494         1179 :         s += l;
    1495         1179 :         len -= l;
    1496              :     }
    1497              : 
    1498          500 :     return s - start;
    1499              : }
    1500              : 
    1501              : static int
    1502         9451 : pg_utf8_verifychar(const unsigned char *s, int len)
    1503              : {
    1504              :     int         l;
    1505              : 
    1506         9451 :     if ((*s & 0x80) == 0)
    1507              :     {
    1508            0 :         if (*s == '\0')
    1509            0 :             return -1;
    1510            0 :         return 1;
    1511              :     }
    1512         9451 :     else if ((*s & 0xe0) == 0xc0)
    1513         3331 :         l = 2;
    1514         6120 :     else if ((*s & 0xf0) == 0xe0)
    1515         3412 :         l = 3;
    1516         2708 :     else if ((*s & 0xf8) == 0xf0)
    1517         2532 :         l = 4;
    1518              :     else
    1519          176 :         l = 1;
    1520              : 
    1521         9451 :     if (l > len)
    1522          320 :         return -1;
    1523              : 
    1524         9131 :     if (!pg_utf8_islegal(s, l))
    1525         1486 :         return -1;
    1526              : 
    1527         7645 :     return l;
    1528              : }
    1529              : 
    1530              : /*
    1531              :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1532              :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1533              :  * input byte and current state are used to compute an index into an array of
    1534              :  * state transitions. Since the address of the next transition is dependent
    1535              :  * on this computation, there is latency in executing the load instruction,
    1536              :  * and the CPU is not kept busy.
    1537              :  *
    1538              :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1539              :  *
    1540              :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1541              :  *
    1542              :  * In a shift-based DFA, the input byte is an index into array of integers
    1543              :  * whose bit pattern encodes the state transitions. To compute the next
    1544              :  * state, we simply right-shift the integer by the current state and apply a
    1545              :  * mask. In this scheme, the address of the transition only depends on the
    1546              :  * input byte, so there is better pipelining.
    1547              :  *
    1548              :  * The naming convention for states and transitions was adopted from a UTF-8
    1549              :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1550              :  *
    1551              :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1552              :  *
    1553              :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1554              :  * ==========================================================================
    1555              :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1556              :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1557              :  *                                                                  |
    1558              :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1559              :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1560              :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1561              :  *                                                                  |
    1562              :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1563              :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1564              :  *                                                                  |
    1565              :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1566              :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1567              :  *
    1568              :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1569              :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1570              :  * it's possible to find state numbers such that the transitions fit within
    1571              :  * 32-bit integers, as Dougall Johnson demonstrated:
    1572              :  *
    1573              :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1574              :  *
    1575              :  * This packed representation is the reason for the seemingly odd choice of
    1576              :  * state values below.
    1577              :  */
    1578              : 
    1579              : /* Error */
    1580              : #define ERR  0
    1581              : /* Begin */
    1582              : #define BGN 11
    1583              : /* Continuation states, expect 1/2/3 continuation bytes */
    1584              : #define CS1 16
    1585              : #define CS2  1
    1586              : #define CS3  5
    1587              : /* Partial states, where the first continuation byte has a restricted range */
    1588              : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1589              : #define P3B 20                  /* Lead was ED, check for surrogate */
    1590              : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1591              : #define P4B 30                  /* Lead was F4, check for too-large */
    1592              : /* Begin and End are the same state */
    1593              : #define END BGN
    1594              : 
    1595              : /* the encoded state transitions for the lookup table */
    1596              : 
    1597              : /* ASCII */
    1598              : #define ASC (END << BGN)
    1599              : /* 2-byte lead */
    1600              : #define L2A (CS1 << BGN)
    1601              : /* 3-byte lead */
    1602              : #define L3A (P3A << BGN)
    1603              : #define L3B (CS2 << BGN)
    1604              : #define L3C (P3B << BGN)
    1605              : /* 4-byte lead */
    1606              : #define L4A (P4A << BGN)
    1607              : #define L4B (CS3 << BGN)
    1608              : #define L4C (P4B << BGN)
    1609              : /* continuation byte */
    1610              : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1611              : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1612              : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1613              : /* invalid byte */
    1614              : #define ILL ERR
    1615              : 
    1616              : static const uint32 Utf8Transition[256] =
    1617              : {
    1618              :     /* ASCII */
    1619              : 
    1620              :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1621              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1622              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1623              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1624              : 
    1625              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1626              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1627              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1628              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1629              : 
    1630              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1631              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1632              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1633              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1634              : 
    1635              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1636              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1637              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1638              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1639              : 
    1640              :     /* continuation bytes */
    1641              : 
    1642              :     /* 80..8F */
    1643              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1644              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1645              : 
    1646              :     /* 90..9F */
    1647              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1648              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1649              : 
    1650              :     /* A0..BF */
    1651              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1652              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1653              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1654              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1655              : 
    1656              :     /* leading bytes */
    1657              : 
    1658              :     /* C0..DF */
    1659              :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1660              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1661              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1662              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1663              : 
    1664              :     /* E0..EF */
    1665              :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1666              :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1667              : 
    1668              :     /* F0..FF */
    1669              :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1670              :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1671              : };
    1672              : 
    1673              : static void
    1674         1147 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1675              : {
    1676              :     /* Note: We deliberately don't check the state's value here. */
    1677        37851 :     while (len > 0)
    1678              :     {
    1679              :         /*
    1680              :          * It's important that the mask value is 31: In most instruction sets,
    1681              :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1682              :          * 32, so the compiler should elide the mask operation.
    1683              :          */
    1684        36704 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1685        36704 :         len--;
    1686              :     }
    1687              : 
    1688         1147 :     *state &= 31;
    1689         1147 : }
    1690              : 
    1691              : static int
    1692       717701 : pg_utf8_verifystr(const unsigned char *s, int len)
    1693              : {
    1694       717701 :     const unsigned char *start = s;
    1695       717701 :     const int   orig_len = len;
    1696       717701 :     uint32      state = BGN;
    1697              : 
    1698              : /*
    1699              :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1700              :  * the compiler can unroll a longer loop, it's not worth it because we
    1701              :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1702              :  */
    1703              : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1704              : 
    1705       717701 :     if (len >= STRIDE_LENGTH)
    1706              :     {
    1707      2662674 :         while (len >= STRIDE_LENGTH)
    1708              :         {
    1709              :             /*
    1710              :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1711              :              * but we must first check for a non-END state, which means the
    1712              :              * previous chunk ended in the middle of a multibyte sequence.
    1713              :              */
    1714      2299137 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1715         1147 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1716              : 
    1717      2299137 :             s += STRIDE_LENGTH;
    1718      2299137 :             len -= STRIDE_LENGTH;
    1719              :         }
    1720              : 
    1721              :         /* The error state persists, so we only need to check for it here. */
    1722       363537 :         if (state == ERR)
    1723              :         {
    1724              :             /*
    1725              :              * Start over from the beginning with the slow path so we can
    1726              :              * count the valid bytes.
    1727              :              */
    1728          336 :             len = orig_len;
    1729          336 :             s = start;
    1730              :         }
    1731       363201 :         else if (state != END)
    1732              :         {
    1733              :             /*
    1734              :              * The fast path exited in the middle of a multibyte sequence.
    1735              :              * Walk backwards to find the leading byte so that the slow path
    1736              :              * can resume checking from there. We must always backtrack at
    1737              :              * least one byte, since the current byte could be e.g. an ASCII
    1738              :              * byte after a 2-byte lead, which is invalid.
    1739              :              */
    1740              :             do
    1741              :             {
    1742              :                 Assert(s > start);
    1743           73 :                 s--;
    1744           73 :                 len++;
    1745              :                 Assert(IS_HIGHBIT_SET(*s));
    1746           73 :             } while (pg_utf_mblen(s) <= 1);
    1747              :         }
    1748              :     }
    1749              : 
    1750              :     /* check remaining bytes */
    1751     10703420 :     while (len > 0)
    1752              :     {
    1753              :         int         l;
    1754              : 
    1755              :         /* fast path for ASCII-subset characters */
    1756      9987625 :         if (!IS_HIGHBIT_SET(*s))
    1757              :         {
    1758      9978210 :             if (*s == '\0')
    1759          132 :                 break;
    1760      9978078 :             l = 1;
    1761              :         }
    1762              :         else
    1763              :         {
    1764         9415 :             l = pg_utf8_verifychar(s, len);
    1765         9415 :             if (l == -1)
    1766         1774 :                 break;
    1767              :         }
    1768      9985719 :         s += l;
    1769      9985719 :         len -= l;
    1770              :     }
    1771              : 
    1772       717701 :     return s - start;
    1773              : }
    1774              : 
    1775              : /*
    1776              :  * Check for validity of a single UTF-8 encoded character
    1777              :  *
    1778              :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1779              :  * restrictions on the second byte are meant to ensure that there isn't
    1780              :  * more than one encoding of a given Unicode character point; that is,
    1781              :  * you may not use a longer-than-necessary byte sequence with high order
    1782              :  * zero bits to represent a character that would fit in fewer bytes.
    1783              :  * To do otherwise is to create security hazards (eg, create an apparent
    1784              :  * non-ASCII character that decodes to plain ASCII).
    1785              :  *
    1786              :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1787              :  * caller must have checked that that many bytes are present in the buffer.
    1788              :  */
    1789              : bool
    1790        16204 : pg_utf8_islegal(const unsigned char *source, int length)
    1791              : {
    1792              :     unsigned char a;
    1793              : 
    1794        16204 :     switch (length)
    1795              :     {
    1796            0 :         default:
    1797              :             /* reject lengths 5 and 6 for now */
    1798            0 :             return false;
    1799         2396 :         case 4:
    1800         2396 :             a = source[3];
    1801         2396 :             if (a < 0x80 || a > 0xBF)
    1802          198 :                 return false;
    1803              :             pg_fallthrough;
    1804              :         case 3:
    1805         6619 :             a = source[2];
    1806         6619 :             if (a < 0x80 || a > 0xBF)
    1807          440 :                 return false;
    1808              :             pg_fallthrough;
    1809              :         case 2:
    1810         9830 :             a = source[1];
    1811         9830 :             switch (*source)
    1812              :             {
    1813          208 :                 case 0xE0:
    1814          208 :                     if (a < 0xA0 || a > 0xBF)
    1815          176 :                         return false;
    1816           32 :                     break;
    1817          208 :                 case 0xED:
    1818          208 :                     if (a < 0x80 || a > 0x9F)
    1819          176 :                         return false;
    1820           32 :                     break;
    1821         2078 :                 case 0xF0:
    1822         2078 :                     if (a < 0x90 || a > 0xBF)
    1823          176 :                         return false;
    1824         1902 :                     break;
    1825          120 :                 case 0xF4:
    1826          120 :                     if (a < 0x80 || a > 0x8F)
    1827           88 :                         return false;
    1828           32 :                     break;
    1829         7216 :                 default:
    1830         7216 :                     if (a < 0x80 || a > 0xBF)
    1831          168 :                         return false;
    1832         7048 :                     break;
    1833              :             }
    1834              :             pg_fallthrough;
    1835              :         case 1:
    1836        14782 :             a = *source;
    1837        14782 :             if (a >= 0x80 && a < 0xC2)
    1838          264 :                 return false;
    1839        14518 :             if (a > 0xF4)
    1840           88 :                 return false;
    1841        14430 :             break;
    1842              :     }
    1843        14430 :     return true;
    1844              : }
    1845              : 
    1846              : 
    1847              : /*
    1848              :  * Fills the provided buffer with two bytes such that:
    1849              :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    1850              :  */
    1851              : void
    1852          212 : pg_encoding_set_invalid(int encoding, char *dst)
    1853              : {
    1854              :     Assert(pg_encoding_max_length(encoding) > 1);
    1855              : 
    1856          212 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    1857          212 :     dst[1] = NONUTF8_INVALID_BYTE1;
    1858          212 : }
    1859              : 
    1860              : /*
    1861              :  *-------------------------------------------------------------------
    1862              :  * encoding info table
    1863              :  *-------------------------------------------------------------------
    1864              :  */
    1865              : const pg_wchar_tbl pg_wchar_table[] = {
    1866              :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    1867              :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    1868              :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
    1869              :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    1870              :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    1871              :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    1872              :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    1873              :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1874              :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1875              :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1876              :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1877              :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1878              :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1879              :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1880              :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1881              :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1882              :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1883              :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1884              :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1885              :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1886              :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1887              :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1888              :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1889              :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1890              :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1891              :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1892              :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1893              :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1894              :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1895              :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1896              :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1897              :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1898              :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1899              :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1900              :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    1901              :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    1902              :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    1903              :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    1904              :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    1905              :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    1906              :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    1907              : };
    1908              : 
    1909              : /*
    1910              :  * Returns the byte length of a multibyte character.
    1911              :  *
    1912              :  * Choose "mblen" functions based on the input string characteristics.
    1913              :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    1914              :  *
    1915              :  * - The input string is zero-terminated
    1916              :  *
    1917              :  * - The input string is known to be valid in the encoding (e.g., string
    1918              :  *   converted from database encoding)
    1919              :  *
    1920              :  * - The encoding is not GB18030 (e.g., when only database encodings are
    1921              :  *   passed to 'encoding' parameter)
    1922              :  *
    1923              :  * encoding==GB18030 requires examining up to two bytes to determine character
    1924              :  * length.  Therefore, callers satisfying none of those conditions must use
    1925              :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    1926              :  * guaranteed to be within allocation bounds.
    1927              :  *
    1928              :  * When dealing with text that is not certainly valid in the specified
    1929              :  * encoding, the result may exceed the actual remaining string length.
    1930              :  * Callers that are not prepared to deal with that should use Min(remaining,
    1931              :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    1932              :  * pg_encoding_mblen_bounded() are interchangeable.
    1933              :  */
    1934              : int
    1935     30165036 : pg_encoding_mblen(int encoding, const char *mbstr)
    1936              : {
    1937     30165036 :     return (PG_VALID_ENCODING(encoding) ?
    1938     60330072 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    1939            0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    1940              : }
    1941              : 
    1942              : /*
    1943              :  * Returns the byte length of a multibyte character (possibly not
    1944              :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    1945              :  */
    1946              : int
    1947         3136 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    1948              :                                 size_t remaining)
    1949              : {
    1950              :     /*
    1951              :      * Define zero remaining as too few, even for single-byte encodings.
    1952              :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    1953              :      * zero; others read one.
    1954              :      */
    1955         3136 :     if (remaining < 1 ||
    1956          202 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    1957           42 :         return INT_MAX;
    1958         3094 :     return pg_encoding_mblen(encoding, mbstr);
    1959              : }
    1960              : 
    1961              : /*
    1962              :  * Returns the byte length of a multibyte character; but not more than the
    1963              :  * distance to the terminating zero byte.  For input that might lack a
    1964              :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    1965              :  */
    1966              : int
    1967            0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    1968              : {
    1969            0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    1970              : }
    1971              : 
    1972              : /*
    1973              :  * Returns the display length of a multibyte character.
    1974              :  */
    1975              : int
    1976     30049471 : pg_encoding_dsplen(int encoding, const char *mbstr)
    1977              : {
    1978     30049471 :     return (PG_VALID_ENCODING(encoding) ?
    1979     60098942 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    1980            0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    1981              : }
    1982              : 
    1983              : /*
    1984              :  * Verify the first multibyte character of the given string.
    1985              :  * Return its byte length if good, -1 if bad.  (See comments above for
    1986              :  * full details of the mbverifychar API.)
    1987              :  */
    1988              : int
    1989         4228 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    1990              : {
    1991         4228 :     return (PG_VALID_ENCODING(encoding) ?
    1992         8456 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    1993            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    1994              : }
    1995              : 
    1996              : /*
    1997              :  * Verify that a string is valid for the given encoding.
    1998              :  * Returns the number of input bytes (<= len) that form a valid string.
    1999              :  * (See comments above for full details of the mbverifystr API.)
    2000              :  */
    2001              : int
    2002       230721 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2003              : {
    2004       230721 :     return (PG_VALID_ENCODING(encoding) ?
    2005       461442 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2006            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2007              : }
    2008              : 
    2009              : /*
    2010              :  * fetch maximum length of a given encoding
    2011              :  */
    2012              : int
    2013       684682 : pg_encoding_max_length(int encoding)
    2014              : {
    2015              :     Assert(PG_VALID_ENCODING(encoding));
    2016              : 
    2017              :     /*
    2018              :      * Check for the encoding despite the assert, due to some mingw versions
    2019              :      * otherwise issuing bogus warnings.
    2020              :      */
    2021       684682 :     return PG_VALID_ENCODING(encoding) ?
    2022      1369364 :         pg_wchar_table[encoding].maxmblen :
    2023              :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2024              : }
        

Generated by: LCOV version 2.0-1