LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 80.8 % 750 606
Test Date: 2026-05-01 10:16:27 Functions: 82.9 % 76 63
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * wchar.c
       4              :  *    Functions for working with multibyte characters in various encodings.
       5              :  *
       6              :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/common/wchar.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "c.h"
      14              : 
      15              : #include <limits.h>
      16              : 
      17              : #include "mb/pg_wchar.h"
      18              : #include "utils/ascii.h"
      19              : 
      20              : 
      21              : /*
      22              :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23              :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24              :  *
      25              :  * For historical reasons, several verifychar implementations opt to reject
      26              :  * this pair specifically.  Byte pair range constraints, in encoding
      27              :  * originator documentation, always excluded this pair.  No core conversion
      28              :  * could translate it.  However, longstanding verifychar implementations
      29              :  * accepted any non-NUL byte.  big5_to_euc_tw even translates pairs not
      30              :  * valid per encoding originator documentation.  To avoid tightening core
      31              :  * or non-core conversions in a security patch, we sought this one pair.
      32              :  *
      33              :  * PQescapeString() historically used spaces for BYTE1; many other values
      34              :  * could suffice for BYTE1.
      35              :  */
      36              : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37              : #define NONUTF8_INVALID_BYTE1 (' ')
      38              : 
      39              : 
      40              : /*
      41              :  * Operations on multi-byte encodings are driven by a table of helper
      42              :  * functions.
      43              :  *
      44              :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45              :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46              :  * and wchar2mb() conversion functions.
      47              :  *
      48              :  * These functions generally assume that their input is validly formed.
      49              :  * The "verifier" functions, further down in the file, have to be more
      50              :  * paranoid.
      51              :  *
      52              :  * We expect that mblen() does not need to examine more than the first byte
      53              :  * of the character to discover the correct length.  GB18030 is an exception
      54              :  * to that rule, though, as it also looks at second byte.  But even that
      55              :  * behaves in a predictable way, if you only pass the first byte: it will
      56              :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57              :  * good enough for all current uses.
      58              :  *
      59              :  * Note: for the display output of psql to work properly, the return values
      60              :  * of the dsplen functions must conform to the Unicode standard. In particular
      61              :  * the NUL character is zero width and control characters are generally
      62              :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63              :  * subset to the ASCII routines to ensure consistency.
      64              :  */
      65              : 
      66              : /* No error-reporting facility.  Ignore incomplete trailing byte sequence. */
      67              : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
      68              : 
      69              : /*
      70              :  * SQL/ASCII
      71              :  */
      72              : static int
      73          433 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      74              : {
      75          433 :     int         cnt = 0;
      76              : 
      77        33279 :     while (len > 0 && *from)
      78              :     {
      79        32846 :         *to++ = *from++;
      80        32846 :         len--;
      81        32846 :         cnt++;
      82              :     }
      83          433 :     *to = 0;
      84          433 :     return cnt;
      85              : }
      86              : 
      87              : static int
      88        19580 : pg_ascii_mblen(const unsigned char *s)
      89              : {
      90        19580 :     return 1;
      91              : }
      92              : 
      93              : static int
      94        18075 : pg_ascii_dsplen(const unsigned char *s)
      95              : {
      96        18075 :     if (*s == '\0')
      97            0 :         return 0;
      98        18075 :     if (*s < 0x20 || *s == 0x7f)
      99            2 :         return -1;
     100              : 
     101        18073 :     return 1;
     102              : }
     103              : 
     104              : /*
     105              :  * EUC
     106              :  */
     107              : static int
     108           32 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     109              : {
     110           32 :     int         cnt = 0;
     111              : 
     112           48 :     while (len > 0 && *from)
     113              :     {
     114           32 :         if (*from == SS2)       /* JIS X 0201 (so called "1 byte KANA") */
     115              :         {
     116            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     117            4 :             from++;
     118            4 :             *to = (SS2 << 8) | *from++;
     119            4 :             len -= 2;
     120              :         }
     121           24 :         else if (*from == SS3)  /* JIS X 0212 KANJI */
     122              :         {
     123           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     124            4 :             from++;
     125            4 :             *to = (SS3 << 16) | (*from++ << 8);
     126            4 :             *to |= *from++;
     127            4 :             len -= 3;
     128              :         }
     129           12 :         else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
     130              :         {
     131            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     132            4 :             *to = *from++ << 8;
     133            4 :             *to |= *from++;
     134            4 :             len -= 2;
     135              :         }
     136              :         else                    /* must be ASCII */
     137              :         {
     138            4 :             *to = *from++;
     139            4 :             len--;
     140              :         }
     141           16 :         to++;
     142           16 :         cnt++;
     143              :     }
     144           32 :     *to = 0;
     145           32 :     return cnt;
     146              : }
     147              : 
     148              : static inline int
     149          156 : pg_euc_mblen(const unsigned char *s)
     150              : {
     151              :     int         len;
     152              : 
     153          156 :     if (*s == SS2)
     154            0 :         len = 2;
     155          156 :     else if (*s == SS3)
     156            0 :         len = 3;
     157          156 :     else if (IS_HIGHBIT_SET(*s))
     158          108 :         len = 2;
     159              :     else
     160           48 :         len = 1;
     161          156 :     return len;
     162              : }
     163              : 
     164              : static inline int
     165            0 : pg_euc_dsplen(const unsigned char *s)
     166              : {
     167              :     int         len;
     168              : 
     169            0 :     if (*s == SS2)
     170            0 :         len = 2;
     171            0 :     else if (*s == SS3)
     172            0 :         len = 2;
     173            0 :     else if (IS_HIGHBIT_SET(*s))
     174            0 :         len = 2;
     175              :     else
     176            0 :         len = pg_ascii_dsplen(s);
     177            0 :     return len;
     178              : }
     179              : 
     180              : /*
     181              :  * EUC_JP
     182              :  */
     183              : static int
     184           32 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     185              : {
     186           32 :     return pg_euc2wchar_with_len(from, to, len);
     187              : }
     188              : 
     189              : static int
     190          136 : pg_eucjp_mblen(const unsigned char *s)
     191              : {
     192          136 :     return pg_euc_mblen(s);
     193              : }
     194              : 
     195              : static int
     196            0 : pg_eucjp_dsplen(const unsigned char *s)
     197              : {
     198              :     int         len;
     199              : 
     200            0 :     if (*s == SS2)
     201            0 :         len = 1;
     202            0 :     else if (*s == SS3)
     203            0 :         len = 2;
     204            0 :     else if (IS_HIGHBIT_SET(*s))
     205            0 :         len = 2;
     206              :     else
     207            0 :         len = pg_ascii_dsplen(s);
     208            0 :     return len;
     209              : }
     210              : 
     211              : /*
     212              :  * EUC_KR
     213              :  */
     214              : static int
     215            0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     216              : {
     217            0 :     return pg_euc2wchar_with_len(from, to, len);
     218              : }
     219              : 
     220              : static int
     221            4 : pg_euckr_mblen(const unsigned char *s)
     222              : {
     223            4 :     return pg_euc_mblen(s);
     224              : }
     225              : 
     226              : static int
     227            0 : pg_euckr_dsplen(const unsigned char *s)
     228              : {
     229            0 :     return pg_euc_dsplen(s);
     230              : }
     231              : 
     232              : /*
     233              :  * EUC_CN
     234              :  *
     235              :  */
     236              : static int
     237           36 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     238              : {
     239           36 :     int         cnt = 0;
     240              : 
     241           52 :     while (len > 0 && *from)
     242              :     {
     243           36 :         if (*from == SS2)       /* code set 2 (unused?) */
     244              :         {
     245           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     246            4 :             from++;
     247            4 :             *to = (SS2 << 16) | (*from++ << 8);
     248            4 :             *to |= *from++;
     249            4 :             len -= 3;
     250              :         }
     251           24 :         else if (*from == SS3)  /* code set 3 (unused ?) */
     252              :         {
     253           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     254            4 :             from++;
     255            4 :             *to = (SS3 << 16) | (*from++ << 8);
     256            4 :             *to |= *from++;
     257            4 :             len -= 3;
     258              :         }
     259           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
     260              :         {
     261            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     262            4 :             *to = *from++ << 8;
     263            4 :             *to |= *from++;
     264            4 :             len -= 2;
     265              :         }
     266              :         else
     267              :         {
     268            4 :             *to = *from++;
     269            4 :             len--;
     270              :         }
     271           16 :         to++;
     272           16 :         cnt++;
     273              :     }
     274           36 :     *to = 0;
     275           36 :     return cnt;
     276              : }
     277              : 
     278              : /*
     279              :  * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
     280              :  * EUC_CN), but mb2wchar_with_len does.  Tell a coherent story for code that
     281              :  * relies on agreement between mb2wchar_with_len and mblen.  Invalid text
     282              :  * datums (e.g. from shared catalogs) reach this.
     283              :  */
     284              : static int
     285            4 : pg_euccn_mblen(const unsigned char *s)
     286              : {
     287              :     int         len;
     288              : 
     289            4 :     if (*s == SS2)
     290            0 :         len = 3;
     291            4 :     else if (*s == SS3)
     292            0 :         len = 3;
     293            4 :     else if (IS_HIGHBIT_SET(*s))
     294            4 :         len = 2;
     295              :     else
     296            0 :         len = 1;
     297            4 :     return len;
     298              : }
     299              : 
     300              : static int
     301            0 : pg_euccn_dsplen(const unsigned char *s)
     302              : {
     303              :     int         len;
     304              : 
     305            0 :     if (IS_HIGHBIT_SET(*s))
     306            0 :         len = 2;
     307              :     else
     308            0 :         len = pg_ascii_dsplen(s);
     309            0 :     return len;
     310              : }
     311              : 
     312              : /*
     313              :  * EUC_TW
     314              :  *
     315              :  */
     316              : static int
     317           40 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     318              : {
     319           40 :     int         cnt = 0;
     320              : 
     321           56 :     while (len > 0 && *from)
     322              :     {
     323           40 :         if (*from == SS2)       /* code set 2 */
     324              :         {
     325           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     326            4 :             from++;
     327            4 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     328            4 :             *to |= *from++ << 8;
     329            4 :             *to |= *from++;
     330            4 :             len -= 4;
     331              :         }
     332           24 :         else if (*from == SS3)  /* code set 3 (unused?) */
     333              :         {
     334           12 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     335            4 :             from++;
     336            4 :             *to = (SS3 << 16) | (*from++ << 8);
     337            4 :             *to |= *from++;
     338            4 :             len -= 3;
     339              :         }
     340           12 :         else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
     341              :         {
     342            8 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     343            4 :             *to = *from++ << 8;
     344            4 :             *to |= *from++;
     345            4 :             len -= 2;
     346              :         }
     347              :         else
     348              :         {
     349            4 :             *to = *from++;
     350            4 :             len--;
     351              :         }
     352           16 :         to++;
     353           16 :         cnt++;
     354              :     }
     355           40 :     *to = 0;
     356           40 :     return cnt;
     357              : }
     358              : 
     359              : static int
     360            4 : pg_euctw_mblen(const unsigned char *s)
     361              : {
     362              :     int         len;
     363              : 
     364            4 :     if (*s == SS2)
     365            0 :         len = 4;
     366            4 :     else if (*s == SS3)
     367            0 :         len = 3;
     368            4 :     else if (IS_HIGHBIT_SET(*s))
     369            4 :         len = 2;
     370              :     else
     371            0 :         len = 1;
     372            4 :     return len;
     373              : }
     374              : 
     375              : static int
     376            0 : pg_euctw_dsplen(const unsigned char *s)
     377              : {
     378              :     int         len;
     379              : 
     380            0 :     if (*s == SS2)
     381            0 :         len = 2;
     382            0 :     else if (*s == SS3)
     383            0 :         len = 2;
     384            0 :     else if (IS_HIGHBIT_SET(*s))
     385            0 :         len = 2;
     386              :     else
     387            0 :         len = pg_ascii_dsplen(s);
     388            0 :     return len;
     389              : }
     390              : 
     391              : /*
     392              :  * Convert pg_wchar to EUC_* encoding.
     393              :  * caller must allocate enough space for "to", including a trailing zero!
     394              :  * len: length of from.
     395              :  * "from" not necessarily null terminated.
     396              :  */
     397              : static int
     398           48 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     399              : {
     400           48 :     int         cnt = 0;
     401              : 
     402           96 :     while (len > 0 && *from)
     403              :     {
     404              :         unsigned char c;
     405              : 
     406           48 :         if ((c = (*from >> 24)))
     407              :         {
     408            4 :             *to++ = c;
     409            4 :             *to++ = (*from >> 16) & 0xff;
     410            4 :             *to++ = (*from >> 8) & 0xff;
     411            4 :             *to++ = *from & 0xff;
     412            4 :             cnt += 4;
     413              :         }
     414           44 :         else if ((c = (*from >> 16)))
     415              :         {
     416           16 :             *to++ = c;
     417           16 :             *to++ = (*from >> 8) & 0xff;
     418           16 :             *to++ = *from & 0xff;
     419           16 :             cnt += 3;
     420              :         }
     421           28 :         else if ((c = (*from >> 8)))
     422              :         {
     423           16 :             *to++ = c;
     424           16 :             *to++ = *from & 0xff;
     425           16 :             cnt += 2;
     426              :         }
     427              :         else
     428              :         {
     429           12 :             *to++ = *from;
     430           12 :             cnt++;
     431              :         }
     432           48 :         from++;
     433           48 :         len--;
     434              :     }
     435           48 :     *to = 0;
     436           48 :     return cnt;
     437              : }
     438              : 
     439              : 
     440              : /*
     441              :  * JOHAB
     442              :  */
     443              : static int
     444           16 : pg_johab_mblen(const unsigned char *s)
     445              : {
     446           16 :     return pg_euc_mblen(s);
     447              : }
     448              : 
     449              : static int
     450            0 : pg_johab_dsplen(const unsigned char *s)
     451              : {
     452            0 :     return pg_euc_dsplen(s);
     453              : }
     454              : 
     455              : /*
     456              :  * convert UTF8 string to pg_wchar (UCS-4)
     457              :  * caller must allocate enough space for "to", including a trailing zero!
     458              :  * len: length of from.
     459              :  * "from" not necessarily null terminated.
     460              :  */
     461              : static int
     462      6722638 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     463              : {
     464      6722638 :     int         cnt = 0;
     465              :     uint32      c1,
     466              :                 c2,
     467              :                 c3,
     468              :                 c4;
     469              : 
     470    106723589 :     while (len > 0 && *from)
     471              :     {
     472    100000979 :         if ((*from & 0x80) == 0)
     473              :         {
     474    100000300 :             *to = *from++;
     475    100000300 :             len--;
     476              :         }
     477          679 :         else if ((*from & 0xe0) == 0xc0)
     478              :         {
     479          345 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     480          337 :             c1 = *from++ & 0x1f;
     481          337 :             c2 = *from++ & 0x3f;
     482          337 :             *to = (c1 << 6) | c2;
     483          337 :             len -= 2;
     484              :         }
     485          334 :         else if ((*from & 0xf0) == 0xe0)
     486              :         {
     487          174 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     488          166 :             c1 = *from++ & 0x0f;
     489          166 :             c2 = *from++ & 0x3f;
     490          166 :             c3 = *from++ & 0x3f;
     491          166 :             *to = (c1 << 12) | (c2 << 6) | c3;
     492          166 :             len -= 3;
     493              :         }
     494          160 :         else if ((*from & 0xf8) == 0xf0)
     495              :         {
     496           16 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     497            4 :             c1 = *from++ & 0x07;
     498            4 :             c2 = *from++ & 0x3f;
     499            4 :             c3 = *from++ & 0x3f;
     500            4 :             c4 = *from++ & 0x3f;
     501            4 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     502            4 :             len -= 4;
     503              :         }
     504              :         else
     505              :         {
     506              :             /* treat a bogus char as length 1; not ours to raise error */
     507          144 :             *to = *from++;
     508          144 :             len--;
     509              :         }
     510    100000951 :         to++;
     511    100000951 :         cnt++;
     512              :     }
     513      6722638 :     *to = 0;
     514      6722638 :     return cnt;
     515              : }
     516              : 
     517              : 
     518              : /*
     519              :  * Trivial conversion from pg_wchar to UTF-8.
     520              :  * caller should allocate enough space for "to"
     521              :  * len: length of from.
     522              :  * "from" not necessarily null terminated.
     523              :  */
     524              : static int
     525       579538 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     526              : {
     527       579538 :     int         cnt = 0;
     528              : 
     529      8542747 :     while (len > 0 && *from)
     530              :     {
     531              :         int         char_len;
     532              : 
     533      7963209 :         unicode_to_utf8(*from, to);
     534      7963209 :         char_len = pg_utf_mblen(to);
     535      7963209 :         cnt += char_len;
     536      7963209 :         to += char_len;
     537      7963209 :         from++;
     538      7963209 :         len--;
     539              :     }
     540       579538 :     *to = 0;
     541       579538 :     return cnt;
     542              : }
     543              : 
     544              : /*
     545              :  * Return the byte length of a UTF8 character pointed to by s
     546              :  *
     547              :  * Note: in the current implementation we do not support UTF8 sequences
     548              :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     549              :  * We return "1" for any leading byte that is either flat-out illegal or
     550              :  * indicates a length larger than we support.
     551              :  *
     552              :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     553              :  * other places would need to be fixed to change this.
     554              :  */
     555              : int
     556     96388698 : pg_utf_mblen(const unsigned char *s)
     557              : {
     558              :     int         len;
     559              : 
     560     96388698 :     if ((*s & 0x80) == 0)
     561     96284117 :         len = 1;
     562       104581 :     else if ((*s & 0xe0) == 0xc0)
     563         8504 :         len = 2;
     564        96077 :     else if ((*s & 0xf0) == 0xe0)
     565        69695 :         len = 3;
     566        26382 :     else if ((*s & 0xf8) == 0xf0)
     567        26267 :         len = 4;
     568              : #ifdef NOT_USED
     569              :     else if ((*s & 0xfc) == 0xf8)
     570              :         len = 5;
     571              :     else if ((*s & 0xfe) == 0xfc)
     572              :         len = 6;
     573              : #endif
     574              :     else
     575          115 :         len = 1;
     576     96388698 :     return len;
     577              : }
     578              : 
     579              : /*
     580              :  * This is an implementation of wcwidth() and wcswidth() as defined in
     581              :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     582              :  * <http://www.unix.org/online.html>
     583              :  *
     584              :  * Markus Kuhn -- 2001-09-08 -- public domain
     585              :  *
     586              :  * customised for PostgreSQL
     587              :  *
     588              :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     589              :  */
     590              : 
     591              : struct mbinterval
     592              : {
     593              :     unsigned int first;
     594              :     unsigned int last;
     595              : };
     596              : 
     597              : /* auxiliary function for binary search in interval table */
     598              : static int
     599     59764790 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     600              : {
     601     59764790 :     int         min = 0;
     602              :     int         mid;
     603              : 
     604     59764790 :     if (ucs < table[0].first || ucs > table[max].last)
     605     59759144 :         return 0;
     606        49149 :     while (max >= min)
     607              :     {
     608        43983 :         mid = (min + max) / 2;
     609        43983 :         if (ucs > table[mid].last)
     610         9747 :             min = mid + 1;
     611        34236 :         else if (ucs < table[mid].first)
     612        33756 :             max = mid - 1;
     613              :         else
     614          480 :             return 1;
     615              :     }
     616              : 
     617         5166 :     return 0;
     618              : }
     619              : 
     620              : 
     621              : /* The following functions define the column width of an ISO 10646
     622              :  * character as follows:
     623              :  *
     624              :  *    - The null character (U+0000) has a column width of 0.
     625              :  *
     626              :  *    - Other C0/C1 control characters and DEL will lead to a return
     627              :  *      value of -1.
     628              :  *
     629              :  *    - Non-spacing and enclosing combining characters (general
     630              :  *      category code Mn, Me or Cf in the Unicode database) have a
     631              :  *      column width of 0.
     632              :  *
     633              :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     634              :  *      FullWidth (F) category as defined in Unicode Technical
     635              :  *      Report #11 have a column width of 2.
     636              :  *
     637              :  *    - All remaining characters (including all printable
     638              :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     639              :  *      etc.) have a column width of 1.
     640              :  *
     641              :  * This implementation assumes that wchar_t characters are encoded
     642              :  * in ISO 10646.
     643              :  */
     644              : 
     645              : static int
     646     29911966 : ucs_wcwidth(pg_wchar ucs)
     647              : {
     648              : #include "common/unicode_nonspacing_table.h"
     649              : #include "common/unicode_east_asian_fw_table.h"
     650              : 
     651              :     /* test for 8-bit control characters */
     652     29911966 :     if (ucs == 0)
     653            0 :         return 0;
     654              : 
     655     29911966 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     656        29409 :         return -1;
     657              : 
     658              :     /*
     659              :      * binary search in table of non-spacing characters
     660              :      *
     661              :      * XXX: In the official Unicode sources, it is possible for a character to
     662              :      * be described as both non-spacing and wide at the same time. As of
     663              :      * Unicode 13.0, treating the non-spacing property as the determining
     664              :      * factor for display width leads to the correct behavior, so do that
     665              :      * search first.
     666              :      */
     667     29882557 :     if (mbbisearch(ucs, nonspacing,
     668              :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     669          324 :         return 0;
     670              : 
     671              :     /* binary search in table of wide characters */
     672     29882233 :     if (mbbisearch(ucs, east_asian_fw,
     673              :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     674          156 :         return 2;
     675              : 
     676     29882077 :     return 1;
     677              : }
     678              : 
     679              : static int
     680     29911966 : pg_utf_dsplen(const unsigned char *s)
     681              : {
     682     29911966 :     return ucs_wcwidth(utf8_to_unicode(s));
     683              : }
     684              : 
     685              : /*
     686              :  * ISO8859-1
     687              :  */
     688              : static int
     689          468 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     690              : {
     691          468 :     int         cnt = 0;
     692              : 
     693        13377 :     while (len > 0 && *from)
     694              :     {
     695        12909 :         *to++ = *from++;
     696        12909 :         len--;
     697        12909 :         cnt++;
     698              :     }
     699          468 :     *to = 0;
     700          468 :     return cnt;
     701              : }
     702              : 
     703              : /*
     704              :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     705              :  * high bits.
     706              :  * caller should allocate enough space for "to"
     707              :  * len: length of from.
     708              :  * "from" not necessarily null terminated.
     709              :  */
     710              : static int
     711           79 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     712              : {
     713           79 :     int         cnt = 0;
     714              : 
     715          678 :     while (len > 0 && *from)
     716              :     {
     717          599 :         *to++ = *from++;
     718          599 :         len--;
     719          599 :         cnt++;
     720              :     }
     721           79 :     *to = 0;
     722           79 :     return cnt;
     723              : }
     724              : 
     725              : static int
     726         3614 : pg_latin1_mblen(const unsigned char *s)
     727              : {
     728         3614 :     return 1;
     729              : }
     730              : 
     731              : static int
     732          400 : pg_latin1_dsplen(const unsigned char *s)
     733              : {
     734          400 :     return pg_ascii_dsplen(s);
     735              : }
     736              : 
     737              : /*
     738              :  * SJIS
     739              :  */
     740              : static int
     741         1015 : pg_sjis_mblen(const unsigned char *s)
     742              : {
     743              :     int         len;
     744              : 
     745         1015 :     if (*s >= 0xa1 && *s <= 0xdf)
     746            0 :         len = 1;                /* 1 byte kana? */
     747         1015 :     else if (IS_HIGHBIT_SET(*s))
     748          809 :         len = 2;                /* kanji? */
     749              :     else
     750          206 :         len = 1;                /* should be ASCII */
     751         1015 :     return len;
     752              : }
     753              : 
     754              : static int
     755            0 : pg_sjis_dsplen(const unsigned char *s)
     756              : {
     757              :     int         len;
     758              : 
     759            0 :     if (*s >= 0xa1 && *s <= 0xdf)
     760            0 :         len = 1;                /* 1 byte kana? */
     761            0 :     else if (IS_HIGHBIT_SET(*s))
     762            0 :         len = 2;                /* kanji? */
     763              :     else
     764            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     765            0 :     return len;
     766              : }
     767              : 
     768              : /*
     769              :  * Big5
     770              :  */
     771              : static int
     772          232 : pg_big5_mblen(const unsigned char *s)
     773              : {
     774              :     int         len;
     775              : 
     776          232 :     if (IS_HIGHBIT_SET(*s))
     777          208 :         len = 2;                /* kanji? */
     778              :     else
     779           24 :         len = 1;                /* should be ASCII */
     780          232 :     return len;
     781              : }
     782              : 
     783              : static int
     784            0 : pg_big5_dsplen(const unsigned char *s)
     785              : {
     786              :     int         len;
     787              : 
     788            0 :     if (IS_HIGHBIT_SET(*s))
     789            0 :         len = 2;                /* kanji? */
     790              :     else
     791            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     792            0 :     return len;
     793              : }
     794              : 
     795              : /*
     796              :  * GBK
     797              :  */
     798              : static int
     799          282 : pg_gbk_mblen(const unsigned char *s)
     800              : {
     801              :     int         len;
     802              : 
     803          282 :     if (IS_HIGHBIT_SET(*s))
     804          212 :         len = 2;                /* kanji? */
     805              :     else
     806           70 :         len = 1;                /* should be ASCII */
     807          282 :     return len;
     808              : }
     809              : 
     810              : static int
     811            0 : pg_gbk_dsplen(const unsigned char *s)
     812              : {
     813              :     int         len;
     814              : 
     815            0 :     if (IS_HIGHBIT_SET(*s))
     816            0 :         len = 2;                /* kanji? */
     817              :     else
     818            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     819            0 :     return len;
     820              : }
     821              : 
     822              : /*
     823              :  * UHC
     824              :  */
     825              : static int
     826           16 : pg_uhc_mblen(const unsigned char *s)
     827              : {
     828              :     int         len;
     829              : 
     830           16 :     if (IS_HIGHBIT_SET(*s))
     831           16 :         len = 2;                /* 2byte? */
     832              :     else
     833            0 :         len = 1;                /* should be ASCII */
     834           16 :     return len;
     835              : }
     836              : 
     837              : static int
     838            0 : pg_uhc_dsplen(const unsigned char *s)
     839              : {
     840              :     int         len;
     841              : 
     842            0 :     if (IS_HIGHBIT_SET(*s))
     843            0 :         len = 2;                /* 2byte? */
     844              :     else
     845            0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     846            0 :     return len;
     847              : }
     848              : 
     849              : /*
     850              :  * GB18030
     851              :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
     852              :  */
     853              : 
     854              : /*
     855              :  * Unlike all other mblen() functions, this also looks at the second byte of
     856              :  * the input.  However, if you only pass the first byte of a multi-byte
     857              :  * string, and \0 as the second byte, this still works in a predictable way:
     858              :  * a 4-byte character will be reported as two 2-byte characters.  That's
     859              :  * enough for all current uses, as a client-only encoding.  It works that
     860              :  * way, because in any valid 4-byte GB18030-encoded character, the third and
     861              :  * fourth byte look like a 2-byte encoded character, when looked at
     862              :  * separately.
     863              :  */
     864              : static int
     865          623 : pg_gb18030_mblen(const unsigned char *s)
     866              : {
     867              :     int         len;
     868              : 
     869          623 :     if (!IS_HIGHBIT_SET(*s))
     870          348 :         len = 1;                /* ASCII */
     871          275 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
     872          114 :         len = 4;
     873              :     else
     874          161 :         len = 2;
     875          623 :     return len;
     876              : }
     877              : 
     878              : static int
     879            0 : pg_gb18030_dsplen(const unsigned char *s)
     880              : {
     881              :     int         len;
     882              : 
     883            0 :     if (IS_HIGHBIT_SET(*s))
     884            0 :         len = 2;
     885              :     else
     886            0 :         len = pg_ascii_dsplen(s);   /* ASCII */
     887            0 :     return len;
     888              : }
     889              : 
     890              : /*
     891              :  *-------------------------------------------------------------------
     892              :  * multibyte sequence validators
     893              :  *
     894              :  * The verifychar functions accept "s", a pointer to the first byte of a
     895              :  * string, and "len", the remaining length of the string.  If there is a
     896              :  * validly encoded character beginning at *s, return its length in bytes;
     897              :  * else return -1.
     898              :  *
     899              :  * The verifystr functions also accept "s", a pointer to a string and "len",
     900              :  * the length of the string.  They verify the whole string, and return the
     901              :  * number of input bytes (<= len) that are valid.  In other words, if the
     902              :  * whole string is valid, verifystr returns "len", otherwise it returns the
     903              :  * byte offset of the first invalid character.  The verifystr functions must
     904              :  * test for and reject zeroes in the input.
     905              :  *
     906              :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
     907              :  * they must test for and reject zeroes in any additional bytes of a
     908              :  * multibyte character.  Note that this definition allows the function for a
     909              :  * single-byte encoding to be just "return 1".
     910              :  *-------------------------------------------------------------------
     911              :  */
     912              : static int
     913          161 : pg_ascii_verifychar(const unsigned char *s, int len)
     914              : {
     915          161 :     return 1;
     916              : }
     917              : 
     918              : static int
     919       211584 : pg_ascii_verifystr(const unsigned char *s, int len)
     920              : {
     921       211584 :     const unsigned char *nullpos = memchr(s, 0, len);
     922              : 
     923       211584 :     if (nullpos == NULL)
     924       211584 :         return len;
     925              :     else
     926            0 :         return nullpos - s;
     927              : }
     928              : 
     929              : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
     930              : 
     931              : static int
     932          336 : pg_eucjp_verifychar(const unsigned char *s, int len)
     933              : {
     934              :     int         l;
     935              :     unsigned char c1,
     936              :                 c2;
     937              : 
     938          336 :     c1 = *s++;
     939              : 
     940          336 :     switch (c1)
     941              :     {
     942            0 :         case SS2:               /* JIS X 0201 */
     943            0 :             l = 2;
     944            0 :             if (l > len)
     945            0 :                 return -1;
     946            0 :             c2 = *s++;
     947            0 :             if (c2 < 0xa1 || c2 > 0xdf)
     948            0 :                 return -1;
     949            0 :             break;
     950              : 
     951            0 :         case SS3:               /* JIS X 0212 */
     952            0 :             l = 3;
     953            0 :             if (l > len)
     954            0 :                 return -1;
     955            0 :             c2 = *s++;
     956            0 :             if (!IS_EUC_RANGE_VALID(c2))
     957            0 :                 return -1;
     958            0 :             c2 = *s++;
     959            0 :             if (!IS_EUC_RANGE_VALID(c2))
     960            0 :                 return -1;
     961            0 :             break;
     962              : 
     963          336 :         default:
     964          336 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
     965              :             {
     966          336 :                 l = 2;
     967          336 :                 if (l > len)
     968           56 :                     return -1;
     969          280 :                 if (!IS_EUC_RANGE_VALID(c1))
     970           16 :                     return -1;
     971          264 :                 c2 = *s++;
     972          264 :                 if (!IS_EUC_RANGE_VALID(c2))
     973          120 :                     return -1;
     974              :             }
     975              :             else
     976              :                 /* must be ASCII */
     977              :             {
     978            0 :                 l = 1;
     979              :             }
     980          144 :             break;
     981              :     }
     982              : 
     983          144 :     return l;
     984              : }
     985              : 
     986              : static int
     987          196 : pg_eucjp_verifystr(const unsigned char *s, int len)
     988              : {
     989          196 :     const unsigned char *start = s;
     990              : 
     991          604 :     while (len > 0)
     992              :     {
     993              :         int         l;
     994              : 
     995              :         /* fast path for ASCII-subset characters */
     996          552 :         if (!IS_HIGHBIT_SET(*s))
     997              :         {
     998          384 :             if (*s == '\0')
     999           48 :                 break;
    1000          336 :             l = 1;
    1001              :         }
    1002              :         else
    1003              :         {
    1004          168 :             l = pg_eucjp_verifychar(s, len);
    1005          168 :             if (l == -1)
    1006           96 :                 break;
    1007              :         }
    1008          408 :         s += l;
    1009          408 :         len -= l;
    1010              :     }
    1011              : 
    1012          196 :     return s - start;
    1013              : }
    1014              : 
    1015              : static int
    1016           96 : pg_euckr_verifychar(const unsigned char *s, int len)
    1017              : {
    1018              :     int         l;
    1019              :     unsigned char c1,
    1020              :                 c2;
    1021              : 
    1022           96 :     c1 = *s++;
    1023              : 
    1024           96 :     if (IS_HIGHBIT_SET(c1))
    1025              :     {
    1026           96 :         l = 2;
    1027           96 :         if (l > len)
    1028            8 :             return -1;
    1029           88 :         if (!IS_EUC_RANGE_VALID(c1))
    1030           16 :             return -1;
    1031           72 :         c2 = *s++;
    1032           72 :         if (!IS_EUC_RANGE_VALID(c2))
    1033            0 :             return -1;
    1034              :     }
    1035              :     else
    1036              :         /* must be ASCII */
    1037              :     {
    1038            0 :         l = 1;
    1039              :     }
    1040              : 
    1041           72 :     return l;
    1042              : }
    1043              : 
    1044              : static int
    1045           40 : pg_euckr_verifystr(const unsigned char *s, int len)
    1046              : {
    1047           40 :     const unsigned char *start = s;
    1048              : 
    1049          124 :     while (len > 0)
    1050              :     {
    1051              :         int         l;
    1052              : 
    1053              :         /* fast path for ASCII-subset characters */
    1054          108 :         if (!IS_HIGHBIT_SET(*s))
    1055              :         {
    1056           48 :             if (*s == '\0')
    1057            0 :                 break;
    1058           48 :             l = 1;
    1059              :         }
    1060              :         else
    1061              :         {
    1062           60 :             l = pg_euckr_verifychar(s, len);
    1063           60 :             if (l == -1)
    1064           24 :                 break;
    1065              :         }
    1066           84 :         s += l;
    1067           84 :         len -= l;
    1068              :     }
    1069              : 
    1070           40 :     return s - start;
    1071              : }
    1072              : 
    1073              : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1074              : #define pg_euccn_verifychar pg_euckr_verifychar
    1075              : #define pg_euccn_verifystr  pg_euckr_verifystr
    1076              : 
    1077              : static int
    1078           12 : pg_euctw_verifychar(const unsigned char *s, int len)
    1079              : {
    1080              :     int         l;
    1081              :     unsigned char c1,
    1082              :                 c2;
    1083              : 
    1084           12 :     c1 = *s++;
    1085              : 
    1086           12 :     switch (c1)
    1087              :     {
    1088            0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1089            0 :             l = 4;
    1090            0 :             if (l > len)
    1091            0 :                 return -1;
    1092            0 :             c2 = *s++;
    1093            0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1094            0 :                 return -1;
    1095            0 :             c2 = *s++;
    1096            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1097            0 :                 return -1;
    1098            0 :             c2 = *s++;
    1099            0 :             if (!IS_EUC_RANGE_VALID(c2))
    1100            0 :                 return -1;
    1101            0 :             break;
    1102              : 
    1103            0 :         case SS3:               /* unused */
    1104            0 :             return -1;
    1105              : 
    1106           12 :         default:
    1107           12 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1108              :             {
    1109           12 :                 l = 2;
    1110           12 :                 if (l > len)
    1111            4 :                     return -1;
    1112              :                 /* no further range check on c1? */
    1113            8 :                 c2 = *s++;
    1114            8 :                 if (!IS_EUC_RANGE_VALID(c2))
    1115            8 :                     return -1;
    1116              :             }
    1117              :             else
    1118              :                 /* must be ASCII */
    1119              :             {
    1120            0 :                 l = 1;
    1121              :             }
    1122            0 :             break;
    1123              :     }
    1124            0 :     return l;
    1125              : }
    1126              : 
    1127              : static int
    1128           20 : pg_euctw_verifystr(const unsigned char *s, int len)
    1129              : {
    1130           20 :     const unsigned char *start = s;
    1131              : 
    1132           44 :     while (len > 0)
    1133              :     {
    1134              :         int         l;
    1135              : 
    1136              :         /* fast path for ASCII-subset characters */
    1137           36 :         if (!IS_HIGHBIT_SET(*s))
    1138              :         {
    1139           24 :             if (*s == '\0')
    1140            0 :                 break;
    1141           24 :             l = 1;
    1142              :         }
    1143              :         else
    1144              :         {
    1145           12 :             l = pg_euctw_verifychar(s, len);
    1146           12 :             if (l == -1)
    1147           12 :                 break;
    1148              :         }
    1149           24 :         s += l;
    1150           24 :         len -= l;
    1151              :     }
    1152              : 
    1153           20 :     return s - start;
    1154              : }
    1155              : 
    1156              : static int
    1157           12 : pg_johab_verifychar(const unsigned char *s, int len)
    1158              : {
    1159              :     int         l,
    1160              :                 mbl;
    1161              :     unsigned char c;
    1162              : 
    1163           12 :     l = mbl = pg_johab_mblen(s);
    1164              : 
    1165           12 :     if (len < l)
    1166            4 :         return -1;
    1167              : 
    1168            8 :     if (!IS_HIGHBIT_SET(*s))
    1169            0 :         return mbl;
    1170              : 
    1171            8 :     while (--l > 0)
    1172              :     {
    1173            8 :         c = *++s;
    1174            8 :         if (!IS_EUC_RANGE_VALID(c))
    1175            8 :             return -1;
    1176              :     }
    1177            0 :     return mbl;
    1178              : }
    1179              : 
    1180              : static int
    1181           16 : pg_johab_verifystr(const unsigned char *s, int len)
    1182              : {
    1183           16 :     const unsigned char *start = s;
    1184              : 
    1185           28 :     while (len > 0)
    1186              :     {
    1187              :         int         l;
    1188              : 
    1189              :         /* fast path for ASCII-subset characters */
    1190           24 :         if (!IS_HIGHBIT_SET(*s))
    1191              :         {
    1192           12 :             if (*s == '\0')
    1193            0 :                 break;
    1194           12 :             l = 1;
    1195              :         }
    1196              :         else
    1197              :         {
    1198           12 :             l = pg_johab_verifychar(s, len);
    1199           12 :             if (l == -1)
    1200           12 :                 break;
    1201              :         }
    1202           12 :         s += l;
    1203           12 :         len -= l;
    1204              :     }
    1205              : 
    1206           16 :     return s - start;
    1207              : }
    1208              : 
    1209              : static int
    1210         3223 : pg_latin1_verifychar(const unsigned char *s, int len)
    1211              : {
    1212         3223 :     return 1;
    1213              : }
    1214              : 
    1215              : static int
    1216         5211 : pg_latin1_verifystr(const unsigned char *s, int len)
    1217              : {
    1218         5211 :     const unsigned char *nullpos = memchr(s, 0, len);
    1219              : 
    1220         5211 :     if (nullpos == NULL)
    1221         5139 :         return len;
    1222              :     else
    1223           72 :         return nullpos - s;
    1224              : }
    1225              : 
    1226              : static int
    1227          624 : pg_sjis_verifychar(const unsigned char *s, int len)
    1228              : {
    1229              :     int         l,
    1230              :                 mbl;
    1231              :     unsigned char c1,
    1232              :                 c2;
    1233              : 
    1234          624 :     l = mbl = pg_sjis_mblen(s);
    1235              : 
    1236          624 :     if (len < l)
    1237           86 :         return -1;
    1238              : 
    1239          538 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1240            0 :         return mbl;
    1241              : 
    1242          538 :     c1 = *s++;
    1243          538 :     c2 = *s;
    1244          538 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1245          214 :         return -1;
    1246          324 :     return mbl;
    1247              : }
    1248              : 
    1249              : static int
    1250          322 : pg_sjis_verifystr(const unsigned char *s, int len)
    1251              : {
    1252          322 :     const unsigned char *start = s;
    1253              : 
    1254         1233 :     while (len > 0)
    1255              :     {
    1256              :         int         l;
    1257              : 
    1258              :         /* fast path for ASCII-subset characters */
    1259         1107 :         if (!IS_HIGHBIT_SET(*s))
    1260              :         {
    1261          815 :             if (*s == '\0')
    1262           48 :                 break;
    1263          767 :             l = 1;
    1264              :         }
    1265              :         else
    1266              :         {
    1267          292 :             l = pg_sjis_verifychar(s, len);
    1268          292 :             if (l == -1)
    1269          148 :                 break;
    1270              :         }
    1271          911 :         s += l;
    1272          911 :         len -= l;
    1273              :     }
    1274              : 
    1275          322 :     return s - start;
    1276              : }
    1277              : 
    1278              : static int
    1279          168 : pg_big5_verifychar(const unsigned char *s, int len)
    1280              : {
    1281              :     int         l,
    1282              :                 mbl;
    1283              : 
    1284          168 :     l = mbl = pg_big5_mblen(s);
    1285              : 
    1286          168 :     if (len < l)
    1287            4 :         return -1;
    1288              : 
    1289          164 :     if (l == 2 &&
    1290          164 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1291            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1292            8 :         return -1;
    1293              : 
    1294          264 :     while (--l > 0)
    1295              :     {
    1296          156 :         if (*++s == '\0')
    1297           48 :             return -1;
    1298              :     }
    1299              : 
    1300          108 :     return mbl;
    1301              : }
    1302              : 
    1303              : static int
    1304          104 : pg_big5_verifystr(const unsigned char *s, int len)
    1305              : {
    1306          104 :     const unsigned char *start = s;
    1307              : 
    1308          428 :     while (len > 0)
    1309              :     {
    1310              :         int         l;
    1311              : 
    1312              :         /* fast path for ASCII-subset characters */
    1313          384 :         if (!IS_HIGHBIT_SET(*s))
    1314              :         {
    1315          300 :             if (*s == '\0')
    1316           24 :                 break;
    1317          276 :             l = 1;
    1318              :         }
    1319              :         else
    1320              :         {
    1321           84 :             l = pg_big5_verifychar(s, len);
    1322           84 :             if (l == -1)
    1323           36 :                 break;
    1324              :         }
    1325          324 :         s += l;
    1326          324 :         len -= l;
    1327              :     }
    1328              : 
    1329          104 :     return s - start;
    1330              : }
    1331              : 
    1332              : static int
    1333          140 : pg_gbk_verifychar(const unsigned char *s, int len)
    1334              : {
    1335              :     int         l,
    1336              :                 mbl;
    1337              : 
    1338          140 :     l = mbl = pg_gbk_mblen(s);
    1339              : 
    1340          140 :     if (len < l)
    1341           28 :         return -1;
    1342              : 
    1343          112 :     if (l == 2 &&
    1344          112 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1345           16 :         s[1] == NONUTF8_INVALID_BYTE1)
    1346           16 :         return -1;
    1347              : 
    1348          192 :     while (--l > 0)
    1349              :     {
    1350           96 :         if (*++s == '\0')
    1351            0 :             return -1;
    1352              :     }
    1353              : 
    1354           96 :     return mbl;
    1355              : }
    1356              : 
    1357              : static int
    1358          132 : pg_gbk_verifystr(const unsigned char *s, int len)
    1359              : {
    1360          132 :     const unsigned char *start = s;
    1361              : 
    1362          336 :     while (len > 0)
    1363              :     {
    1364              :         int         l;
    1365              : 
    1366              :         /* fast path for ASCII-subset characters */
    1367          248 :         if (!IS_HIGHBIT_SET(*s))
    1368              :         {
    1369          124 :             if (*s == '\0')
    1370            0 :                 break;
    1371          124 :             l = 1;
    1372              :         }
    1373              :         else
    1374              :         {
    1375          124 :             l = pg_gbk_verifychar(s, len);
    1376          124 :             if (l == -1)
    1377           44 :                 break;
    1378              :         }
    1379          204 :         s += l;
    1380          204 :         len -= l;
    1381              :     }
    1382              : 
    1383          132 :     return s - start;
    1384              : }
    1385              : 
    1386              : static int
    1387           12 : pg_uhc_verifychar(const unsigned char *s, int len)
    1388              : {
    1389              :     int         l,
    1390              :                 mbl;
    1391              : 
    1392           12 :     l = mbl = pg_uhc_mblen(s);
    1393              : 
    1394           12 :     if (len < l)
    1395            4 :         return -1;
    1396              : 
    1397            8 :     if (l == 2 &&
    1398            8 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1399            8 :         s[1] == NONUTF8_INVALID_BYTE1)
    1400            8 :         return -1;
    1401              : 
    1402            0 :     while (--l > 0)
    1403              :     {
    1404            0 :         if (*++s == '\0')
    1405            0 :             return -1;
    1406              :     }
    1407              : 
    1408            0 :     return mbl;
    1409              : }
    1410              : 
    1411              : static int
    1412           16 : pg_uhc_verifystr(const unsigned char *s, int len)
    1413              : {
    1414           16 :     const unsigned char *start = s;
    1415              : 
    1416           28 :     while (len > 0)
    1417              :     {
    1418              :         int         l;
    1419              : 
    1420              :         /* fast path for ASCII-subset characters */
    1421           24 :         if (!IS_HIGHBIT_SET(*s))
    1422              :         {
    1423           12 :             if (*s == '\0')
    1424            0 :                 break;
    1425           12 :             l = 1;
    1426              :         }
    1427              :         else
    1428              :         {
    1429           12 :             l = pg_uhc_verifychar(s, len);
    1430           12 :             if (l == -1)
    1431           12 :                 break;
    1432              :         }
    1433           12 :         s += l;
    1434           12 :         len -= l;
    1435              :     }
    1436              : 
    1437           16 :     return s - start;
    1438              : }
    1439              : 
    1440              : static int
    1441          698 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1442              : {
    1443              :     int         l;
    1444              : 
    1445          698 :     if (!IS_HIGHBIT_SET(*s))
    1446            0 :         l = 1;                  /* ASCII */
    1447          698 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1448              :     {
    1449              :         /* Should be 4-byte, validate remaining bytes */
    1450          210 :         if (*s >= 0x81 && *s <= 0xfe &&
    1451          204 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1452          204 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1453          108 :             l = 4;
    1454              :         else
    1455          102 :             l = -1;
    1456              :     }
    1457          488 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1458              :     {
    1459              :         /* Should be 2-byte, validate */
    1460          358 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1461          238 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1462          176 :             l = 2;
    1463              :         else
    1464          182 :             l = -1;
    1465              :     }
    1466              :     else
    1467          130 :         l = -1;
    1468          698 :     return l;
    1469              : }
    1470              : 
    1471              : static int
    1472          500 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1473              : {
    1474          500 :     const unsigned char *start = s;
    1475              : 
    1476         1679 :     while (len > 0)
    1477              :     {
    1478              :         int         l;
    1479              : 
    1480              :         /* fast path for ASCII-subset characters */
    1481         1515 :         if (!IS_HIGHBIT_SET(*s))
    1482              :         {
    1483         1037 :             if (*s == '\0')
    1484           30 :                 break;
    1485         1007 :             l = 1;
    1486              :         }
    1487              :         else
    1488              :         {
    1489          478 :             l = pg_gb18030_verifychar(s, len);
    1490          478 :             if (l == -1)
    1491          306 :                 break;
    1492              :         }
    1493         1179 :         s += l;
    1494         1179 :         len -= l;
    1495              :     }
    1496              : 
    1497          500 :     return s - start;
    1498              : }
    1499              : 
    1500              : static int
    1501         9451 : pg_utf8_verifychar(const unsigned char *s, int len)
    1502              : {
    1503              :     int         l;
    1504              : 
    1505         9451 :     if ((*s & 0x80) == 0)
    1506              :     {
    1507            0 :         if (*s == '\0')
    1508            0 :             return -1;
    1509            0 :         return 1;
    1510              :     }
    1511         9451 :     else if ((*s & 0xe0) == 0xc0)
    1512         3331 :         l = 2;
    1513         6120 :     else if ((*s & 0xf0) == 0xe0)
    1514         3412 :         l = 3;
    1515         2708 :     else if ((*s & 0xf8) == 0xf0)
    1516         2532 :         l = 4;
    1517              :     else
    1518          176 :         l = 1;
    1519              : 
    1520         9451 :     if (l > len)
    1521          320 :         return -1;
    1522              : 
    1523         9131 :     if (!pg_utf8_islegal(s, l))
    1524         1486 :         return -1;
    1525              : 
    1526         7645 :     return l;
    1527              : }
    1528              : 
    1529              : /*
    1530              :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1531              :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1532              :  * input byte and current state are used to compute an index into an array of
    1533              :  * state transitions. Since the address of the next transition is dependent
    1534              :  * on this computation, there is latency in executing the load instruction,
    1535              :  * and the CPU is not kept busy.
    1536              :  *
    1537              :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1538              :  *
    1539              :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1540              :  *
    1541              :  * In a shift-based DFA, the input byte is an index into array of integers
    1542              :  * whose bit pattern encodes the state transitions. To compute the next
    1543              :  * state, we simply right-shift the integer by the current state and apply a
    1544              :  * mask. In this scheme, the address of the transition only depends on the
    1545              :  * input byte, so there is better pipelining.
    1546              :  *
    1547              :  * The naming convention for states and transitions was adopted from a UTF-8
    1548              :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1549              :  *
    1550              :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1551              :  *
    1552              :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1553              :  * ==========================================================================
    1554              :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1555              :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1556              :  *                                                                  |
    1557              :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1558              :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1559              :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1560              :  *                                                                  |
    1561              :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1562              :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1563              :  *                                                                  |
    1564              :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1565              :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1566              :  *
    1567              :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1568              :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1569              :  * it's possible to find state numbers such that the transitions fit within
    1570              :  * 32-bit integers, as Dougall Johnson demonstrated:
    1571              :  *
    1572              :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1573              :  *
    1574              :  * This packed representation is the reason for the seemingly odd choice of
    1575              :  * state values below.
    1576              :  */
    1577              : 
    1578              : /* Error */
    1579              : #define ERR  0
    1580              : /* Begin */
    1581              : #define BGN 11
    1582              : /* Continuation states, expect 1/2/3 continuation bytes */
    1583              : #define CS1 16
    1584              : #define CS2  1
    1585              : #define CS3  5
    1586              : /* Partial states, where the first continuation byte has a restricted range */
    1587              : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1588              : #define P3B 20                  /* Lead was ED, check for surrogate */
    1589              : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1590              : #define P4B 30                  /* Lead was F4, check for too-large */
    1591              : /* Begin and End are the same state */
    1592              : #define END BGN
    1593              : 
    1594              : /* the encoded state transitions for the lookup table */
    1595              : 
    1596              : /* ASCII */
    1597              : #define ASC (END << BGN)
    1598              : /* 2-byte lead */
    1599              : #define L2A (CS1 << BGN)
    1600              : /* 3-byte lead */
    1601              : #define L3A (P3A << BGN)
    1602              : #define L3B (CS2 << BGN)
    1603              : #define L3C (P3B << BGN)
    1604              : /* 4-byte lead */
    1605              : #define L4A (P4A << BGN)
    1606              : #define L4B (CS3 << BGN)
    1607              : #define L4C (P4B << BGN)
    1608              : /* continuation byte */
    1609              : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1610              : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1611              : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1612              : /* invalid byte */
    1613              : #define ILL ERR
    1614              : 
    1615              : static const uint32 Utf8Transition[256] =
    1616              : {
    1617              :     /* ASCII */
    1618              : 
    1619              :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1620              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1621              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1622              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1623              : 
    1624              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1625              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1626              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1627              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1628              : 
    1629              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1630              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1631              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1632              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1633              : 
    1634              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1635              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1636              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1637              :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1638              : 
    1639              :     /* continuation bytes */
    1640              : 
    1641              :     /* 80..8F */
    1642              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1643              :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1644              : 
    1645              :     /* 90..9F */
    1646              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1647              :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1648              : 
    1649              :     /* A0..BF */
    1650              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1651              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1652              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1653              :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1654              : 
    1655              :     /* leading bytes */
    1656              : 
    1657              :     /* C0..DF */
    1658              :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1659              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1660              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1661              :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1662              : 
    1663              :     /* E0..EF */
    1664              :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1665              :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1666              : 
    1667              :     /* F0..FF */
    1668              :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1669              :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1670              : };
    1671              : 
    1672              : static void
    1673         1147 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1674              : {
    1675              :     /* Note: We deliberately don't check the state's value here. */
    1676        37851 :     while (len > 0)
    1677              :     {
    1678              :         /*
    1679              :          * It's important that the mask value is 31: In most instruction sets,
    1680              :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1681              :          * 32, so the compiler should elide the mask operation.
    1682              :          */
    1683        36704 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1684        36704 :         len--;
    1685              :     }
    1686              : 
    1687         1147 :     *state &= 31;
    1688         1147 : }
    1689              : 
    1690              : static int
    1691       709567 : pg_utf8_verifystr(const unsigned char *s, int len)
    1692              : {
    1693       709567 :     const unsigned char *start = s;
    1694       709567 :     const int   orig_len = len;
    1695       709567 :     uint32      state = BGN;
    1696              : 
    1697              : /*
    1698              :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1699              :  * the compiler can unroll a longer loop, it's not worth it because we
    1700              :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1701              :  */
    1702              : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1703              : 
    1704       709567 :     if (len >= STRIDE_LENGTH)
    1705              :     {
    1706      2654373 :         while (len >= STRIDE_LENGTH)
    1707              :         {
    1708              :             /*
    1709              :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1710              :              * but we must first check for a non-END state, which means the
    1711              :              * previous chunk ended in the middle of a multibyte sequence.
    1712              :              */
    1713      2292264 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1714         1147 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1715              : 
    1716      2292264 :             s += STRIDE_LENGTH;
    1717      2292264 :             len -= STRIDE_LENGTH;
    1718              :         }
    1719              : 
    1720              :         /* The error state persists, so we only need to check for it here. */
    1721       362109 :         if (state == ERR)
    1722              :         {
    1723              :             /*
    1724              :              * Start over from the beginning with the slow path so we can
    1725              :              * count the valid bytes.
    1726              :              */
    1727          336 :             len = orig_len;
    1728          336 :             s = start;
    1729              :         }
    1730       361773 :         else if (state != END)
    1731              :         {
    1732              :             /*
    1733              :              * The fast path exited in the middle of a multibyte sequence.
    1734              :              * Walk backwards to find the leading byte so that the slow path
    1735              :              * can resume checking from there. We must always backtrack at
    1736              :              * least one byte, since the current byte could be e.g. an ASCII
    1737              :              * byte after a 2-byte lead, which is invalid.
    1738              :              */
    1739              :             do
    1740              :             {
    1741              :                 Assert(s > start);
    1742           73 :                 s--;
    1743           73 :                 len++;
    1744              :                 Assert(IS_HIGHBIT_SET(*s));
    1745           73 :             } while (pg_utf_mblen(s) <= 1);
    1746              :         }
    1747              :     }
    1748              : 
    1749              :     /* check remaining bytes */
    1750     10616388 :     while (len > 0)
    1751              :     {
    1752              :         int         l;
    1753              : 
    1754              :         /* fast path for ASCII-subset characters */
    1755      9908727 :         if (!IS_HIGHBIT_SET(*s))
    1756              :         {
    1757      9899312 :             if (*s == '\0')
    1758          132 :                 break;
    1759      9899180 :             l = 1;
    1760              :         }
    1761              :         else
    1762              :         {
    1763         9415 :             l = pg_utf8_verifychar(s, len);
    1764         9415 :             if (l == -1)
    1765         1774 :                 break;
    1766              :         }
    1767      9906821 :         s += l;
    1768      9906821 :         len -= l;
    1769              :     }
    1770              : 
    1771       709567 :     return s - start;
    1772              : }
    1773              : 
    1774              : /*
    1775              :  * Check for validity of a single UTF-8 encoded character
    1776              :  *
    1777              :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1778              :  * restrictions on the second byte are meant to ensure that there isn't
    1779              :  * more than one encoding of a given Unicode character point; that is,
    1780              :  * you may not use a longer-than-necessary byte sequence with high order
    1781              :  * zero bits to represent a character that would fit in fewer bytes.
    1782              :  * To do otherwise is to create security hazards (eg, create an apparent
    1783              :  * non-ASCII character that decodes to plain ASCII).
    1784              :  *
    1785              :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1786              :  * caller must have checked that that many bytes are present in the buffer.
    1787              :  */
    1788              : bool
    1789        16166 : pg_utf8_islegal(const unsigned char *source, int length)
    1790              : {
    1791              :     unsigned char a;
    1792              : 
    1793        16166 :     switch (length)
    1794              :     {
    1795            0 :         default:
    1796              :             /* reject lengths 5 and 6 for now */
    1797            0 :             return false;
    1798         2396 :         case 4:
    1799         2396 :             a = source[3];
    1800         2396 :             if (a < 0x80 || a > 0xBF)
    1801          198 :                 return false;
    1802              :             pg_fallthrough;
    1803              :         case 3:
    1804         6619 :             a = source[2];
    1805         6619 :             if (a < 0x80 || a > 0xBF)
    1806          440 :                 return false;
    1807              :             pg_fallthrough;
    1808              :         case 2:
    1809         9822 :             a = source[1];
    1810         9822 :             switch (*source)
    1811              :             {
    1812          208 :                 case 0xE0:
    1813          208 :                     if (a < 0xA0 || a > 0xBF)
    1814          176 :                         return false;
    1815           32 :                     break;
    1816          208 :                 case 0xED:
    1817          208 :                     if (a < 0x80 || a > 0x9F)
    1818          176 :                         return false;
    1819           32 :                     break;
    1820         2078 :                 case 0xF0:
    1821         2078 :                     if (a < 0x90 || a > 0xBF)
    1822          176 :                         return false;
    1823         1902 :                     break;
    1824          120 :                 case 0xF4:
    1825          120 :                     if (a < 0x80 || a > 0x8F)
    1826           88 :                         return false;
    1827           32 :                     break;
    1828         7208 :                 default:
    1829         7208 :                     if (a < 0x80 || a > 0xBF)
    1830          168 :                         return false;
    1831         7040 :                     break;
    1832              :             }
    1833              :             pg_fallthrough;
    1834              :         case 1:
    1835        14744 :             a = *source;
    1836        14744 :             if (a >= 0x80 && a < 0xC2)
    1837          264 :                 return false;
    1838        14480 :             if (a > 0xF4)
    1839           88 :                 return false;
    1840        14392 :             break;
    1841              :     }
    1842        14392 :     return true;
    1843              : }
    1844              : 
    1845              : 
    1846              : /*
    1847              :  * Fills the provided buffer with two bytes such that:
    1848              :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    1849              :  */
    1850              : void
    1851          212 : pg_encoding_set_invalid(int encoding, char *dst)
    1852              : {
    1853              :     Assert(pg_encoding_max_length(encoding) > 1);
    1854              : 
    1855          212 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    1856          212 :     dst[1] = NONUTF8_INVALID_BYTE1;
    1857          212 : }
    1858              : 
    1859              : /*
    1860              :  *-------------------------------------------------------------------
    1861              :  * encoding info table
    1862              :  *-------------------------------------------------------------------
    1863              :  */
    1864              : const pg_wchar_tbl pg_wchar_table[] = {
    1865              :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    1866              :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    1867              :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
    1868              :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    1869              :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    1870              :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    1871              :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    1872              :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1873              :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1874              :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1875              :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1876              :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1877              :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1878              :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1879              :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1880              :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1881              :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1882              :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1883              :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1884              :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1885              :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1886              :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1887              :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1888              :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1889              :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1890              :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1891              :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1892              :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1893              :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1894              :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1895              :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1896              :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1897              :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1898              :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    1899              :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    1900              :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    1901              :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    1902              :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    1903              :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    1904              :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    1905              :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    1906              : };
    1907              : 
    1908              : /*
    1909              :  * Returns the byte length of a multibyte character.
    1910              :  *
    1911              :  * Choose "mblen" functions based on the input string characteristics.
    1912              :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    1913              :  *
    1914              :  * - The input string is zero-terminated
    1915              :  *
    1916              :  * - The input string is known to be valid in the encoding (e.g., string
    1917              :  *   converted from database encoding)
    1918              :  *
    1919              :  * - The encoding is not GB18030 (e.g., when only database encodings are
    1920              :  *   passed to 'encoding' parameter)
    1921              :  *
    1922              :  * encoding==GB18030 requires examining up to two bytes to determine character
    1923              :  * length.  Therefore, callers satisfying none of those conditions must use
    1924              :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    1925              :  * guaranteed to be within allocation bounds.
    1926              :  *
    1927              :  * When dealing with text that is not certainly valid in the specified
    1928              :  * encoding, the result may exceed the actual remaining string length.
    1929              :  * Callers that are not prepared to deal with that should use Min(remaining,
    1930              :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    1931              :  * pg_encoding_mblen_bounded() are interchangeable.
    1932              :  */
    1933              : int
    1934     30038425 : pg_encoding_mblen(int encoding, const char *mbstr)
    1935              : {
    1936     30038425 :     return (PG_VALID_ENCODING(encoding) ?
    1937     60076850 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    1938            0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    1939              : }
    1940              : 
    1941              : /*
    1942              :  * Returns the byte length of a multibyte character (possibly not
    1943              :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    1944              :  */
    1945              : int
    1946         3136 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    1947              :                                 size_t remaining)
    1948              : {
    1949              :     /*
    1950              :      * Define zero remaining as too few, even for single-byte encodings.
    1951              :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    1952              :      * zero; others read one.
    1953              :      */
    1954         3136 :     if (remaining < 1 ||
    1955          202 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    1956           42 :         return INT_MAX;
    1957         3094 :     return pg_encoding_mblen(encoding, mbstr);
    1958              : }
    1959              : 
    1960              : /*
    1961              :  * Returns the byte length of a multibyte character; but not more than the
    1962              :  * distance to the terminating zero byte.  For input that might lack a
    1963              :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    1964              :  */
    1965              : int
    1966            0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    1967              : {
    1968            0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    1969              : }
    1970              : 
    1971              : /*
    1972              :  * Returns the display length of a multibyte character.
    1973              :  */
    1974              : int
    1975     29922862 : pg_encoding_dsplen(int encoding, const char *mbstr)
    1976              : {
    1977     29922862 :     return (PG_VALID_ENCODING(encoding) ?
    1978     59845724 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    1979            0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    1980              : }
    1981              : 
    1982              : /*
    1983              :  * Verify the first multibyte character of the given string.
    1984              :  * Return its byte length if good, -1 if bad.  (See comments above for
    1985              :  * full details of the mbverifychar API.)
    1986              :  */
    1987              : int
    1988         4228 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    1989              : {
    1990         4228 :     return (PG_VALID_ENCODING(encoding) ?
    1991         8456 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    1992            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    1993              : }
    1994              : 
    1995              : /*
    1996              :  * Verify that a string is valid for the given encoding.
    1997              :  * Returns the number of input bytes (<= len) that form a valid string.
    1998              :  * (See comments above for full details of the mbverifystr API.)
    1999              :  */
    2000              : int
    2001       230717 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2002              : {
    2003       230717 :     return (PG_VALID_ENCODING(encoding) ?
    2004       461434 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2005            0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2006              : }
    2007              : 
    2008              : /*
    2009              :  * fetch maximum length of a given encoding
    2010              :  */
    2011              : int
    2012       683944 : pg_encoding_max_length(int encoding)
    2013              : {
    2014              :     Assert(PG_VALID_ENCODING(encoding));
    2015              : 
    2016              :     /*
    2017              :      * Check for the encoding despite the assert, due to some mingw versions
    2018              :      * otherwise issuing bogus warnings.
    2019              :      */
    2020       683944 :     return PG_VALID_ENCODING(encoding) ?
    2021      1367888 :         pg_wchar_table[encoding].maxmblen :
    2022              :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2023              : }
        

Generated by: LCOV version 2.0-1