LCOV - PostgreSQL 19devel - src/common/wchar.c

LCOV - code coverage report

Current view:	top level - src/common - wchar.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	683	868	78.7 %
Date:	2026-02-10 10:17:44	Functions:	68	82	82.9 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wchar.c
       4             :  *    Functions for working with multibyte characters in various encodings.
       5             :  *
       6             :  * Portions Copyright (c) 1998-2026, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/common/wchar.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #include <limits.h>
      16             : 
      17             : #include "mb/pg_wchar.h"
      18             : #include "utils/ascii.h"
      19             : 
      20             : 
      21             : /*
      22             :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23             :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24             :  *
      25             :  * For historical reasons, several verifychar implementations opt to reject
      26             :  * this pair specifically.  Byte pair range constraints, in encoding
      27             :  * originator documentation, always excluded this pair.  No core conversion
      28             :  * could translate it.  However, longstanding verifychar implementations
      29             :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      30             :  * pairs not valid per encoding originator documentation.  To avoid tightening
      31             :  * core or non-core conversions in a security patch, we sought this one pair.
      32             :  *
      33             :  * PQescapeString() historically used spaces for BYTE1; many other values
      34             :  * could suffice for BYTE1.
      35             :  */
      36             : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37             : #define NONUTF8_INVALID_BYTE1 (' ')
      38             : 
      39             : 
      40             : /*
      41             :  * Operations on multi-byte encodings are driven by a table of helper
      42             :  * functions.
      43             :  *
      44             :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45             :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46             :  * and wchar2mb() conversion functions.
      47             :  *
      48             :  * These functions generally assume that their input is validly formed.
      49             :  * The "verifier" functions, further down in the file, have to be more
      50             :  * paranoid.
      51             :  *
      52             :  * We expect that mblen() does not need to examine more than the first byte
      53             :  * of the character to discover the correct length.  GB18030 is an exception
      54             :  * to that rule, though, as it also looks at second byte.  But even that
      55             :  * behaves in a predictable way, if you only pass the first byte: it will
      56             :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57             :  * good enough for all current uses.
      58             :  *
      59             :  * Note: for the display output of psql to work properly, the return values
      60             :  * of the dsplen functions must conform to the Unicode standard. In particular
      61             :  * the NUL character is zero width and control characters are generally
      62             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63             :  * subset to the ASCII routines to ensure consistency.
      64             :  */
      65             : 
      66             : /* No error-reporting facility.  Ignore incomplete trailing byte sequence. */
      67             : #define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
      68             : 
      69             : /*
      70             :  * SQL/ASCII
      71             :  */
      72             : static int
      73         818 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      74             : {
      75         818 :     int         cnt = 0;
      76             : 
      77       64030 :     while (len > 0 && *from)
      78             :     {
      79       63212 :         *to++ = *from++;
      80       63212 :         len--;
      81       63212 :         cnt++;
      82             :     }
      83         818 :     *to = 0;
      84         818 :     return cnt;
      85             : }
      86             : 
      87             : static int
      88       37956 : pg_ascii_mblen(const unsigned char *s)
      89             : {
      90       37956 :     return 1;
      91             : }
      92             : 
      93             : static int
      94       34946 : pg_ascii_dsplen(const unsigned char *s)
      95             : {
      96       34946 :     if (*s == '\0')
      97           0 :         return 0;
      98       34946 :     if (*s < 0x20 || *s == 0x7f)
      99           4 :         return -1;
     100             : 
     101       34942 :     return 1;
     102             : }
     103             : 
     104             : /*
     105             :  * EUC
     106             :  */
     107             : static int
     108          48 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     109             : {
     110          48 :     int         cnt = 0;
     111             : 
     112          72 :     while (len > 0 && *from)
     113             :     {
     114          48 :         if (*from == SS2)       /* JIS X 0201 (so called "1 byte KANA") */
     115             :         {
     116          12 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     117           6 :             from++;
     118           6 :             *to = (SS2 << 8) | *from++;
     119           6 :             len -= 2;
     120             :         }
     121          36 :         else if (*from == SS3)  /* JIS X 0212 KANJI */
     122             :         {
     123          18 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     124           6 :             from++;
     125           6 :             *to = (SS3 << 16) | (*from++ << 8);
     126           6 :             *to |= *from++;
     127           6 :             len -= 3;
     128             :         }
     129          18 :         else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
     130             :         {
     131          12 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     132           6 :             *to = *from++ << 8;
     133           6 :             *to |= *from++;
     134           6 :             len -= 2;
     135             :         }
     136             :         else                    /* must be ASCII */
     137             :         {
     138           6 :             *to = *from++;
     139           6 :             len--;
     140             :         }
     141          24 :         to++;
     142          24 :         cnt++;
     143             :     }
     144          48 :     *to = 0;
     145          48 :     return cnt;
     146             : }
     147             : 
     148             : static inline int
     149         234 : pg_euc_mblen(const unsigned char *s)
     150             : {
     151             :     int         len;
     152             : 
     153         234 :     if (*s == SS2)
     154           0 :         len = 2;
     155         234 :     else if (*s == SS3)
     156           0 :         len = 3;
     157         234 :     else if (IS_HIGHBIT_SET(*s))
     158         162 :         len = 2;
     159             :     else
     160          72 :         len = 1;
     161         234 :     return len;
     162             : }
     163             : 
     164             : static inline int
     165           0 : pg_euc_dsplen(const unsigned char *s)
     166             : {
     167             :     int         len;
     168             : 
     169           0 :     if (*s == SS2)
     170           0 :         len = 2;
     171           0 :     else if (*s == SS3)
     172           0 :         len = 2;
     173           0 :     else if (IS_HIGHBIT_SET(*s))
     174           0 :         len = 2;
     175             :     else
     176           0 :         len = pg_ascii_dsplen(s);
     177           0 :     return len;
     178             : }
     179             : 
     180             : /*
     181             :  * EUC_JP
     182             :  */
     183             : static int
     184          48 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     185             : {
     186          48 :     return pg_euc2wchar_with_len(from, to, len);
     187             : }
     188             : 
     189             : static int
     190         204 : pg_eucjp_mblen(const unsigned char *s)
     191             : {
     192         204 :     return pg_euc_mblen(s);
     193             : }
     194             : 
     195             : static int
     196           0 : pg_eucjp_dsplen(const unsigned char *s)
     197             : {
     198             :     int         len;
     199             : 
     200           0 :     if (*s == SS2)
     201           0 :         len = 1;
     202           0 :     else if (*s == SS3)
     203           0 :         len = 2;
     204           0 :     else if (IS_HIGHBIT_SET(*s))
     205           0 :         len = 2;
     206             :     else
     207           0 :         len = pg_ascii_dsplen(s);
     208           0 :     return len;
     209             : }
     210             : 
     211             : /*
     212             :  * EUC_KR
     213             :  */
     214             : static int
     215           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     216             : {
     217           0 :     return pg_euc2wchar_with_len(from, to, len);
     218             : }
     219             : 
     220             : static int
     221           6 : pg_euckr_mblen(const unsigned char *s)
     222             : {
     223           6 :     return pg_euc_mblen(s);
     224             : }
     225             : 
     226             : static int
     227           0 : pg_euckr_dsplen(const unsigned char *s)
     228             : {
     229           0 :     return pg_euc_dsplen(s);
     230             : }
     231             : 
     232             : /*
     233             :  * EUC_CN
     234             :  *
     235             :  */
     236             : static int
     237          54 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     238             : {
     239          54 :     int         cnt = 0;
     240             : 
     241          78 :     while (len > 0 && *from)
     242             :     {
     243          54 :         if (*from == SS2)       /* code set 2 (unused?) */
     244             :         {
     245          18 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     246           6 :             from++;
     247           6 :             *to = (SS2 << 16) | (*from++ << 8);
     248           6 :             *to |= *from++;
     249           6 :             len -= 3;
     250             :         }
     251          36 :         else if (*from == SS3)  /* code set 3 (unused ?) */
     252             :         {
     253          18 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     254           6 :             from++;
     255           6 :             *to = (SS3 << 16) | (*from++ << 8);
     256           6 :             *to |= *from++;
     257           6 :             len -= 3;
     258             :         }
     259          18 :         else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
     260             :         {
     261          12 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     262           6 :             *to = *from++ << 8;
     263           6 :             *to |= *from++;
     264           6 :             len -= 2;
     265             :         }
     266             :         else
     267             :         {
     268           6 :             *to = *from++;
     269           6 :             len--;
     270             :         }
     271          24 :         to++;
     272          24 :         cnt++;
     273             :     }
     274          54 :     *to = 0;
     275          54 :     return cnt;
     276             : }
     277             : 
     278             : /*
     279             :  * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
     280             :  * EUC_CN), but mb2wchar_with_len does.  Tell a coherent story for code that
     281             :  * relies on agreement between mb2wchar_with_len and mblen.  Invalid text
     282             :  * datums (e.g. from shared catalogs) reach this.
     283             :  */
     284             : static int
     285           6 : pg_euccn_mblen(const unsigned char *s)
     286             : {
     287             :     int         len;
     288             : 
     289           6 :     if (*s == SS2)
     290           0 :         len = 3;
     291           6 :     else if (*s == SS3)
     292           0 :         len = 3;
     293           6 :     else if (IS_HIGHBIT_SET(*s))
     294           6 :         len = 2;
     295             :     else
     296           0 :         len = 1;
     297           6 :     return len;
     298             : }
     299             : 
     300             : static int
     301           0 : pg_euccn_dsplen(const unsigned char *s)
     302             : {
     303             :     int         len;
     304             : 
     305           0 :     if (IS_HIGHBIT_SET(*s))
     306           0 :         len = 2;
     307             :     else
     308           0 :         len = pg_ascii_dsplen(s);
     309           0 :     return len;
     310             : }
     311             : 
     312             : /*
     313             :  * EUC_TW
     314             :  *
     315             :  */
     316             : static int
     317          60 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     318             : {
     319          60 :     int         cnt = 0;
     320             : 
     321          84 :     while (len > 0 && *from)
     322             :     {
     323          60 :         if (*from == SS2)       /* code set 2 */
     324             :         {
     325          24 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     326           6 :             from++;
     327           6 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     328           6 :             *to |= *from++ << 8;
     329           6 :             *to |= *from++;
     330           6 :             len -= 4;
     331             :         }
     332          36 :         else if (*from == SS3)  /* code set 3 (unused?) */
     333             :         {
     334          18 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     335           6 :             from++;
     336           6 :             *to = (SS3 << 16) | (*from++ << 8);
     337           6 :             *to |= *from++;
     338           6 :             len -= 3;
     339             :         }
     340          18 :         else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
     341             :         {
     342          12 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     343           6 :             *to = *from++ << 8;
     344           6 :             *to |= *from++;
     345           6 :             len -= 2;
     346             :         }
     347             :         else
     348             :         {
     349           6 :             *to = *from++;
     350           6 :             len--;
     351             :         }
     352          24 :         to++;
     353          24 :         cnt++;
     354             :     }
     355          60 :     *to = 0;
     356          60 :     return cnt;
     357             : }
     358             : 
     359             : static int
     360           6 : pg_euctw_mblen(const unsigned char *s)
     361             : {
     362             :     int         len;
     363             : 
     364           6 :     if (*s == SS2)
     365           0 :         len = 4;
     366           6 :     else if (*s == SS3)
     367           0 :         len = 3;
     368           6 :     else if (IS_HIGHBIT_SET(*s))
     369           6 :         len = 2;
     370             :     else
     371           0 :         len = 1;
     372           6 :     return len;
     373             : }
     374             : 
     375             : static int
     376           0 : pg_euctw_dsplen(const unsigned char *s)
     377             : {
     378             :     int         len;
     379             : 
     380           0 :     if (*s == SS2)
     381           0 :         len = 2;
     382           0 :     else if (*s == SS3)
     383           0 :         len = 2;
     384           0 :     else if (IS_HIGHBIT_SET(*s))
     385           0 :         len = 2;
     386             :     else
     387           0 :         len = pg_ascii_dsplen(s);
     388           0 :     return len;
     389             : }
     390             : 
     391             : /*
     392             :  * Convert pg_wchar to EUC_* encoding.
     393             :  * caller must allocate enough space for "to", including a trailing zero!
     394             :  * len: length of from.
     395             :  * "from" not necessarily null terminated.
     396             :  */
     397             : static int
     398          72 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     399             : {
     400          72 :     int         cnt = 0;
     401             : 
     402         144 :     while (len > 0 && *from)
     403             :     {
     404             :         unsigned char c;
     405             : 
     406          72 :         if ((c = (*from >> 24)))
     407             :         {
     408           6 :             *to++ = c;
     409           6 :             *to++ = (*from >> 16) & 0xff;
     410           6 :             *to++ = (*from >> 8) & 0xff;
     411           6 :             *to++ = *from & 0xff;
     412           6 :             cnt += 4;
     413             :         }
     414          66 :         else if ((c = (*from >> 16)))
     415             :         {
     416          24 :             *to++ = c;
     417          24 :             *to++ = (*from >> 8) & 0xff;
     418          24 :             *to++ = *from & 0xff;
     419          24 :             cnt += 3;
     420             :         }
     421          42 :         else if ((c = (*from >> 8)))
     422             :         {
     423          24 :             *to++ = c;
     424          24 :             *to++ = *from & 0xff;
     425          24 :             cnt += 2;
     426             :         }
     427             :         else
     428             :         {
     429          18 :             *to++ = *from;
     430          18 :             cnt++;
     431             :         }
     432          72 :         from++;
     433          72 :         len--;
     434             :     }
     435          72 :     *to = 0;
     436          72 :     return cnt;
     437             : }
     438             : 
     439             : 
     440             : /*
     441             :  * JOHAB
     442             :  */
     443             : static int
     444          24 : pg_johab_mblen(const unsigned char *s)
     445             : {
     446          24 :     return pg_euc_mblen(s);
     447             : }
     448             : 
     449             : static int
     450           0 : pg_johab_dsplen(const unsigned char *s)
     451             : {
     452           0 :     return pg_euc_dsplen(s);
     453             : }
     454             : 
     455             : /*
     456             :  * convert UTF8 string to pg_wchar (UCS-4)
     457             :  * caller must allocate enough space for "to", including a trailing zero!
     458             :  * len: length of from.
     459             :  * "from" not necessarily null terminated.
     460             :  */
     461             : static int
     462    10207072 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     463             : {
     464    10207072 :     int         cnt = 0;
     465             :     uint32      c1,
     466             :                 c2,
     467             :                 c3,
     468             :                 c4;
     469             : 
     470   160207832 :     while (len > 0 && *from)
     471             :     {
     472   150000802 :         if ((*from & 0x80) == 0)
     473             :         {
     474   149999692 :             *to = *from++;
     475   149999692 :             len--;
     476             :         }
     477        1110 :         else if ((*from & 0xe0) == 0xc0)
     478             :         {
     479         536 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     480         524 :             c1 = *from++ & 0x1f;
     481         524 :             c2 = *from++ & 0x3f;
     482         524 :             *to = (c1 << 6) | c2;
     483         524 :             len -= 2;
     484             :         }
     485         574 :         else if ((*from & 0xf0) == 0xe0)
     486             :         {
     487         334 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     488         322 :             c1 = *from++ & 0x0f;
     489         322 :             c2 = *from++ & 0x3f;
     490         322 :             c3 = *from++ & 0x3f;
     491         322 :             *to = (c1 << 12) | (c2 << 6) | c3;
     492         322 :             len -= 3;
     493             :         }
     494         240 :         else if ((*from & 0xf8) == 0xf0)
     495             :         {
     496          24 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     497           6 :             c1 = *from++ & 0x07;
     498           6 :             c2 = *from++ & 0x3f;
     499           6 :             c3 = *from++ & 0x3f;
     500           6 :             c4 = *from++ & 0x3f;
     501           6 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     502           6 :             len -= 4;
     503             :         }
     504             :         else
     505             :         {
     506             :             /* treat a bogus char as length 1; not ours to raise error */
     507         216 :             *to = *from++;
     508         216 :             len--;
     509             :         }
     510   150000760 :         to++;
     511   150000760 :         cnt++;
     512             :     }
     513    10207072 :     *to = 0;
     514    10207072 :     return cnt;
     515             : }
     516             : 
     517             : 
     518             : /*
     519             :  * Trivial conversion from pg_wchar to UTF-8.
     520             :  * caller should allocate enough space for "to"
     521             :  * len: length of from.
     522             :  * "from" not necessarily null terminated.
     523             :  */
     524             : static int
     525     1116158 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     526             : {
     527     1116158 :     int         cnt = 0;
     528             : 
     529    16800580 :     while (len > 0 && *from)
     530             :     {
     531             :         int         char_len;
     532             : 
     533    15684422 :         unicode_to_utf8(*from, to);
     534    15684422 :         char_len = pg_utf_mblen(to);
     535    15684422 :         cnt += char_len;
     536    15684422 :         to += char_len;
     537    15684422 :         from++;
     538    15684422 :         len--;
     539             :     }
     540     1116158 :     *to = 0;
     541     1116158 :     return cnt;
     542             : }
     543             : 
     544             : /*
     545             :  * Return the byte length of a UTF8 character pointed to by s
     546             :  *
     547             :  * Note: in the current implementation we do not support UTF8 sequences
     548             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     549             :  * We return "1" for any leading byte that is either flat-out illegal or
     550             :  * indicates a length larger than we support.
     551             :  *
     552             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     553             :  * other places would need to be fixed to change this.
     554             :  */
     555             : int
     556   315032704 : pg_utf_mblen(const unsigned char *s)
     557             : {
     558             :     int         len;
     559             : 
     560   315032704 :     if ((*s & 0x80) == 0)
     561   315004426 :         len = 1;
     562       28278 :     else if ((*s & 0xe0) == 0xc0)
     563       13970 :         len = 2;
     564       14308 :     else if ((*s & 0xf0) == 0xe0)
     565        9780 :         len = 3;
     566        4528 :     else if ((*s & 0xf8) == 0xf0)
     567        4354 :         len = 4;
     568             : #ifdef NOT_USED
     569             :     else if ((*s & 0xfc) == 0xf8)
     570             :         len = 5;
     571             :     else if ((*s & 0xfe) == 0xfc)
     572             :         len = 6;
     573             : #endif
     574             :     else
     575         174 :         len = 1;
     576   315032704 :     return len;
     577             : }
     578             : 
     579             : /*
     580             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     581             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     582             :  * <http://www.unix.org/online.html>
     583             :  *
     584             :  * Markus Kuhn -- 2001-09-08 -- public domain
     585             :  *
     586             :  * customised for PostgreSQL
     587             :  *
     588             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     589             :  */
     590             : 
     591             : struct mbinterval
     592             : {
     593             :     unsigned int first;
     594             :     unsigned int last;
     595             : };
     596             : 
     597             : /* auxiliary function for binary search in interval table */
     598             : static int
     599    89002390 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     600             : {
     601    89002390 :     int         min = 0;
     602             :     int         mid;
     603             : 
     604    89002390 :     if (ucs < table[0].first || ucs > table[max].last)
     605    88993924 :         return 0;
     606       74220 :     while (max >= min)
     607             :     {
     608       66456 :         mid = (min + max) / 2;
     609       66456 :         if (ucs > table[mid].last)
     610       13332 :             min = mid + 1;
     611       53124 :         else if (ucs < table[mid].first)
     612       52422 :             max = mid - 1;
     613             :         else
     614         702 :             return 1;
     615             :     }
     616             : 
     617        7764 :     return 0;
     618             : }
     619             : 
     620             : 
     621             : /* The following functions define the column width of an ISO 10646
     622             :  * character as follows:
     623             :  *
     624             :  *    - The null character (U+0000) has a column width of 0.
     625             :  *
     626             :  *    - Other C0/C1 control characters and DEL will lead to a return
     627             :  *      value of -1.
     628             :  *
     629             :  *    - Non-spacing and enclosing combining characters (general
     630             :  *      category code Mn, Me or Cf in the Unicode database) have a
     631             :  *      column width of 0.
     632             :  *
     633             :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     634             :  *      FullWidth (F) category as defined in Unicode Technical
     635             :  *      Report #11 have a column width of 2.
     636             :  *
     637             :  *    - All remaining characters (including all printable
     638             :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     639             :  *      etc.) have a column width of 1.
     640             :  *
     641             :  * This implementation assumes that wchar_t characters are encoded
     642             :  * in ISO 10646.
     643             :  */
     644             : 
     645             : static int
     646    44546456 : ucs_wcwidth(pg_wchar ucs)
     647             : {
     648             : #include "common/unicode_nonspacing_table.h"
     649             : #include "common/unicode_east_asian_fw_table.h"
     650             : 
     651             :     /* test for 8-bit control characters */
     652    44546456 :     if (ucs == 0)
     653           0 :         return 0;
     654             : 
     655    44546456 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     656       45018 :         return -1;
     657             : 
     658             :     /*
     659             :      * binary search in table of non-spacing characters
     660             :      *
     661             :      * XXX: In the official Unicode sources, it is possible for a character to
     662             :      * be described as both non-spacing and wide at the same time. As of
     663             :      * Unicode 13.0, treating the non-spacing property as the determining
     664             :      * factor for display width leads to the correct behavior, so do that
     665             :      * search first.
     666             :      */
     667    44501438 :     if (mbbisearch(ucs, nonspacing,
     668             :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     669         486 :         return 0;
     670             : 
     671             :     /* binary search in table of wide characters */
     672    44500952 :     if (mbbisearch(ucs, east_asian_fw,
     673             :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     674         216 :         return 2;
     675             : 
     676    44500736 :     return 1;
     677             : }
     678             : 
     679             : static int
     680    44546456 : pg_utf_dsplen(const unsigned char *s)
     681             : {
     682    44546456 :     return ucs_wcwidth(utf8_to_unicode(s));
     683             : }
     684             : 
     685             : /*
     686             :  * convert mule internal code to pg_wchar
     687             :  * caller should allocate enough space for "to"
     688             :  * len: length of from.
     689             :  * "from" not necessarily null terminated.
     690             :  */
     691             : static int
     692          36 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     693             : {
     694          36 :     int         cnt = 0;
     695             : 
     696          54 :     while (len > 0 && *from)
     697             :     {
     698          36 :         if (IS_LC1(*from))
     699             :         {
     700          12 :             MB2CHAR_NEED_AT_LEAST(len, 2);
     701           6 :             *to = *from++ << 16;
     702           6 :             *to |= *from++;
     703           6 :             len -= 2;
     704             :         }
     705          24 :         else if (IS_LCPRV1(*from))
     706             :         {
     707           0 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     708           0 :             from++;
     709           0 :             *to = *from++ << 16;
     710           0 :             *to |= *from++;
     711           0 :             len -= 3;
     712             :         }
     713          24 :         else if (IS_LC2(*from))
     714             :         {
     715          18 :             MB2CHAR_NEED_AT_LEAST(len, 3);
     716           6 :             *to = *from++ << 16;
     717           6 :             *to |= *from++ << 8;
     718           6 :             *to |= *from++;
     719           6 :             len -= 3;
     720             :         }
     721           6 :         else if (IS_LCPRV2(*from))
     722             :         {
     723           0 :             MB2CHAR_NEED_AT_LEAST(len, 4);
     724           0 :             from++;
     725           0 :             *to = *from++ << 16;
     726           0 :             *to |= *from++ << 8;
     727           0 :             *to |= *from++;
     728           0 :             len -= 4;
     729             :         }
     730             :         else
     731             :         {                       /* assume ASCII */
     732           6 :             *to = (unsigned char) *from++;
     733           6 :             len--;
     734             :         }
     735          18 :         to++;
     736          18 :         cnt++;
     737             :     }
     738          36 :     *to = 0;
     739          36 :     return cnt;
     740             : }
     741             : 
     742             : /*
     743             :  * convert pg_wchar to mule internal code
     744             :  * caller should allocate enough space for "to"
     745             :  * len: length of from.
     746             :  * "from" not necessarily null terminated.
     747             :  */
     748             : static int
     749          18 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     750             : {
     751          18 :     int         cnt = 0;
     752             : 
     753          36 :     while (len > 0 && *from)
     754             :     {
     755             :         unsigned char lb;
     756             : 
     757          18 :         lb = (*from >> 16) & 0xff;
     758          18 :         if (IS_LC1(lb))
     759             :         {
     760           6 :             *to++ = lb;
     761           6 :             *to++ = *from & 0xff;
     762           6 :             cnt += 2;
     763             :         }
     764          12 :         else if (IS_LC2(lb))
     765             :         {
     766           6 :             *to++ = lb;
     767           6 :             *to++ = (*from >> 8) & 0xff;
     768           6 :             *to++ = *from & 0xff;
     769           6 :             cnt += 3;
     770             :         }
     771           6 :         else if (IS_LCPRV1_A_RANGE(lb))
     772             :         {
     773           0 :             *to++ = LCPRV1_A;
     774           0 :             *to++ = lb;
     775           0 :             *to++ = *from & 0xff;
     776           0 :             cnt += 3;
     777             :         }
     778           6 :         else if (IS_LCPRV1_B_RANGE(lb))
     779             :         {
     780           0 :             *to++ = LCPRV1_B;
     781           0 :             *to++ = lb;
     782           0 :             *to++ = *from & 0xff;
     783           0 :             cnt += 3;
     784             :         }
     785           6 :         else if (IS_LCPRV2_A_RANGE(lb))
     786             :         {
     787           0 :             *to++ = LCPRV2_A;
     788           0 :             *to++ = lb;
     789           0 :             *to++ = (*from >> 8) & 0xff;
     790           0 :             *to++ = *from & 0xff;
     791           0 :             cnt += 4;
     792             :         }
     793           6 :         else if (IS_LCPRV2_B_RANGE(lb))
     794             :         {
     795           0 :             *to++ = LCPRV2_B;
     796           0 :             *to++ = lb;
     797           0 :             *to++ = (*from >> 8) & 0xff;
     798           0 :             *to++ = *from & 0xff;
     799           0 :             cnt += 4;
     800             :         }
     801             :         else
     802             :         {
     803           6 :             *to++ = *from & 0xff;
     804           6 :             cnt += 1;
     805             :         }
     806          18 :         from++;
     807          18 :         len--;
     808             :     }
     809          18 :     *to = 0;
     810          18 :     return cnt;
     811             : }
     812             : 
     813             : /* exported for direct use by conv.c */
     814             : int
     815        3024 : pg_mule_mblen(const unsigned char *s)
     816             : {
     817             :     int         len;
     818             : 
     819        3024 :     if (IS_LC1(*s))
     820        1220 :         len = 2;
     821        1804 :     else if (IS_LCPRV1(*s))
     822           0 :         len = 3;
     823        1804 :     else if (IS_LC2(*s))
     824        1710 :         len = 3;
     825          94 :     else if (IS_LCPRV2(*s))
     826          40 :         len = 4;
     827             :     else
     828          54 :         len = 1;                /* assume ASCII */
     829        3024 :     return len;
     830             : }
     831             : 
     832             : static int
     833           0 : pg_mule_dsplen(const unsigned char *s)
     834             : {
     835             :     int         len;
     836             : 
     837             :     /*
     838             :      * Note: it's not really appropriate to assume that all multibyte charsets
     839             :      * are double-wide on screen.  But this seems an okay approximation for
     840             :      * the MULE charsets we currently support.
     841             :      */
     842             : 
     843           0 :     if (IS_LC1(*s))
     844           0 :         len = 1;
     845           0 :     else if (IS_LCPRV1(*s))
     846           0 :         len = 1;
     847           0 :     else if (IS_LC2(*s))
     848           0 :         len = 2;
     849           0 :     else if (IS_LCPRV2(*s))
     850           0 :         len = 2;
     851             :     else
     852           0 :         len = 1;                /* assume ASCII */
     853             : 
     854           0 :     return len;
     855             : }
     856             : 
     857             : /*
     858             :  * ISO8859-1
     859             :  */
     860             : static int
     861        1082 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     862             : {
     863        1082 :     int         cnt = 0;
     864             : 
     865       30028 :     while (len > 0 && *from)
     866             :     {
     867       28946 :         *to++ = *from++;
     868       28946 :         len--;
     869       28946 :         cnt++;
     870             :     }
     871        1082 :     *to = 0;
     872        1082 :     return cnt;
     873             : }
     874             : 
     875             : /*
     876             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     877             :  * high bits.
     878             :  * caller should allocate enough space for "to"
     879             :  * len: length of from.
     880             :  * "from" not necessarily null terminated.
     881             :  */
     882             : static int
     883         162 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     884             : {
     885         162 :     int         cnt = 0;
     886             : 
     887        1380 :     while (len > 0 && *from)
     888             :     {
     889        1218 :         *to++ = *from++;
     890        1218 :         len--;
     891        1218 :         cnt++;
     892             :     }
     893         162 :     *to = 0;
     894         162 :     return cnt;
     895             : }
     896             : 
     897             : static int
     898        7996 : pg_latin1_mblen(const unsigned char *s)
     899             : {
     900        7996 :     return 1;
     901             : }
     902             : 
     903             : static int
     904         800 : pg_latin1_dsplen(const unsigned char *s)
     905             : {
     906         800 :     return pg_ascii_dsplen(s);
     907             : }
     908             : 
     909             : /*
     910             :  * SJIS
     911             :  */
     912             : static int
     913        1690 : pg_sjis_mblen(const unsigned char *s)
     914             : {
     915             :     int         len;
     916             : 
     917        1690 :     if (*s >= 0xa1 && *s <= 0xdf)
     918           0 :         len = 1;                /* 1 byte kana? */
     919        1690 :     else if (IS_HIGHBIT_SET(*s))
     920        1314 :         len = 2;                /* kanji? */
     921             :     else
     922         376 :         len = 1;                /* should be ASCII */
     923        1690 :     return len;
     924             : }
     925             : 
     926             : static int
     927           0 : pg_sjis_dsplen(const unsigned char *s)
     928             : {
     929             :     int         len;
     930             : 
     931           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     932           0 :         len = 1;                /* 1 byte kana? */
     933           0 :     else if (IS_HIGHBIT_SET(*s))
     934           0 :         len = 2;                /* kanji? */
     935             :     else
     936           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     937           0 :     return len;
     938             : }
     939             : 
     940             : /*
     941             :  * Big5
     942             :  */
     943             : static int
     944         492 : pg_big5_mblen(const unsigned char *s)
     945             : {
     946             :     int         len;
     947             : 
     948         492 :     if (IS_HIGHBIT_SET(*s))
     949         438 :         len = 2;                /* kanji? */
     950             :     else
     951          54 :         len = 1;                /* should be ASCII */
     952         492 :     return len;
     953             : }
     954             : 
     955             : static int
     956           0 : pg_big5_dsplen(const unsigned char *s)
     957             : {
     958             :     int         len;
     959             : 
     960           0 :     if (IS_HIGHBIT_SET(*s))
     961           0 :         len = 2;                /* kanji? */
     962             :     else
     963           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     964           0 :     return len;
     965             : }
     966             : 
     967             : /*
     968             :  * GBK
     969             :  */
     970             : static int
     971         556 : pg_gbk_mblen(const unsigned char *s)
     972             : {
     973             :     int         len;
     974             : 
     975         556 :     if (IS_HIGHBIT_SET(*s))
     976         416 :         len = 2;                /* kanji? */
     977             :     else
     978         140 :         len = 1;                /* should be ASCII */
     979         556 :     return len;
     980             : }
     981             : 
     982             : static int
     983           0 : pg_gbk_dsplen(const unsigned char *s)
     984             : {
     985             :     int         len;
     986             : 
     987           0 :     if (IS_HIGHBIT_SET(*s))
     988           0 :         len = 2;                /* kanji? */
     989             :     else
     990           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     991           0 :     return len;
     992             : }
     993             : 
     994             : /*
     995             :  * UHC
     996             :  */
     997             : static int
     998          24 : pg_uhc_mblen(const unsigned char *s)
     999             : {
    1000             :     int         len;
    1001             : 
    1002          24 :     if (IS_HIGHBIT_SET(*s))
    1003          24 :         len = 2;                /* 2byte? */
    1004             :     else
    1005           0 :         len = 1;                /* should be ASCII */
    1006          24 :     return len;
    1007             : }
    1008             : 
    1009             : static int
    1010           0 : pg_uhc_dsplen(const unsigned char *s)
    1011             : {
    1012             :     int         len;
    1013             : 
    1014           0 :     if (IS_HIGHBIT_SET(*s))
    1015           0 :         len = 2;                /* 2byte? */
    1016             :     else
    1017           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1018           0 :     return len;
    1019             : }
    1020             : 
    1021             : /*
    1022             :  * GB18030
    1023             :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1024             :  */
    1025             : 
    1026             : /*
    1027             :  * Unlike all other mblen() functions, this also looks at the second byte of
    1028             :  * the input.  However, if you only pass the first byte of a multi-byte
    1029             :  * string, and \0 as the second byte, this still works in a predictable way:
    1030             :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1031             :  * enough for all current uses, as a client-only encoding.  It works that
    1032             :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1033             :  * fourth byte look like a 2-byte encoded character, when looked at
    1034             :  * separately.
    1035             :  */
    1036             : static int
    1037        1182 : pg_gb18030_mblen(const unsigned char *s)
    1038             : {
    1039             :     int         len;
    1040             : 
    1041        1182 :     if (!IS_HIGHBIT_SET(*s))
    1042         684 :         len = 1;                /* ASCII */
    1043         498 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1044         186 :         len = 4;
    1045             :     else
    1046         312 :         len = 2;
    1047        1182 :     return len;
    1048             : }
    1049             : 
    1050             : static int
    1051           0 : pg_gb18030_dsplen(const unsigned char *s)
    1052             : {
    1053             :     int         len;
    1054             : 
    1055           0 :     if (IS_HIGHBIT_SET(*s))
    1056           0 :         len = 2;
    1057             :     else
    1058           0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1059           0 :     return len;
    1060             : }
    1061             : 
    1062             : /*
    1063             :  *-------------------------------------------------------------------
    1064             :  * multibyte sequence validators
    1065             :  *
    1066             :  * The verifychar functions accept "s", a pointer to the first byte of a
    1067             :  * string, and "len", the remaining length of the string.  If there is a
    1068             :  * validly encoded character beginning at *s, return its length in bytes;
    1069             :  * else return -1.
    1070             :  *
    1071             :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1072             :  * the length of the string.  They verify the whole string, and return the
    1073             :  * number of input bytes (<= len) that are valid.  In other words, if the
    1074             :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1075             :  * byte offset of the first invalid character.  The verifystr functions must
    1076             :  * test for and reject zeroes in the input.
    1077             :  *
    1078             :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1079             :  * they must test for and reject zeroes in any additional bytes of a
    1080             :  * multibyte character.  Note that this definition allows the function for a
    1081             :  * single-byte encoding to be just "return 1".
    1082             :  *-------------------------------------------------------------------
    1083             :  */
    1084             : static int
    1085         322 : pg_ascii_verifychar(const unsigned char *s, int len)
    1086             : {
    1087         322 :     return 1;
    1088             : }
    1089             : 
    1090             : static int
    1091      423860 : pg_ascii_verifystr(const unsigned char *s, int len)
    1092             : {
    1093      423860 :     const unsigned char *nullpos = memchr(s, 0, len);
    1094             : 
    1095      423860 :     if (nullpos == NULL)
    1096      423860 :         return len;
    1097             :     else
    1098           0 :         return nullpos - s;
    1099             : }
    1100             : 
    1101             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1102             : 
    1103             : static int
    1104         504 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1105             : {
    1106             :     int         l;
    1107             :     unsigned char c1,
    1108             :                 c2;
    1109             : 
    1110         504 :     c1 = *s++;
    1111             : 
    1112         504 :     switch (c1)
    1113             :     {
    1114           0 :         case SS2:               /* JIS X 0201 */
    1115           0 :             l = 2;
    1116           0 :             if (l > len)
    1117           0 :                 return -1;
    1118           0 :             c2 = *s++;
    1119           0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1120           0 :                 return -1;
    1121           0 :             break;
    1122             : 
    1123           0 :         case SS3:               /* JIS X 0212 */
    1124           0 :             l = 3;
    1125           0 :             if (l > len)
    1126           0 :                 return -1;
    1127           0 :             c2 = *s++;
    1128           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1129           0 :                 return -1;
    1130           0 :             c2 = *s++;
    1131           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1132           0 :                 return -1;
    1133           0 :             break;
    1134             : 
    1135         504 :         default:
    1136         504 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1137             :             {
    1138         504 :                 l = 2;
    1139         504 :                 if (l > len)
    1140          84 :                     return -1;
    1141         420 :                 if (!IS_EUC_RANGE_VALID(c1))
    1142          24 :                     return -1;
    1143         396 :                 c2 = *s++;
    1144         396 :                 if (!IS_EUC_RANGE_VALID(c2))
    1145         180 :                     return -1;
    1146             :             }
    1147             :             else
    1148             :                 /* must be ASCII */
    1149             :             {
    1150           0 :                 l = 1;
    1151             :             }
    1152         216 :             break;
    1153             :     }
    1154             : 
    1155         216 :     return l;
    1156             : }
    1157             : 
    1158             : static int
    1159         300 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1160             : {
    1161         300 :     const unsigned char *start = s;
    1162             : 
    1163         930 :     while (len > 0)
    1164             :     {
    1165             :         int         l;
    1166             : 
    1167             :         /* fast path for ASCII-subset characters */
    1168         846 :         if (!IS_HIGHBIT_SET(*s))
    1169             :         {
    1170         594 :             if (*s == '\0')
    1171          72 :                 break;
    1172         522 :             l = 1;
    1173             :         }
    1174             :         else
    1175             :         {
    1176         252 :             l = pg_eucjp_verifychar(s, len);
    1177         252 :             if (l == -1)
    1178         144 :                 break;
    1179             :         }
    1180         630 :         s += l;
    1181         630 :         len -= l;
    1182             :     }
    1183             : 
    1184         300 :     return s - start;
    1185             : }
    1186             : 
    1187             : static int
    1188         144 : pg_euckr_verifychar(const unsigned char *s, int len)
    1189             : {
    1190             :     int         l;
    1191             :     unsigned char c1,
    1192             :                 c2;
    1193             : 
    1194         144 :     c1 = *s++;
    1195             : 
    1196         144 :     if (IS_HIGHBIT_SET(c1))
    1197             :     {
    1198         144 :         l = 2;
    1199         144 :         if (l > len)
    1200          12 :             return -1;
    1201         132 :         if (!IS_EUC_RANGE_VALID(c1))
    1202          24 :             return -1;
    1203         108 :         c2 = *s++;
    1204         108 :         if (!IS_EUC_RANGE_VALID(c2))
    1205           0 :             return -1;
    1206             :     }
    1207             :     else
    1208             :         /* must be ASCII */
    1209             :     {
    1210           0 :         l = 1;
    1211             :     }
    1212             : 
    1213         108 :     return l;
    1214             : }
    1215             : 
    1216             : static int
    1217          72 : pg_euckr_verifystr(const unsigned char *s, int len)
    1218             : {
    1219          72 :     const unsigned char *start = s;
    1220             : 
    1221         234 :     while (len > 0)
    1222             :     {
    1223             :         int         l;
    1224             : 
    1225             :         /* fast path for ASCII-subset characters */
    1226         198 :         if (!IS_HIGHBIT_SET(*s))
    1227             :         {
    1228         108 :             if (*s == '\0')
    1229           0 :                 break;
    1230         108 :             l = 1;
    1231             :         }
    1232             :         else
    1233             :         {
    1234          90 :             l = pg_euckr_verifychar(s, len);
    1235          90 :             if (l == -1)
    1236          36 :                 break;
    1237             :         }
    1238         162 :         s += l;
    1239         162 :         len -= l;
    1240             :     }
    1241             : 
    1242          72 :     return s - start;
    1243             : }
    1244             : 
    1245             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1246             : #define pg_euccn_verifychar pg_euckr_verifychar
    1247             : #define pg_euccn_verifystr  pg_euckr_verifystr
    1248             : 
    1249             : static int
    1250          18 : pg_euctw_verifychar(const unsigned char *s, int len)
    1251             : {
    1252             :     int         l;
    1253             :     unsigned char c1,
    1254             :                 c2;
    1255             : 
    1256          18 :     c1 = *s++;
    1257             : 
    1258          18 :     switch (c1)
    1259             :     {
    1260           0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1261           0 :             l = 4;
    1262           0 :             if (l > len)
    1263           0 :                 return -1;
    1264           0 :             c2 = *s++;
    1265           0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1266           0 :                 return -1;
    1267           0 :             c2 = *s++;
    1268           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1269           0 :                 return -1;
    1270           0 :             c2 = *s++;
    1271           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1272           0 :                 return -1;
    1273           0 :             break;
    1274             : 
    1275           0 :         case SS3:               /* unused */
    1276           0 :             return -1;
    1277             : 
    1278          18 :         default:
    1279          18 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1280             :             {
    1281          18 :                 l = 2;
    1282          18 :                 if (l > len)
    1283           6 :                     return -1;
    1284             :                 /* no further range check on c1? */
    1285          12 :                 c2 = *s++;
    1286          12 :                 if (!IS_EUC_RANGE_VALID(c2))
    1287          12 :                     return -1;
    1288             :             }
    1289             :             else
    1290             :                 /* must be ASCII */
    1291             :             {
    1292           0 :                 l = 1;
    1293             :             }
    1294           0 :             break;
    1295             :     }
    1296           0 :     return l;
    1297             : }
    1298             : 
    1299             : static int
    1300          36 : pg_euctw_verifystr(const unsigned char *s, int len)
    1301             : {
    1302          36 :     const unsigned char *start = s;
    1303             : 
    1304          90 :     while (len > 0)
    1305             :     {
    1306             :         int         l;
    1307             : 
    1308             :         /* fast path for ASCII-subset characters */
    1309          72 :         if (!IS_HIGHBIT_SET(*s))
    1310             :         {
    1311          54 :             if (*s == '\0')
    1312           0 :                 break;
    1313          54 :             l = 1;
    1314             :         }
    1315             :         else
    1316             :         {
    1317          18 :             l = pg_euctw_verifychar(s, len);
    1318          18 :             if (l == -1)
    1319          18 :                 break;
    1320             :         }
    1321          54 :         s += l;
    1322          54 :         len -= l;
    1323             :     }
    1324             : 
    1325          36 :     return s - start;
    1326             : }
    1327             : 
    1328             : static int
    1329          18 : pg_johab_verifychar(const unsigned char *s, int len)
    1330             : {
    1331             :     int         l,
    1332             :                 mbl;
    1333             :     unsigned char c;
    1334             : 
    1335          18 :     l = mbl = pg_johab_mblen(s);
    1336             : 
    1337          18 :     if (len < l)
    1338           6 :         return -1;
    1339             : 
    1340          12 :     if (!IS_HIGHBIT_SET(*s))
    1341           0 :         return mbl;
    1342             : 
    1343          12 :     while (--l > 0)
    1344             :     {
    1345          12 :         c = *++s;
    1346          12 :         if (!IS_EUC_RANGE_VALID(c))
    1347          12 :             return -1;
    1348             :     }
    1349           0 :     return mbl;
    1350             : }
    1351             : 
    1352             : static int
    1353          24 : pg_johab_verifystr(const unsigned char *s, int len)
    1354             : {
    1355          24 :     const unsigned char *start = s;
    1356             : 
    1357          42 :     while (len > 0)
    1358             :     {
    1359             :         int         l;
    1360             : 
    1361             :         /* fast path for ASCII-subset characters */
    1362          36 :         if (!IS_HIGHBIT_SET(*s))
    1363             :         {
    1364          18 :             if (*s == '\0')
    1365           0 :                 break;
    1366          18 :             l = 1;
    1367             :         }
    1368             :         else
    1369             :         {
    1370          18 :             l = pg_johab_verifychar(s, len);
    1371          18 :             if (l == -1)
    1372          18 :                 break;
    1373             :         }
    1374          18 :         s += l;
    1375          18 :         len -= l;
    1376             :     }
    1377             : 
    1378          24 :     return s - start;
    1379             : }
    1380             : 
    1381             : static int
    1382        1350 : pg_mule_verifychar(const unsigned char *s, int len)
    1383             : {
    1384             :     int         l,
    1385             :                 mbl;
    1386             :     unsigned char c;
    1387             : 
    1388        1350 :     l = mbl = pg_mule_mblen(s);
    1389             : 
    1390        1350 :     if (len < l)
    1391         344 :         return -1;
    1392             : 
    1393        2032 :     while (--l > 0)
    1394             :     {
    1395        1348 :         c = *++s;
    1396        1348 :         if (!IS_HIGHBIT_SET(c))
    1397         322 :             return -1;
    1398             :     }
    1399         684 :     return mbl;
    1400             : }
    1401             : 
    1402             : static int
    1403         438 : pg_mule_verifystr(const unsigned char *s, int len)
    1404             : {
    1405         438 :     const unsigned char *start = s;
    1406             : 
    1407        1290 :     while (len > 0)
    1408             :     {
    1409             :         int         l;
    1410             : 
    1411             :         /* fast path for ASCII-subset characters */
    1412        1122 :         if (!IS_HIGHBIT_SET(*s))
    1413             :         {
    1414         690 :             if (*s == '\0')
    1415          36 :                 break;
    1416         654 :             l = 1;
    1417             :         }
    1418             :         else
    1419             :         {
    1420         432 :             l = pg_mule_verifychar(s, len);
    1421         432 :             if (l == -1)
    1422         234 :                 break;
    1423             :         }
    1424         852 :         s += l;
    1425         852 :         len -= l;
    1426             :     }
    1427             : 
    1428         438 :     return s - start;
    1429             : }
    1430             : 
    1431             : static int
    1432        7156 : pg_latin1_verifychar(const unsigned char *s, int len)
    1433             : {
    1434        7156 :     return 1;
    1435             : }
    1436             : 
    1437             : static int
    1438       11376 : pg_latin1_verifystr(const unsigned char *s, int len)
    1439             : {
    1440       11376 :     const unsigned char *nullpos = memchr(s, 0, len);
    1441             : 
    1442       11376 :     if (nullpos == NULL)
    1443       11268 :         return len;
    1444             :     else
    1445         108 :         return nullpos - s;
    1446             : }
    1447             : 
    1448             : static int
    1449        1002 : pg_sjis_verifychar(const unsigned char *s, int len)
    1450             : {
    1451             :     int         l,
    1452             :                 mbl;
    1453             :     unsigned char c1,
    1454             :                 c2;
    1455             : 
    1456        1002 :     l = mbl = pg_sjis_mblen(s);
    1457             : 
    1458        1002 :     if (len < l)
    1459         132 :         return -1;
    1460             : 
    1461         870 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1462           0 :         return mbl;
    1463             : 
    1464         870 :     c1 = *s++;
    1465         870 :     c2 = *s;
    1466         870 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1467         348 :         return -1;
    1468         522 :     return mbl;
    1469             : }
    1470             : 
    1471             : static int
    1472         546 : pg_sjis_verifystr(const unsigned char *s, int len)
    1473             : {
    1474         546 :     const unsigned char *start = s;
    1475             : 
    1476        2068 :     while (len > 0)
    1477             :     {
    1478             :         int         l;
    1479             : 
    1480             :         /* fast path for ASCII-subset characters */
    1481        1842 :         if (!IS_HIGHBIT_SET(*s))
    1482             :         {
    1483        1348 :             if (*s == '\0')
    1484          72 :                 break;
    1485        1276 :             l = 1;
    1486             :         }
    1487             :         else
    1488             :         {
    1489         494 :             l = pg_sjis_verifychar(s, len);
    1490         494 :             if (l == -1)
    1491         248 :                 break;
    1492             :         }
    1493        1522 :         s += l;
    1494        1522 :         len -= l;
    1495             :     }
    1496             : 
    1497         546 :     return s - start;
    1498             : }
    1499             : 
    1500             : static int
    1501         360 : pg_big5_verifychar(const unsigned char *s, int len)
    1502             : {
    1503             :     int         l,
    1504             :                 mbl;
    1505             : 
    1506         360 :     l = mbl = pg_big5_mblen(s);
    1507             : 
    1508         360 :     if (len < l)
    1509           6 :         return -1;
    1510             : 
    1511         354 :     if (l == 2 &&
    1512         354 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1513          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1514          12 :         return -1;
    1515             : 
    1516         576 :     while (--l > 0)
    1517             :     {
    1518         342 :         if (*++s == '\0')
    1519         108 :             return -1;
    1520             :     }
    1521             : 
    1522         234 :     return mbl;
    1523             : }
    1524             : 
    1525             : static int
    1526         162 : pg_big5_verifystr(const unsigned char *s, int len)
    1527             : {
    1528         162 :     const unsigned char *start = s;
    1529             : 
    1530         666 :     while (len > 0)
    1531             :     {
    1532             :         int         l;
    1533             : 
    1534             :         /* fast path for ASCII-subset characters */
    1535         594 :         if (!IS_HIGHBIT_SET(*s))
    1536             :         {
    1537         468 :             if (*s == '\0')
    1538          36 :                 break;
    1539         432 :             l = 1;
    1540             :         }
    1541             :         else
    1542             :         {
    1543         126 :             l = pg_big5_verifychar(s, len);
    1544         126 :             if (l == -1)
    1545          54 :                 break;
    1546             :         }
    1547         504 :         s += l;
    1548         504 :         len -= l;
    1549             :     }
    1550             : 
    1551         162 :     return s - start;
    1552             : }
    1553             : 
    1554             : static int
    1555         274 : pg_gbk_verifychar(const unsigned char *s, int len)
    1556             : {
    1557             :     int         l,
    1558             :                 mbl;
    1559             : 
    1560         274 :     l = mbl = pg_gbk_mblen(s);
    1561             : 
    1562         274 :     if (len < l)
    1563          54 :         return -1;
    1564             : 
    1565         220 :     if (l == 2 &&
    1566         220 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1567          28 :         s[1] == NONUTF8_INVALID_BYTE1)
    1568          28 :         return -1;
    1569             : 
    1570         384 :     while (--l > 0)
    1571             :     {
    1572         192 :         if (*++s == '\0')
    1573           0 :             return -1;
    1574             :     }
    1575             : 
    1576         192 :     return mbl;
    1577             : }
    1578             : 
    1579             : static int
    1580         256 : pg_gbk_verifystr(const unsigned char *s, int len)
    1581             : {
    1582         256 :     const unsigned char *start = s;
    1583             : 
    1584         658 :     while (len > 0)
    1585             :     {
    1586             :         int         l;
    1587             : 
    1588             :         /* fast path for ASCII-subset characters */
    1589         484 :         if (!IS_HIGHBIT_SET(*s))
    1590             :         {
    1591         242 :             if (*s == '\0')
    1592           0 :                 break;
    1593         242 :             l = 1;
    1594             :         }
    1595             :         else
    1596             :         {
    1597         242 :             l = pg_gbk_verifychar(s, len);
    1598         242 :             if (l == -1)
    1599          82 :                 break;
    1600             :         }
    1601         402 :         s += l;
    1602         402 :         len -= l;
    1603             :     }
    1604             : 
    1605         256 :     return s - start;
    1606             : }
    1607             : 
    1608             : static int
    1609          18 : pg_uhc_verifychar(const unsigned char *s, int len)
    1610             : {
    1611             :     int         l,
    1612             :                 mbl;
    1613             : 
    1614          18 :     l = mbl = pg_uhc_mblen(s);
    1615             : 
    1616          18 :     if (len < l)
    1617           6 :         return -1;
    1618             : 
    1619          12 :     if (l == 2 &&
    1620          12 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1621          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1622          12 :         return -1;
    1623             : 
    1624           0 :     while (--l > 0)
    1625             :     {
    1626           0 :         if (*++s == '\0')
    1627           0 :             return -1;
    1628             :     }
    1629             : 
    1630           0 :     return mbl;
    1631             : }
    1632             : 
    1633             : static int
    1634          24 : pg_uhc_verifystr(const unsigned char *s, int len)
    1635             : {
    1636          24 :     const unsigned char *start = s;
    1637             : 
    1638          42 :     while (len > 0)
    1639             :     {
    1640             :         int         l;
    1641             : 
    1642             :         /* fast path for ASCII-subset characters */
    1643          36 :         if (!IS_HIGHBIT_SET(*s))
    1644             :         {
    1645          18 :             if (*s == '\0')
    1646           0 :                 break;
    1647          18 :             l = 1;
    1648             :         }
    1649             :         else
    1650             :         {
    1651          18 :             l = pg_uhc_verifychar(s, len);
    1652          18 :             if (l == -1)
    1653          18 :                 break;
    1654             :         }
    1655          18 :         s += l;
    1656          18 :         len -= l;
    1657             :     }
    1658             : 
    1659          24 :     return s - start;
    1660             : }
    1661             : 
    1662             : static int
    1663        1212 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1664             : {
    1665             :     int         l;
    1666             : 
    1667        1212 :     if (!IS_HIGHBIT_SET(*s))
    1668           0 :         l = 1;                  /* ASCII */
    1669        1212 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1670             :     {
    1671             :         /* Should be 4-byte, validate remaining bytes */
    1672         318 :         if (*s >= 0x81 && *s <= 0xfe &&
    1673         306 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1674         306 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1675         162 :             l = 4;
    1676             :         else
    1677         156 :             l = -1;
    1678             :     }
    1679         894 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1680             :     {
    1681             :         /* Should be 2-byte, validate */
    1682         660 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1683         420 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1684         324 :             l = 2;
    1685             :         else
    1686         336 :             l = -1;
    1687             :     }
    1688             :     else
    1689         234 :         l = -1;
    1690        1212 :     return l;
    1691             : }
    1692             : 
    1693             : static int
    1694         902 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1695             : {
    1696         902 :     const unsigned char *start = s;
    1697             : 
    1698        2966 :     while (len > 0)
    1699             :     {
    1700             :         int         l;
    1701             : 
    1702             :         /* fast path for ASCII-subset characters */
    1703        2670 :         if (!IS_HIGHBIT_SET(*s))
    1704             :         {
    1705        1804 :             if (*s == '\0')
    1706          48 :                 break;
    1707        1756 :             l = 1;
    1708             :         }
    1709             :         else
    1710             :         {
    1711         866 :             l = pg_gb18030_verifychar(s, len);
    1712         866 :             if (l == -1)
    1713         558 :                 break;
    1714             :         }
    1715        2064 :         s += l;
    1716        2064 :         len -= l;
    1717             :     }
    1718             : 
    1719         902 :     return s - start;
    1720             : }
    1721             : 
    1722             : static int
    1723       17628 : pg_utf8_verifychar(const unsigned char *s, int len)
    1724             : {
    1725             :     int         l;
    1726             : 
    1727       17628 :     if ((*s & 0x80) == 0)
    1728             :     {
    1729           0 :         if (*s == '\0')
    1730           0 :             return -1;
    1731           0 :         return 1;
    1732             :     }
    1733       17628 :     else if ((*s & 0xe0) == 0xc0)
    1734        6176 :         l = 2;
    1735       11452 :     else if ((*s & 0xf0) == 0xe0)
    1736        6332 :         l = 3;
    1737        5120 :     else if ((*s & 0xf8) == 0xf0)
    1738        4856 :         l = 4;
    1739             :     else
    1740         264 :         l = 1;
    1741             : 
    1742       17628 :     if (l > len)
    1743         578 :         return -1;
    1744             : 
    1745       17050 :     if (!pg_utf8_islegal(s, l))
    1746        2362 :         return -1;
    1747             : 
    1748       14688 :     return l;
    1749             : }
    1750             : 
    1751             : /*
    1752             :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1753             :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1754             :  * input byte and current state are used to compute an index into an array of
    1755             :  * state transitions. Since the address of the next transition is dependent
    1756             :  * on this computation, there is latency in executing the load instruction,
    1757             :  * and the CPU is not kept busy.
    1758             :  *
    1759             :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1760             :  *
    1761             :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1762             :  *
    1763             :  * In a shift-based DFA, the input byte is an index into array of integers
    1764             :  * whose bit pattern encodes the state transitions. To compute the next
    1765             :  * state, we simply right-shift the integer by the current state and apply a
    1766             :  * mask. In this scheme, the address of the transition only depends on the
    1767             :  * input byte, so there is better pipelining.
    1768             :  *
    1769             :  * The naming convention for states and transitions was adopted from a UTF-8
    1770             :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1771             :  *
    1772             :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1773             :  *
    1774             :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1775             :  * ==========================================================================
    1776             :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1777             :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1778             :  *                                                                  |
    1779             :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1780             :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1781             :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1782             :  *                                                                  |
    1783             :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1784             :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1785             :  *                                                                  |
    1786             :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1787             :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1788             :  *
    1789             :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1790             :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1791             :  * it's possible to find state numbers such that the transitions fit within
    1792             :  * 32-bit integers, as Dougall Johnson demonstrated:
    1793             :  *
    1794             :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1795             :  *
    1796             :  * This packed representation is the reason for the seemingly odd choice of
    1797             :  * state values below.
    1798             :  */
    1799             : 
    1800             : /* Error */
    1801             : #define ERR  0
    1802             : /* Begin */
    1803             : #define BGN 11
    1804             : /* Continuation states, expect 1/2/3 continuation bytes */
    1805             : #define CS1 16
    1806             : #define CS2  1
    1807             : #define CS3  5
    1808             : /* Partial states, where the first continuation byte has a restricted range */
    1809             : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1810             : #define P3B 20                  /* Lead was ED, check for surrogate */
    1811             : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1812             : #define P4B 30                  /* Lead was F4, check for too-large */
    1813             : /* Begin and End are the same state */
    1814             : #define END BGN
    1815             : 
    1816             : /* the encoded state transitions for the lookup table */
    1817             : 
    1818             : /* ASCII */
    1819             : #define ASC (END << BGN)
    1820             : /* 2-byte lead */
    1821             : #define L2A (CS1 << BGN)
    1822             : /* 3-byte lead */
    1823             : #define L3A (P3A << BGN)
    1824             : #define L3B (CS2 << BGN)
    1825             : #define L3C (P3B << BGN)
    1826             : /* 4-byte lead */
    1827             : #define L4A (P4A << BGN)
    1828             : #define L4B (CS3 << BGN)
    1829             : #define L4C (P4B << BGN)
    1830             : /* continuation byte */
    1831             : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1832             : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1833             : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1834             : /* invalid byte */
    1835             : #define ILL ERR
    1836             : 
    1837             : static const uint32 Utf8Transition[256] =
    1838             : {
    1839             :     /* ASCII */
    1840             : 
    1841             :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1842             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1843             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1844             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1845             : 
    1846             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1847             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1848             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1849             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1850             : 
    1851             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1852             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1853             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1854             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1855             : 
    1856             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1857             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1858             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1859             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1860             : 
    1861             :     /* continuation bytes */
    1862             : 
    1863             :     /* 80..8F */
    1864             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1865             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1866             : 
    1867             :     /* 90..9F */
    1868             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1869             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1870             : 
    1871             :     /* A0..BF */
    1872             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1873             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1874             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1875             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1876             : 
    1877             :     /* leading bytes */
    1878             : 
    1879             :     /* C0..DF */
    1880             :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1881             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1882             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1883             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1884             : 
    1885             :     /* E0..EF */
    1886             :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1887             :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1888             : 
    1889             :     /* F0..FF */
    1890             :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1891             :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1892             : };
    1893             : 
    1894             : static void
    1895        1750 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1896             : {
    1897             :     /* Note: We deliberately don't check the state's value here. */
    1898       57750 :     while (len > 0)
    1899             :     {
    1900             :         /*
    1901             :          * It's important that the mask value is 31: In most instruction sets,
    1902             :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1903             :          * 32, so the compiler should elide the mask operation.
    1904             :          */
    1905       56000 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1906       56000 :         len--;
    1907             :     }
    1908             : 
    1909        1750 :     *state &= 31;
    1910        1750 : }
    1911             : 
    1912             : static int
    1913     1219058 : pg_utf8_verifystr(const unsigned char *s, int len)
    1914             : {
    1915     1219058 :     const unsigned char *start = s;
    1916     1219058 :     const int   orig_len = len;
    1917     1219058 :     uint32      state = BGN;
    1918             : 
    1919             : /*
    1920             :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1921             :  * the compiler can unroll a longer loop, it's not worth it because we
    1922             :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1923             :  */
    1924             : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1925             : 
    1926     1219058 :     if (len >= STRIDE_LENGTH)
    1927             :     {
    1928     4166986 :         while (len >= STRIDE_LENGTH)
    1929             :         {
    1930             :             /*
    1931             :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1932             :              * but we must first check for a non-END state, which means the
    1933             :              * previous chunk ended in the middle of a multibyte sequence.
    1934             :              */
    1935     3582598 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1936        1750 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1937             : 
    1938     3582598 :             s += STRIDE_LENGTH;
    1939     3582598 :             len -= STRIDE_LENGTH;
    1940             :         }
    1941             : 
    1942             :         /* The error state persists, so we only need to check for it here. */
    1943      584388 :         if (state == ERR)
    1944             :         {
    1945             :             /*
    1946             :              * Start over from the beginning with the slow path so we can
    1947             :              * count the valid bytes.
    1948             :              */
    1949         504 :             len = orig_len;
    1950         504 :             s = start;
    1951             :         }
    1952      583884 :         else if (state != END)
    1953             :         {
    1954             :             /*
    1955             :              * The fast path exited in the middle of a multibyte sequence.
    1956             :              * Walk backwards to find the leading byte so that the slow path
    1957             :              * can resume checking from there. We must always backtrack at
    1958             :              * least one byte, since the current byte could be e.g. an ASCII
    1959             :              * byte after a 2-byte lead, which is invalid.
    1960             :              */
    1961             :             do
    1962             :             {
    1963             :                 Assert(s > start);
    1964         116 :                 s--;
    1965         116 :                 len++;
    1966             :                 Assert(IS_HIGHBIT_SET(*s));
    1967         116 :             } while (pg_utf_mblen(s) <= 1);
    1968             :         }
    1969             :     }
    1970             : 
    1971             :     /* check remaining bytes */
    1972    17970278 :     while (len > 0)
    1973             :     {
    1974             :         int         l;
    1975             : 
    1976             :         /* fast path for ASCII-subset characters */
    1977    16754302 :         if (!IS_HIGHBIT_SET(*s))
    1978             :         {
    1979    16736746 :             if (*s == '\0')
    1980         206 :                 break;
    1981    16736540 :             l = 1;
    1982             :         }
    1983             :         else
    1984             :         {
    1985       17556 :             l = pg_utf8_verifychar(s, len);
    1986       17556 :             if (l == -1)
    1987        2876 :                 break;
    1988             :         }
    1989    16751220 :         s += l;
    1990    16751220 :         len -= l;
    1991             :     }
    1992             : 
    1993     1219058 :     return s - start;
    1994             : }
    1995             : 
    1996             : /*
    1997             :  * Check for validity of a single UTF-8 encoded character
    1998             :  *
    1999             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    2000             :  * restrictions on the second byte are meant to ensure that there isn't
    2001             :  * more than one encoding of a given Unicode character point; that is,
    2002             :  * you may not use a longer-than-necessary byte sequence with high order
    2003             :  * zero bits to represent a character that would fit in fewer bytes.
    2004             :  * To do otherwise is to create security hazards (eg, create an apparent
    2005             :  * non-ASCII character that decodes to plain ASCII).
    2006             :  *
    2007             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    2008             :  * caller must have checked that that many bytes are present in the buffer.
    2009             :  */
    2010             : bool
    2011       23654 : pg_utf8_islegal(const unsigned char *source, int length)
    2012             : {
    2013             :     unsigned char a;
    2014             : 
    2015       23654 :     switch (length)
    2016             :     {
    2017           0 :         default:
    2018             :             /* reject lengths 5 and 6 for now */
    2019           0 :             return false;
    2020        4596 :         case 4:
    2021        4596 :             a = source[3];
    2022        4596 :             if (a < 0x80 || a > 0xBF)
    2023         364 :                 return false;
    2024             :             /* FALL THRU */
    2025             :         case 3:
    2026       12050 :             a = source[2];
    2027       12050 :             if (a < 0x80 || a > 0xBF)
    2028         680 :                 return false;
    2029             :             /* FALL THRU */
    2030             :         case 2:
    2031       18004 :             a = source[1];
    2032       18004 :             switch (*source)
    2033             :             {
    2034         312 :                 case 0xE0:
    2035         312 :                     if (a < 0xA0 || a > 0xBF)
    2036         264 :                         return false;
    2037          48 :                     break;
    2038         312 :                 case 0xED:
    2039         312 :                     if (a < 0x80 || a > 0x9F)
    2040         264 :                         return false;
    2041          48 :                     break;
    2042        4052 :                 case 0xF0:
    2043        4052 :                     if (a < 0x90 || a > 0xBF)
    2044         264 :                         return false;
    2045        3788 :                     break;
    2046         180 :                 case 0xF4:
    2047         180 :                     if (a < 0x80 || a > 0x8F)
    2048         132 :                         return false;
    2049          48 :                     break;
    2050       13148 :                 default:
    2051       13148 :                     if (a < 0x80 || a > 0xBF)
    2052         298 :                         return false;
    2053       12850 :                     break;
    2054             :             }
    2055             :             /* FALL THRU */
    2056             :         case 1:
    2057       21388 :             a = *source;
    2058       21388 :             if (a >= 0x80 && a < 0xC2)
    2059         396 :                 return false;
    2060       20992 :             if (a > 0xF4)
    2061         132 :                 return false;
    2062       20860 :             break;
    2063             :     }
    2064       20860 :     return true;
    2065             : }
    2066             : 
    2067             : 
    2068             : /*
    2069             :  * Fills the provided buffer with two bytes such that:
    2070             :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2071             :  */
    2072             : void
    2073         412 : pg_encoding_set_invalid(int encoding, char *dst)
    2074             : {
    2075             :     Assert(pg_encoding_max_length(encoding) > 1);
    2076             : 
    2077         412 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2078         412 :     dst[1] = NONUTF8_INVALID_BYTE1;
    2079         412 : }
    2080             : 
    2081             : /*
    2082             :  *-------------------------------------------------------------------
    2083             :  * encoding info table
    2084             :  *-------------------------------------------------------------------
    2085             :  */
    2086             : const pg_wchar_tbl pg_wchar_table[] = {
    2087             :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2088             :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2089             :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
    2090             :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2091             :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2092             :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2093             :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2094             :     [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2095             :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096             :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097             :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098             :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2099             :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2100             :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2101             :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2102             :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2103             :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2104             :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2105             :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2106             :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2107             :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2108             :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2109             :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2110             :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2111             :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2112             :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2113             :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2114             :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2115             :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2116             :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2117             :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2118             :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2119             :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2120             :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2121             :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2122             :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2123             :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2124             :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2125             :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2126             :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2127             :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2128             :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2129             : };
    2130             : 
    2131             : /*
    2132             :  * Returns the byte length of a multibyte character.
    2133             :  *
    2134             :  * Choose "mblen" functions based on the input string characteristics.
    2135             :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    2136             :  *
    2137             :  * - The input string is zero-terminated
    2138             :  *
    2139             :  * - The input string is known to be valid in the encoding (e.g., string
    2140             :  *   converted from database encoding)
    2141             :  *
    2142             :  * - The encoding is not GB18030 (e.g., when only database encodings are
    2143             :  *   passed to 'encoding' parameter)
    2144             :  *
    2145             :  * encoding==GB18030 requires examining up to two bytes to determine character
    2146             :  * length.  Therefore, callers satisfying none of those conditions must use
    2147             :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    2148             :  * guaranteed to be within allocation bounds.
    2149             :  *
    2150             :  * When dealing with text that is not certainly valid in the specified
    2151             :  * encoding, the result may exceed the actual remaining string length.
    2152             :  * Callers that are not prepared to deal with that should use Min(remaining,
    2153             :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    2154             :  * pg_encoding_mblen_bounded() are interchangeable.
    2155             :  */
    2156             : int
    2157    44754272 : pg_encoding_mblen(int encoding, const char *mbstr)
    2158             : {
    2159    44754272 :     return (PG_VALID_ENCODING(encoding) ?
    2160    89508544 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2161           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2162             : }
    2163             : 
    2164             : /*
    2165             :  * Returns the byte length of a multibyte character (possibly not
    2166             :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    2167             :  */
    2168             : int
    2169        6410 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    2170             :                                 size_t remaining)
    2171             : {
    2172             :     /*
    2173             :      * Define zero remaining as too few, even for single-byte encodings.
    2174             :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    2175             :      * zero; others read one.
    2176             :      */
    2177        6410 :     if (remaining < 1 ||
    2178         338 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    2179          72 :         return INT_MAX;
    2180        6338 :     return pg_encoding_mblen(encoding, mbstr);
    2181             : }
    2182             : 
    2183             : /*
    2184             :  * Returns the byte length of a multibyte character; but not more than the
    2185             :  * distance to the terminating zero byte.  For input that might lack a
    2186             :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    2187             :  */
    2188             : int
    2189           0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2190             : {
    2191           0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2192             : }
    2193             : 
    2194             : /*
    2195             :  * Returns the display length of a multibyte character.
    2196             :  */
    2197             : int
    2198    44572678 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2199             : {
    2200    44572678 :     return (PG_VALID_ENCODING(encoding) ?
    2201    89145356 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2202           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2203             : }
    2204             : 
    2205             : /*
    2206             :  * Verify the first multibyte character of the given string.
    2207             :  * Return its byte length if good, -1 if bad.  (See comments above for
    2208             :  * full details of the mbverifychar API.)
    2209             :  */
    2210             : int
    2211        9790 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2212             : {
    2213        9790 :     return (PG_VALID_ENCODING(encoding) ?
    2214       19580 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2215           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2216             : }
    2217             : 
    2218             : /*
    2219             :  * Verify that a string is valid for the given encoding.
    2220             :  * Returns the number of input bytes (<= len) that form a valid string.
    2221             :  * (See comments above for full details of the mbverifystr API.)
    2222             :  */
    2223             : int
    2224      462216 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2225             : {
    2226      462216 :     return (PG_VALID_ENCODING(encoding) ?
    2227      924432 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2228           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2229             : }
    2230             : 
    2231             : /*
    2232             :  * fetch maximum length of a given encoding
    2233             :  */
    2234             : int
    2235     1168690 : pg_encoding_max_length(int encoding)
    2236             : {
    2237             :     Assert(PG_VALID_ENCODING(encoding));
    2238             : 
    2239             :     /*
    2240             :      * Check for the encoding despite the assert, due to some mingw versions
    2241             :      * otherwise issuing bogus warnings.
    2242             :      */
    2243     1168690 :     return PG_VALID_ENCODING(encoding) ?
    2244     2337380 :         pg_wchar_table[encoding].maxmblen :
    2245             :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2246             : }

Generated by: LCOV version 1.16