LCOV - code coverage report
Current view: top level - src/common - wchar.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18beta1 Lines: 520 855 60.8 %
Date: 2025-05-08 23:15:27 Functions: 61 82 74.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wchar.c
       4             :  *    Functions for working with multibyte characters in various encodings.
       5             :  *
       6             :  * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/common/wchar.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #include <limits.h>
      16             : 
      17             : #include "mb/pg_wchar.h"
      18             : #include "utils/ascii.h"
      19             : 
      20             : 
      21             : /*
      22             :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      23             :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      24             :  *
      25             :  * For historical reasons, several verifychar implementations opt to reject
      26             :  * this pair specifically.  Byte pair range constraints, in encoding
      27             :  * originator documentation, always excluded this pair.  No core conversion
      28             :  * could translate it.  However, longstanding verifychar implementations
      29             :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      30             :  * pairs not valid per encoding originator documentation.  To avoid tightening
      31             :  * core or non-core conversions in a security patch, we sought this one pair.
      32             :  *
      33             :  * PQescapeString() historically used spaces for BYTE1; many other values
      34             :  * could suffice for BYTE1.
      35             :  */
      36             : #define NONUTF8_INVALID_BYTE0 (0x8d)
      37             : #define NONUTF8_INVALID_BYTE1 (' ')
      38             : 
      39             : 
      40             : /*
      41             :  * Operations on multi-byte encodings are driven by a table of helper
      42             :  * functions.
      43             :  *
      44             :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      45             :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      46             :  * and wchar2mb() conversion functions.
      47             :  *
      48             :  * These functions generally assume that their input is validly formed.
      49             :  * The "verifier" functions, further down in the file, have to be more
      50             :  * paranoid.
      51             :  *
      52             :  * We expect that mblen() does not need to examine more than the first byte
      53             :  * of the character to discover the correct length.  GB18030 is an exception
      54             :  * to that rule, though, as it also looks at second byte.  But even that
      55             :  * behaves in a predictable way, if you only pass the first byte: it will
      56             :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      57             :  * good enough for all current uses.
      58             :  *
      59             :  * Note: for the display output of psql to work properly, the return values
      60             :  * of the dsplen functions must conform to the Unicode standard. In particular
      61             :  * the NUL character is zero width and control characters are generally
      62             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      63             :  * subset to the ASCII routines to ensure consistency.
      64             :  */
      65             : 
      66             : /*
      67             :  * SQL/ASCII
      68             :  */
      69             : static int
      70         762 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      71             : {
      72         762 :     int         cnt = 0;
      73             : 
      74       63428 :     while (len > 0 && *from)
      75             :     {
      76       62666 :         *to++ = *from++;
      77       62666 :         len--;
      78       62666 :         cnt++;
      79             :     }
      80         762 :     *to = 0;
      81         762 :     return cnt;
      82             : }
      83             : 
      84             : static int
      85       48234 : pg_ascii_mblen(const unsigned char *s)
      86             : {
      87       48234 :     return 1;
      88             : }
      89             : 
      90             : static int
      91       45656 : pg_ascii_dsplen(const unsigned char *s)
      92             : {
      93       45656 :     if (*s == '\0')
      94           0 :         return 0;
      95       45656 :     if (*s < 0x20 || *s == 0x7f)
      96           6 :         return -1;
      97             : 
      98       45650 :     return 1;
      99             : }
     100             : 
     101             : /*
     102             :  * EUC
     103             :  */
     104             : static int
     105           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     106             : {
     107           0 :     int         cnt = 0;
     108             : 
     109           0 :     while (len > 0 && *from)
     110             :     {
     111           0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
     112             :                                          * KANA") */
     113             :         {
     114           0 :             from++;
     115           0 :             *to = (SS2 << 8) | *from++;
     116           0 :             len -= 2;
     117             :         }
     118           0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
     119             :         {
     120           0 :             from++;
     121           0 :             *to = (SS3 << 16) | (*from++ << 8);
     122           0 :             *to |= *from++;
     123           0 :             len -= 3;
     124             :         }
     125           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
     126             :         {
     127           0 :             *to = *from++ << 8;
     128           0 :             *to |= *from++;
     129           0 :             len -= 2;
     130             :         }
     131             :         else                    /* must be ASCII */
     132             :         {
     133           0 :             *to = *from++;
     134           0 :             len--;
     135             :         }
     136           0 :         to++;
     137           0 :         cnt++;
     138             :     }
     139           0 :     *to = 0;
     140           0 :     return cnt;
     141             : }
     142             : 
     143             : static inline int
     144         234 : pg_euc_mblen(const unsigned char *s)
     145             : {
     146             :     int         len;
     147             : 
     148         234 :     if (*s == SS2)
     149           0 :         len = 2;
     150         234 :     else if (*s == SS3)
     151           0 :         len = 3;
     152         234 :     else if (IS_HIGHBIT_SET(*s))
     153         162 :         len = 2;
     154             :     else
     155          72 :         len = 1;
     156         234 :     return len;
     157             : }
     158             : 
     159             : static inline int
     160           0 : pg_euc_dsplen(const unsigned char *s)
     161             : {
     162             :     int         len;
     163             : 
     164           0 :     if (*s == SS2)
     165           0 :         len = 2;
     166           0 :     else if (*s == SS3)
     167           0 :         len = 2;
     168           0 :     else if (IS_HIGHBIT_SET(*s))
     169           0 :         len = 2;
     170             :     else
     171           0 :         len = pg_ascii_dsplen(s);
     172           0 :     return len;
     173             : }
     174             : 
     175             : /*
     176             :  * EUC_JP
     177             :  */
     178             : static int
     179           0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     180             : {
     181           0 :     return pg_euc2wchar_with_len(from, to, len);
     182             : }
     183             : 
     184             : static int
     185         204 : pg_eucjp_mblen(const unsigned char *s)
     186             : {
     187         204 :     return pg_euc_mblen(s);
     188             : }
     189             : 
     190             : static int
     191           0 : pg_eucjp_dsplen(const unsigned char *s)
     192             : {
     193             :     int         len;
     194             : 
     195           0 :     if (*s == SS2)
     196           0 :         len = 1;
     197           0 :     else if (*s == SS3)
     198           0 :         len = 2;
     199           0 :     else if (IS_HIGHBIT_SET(*s))
     200           0 :         len = 2;
     201             :     else
     202           0 :         len = pg_ascii_dsplen(s);
     203           0 :     return len;
     204             : }
     205             : 
     206             : /*
     207             :  * EUC_KR
     208             :  */
     209             : static int
     210           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     211             : {
     212           0 :     return pg_euc2wchar_with_len(from, to, len);
     213             : }
     214             : 
     215             : static int
     216           6 : pg_euckr_mblen(const unsigned char *s)
     217             : {
     218           6 :     return pg_euc_mblen(s);
     219             : }
     220             : 
     221             : static int
     222           0 : pg_euckr_dsplen(const unsigned char *s)
     223             : {
     224           0 :     return pg_euc_dsplen(s);
     225             : }
     226             : 
     227             : /*
     228             :  * EUC_CN
     229             :  *
     230             :  */
     231             : static int
     232           0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     233             : {
     234           0 :     int         cnt = 0;
     235             : 
     236           0 :     while (len > 0 && *from)
     237             :     {
     238           0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
     239             :         {
     240           0 :             from++;
     241           0 :             *to = (SS2 << 16) | (*from++ << 8);
     242           0 :             *to |= *from++;
     243           0 :             len -= 3;
     244             :         }
     245           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     246             :         {
     247           0 :             from++;
     248           0 :             *to = (SS3 << 16) | (*from++ << 8);
     249           0 :             *to |= *from++;
     250           0 :             len -= 3;
     251             :         }
     252           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     253             :         {
     254           0 :             *to = *from++ << 8;
     255           0 :             *to |= *from++;
     256           0 :             len -= 2;
     257             :         }
     258             :         else
     259             :         {
     260           0 :             *to = *from++;
     261           0 :             len--;
     262             :         }
     263           0 :         to++;
     264           0 :         cnt++;
     265             :     }
     266           0 :     *to = 0;
     267           0 :     return cnt;
     268             : }
     269             : 
     270             : static int
     271           6 : pg_euccn_mblen(const unsigned char *s)
     272             : {
     273             :     int         len;
     274             : 
     275           6 :     if (IS_HIGHBIT_SET(*s))
     276           6 :         len = 2;
     277             :     else
     278           0 :         len = 1;
     279           6 :     return len;
     280             : }
     281             : 
     282             : static int
     283           0 : pg_euccn_dsplen(const unsigned char *s)
     284             : {
     285             :     int         len;
     286             : 
     287           0 :     if (IS_HIGHBIT_SET(*s))
     288           0 :         len = 2;
     289             :     else
     290           0 :         len = pg_ascii_dsplen(s);
     291           0 :     return len;
     292             : }
     293             : 
     294             : /*
     295             :  * EUC_TW
     296             :  *
     297             :  */
     298             : static int
     299           0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     300             : {
     301           0 :     int         cnt = 0;
     302             : 
     303           0 :     while (len > 0 && *from)
     304             :     {
     305           0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
     306             :         {
     307           0 :             from++;
     308           0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     309           0 :             *to |= *from++ << 8;
     310           0 :             *to |= *from++;
     311           0 :             len -= 4;
     312             :         }
     313           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     314             :         {
     315           0 :             from++;
     316           0 :             *to = (SS3 << 16) | (*from++ << 8);
     317           0 :             *to |= *from++;
     318           0 :             len -= 3;
     319             :         }
     320           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     321             :         {
     322           0 :             *to = *from++ << 8;
     323           0 :             *to |= *from++;
     324           0 :             len -= 2;
     325             :         }
     326             :         else
     327             :         {
     328           0 :             *to = *from++;
     329           0 :             len--;
     330             :         }
     331           0 :         to++;
     332           0 :         cnt++;
     333             :     }
     334           0 :     *to = 0;
     335           0 :     return cnt;
     336             : }
     337             : 
     338             : static int
     339           6 : pg_euctw_mblen(const unsigned char *s)
     340             : {
     341             :     int         len;
     342             : 
     343           6 :     if (*s == SS2)
     344           0 :         len = 4;
     345           6 :     else if (*s == SS3)
     346           0 :         len = 3;
     347           6 :     else if (IS_HIGHBIT_SET(*s))
     348           6 :         len = 2;
     349             :     else
     350           0 :         len = 1;
     351           6 :     return len;
     352             : }
     353             : 
     354             : static int
     355           0 : pg_euctw_dsplen(const unsigned char *s)
     356             : {
     357             :     int         len;
     358             : 
     359           0 :     if (*s == SS2)
     360           0 :         len = 2;
     361           0 :     else if (*s == SS3)
     362           0 :         len = 2;
     363           0 :     else if (IS_HIGHBIT_SET(*s))
     364           0 :         len = 2;
     365             :     else
     366           0 :         len = pg_ascii_dsplen(s);
     367           0 :     return len;
     368             : }
     369             : 
     370             : /*
     371             :  * Convert pg_wchar to EUC_* encoding.
     372             :  * caller must allocate enough space for "to", including a trailing zero!
     373             :  * len: length of from.
     374             :  * "from" not necessarily null terminated.
     375             :  */
     376             : static int
     377           0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     378             : {
     379           0 :     int         cnt = 0;
     380             : 
     381           0 :     while (len > 0 && *from)
     382             :     {
     383             :         unsigned char c;
     384             : 
     385           0 :         if ((c = (*from >> 24)))
     386             :         {
     387           0 :             *to++ = c;
     388           0 :             *to++ = (*from >> 16) & 0xff;
     389           0 :             *to++ = (*from >> 8) & 0xff;
     390           0 :             *to++ = *from & 0xff;
     391           0 :             cnt += 4;
     392             :         }
     393           0 :         else if ((c = (*from >> 16)))
     394             :         {
     395           0 :             *to++ = c;
     396           0 :             *to++ = (*from >> 8) & 0xff;
     397           0 :             *to++ = *from & 0xff;
     398           0 :             cnt += 3;
     399             :         }
     400           0 :         else if ((c = (*from >> 8)))
     401             :         {
     402           0 :             *to++ = c;
     403           0 :             *to++ = *from & 0xff;
     404           0 :             cnt += 2;
     405             :         }
     406             :         else
     407             :         {
     408           0 :             *to++ = *from;
     409           0 :             cnt++;
     410             :         }
     411           0 :         from++;
     412           0 :         len--;
     413             :     }
     414           0 :     *to = 0;
     415           0 :     return cnt;
     416             : }
     417             : 
     418             : 
     419             : /*
     420             :  * JOHAB
     421             :  */
     422             : static int
     423          24 : pg_johab_mblen(const unsigned char *s)
     424             : {
     425          24 :     return pg_euc_mblen(s);
     426             : }
     427             : 
     428             : static int
     429           0 : pg_johab_dsplen(const unsigned char *s)
     430             : {
     431           0 :     return pg_euc_dsplen(s);
     432             : }
     433             : 
     434             : /*
     435             :  * convert UTF8 string to pg_wchar (UCS-4)
     436             :  * caller must allocate enough space for "to", including a trailing zero!
     437             :  * len: length of from.
     438             :  * "from" not necessarily null terminated.
     439             :  */
     440             : static int
     441     7059854 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     442             : {
     443     7059854 :     int         cnt = 0;
     444             :     uint32      c1,
     445             :                 c2,
     446             :                 c3,
     447             :                 c4;
     448             : 
     449   148768158 :     while (len > 0 && *from)
     450             :     {
     451   141708304 :         if ((*from & 0x80) == 0)
     452             :         {
     453   141707848 :             *to = *from++;
     454   141707848 :             len--;
     455             :         }
     456         456 :         else if ((*from & 0xe0) == 0xc0)
     457             :         {
     458         364 :             if (len < 2)
     459           0 :                 break;          /* drop trailing incomplete char */
     460         364 :             c1 = *from++ & 0x1f;
     461         364 :             c2 = *from++ & 0x3f;
     462         364 :             *to = (c1 << 6) | c2;
     463         364 :             len -= 2;
     464             :         }
     465          92 :         else if ((*from & 0xf0) == 0xe0)
     466             :         {
     467          92 :             if (len < 3)
     468           0 :                 break;          /* drop trailing incomplete char */
     469          92 :             c1 = *from++ & 0x0f;
     470          92 :             c2 = *from++ & 0x3f;
     471          92 :             c3 = *from++ & 0x3f;
     472          92 :             *to = (c1 << 12) | (c2 << 6) | c3;
     473          92 :             len -= 3;
     474             :         }
     475           0 :         else if ((*from & 0xf8) == 0xf0)
     476             :         {
     477           0 :             if (len < 4)
     478           0 :                 break;          /* drop trailing incomplete char */
     479           0 :             c1 = *from++ & 0x07;
     480           0 :             c2 = *from++ & 0x3f;
     481           0 :             c3 = *from++ & 0x3f;
     482           0 :             c4 = *from++ & 0x3f;
     483           0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     484           0 :             len -= 4;
     485             :         }
     486             :         else
     487             :         {
     488             :             /* treat a bogus char as length 1; not ours to raise error */
     489           0 :             *to = *from++;
     490           0 :             len--;
     491             :         }
     492   141708304 :         to++;
     493   141708304 :         cnt++;
     494             :     }
     495     7059854 :     *to = 0;
     496     7059854 :     return cnt;
     497             : }
     498             : 
     499             : 
     500             : /*
     501             :  * Trivial conversion from pg_wchar to UTF-8.
     502             :  * caller should allocate enough space for "to"
     503             :  * len: length of from.
     504             :  * "from" not necessarily null terminated.
     505             :  */
     506             : static int
     507     1115096 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     508             : {
     509     1115096 :     int         cnt = 0;
     510             : 
     511    16791890 :     while (len > 0 && *from)
     512             :     {
     513             :         int         char_len;
     514             : 
     515    15676794 :         unicode_to_utf8(*from, to);
     516    15676794 :         char_len = pg_utf_mblen(to);
     517    15676794 :         cnt += char_len;
     518    15676794 :         to += char_len;
     519    15676794 :         from++;
     520    15676794 :         len--;
     521             :     }
     522     1115096 :     *to = 0;
     523     1115096 :     return cnt;
     524             : }
     525             : 
     526             : /*
     527             :  * Return the byte length of a UTF8 character pointed to by s
     528             :  *
     529             :  * Note: in the current implementation we do not support UTF8 sequences
     530             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     531             :  * We return "1" for any leading byte that is either flat-out illegal or
     532             :  * indicates a length larger than we support.
     533             :  *
     534             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     535             :  * other places would need to be fixed to change this.
     536             :  */
     537             : int
     538   322758862 : pg_utf_mblen(const unsigned char *s)
     539             : {
     540             :     int         len;
     541             : 
     542   322758862 :     if ((*s & 0x80) == 0)
     543   322729482 :         len = 1;
     544       29380 :     else if ((*s & 0xe0) == 0xc0)
     545       15178 :         len = 2;
     546       14202 :     else if ((*s & 0xf0) == 0xe0)
     547        9656 :         len = 3;
     548        4546 :     else if ((*s & 0xf8) == 0xf0)
     549        4372 :         len = 4;
     550             : #ifdef NOT_USED
     551             :     else if ((*s & 0xfc) == 0xf8)
     552             :         len = 5;
     553             :     else if ((*s & 0xfe) == 0xfc)
     554             :         len = 6;
     555             : #endif
     556             :     else
     557         174 :         len = 1;
     558   322758862 :     return len;
     559             : }
     560             : 
     561             : /*
     562             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     563             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     564             :  * <http://www.unix.org/online.html>
     565             :  *
     566             :  * Markus Kuhn -- 2001-09-08 -- public domain
     567             :  *
     568             :  * customised for PostgreSQL
     569             :  *
     570             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     571             :  */
     572             : 
     573             : struct mbinterval
     574             : {
     575             :     unsigned int first;
     576             :     unsigned int last;
     577             : };
     578             : 
     579             : /* auxiliary function for binary search in interval table */
     580             : static int
     581   107916304 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     582             : {
     583   107916304 :     int         min = 0;
     584             :     int         mid;
     585             : 
     586   107916304 :     if (ucs < table[0].first || ucs > table[max].last)
     587   107905232 :         return 0;
     588       97016 :     while (max >= min)
     589             :     {
     590       86880 :         mid = (min + max) / 2;
     591       86880 :         if (ucs > table[mid].last)
     592       17560 :             min = mid + 1;
     593       69320 :         else if (ucs < table[mid].first)
     594       68384 :             max = mid - 1;
     595             :         else
     596         936 :             return 1;
     597             :     }
     598             : 
     599       10136 :     return 0;
     600             : }
     601             : 
     602             : 
     603             : /* The following functions define the column width of an ISO 10646
     604             :  * character as follows:
     605             :  *
     606             :  *    - The null character (U+0000) has a column width of 0.
     607             :  *
     608             :  *    - Other C0/C1 control characters and DEL will lead to a return
     609             :  *      value of -1.
     610             :  *
     611             :  *    - Non-spacing and enclosing combining characters (general
     612             :  *      category code Mn, Me or Cf in the Unicode database) have a
     613             :  *      column width of 0.
     614             :  *
     615             :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     616             :  *      FullWidth (F) category as defined in Unicode Technical
     617             :  *      Report #11 have a column width of 2.
     618             :  *
     619             :  *    - All remaining characters (including all printable
     620             :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     621             :  *      etc.) have a column width of 1.
     622             :  *
     623             :  * This implementation assumes that wchar_t characters are encoded
     624             :  * in ISO 10646.
     625             :  */
     626             : 
     627             : static int
     628    54021020 : ucs_wcwidth(pg_wchar ucs)
     629             : {
     630             : #include "common/unicode_nonspacing_table.h"
     631             : #include "common/unicode_east_asian_fw_table.h"
     632             : 
     633             :     /* test for 8-bit control characters */
     634    54021020 :     if (ucs == 0)
     635           0 :         return 0;
     636             : 
     637    54021020 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     638       62544 :         return -1;
     639             : 
     640             :     /*
     641             :      * binary search in table of non-spacing characters
     642             :      *
     643             :      * XXX: In the official Unicode sources, it is possible for a character to
     644             :      * be described as both non-spacing and wide at the same time. As of
     645             :      * Unicode 13.0, treating the non-spacing property as the determining
     646             :      * factor for display width leads to the correct behavior, so do that
     647             :      * search first.
     648             :      */
     649    53958476 :     if (mbbisearch(ucs, nonspacing,
     650             :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     651         648 :         return 0;
     652             : 
     653             :     /* binary search in table of wide characters */
     654    53957828 :     if (mbbisearch(ucs, east_asian_fw,
     655             :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     656         288 :         return 2;
     657             : 
     658    53957540 :     return 1;
     659             : }
     660             : 
     661             : static int
     662    54021020 : pg_utf_dsplen(const unsigned char *s)
     663             : {
     664    54021020 :     return ucs_wcwidth(utf8_to_unicode(s));
     665             : }
     666             : 
     667             : /*
     668             :  * convert mule internal code to pg_wchar
     669             :  * caller should allocate enough space for "to"
     670             :  * len: length of from.
     671             :  * "from" not necessarily null terminated.
     672             :  */
     673             : static int
     674           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     675             : {
     676           0 :     int         cnt = 0;
     677             : 
     678           0 :     while (len > 0 && *from)
     679             :     {
     680           0 :         if (IS_LC1(*from) && len >= 2)
     681             :         {
     682           0 :             *to = *from++ << 16;
     683           0 :             *to |= *from++;
     684           0 :             len -= 2;
     685             :         }
     686           0 :         else if (IS_LCPRV1(*from) && len >= 3)
     687             :         {
     688           0 :             from++;
     689           0 :             *to = *from++ << 16;
     690           0 :             *to |= *from++;
     691           0 :             len -= 3;
     692             :         }
     693           0 :         else if (IS_LC2(*from) && len >= 3)
     694             :         {
     695           0 :             *to = *from++ << 16;
     696           0 :             *to |= *from++ << 8;
     697           0 :             *to |= *from++;
     698           0 :             len -= 3;
     699             :         }
     700           0 :         else if (IS_LCPRV2(*from) && len >= 4)
     701             :         {
     702           0 :             from++;
     703           0 :             *to = *from++ << 16;
     704           0 :             *to |= *from++ << 8;
     705           0 :             *to |= *from++;
     706           0 :             len -= 4;
     707             :         }
     708             :         else
     709             :         {                       /* assume ASCII */
     710           0 :             *to = (unsigned char) *from++;
     711           0 :             len--;
     712             :         }
     713           0 :         to++;
     714           0 :         cnt++;
     715             :     }
     716           0 :     *to = 0;
     717           0 :     return cnt;
     718             : }
     719             : 
     720             : /*
     721             :  * convert pg_wchar to mule internal code
     722             :  * caller should allocate enough space for "to"
     723             :  * len: length of from.
     724             :  * "from" not necessarily null terminated.
     725             :  */
     726             : static int
     727           0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     728             : {
     729           0 :     int         cnt = 0;
     730             : 
     731           0 :     while (len > 0 && *from)
     732             :     {
     733             :         unsigned char lb;
     734             : 
     735           0 :         lb = (*from >> 16) & 0xff;
     736           0 :         if (IS_LC1(lb))
     737             :         {
     738           0 :             *to++ = lb;
     739           0 :             *to++ = *from & 0xff;
     740           0 :             cnt += 2;
     741             :         }
     742           0 :         else if (IS_LC2(lb))
     743             :         {
     744           0 :             *to++ = lb;
     745           0 :             *to++ = (*from >> 8) & 0xff;
     746           0 :             *to++ = *from & 0xff;
     747           0 :             cnt += 3;
     748             :         }
     749           0 :         else if (IS_LCPRV1_A_RANGE(lb))
     750             :         {
     751           0 :             *to++ = LCPRV1_A;
     752           0 :             *to++ = lb;
     753           0 :             *to++ = *from & 0xff;
     754           0 :             cnt += 3;
     755             :         }
     756           0 :         else if (IS_LCPRV1_B_RANGE(lb))
     757             :         {
     758           0 :             *to++ = LCPRV1_B;
     759           0 :             *to++ = lb;
     760           0 :             *to++ = *from & 0xff;
     761           0 :             cnt += 3;
     762             :         }
     763           0 :         else if (IS_LCPRV2_A_RANGE(lb))
     764             :         {
     765           0 :             *to++ = LCPRV2_A;
     766           0 :             *to++ = lb;
     767           0 :             *to++ = (*from >> 8) & 0xff;
     768           0 :             *to++ = *from & 0xff;
     769           0 :             cnt += 4;
     770             :         }
     771           0 :         else if (IS_LCPRV2_B_RANGE(lb))
     772             :         {
     773           0 :             *to++ = LCPRV2_B;
     774           0 :             *to++ = lb;
     775           0 :             *to++ = (*from >> 8) & 0xff;
     776           0 :             *to++ = *from & 0xff;
     777           0 :             cnt += 4;
     778             :         }
     779             :         else
     780             :         {
     781           0 :             *to++ = *from & 0xff;
     782           0 :             cnt += 1;
     783             :         }
     784           0 :         from++;
     785           0 :         len--;
     786             :     }
     787           0 :     *to = 0;
     788           0 :     return cnt;
     789             : }
     790             : 
     791             : /* exported for direct use by conv.c */
     792             : int
     793        3024 : pg_mule_mblen(const unsigned char *s)
     794             : {
     795             :     int         len;
     796             : 
     797        3024 :     if (IS_LC1(*s))
     798        1220 :         len = 2;
     799        1804 :     else if (IS_LCPRV1(*s))
     800           0 :         len = 3;
     801        1804 :     else if (IS_LC2(*s))
     802        1710 :         len = 3;
     803          94 :     else if (IS_LCPRV2(*s))
     804          40 :         len = 4;
     805             :     else
     806          54 :         len = 1;                /* assume ASCII */
     807        3024 :     return len;
     808             : }
     809             : 
     810             : static int
     811           0 : pg_mule_dsplen(const unsigned char *s)
     812             : {
     813             :     int         len;
     814             : 
     815             :     /*
     816             :      * Note: it's not really appropriate to assume that all multibyte charsets
     817             :      * are double-wide on screen.  But this seems an okay approximation for
     818             :      * the MULE charsets we currently support.
     819             :      */
     820             : 
     821           0 :     if (IS_LC1(*s))
     822           0 :         len = 1;
     823           0 :     else if (IS_LCPRV1(*s))
     824           0 :         len = 1;
     825           0 :     else if (IS_LC2(*s))
     826           0 :         len = 2;
     827           0 :     else if (IS_LCPRV2(*s))
     828           0 :         len = 2;
     829             :     else
     830           0 :         len = 1;                /* assume ASCII */
     831             : 
     832           0 :     return len;
     833             : }
     834             : 
     835             : /*
     836             :  * ISO8859-1
     837             :  */
     838             : static int
     839        1070 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     840             : {
     841        1070 :     int         cnt = 0;
     842             : 
     843       30004 :     while (len > 0 && *from)
     844             :     {
     845       28934 :         *to++ = *from++;
     846       28934 :         len--;
     847       28934 :         cnt++;
     848             :     }
     849        1070 :     *to = 0;
     850        1070 :     return cnt;
     851             : }
     852             : 
     853             : /*
     854             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     855             :  * high bits.
     856             :  * caller should allocate enough space for "to"
     857             :  * len: length of from.
     858             :  * "from" not necessarily null terminated.
     859             :  */
     860             : static int
     861         150 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     862             : {
     863         150 :     int         cnt = 0;
     864             : 
     865        1356 :     while (len > 0 && *from)
     866             :     {
     867        1206 :         *to++ = *from++;
     868        1206 :         len--;
     869        1206 :         cnt++;
     870             :     }
     871         150 :     *to = 0;
     872         150 :     return cnt;
     873             : }
     874             : 
     875             : static int
     876        8428 : pg_latin1_mblen(const unsigned char *s)
     877             : {
     878        8428 :     return 1;
     879             : }
     880             : 
     881             : static int
     882        1232 : pg_latin1_dsplen(const unsigned char *s)
     883             : {
     884        1232 :     return pg_ascii_dsplen(s);
     885             : }
     886             : 
     887             : /*
     888             :  * SJIS
     889             :  */
     890             : static int
     891        1690 : pg_sjis_mblen(const unsigned char *s)
     892             : {
     893             :     int         len;
     894             : 
     895        1690 :     if (*s >= 0xa1 && *s <= 0xdf)
     896           0 :         len = 1;                /* 1 byte kana? */
     897        1690 :     else if (IS_HIGHBIT_SET(*s))
     898        1314 :         len = 2;                /* kanji? */
     899             :     else
     900         376 :         len = 1;                /* should be ASCII */
     901        1690 :     return len;
     902             : }
     903             : 
     904             : static int
     905           0 : pg_sjis_dsplen(const unsigned char *s)
     906             : {
     907             :     int         len;
     908             : 
     909           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     910           0 :         len = 1;                /* 1 byte kana? */
     911           0 :     else if (IS_HIGHBIT_SET(*s))
     912           0 :         len = 2;                /* kanji? */
     913             :     else
     914           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     915           0 :     return len;
     916             : }
     917             : 
     918             : /*
     919             :  * Big5
     920             :  */
     921             : static int
     922         492 : pg_big5_mblen(const unsigned char *s)
     923             : {
     924             :     int         len;
     925             : 
     926         492 :     if (IS_HIGHBIT_SET(*s))
     927         438 :         len = 2;                /* kanji? */
     928             :     else
     929          54 :         len = 1;                /* should be ASCII */
     930         492 :     return len;
     931             : }
     932             : 
     933             : static int
     934           0 : pg_big5_dsplen(const unsigned char *s)
     935             : {
     936             :     int         len;
     937             : 
     938           0 :     if (IS_HIGHBIT_SET(*s))
     939           0 :         len = 2;                /* kanji? */
     940             :     else
     941           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     942           0 :     return len;
     943             : }
     944             : 
     945             : /*
     946             :  * GBK
     947             :  */
     948             : static int
     949         556 : pg_gbk_mblen(const unsigned char *s)
     950             : {
     951             :     int         len;
     952             : 
     953         556 :     if (IS_HIGHBIT_SET(*s))
     954         416 :         len = 2;                /* kanji? */
     955             :     else
     956         140 :         len = 1;                /* should be ASCII */
     957         556 :     return len;
     958             : }
     959             : 
     960             : static int
     961           0 : pg_gbk_dsplen(const unsigned char *s)
     962             : {
     963             :     int         len;
     964             : 
     965           0 :     if (IS_HIGHBIT_SET(*s))
     966           0 :         len = 2;                /* kanji? */
     967             :     else
     968           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     969           0 :     return len;
     970             : }
     971             : 
     972             : /*
     973             :  * UHC
     974             :  */
     975             : static int
     976          24 : pg_uhc_mblen(const unsigned char *s)
     977             : {
     978             :     int         len;
     979             : 
     980          24 :     if (IS_HIGHBIT_SET(*s))
     981          24 :         len = 2;                /* 2byte? */
     982             :     else
     983           0 :         len = 1;                /* should be ASCII */
     984          24 :     return len;
     985             : }
     986             : 
     987             : static int
     988           0 : pg_uhc_dsplen(const unsigned char *s)
     989             : {
     990             :     int         len;
     991             : 
     992           0 :     if (IS_HIGHBIT_SET(*s))
     993           0 :         len = 2;                /* 2byte? */
     994             :     else
     995           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     996           0 :     return len;
     997             : }
     998             : 
     999             : /*
    1000             :  * GB18030
    1001             :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1002             :  */
    1003             : 
    1004             : /*
    1005             :  * Unlike all other mblen() functions, this also looks at the second byte of
    1006             :  * the input.  However, if you only pass the first byte of a multi-byte
    1007             :  * string, and \0 as the second byte, this still works in a predictable way:
    1008             :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1009             :  * enough for all current uses, as a client-only encoding.  It works that
    1010             :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1011             :  * fourth byte look like a 2-byte encoded character, when looked at
    1012             :  * separately.
    1013             :  */
    1014             : static int
    1015        1158 : pg_gb18030_mblen(const unsigned char *s)
    1016             : {
    1017             :     int         len;
    1018             : 
    1019        1158 :     if (!IS_HIGHBIT_SET(*s))
    1020         684 :         len = 1;                /* ASCII */
    1021         474 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1022         186 :         len = 4;
    1023             :     else
    1024         288 :         len = 2;
    1025        1158 :     return len;
    1026             : }
    1027             : 
    1028             : static int
    1029           0 : pg_gb18030_dsplen(const unsigned char *s)
    1030             : {
    1031             :     int         len;
    1032             : 
    1033           0 :     if (IS_HIGHBIT_SET(*s))
    1034           0 :         len = 2;
    1035             :     else
    1036           0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1037           0 :     return len;
    1038             : }
    1039             : 
    1040             : /*
    1041             :  *-------------------------------------------------------------------
    1042             :  * multibyte sequence validators
    1043             :  *
    1044             :  * The verifychar functions accept "s", a pointer to the first byte of a
    1045             :  * string, and "len", the remaining length of the string.  If there is a
    1046             :  * validly encoded character beginning at *s, return its length in bytes;
    1047             :  * else return -1.
    1048             :  *
    1049             :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1050             :  * the length of the string.  They verify the whole string, and return the
    1051             :  * number of input bytes (<= len) that are valid.  In other words, if the
    1052             :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1053             :  * byte offset of the first invalid character.  The verifystr functions must
    1054             :  * test for and reject zeroes in the input.
    1055             :  *
    1056             :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1057             :  * they must test for and reject zeroes in any additional bytes of a
    1058             :  * multibyte character.  Note that this definition allows the function for a
    1059             :  * single-byte encoding to be just "return 1".
    1060             :  *-------------------------------------------------------------------
    1061             :  */
    1062             : static int
    1063         322 : pg_ascii_verifychar(const unsigned char *s, int len)
    1064             : {
    1065         322 :     return 1;
    1066             : }
    1067             : 
    1068             : static int
    1069      423418 : pg_ascii_verifystr(const unsigned char *s, int len)
    1070             : {
    1071      423418 :     const unsigned char *nullpos = memchr(s, 0, len);
    1072             : 
    1073      423418 :     if (nullpos == NULL)
    1074      423418 :         return len;
    1075             :     else
    1076           0 :         return nullpos - s;
    1077             : }
    1078             : 
    1079             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1080             : 
    1081             : static int
    1082         504 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1083             : {
    1084             :     int         l;
    1085             :     unsigned char c1,
    1086             :                 c2;
    1087             : 
    1088         504 :     c1 = *s++;
    1089             : 
    1090         504 :     switch (c1)
    1091             :     {
    1092           0 :         case SS2:               /* JIS X 0201 */
    1093           0 :             l = 2;
    1094           0 :             if (l > len)
    1095           0 :                 return -1;
    1096           0 :             c2 = *s++;
    1097           0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1098           0 :                 return -1;
    1099           0 :             break;
    1100             : 
    1101           0 :         case SS3:               /* JIS X 0212 */
    1102           0 :             l = 3;
    1103           0 :             if (l > len)
    1104           0 :                 return -1;
    1105           0 :             c2 = *s++;
    1106           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1107           0 :                 return -1;
    1108           0 :             c2 = *s++;
    1109           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1110           0 :                 return -1;
    1111           0 :             break;
    1112             : 
    1113         504 :         default:
    1114         504 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1115             :             {
    1116         504 :                 l = 2;
    1117         504 :                 if (l > len)
    1118          84 :                     return -1;
    1119         420 :                 if (!IS_EUC_RANGE_VALID(c1))
    1120          24 :                     return -1;
    1121         396 :                 c2 = *s++;
    1122         396 :                 if (!IS_EUC_RANGE_VALID(c2))
    1123         180 :                     return -1;
    1124             :             }
    1125             :             else
    1126             :                 /* must be ASCII */
    1127             :             {
    1128           0 :                 l = 1;
    1129             :             }
    1130         216 :             break;
    1131             :     }
    1132             : 
    1133         216 :     return l;
    1134             : }
    1135             : 
    1136             : static int
    1137         300 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1138             : {
    1139         300 :     const unsigned char *start = s;
    1140             : 
    1141         930 :     while (len > 0)
    1142             :     {
    1143             :         int         l;
    1144             : 
    1145             :         /* fast path for ASCII-subset characters */
    1146         846 :         if (!IS_HIGHBIT_SET(*s))
    1147             :         {
    1148         594 :             if (*s == '\0')
    1149          72 :                 break;
    1150         522 :             l = 1;
    1151             :         }
    1152             :         else
    1153             :         {
    1154         252 :             l = pg_eucjp_verifychar(s, len);
    1155         252 :             if (l == -1)
    1156         144 :                 break;
    1157             :         }
    1158         630 :         s += l;
    1159         630 :         len -= l;
    1160             :     }
    1161             : 
    1162         300 :     return s - start;
    1163             : }
    1164             : 
    1165             : static int
    1166          36 : pg_euckr_verifychar(const unsigned char *s, int len)
    1167             : {
    1168             :     int         l;
    1169             :     unsigned char c1,
    1170             :                 c2;
    1171             : 
    1172          36 :     c1 = *s++;
    1173             : 
    1174          36 :     if (IS_HIGHBIT_SET(c1))
    1175             :     {
    1176          36 :         l = 2;
    1177          36 :         if (l > len)
    1178          12 :             return -1;
    1179          24 :         if (!IS_EUC_RANGE_VALID(c1))
    1180          24 :             return -1;
    1181           0 :         c2 = *s++;
    1182           0 :         if (!IS_EUC_RANGE_VALID(c2))
    1183           0 :             return -1;
    1184             :     }
    1185             :     else
    1186             :         /* must be ASCII */
    1187             :     {
    1188           0 :         l = 1;
    1189             :     }
    1190             : 
    1191           0 :     return l;
    1192             : }
    1193             : 
    1194             : static int
    1195          60 : pg_euckr_verifystr(const unsigned char *s, int len)
    1196             : {
    1197          60 :     const unsigned char *start = s;
    1198             : 
    1199         132 :     while (len > 0)
    1200             :     {
    1201             :         int         l;
    1202             : 
    1203             :         /* fast path for ASCII-subset characters */
    1204         108 :         if (!IS_HIGHBIT_SET(*s))
    1205             :         {
    1206          72 :             if (*s == '\0')
    1207           0 :                 break;
    1208          72 :             l = 1;
    1209             :         }
    1210             :         else
    1211             :         {
    1212          36 :             l = pg_euckr_verifychar(s, len);
    1213          36 :             if (l == -1)
    1214          36 :                 break;
    1215             :         }
    1216          72 :         s += l;
    1217          72 :         len -= l;
    1218             :     }
    1219             : 
    1220          60 :     return s - start;
    1221             : }
    1222             : 
    1223             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1224             : #define pg_euccn_verifychar pg_euckr_verifychar
    1225             : #define pg_euccn_verifystr  pg_euckr_verifystr
    1226             : 
    1227             : static int
    1228          18 : pg_euctw_verifychar(const unsigned char *s, int len)
    1229             : {
    1230             :     int         l;
    1231             :     unsigned char c1,
    1232             :                 c2;
    1233             : 
    1234          18 :     c1 = *s++;
    1235             : 
    1236          18 :     switch (c1)
    1237             :     {
    1238           0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1239           0 :             l = 4;
    1240           0 :             if (l > len)
    1241           0 :                 return -1;
    1242           0 :             c2 = *s++;
    1243           0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1244           0 :                 return -1;
    1245           0 :             c2 = *s++;
    1246           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1247           0 :                 return -1;
    1248           0 :             c2 = *s++;
    1249           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1250           0 :                 return -1;
    1251           0 :             break;
    1252             : 
    1253           0 :         case SS3:               /* unused */
    1254           0 :             return -1;
    1255             : 
    1256          18 :         default:
    1257          18 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1258             :             {
    1259          18 :                 l = 2;
    1260          18 :                 if (l > len)
    1261           6 :                     return -1;
    1262             :                 /* no further range check on c1? */
    1263          12 :                 c2 = *s++;
    1264          12 :                 if (!IS_EUC_RANGE_VALID(c2))
    1265          12 :                     return -1;
    1266             :             }
    1267             :             else
    1268             :                 /* must be ASCII */
    1269             :             {
    1270           0 :                 l = 1;
    1271             :             }
    1272           0 :             break;
    1273             :     }
    1274           0 :     return l;
    1275             : }
    1276             : 
    1277             : static int
    1278          36 : pg_euctw_verifystr(const unsigned char *s, int len)
    1279             : {
    1280          36 :     const unsigned char *start = s;
    1281             : 
    1282          90 :     while (len > 0)
    1283             :     {
    1284             :         int         l;
    1285             : 
    1286             :         /* fast path for ASCII-subset characters */
    1287          72 :         if (!IS_HIGHBIT_SET(*s))
    1288             :         {
    1289          54 :             if (*s == '\0')
    1290           0 :                 break;
    1291          54 :             l = 1;
    1292             :         }
    1293             :         else
    1294             :         {
    1295          18 :             l = pg_euctw_verifychar(s, len);
    1296          18 :             if (l == -1)
    1297          18 :                 break;
    1298             :         }
    1299          54 :         s += l;
    1300          54 :         len -= l;
    1301             :     }
    1302             : 
    1303          36 :     return s - start;
    1304             : }
    1305             : 
    1306             : static int
    1307          18 : pg_johab_verifychar(const unsigned char *s, int len)
    1308             : {
    1309             :     int         l,
    1310             :                 mbl;
    1311             :     unsigned char c;
    1312             : 
    1313          18 :     l = mbl = pg_johab_mblen(s);
    1314             : 
    1315          18 :     if (len < l)
    1316           6 :         return -1;
    1317             : 
    1318          12 :     if (!IS_HIGHBIT_SET(*s))
    1319           0 :         return mbl;
    1320             : 
    1321          12 :     while (--l > 0)
    1322             :     {
    1323          12 :         c = *++s;
    1324          12 :         if (!IS_EUC_RANGE_VALID(c))
    1325          12 :             return -1;
    1326             :     }
    1327           0 :     return mbl;
    1328             : }
    1329             : 
    1330             : static int
    1331          24 : pg_johab_verifystr(const unsigned char *s, int len)
    1332             : {
    1333          24 :     const unsigned char *start = s;
    1334             : 
    1335          42 :     while (len > 0)
    1336             :     {
    1337             :         int         l;
    1338             : 
    1339             :         /* fast path for ASCII-subset characters */
    1340          36 :         if (!IS_HIGHBIT_SET(*s))
    1341             :         {
    1342          18 :             if (*s == '\0')
    1343           0 :                 break;
    1344          18 :             l = 1;
    1345             :         }
    1346             :         else
    1347             :         {
    1348          18 :             l = pg_johab_verifychar(s, len);
    1349          18 :             if (l == -1)
    1350          18 :                 break;
    1351             :         }
    1352          18 :         s += l;
    1353          18 :         len -= l;
    1354             :     }
    1355             : 
    1356          24 :     return s - start;
    1357             : }
    1358             : 
    1359             : static int
    1360        1350 : pg_mule_verifychar(const unsigned char *s, int len)
    1361             : {
    1362             :     int         l,
    1363             :                 mbl;
    1364             :     unsigned char c;
    1365             : 
    1366        1350 :     l = mbl = pg_mule_mblen(s);
    1367             : 
    1368        1350 :     if (len < l)
    1369         344 :         return -1;
    1370             : 
    1371        2032 :     while (--l > 0)
    1372             :     {
    1373        1348 :         c = *++s;
    1374        1348 :         if (!IS_HIGHBIT_SET(c))
    1375         322 :             return -1;
    1376             :     }
    1377         684 :     return mbl;
    1378             : }
    1379             : 
    1380             : static int
    1381         438 : pg_mule_verifystr(const unsigned char *s, int len)
    1382             : {
    1383         438 :     const unsigned char *start = s;
    1384             : 
    1385        1290 :     while (len > 0)
    1386             :     {
    1387             :         int         l;
    1388             : 
    1389             :         /* fast path for ASCII-subset characters */
    1390        1122 :         if (!IS_HIGHBIT_SET(*s))
    1391             :         {
    1392         690 :             if (*s == '\0')
    1393          36 :                 break;
    1394         654 :             l = 1;
    1395             :         }
    1396             :         else
    1397             :         {
    1398         432 :             l = pg_mule_verifychar(s, len);
    1399         432 :             if (l == -1)
    1400         234 :                 break;
    1401             :         }
    1402         852 :         s += l;
    1403         852 :         len -= l;
    1404             :     }
    1405             : 
    1406         438 :     return s - start;
    1407             : }
    1408             : 
    1409             : static int
    1410        7156 : pg_latin1_verifychar(const unsigned char *s, int len)
    1411             : {
    1412        7156 :     return 1;
    1413             : }
    1414             : 
    1415             : static int
    1416       11230 : pg_latin1_verifystr(const unsigned char *s, int len)
    1417             : {
    1418       11230 :     const unsigned char *nullpos = memchr(s, 0, len);
    1419             : 
    1420       11230 :     if (nullpos == NULL)
    1421       11122 :         return len;
    1422             :     else
    1423         108 :         return nullpos - s;
    1424             : }
    1425             : 
    1426             : static int
    1427        1002 : pg_sjis_verifychar(const unsigned char *s, int len)
    1428             : {
    1429             :     int         l,
    1430             :                 mbl;
    1431             :     unsigned char c1,
    1432             :                 c2;
    1433             : 
    1434        1002 :     l = mbl = pg_sjis_mblen(s);
    1435             : 
    1436        1002 :     if (len < l)
    1437         132 :         return -1;
    1438             : 
    1439         870 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1440           0 :         return mbl;
    1441             : 
    1442         870 :     c1 = *s++;
    1443         870 :     c2 = *s;
    1444         870 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1445         348 :         return -1;
    1446         522 :     return mbl;
    1447             : }
    1448             : 
    1449             : static int
    1450         546 : pg_sjis_verifystr(const unsigned char *s, int len)
    1451             : {
    1452         546 :     const unsigned char *start = s;
    1453             : 
    1454        2068 :     while (len > 0)
    1455             :     {
    1456             :         int         l;
    1457             : 
    1458             :         /* fast path for ASCII-subset characters */
    1459        1842 :         if (!IS_HIGHBIT_SET(*s))
    1460             :         {
    1461        1348 :             if (*s == '\0')
    1462          72 :                 break;
    1463        1276 :             l = 1;
    1464             :         }
    1465             :         else
    1466             :         {
    1467         494 :             l = pg_sjis_verifychar(s, len);
    1468         494 :             if (l == -1)
    1469         248 :                 break;
    1470             :         }
    1471        1522 :         s += l;
    1472        1522 :         len -= l;
    1473             :     }
    1474             : 
    1475         546 :     return s - start;
    1476             : }
    1477             : 
    1478             : static int
    1479         360 : pg_big5_verifychar(const unsigned char *s, int len)
    1480             : {
    1481             :     int         l,
    1482             :                 mbl;
    1483             : 
    1484         360 :     l = mbl = pg_big5_mblen(s);
    1485             : 
    1486         360 :     if (len < l)
    1487           6 :         return -1;
    1488             : 
    1489         354 :     if (l == 2 &&
    1490         354 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1491          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1492          12 :         return -1;
    1493             : 
    1494         576 :     while (--l > 0)
    1495             :     {
    1496         342 :         if (*++s == '\0')
    1497         108 :             return -1;
    1498             :     }
    1499             : 
    1500         234 :     return mbl;
    1501             : }
    1502             : 
    1503             : static int
    1504         162 : pg_big5_verifystr(const unsigned char *s, int len)
    1505             : {
    1506         162 :     const unsigned char *start = s;
    1507             : 
    1508         666 :     while (len > 0)
    1509             :     {
    1510             :         int         l;
    1511             : 
    1512             :         /* fast path for ASCII-subset characters */
    1513         594 :         if (!IS_HIGHBIT_SET(*s))
    1514             :         {
    1515         468 :             if (*s == '\0')
    1516          36 :                 break;
    1517         432 :             l = 1;
    1518             :         }
    1519             :         else
    1520             :         {
    1521         126 :             l = pg_big5_verifychar(s, len);
    1522         126 :             if (l == -1)
    1523          54 :                 break;
    1524             :         }
    1525         504 :         s += l;
    1526         504 :         len -= l;
    1527             :     }
    1528             : 
    1529         162 :     return s - start;
    1530             : }
    1531             : 
    1532             : static int
    1533         274 : pg_gbk_verifychar(const unsigned char *s, int len)
    1534             : {
    1535             :     int         l,
    1536             :                 mbl;
    1537             : 
    1538         274 :     l = mbl = pg_gbk_mblen(s);
    1539             : 
    1540         274 :     if (len < l)
    1541          54 :         return -1;
    1542             : 
    1543         220 :     if (l == 2 &&
    1544         220 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1545          28 :         s[1] == NONUTF8_INVALID_BYTE1)
    1546          28 :         return -1;
    1547             : 
    1548         384 :     while (--l > 0)
    1549             :     {
    1550         192 :         if (*++s == '\0')
    1551           0 :             return -1;
    1552             :     }
    1553             : 
    1554         192 :     return mbl;
    1555             : }
    1556             : 
    1557             : static int
    1558         256 : pg_gbk_verifystr(const unsigned char *s, int len)
    1559             : {
    1560         256 :     const unsigned char *start = s;
    1561             : 
    1562         658 :     while (len > 0)
    1563             :     {
    1564             :         int         l;
    1565             : 
    1566             :         /* fast path for ASCII-subset characters */
    1567         484 :         if (!IS_HIGHBIT_SET(*s))
    1568             :         {
    1569         242 :             if (*s == '\0')
    1570           0 :                 break;
    1571         242 :             l = 1;
    1572             :         }
    1573             :         else
    1574             :         {
    1575         242 :             l = pg_gbk_verifychar(s, len);
    1576         242 :             if (l == -1)
    1577          82 :                 break;
    1578             :         }
    1579         402 :         s += l;
    1580         402 :         len -= l;
    1581             :     }
    1582             : 
    1583         256 :     return s - start;
    1584             : }
    1585             : 
    1586             : static int
    1587          18 : pg_uhc_verifychar(const unsigned char *s, int len)
    1588             : {
    1589             :     int         l,
    1590             :                 mbl;
    1591             : 
    1592          18 :     l = mbl = pg_uhc_mblen(s);
    1593             : 
    1594          18 :     if (len < l)
    1595           6 :         return -1;
    1596             : 
    1597          12 :     if (l == 2 &&
    1598          12 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1599          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1600          12 :         return -1;
    1601             : 
    1602           0 :     while (--l > 0)
    1603             :     {
    1604           0 :         if (*++s == '\0')
    1605           0 :             return -1;
    1606             :     }
    1607             : 
    1608           0 :     return mbl;
    1609             : }
    1610             : 
    1611             : static int
    1612          24 : pg_uhc_verifystr(const unsigned char *s, int len)
    1613             : {
    1614          24 :     const unsigned char *start = s;
    1615             : 
    1616          42 :     while (len > 0)
    1617             :     {
    1618             :         int         l;
    1619             : 
    1620             :         /* fast path for ASCII-subset characters */
    1621          36 :         if (!IS_HIGHBIT_SET(*s))
    1622             :         {
    1623          18 :             if (*s == '\0')
    1624           0 :                 break;
    1625          18 :             l = 1;
    1626             :         }
    1627             :         else
    1628             :         {
    1629          18 :             l = pg_uhc_verifychar(s, len);
    1630          18 :             if (l == -1)
    1631          18 :                 break;
    1632             :         }
    1633          18 :         s += l;
    1634          18 :         len -= l;
    1635             :     }
    1636             : 
    1637          24 :     return s - start;
    1638             : }
    1639             : 
    1640             : static int
    1641        1164 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1642             : {
    1643             :     int         l;
    1644             : 
    1645        1164 :     if (!IS_HIGHBIT_SET(*s))
    1646           0 :         l = 1;                  /* ASCII */
    1647        1164 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1648             :     {
    1649             :         /* Should be 4-byte, validate remaining bytes */
    1650         318 :         if (*s >= 0x81 && *s <= 0xfe &&
    1651         306 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1652         306 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1653         162 :             l = 4;
    1654             :         else
    1655         156 :             l = -1;
    1656             :     }
    1657         846 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1658             :     {
    1659             :         /* Should be 2-byte, validate */
    1660         612 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1661         372 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1662         276 :             l = 2;
    1663             :         else
    1664         336 :             l = -1;
    1665             :     }
    1666             :     else
    1667         234 :         l = -1;
    1668        1164 :     return l;
    1669             : }
    1670             : 
    1671             : static int
    1672         884 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1673             : {
    1674         884 :     const unsigned char *start = s;
    1675             : 
    1676        2930 :     while (len > 0)
    1677             :     {
    1678             :         int         l;
    1679             : 
    1680             :         /* fast path for ASCII-subset characters */
    1681        2652 :         if (!IS_HIGHBIT_SET(*s))
    1682             :         {
    1683        1804 :             if (*s == '\0')
    1684          48 :                 break;
    1685        1756 :             l = 1;
    1686             :         }
    1687             :         else
    1688             :         {
    1689         848 :             l = pg_gb18030_verifychar(s, len);
    1690         848 :             if (l == -1)
    1691         558 :                 break;
    1692             :         }
    1693        2046 :         s += l;
    1694        2046 :         len -= l;
    1695             :     }
    1696             : 
    1697         884 :     return s - start;
    1698             : }
    1699             : 
    1700             : static int
    1701       17614 : pg_utf8_verifychar(const unsigned char *s, int len)
    1702             : {
    1703             :     int         l;
    1704             : 
    1705       17614 :     if ((*s & 0x80) == 0)
    1706             :     {
    1707           0 :         if (*s == '\0')
    1708           0 :             return -1;
    1709           0 :         return 1;
    1710             :     }
    1711       17614 :     else if ((*s & 0xe0) == 0xc0)
    1712        6162 :         l = 2;
    1713       11452 :     else if ((*s & 0xf0) == 0xe0)
    1714        6332 :         l = 3;
    1715        5120 :     else if ((*s & 0xf8) == 0xf0)
    1716        4856 :         l = 4;
    1717             :     else
    1718         264 :         l = 1;
    1719             : 
    1720       17614 :     if (l > len)
    1721         578 :         return -1;
    1722             : 
    1723       17036 :     if (!pg_utf8_islegal(s, l))
    1724        2356 :         return -1;
    1725             : 
    1726       14680 :     return l;
    1727             : }
    1728             : 
    1729             : /*
    1730             :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1731             :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1732             :  * input byte and current state are used to compute an index into an array of
    1733             :  * state transitions. Since the address of the next transition is dependent
    1734             :  * on this computation, there is latency in executing the load instruction,
    1735             :  * and the CPU is not kept busy.
    1736             :  *
    1737             :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1738             :  *
    1739             :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1740             :  *
    1741             :  * In a shift-based DFA, the input byte is an index into array of integers
    1742             :  * whose bit pattern encodes the state transitions. To compute the next
    1743             :  * state, we simply right-shift the integer by the current state and apply a
    1744             :  * mask. In this scheme, the address of the transition only depends on the
    1745             :  * input byte, so there is better pipelining.
    1746             :  *
    1747             :  * The naming convention for states and transitions was adopted from a UTF-8
    1748             :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1749             :  *
    1750             :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1751             :  *
    1752             :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1753             :  * ==========================================================================
    1754             :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1755             :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1756             :  *                                                                  |
    1757             :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1758             :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1759             :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1760             :  *                                                                  |
    1761             :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1762             :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1763             :  *                                                                  |
    1764             :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1765             :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1766             :  *
    1767             :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1768             :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1769             :  * it's possible to find state numbers such that the transitions fit within
    1770             :  * 32-bit integers, as Dougall Johnson demonstrated:
    1771             :  *
    1772             :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1773             :  *
    1774             :  * This packed representation is the reason for the seemingly odd choice of
    1775             :  * state values below.
    1776             :  */
    1777             : 
    1778             : /* Error */
    1779             : #define ERR  0
    1780             : /* Begin */
    1781             : #define BGN 11
    1782             : /* Continuation states, expect 1/2/3 continuation bytes */
    1783             : #define CS1 16
    1784             : #define CS2  1
    1785             : #define CS3  5
    1786             : /* Partial states, where the first continuation byte has a restricted range */
    1787             : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1788             : #define P3B 20                  /* Lead was ED, check for surrogate */
    1789             : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1790             : #define P4B 30                  /* Lead was F4, check for too-large */
    1791             : /* Begin and End are the same state */
    1792             : #define END BGN
    1793             : 
    1794             : /* the encoded state transitions for the lookup table */
    1795             : 
    1796             : /* ASCII */
    1797             : #define ASC (END << BGN)
    1798             : /* 2-byte lead */
    1799             : #define L2A (CS1 << BGN)
    1800             : /* 3-byte lead */
    1801             : #define L3A (P3A << BGN)
    1802             : #define L3B (CS2 << BGN)
    1803             : #define L3C (P3B << BGN)
    1804             : /* 4-byte lead */
    1805             : #define L4A (P4A << BGN)
    1806             : #define L4B (CS3 << BGN)
    1807             : #define L4C (P4B << BGN)
    1808             : /* continuation byte */
    1809             : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1810             : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1811             : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1812             : /* invalid byte */
    1813             : #define ILL ERR
    1814             : 
    1815             : static const uint32 Utf8Transition[256] =
    1816             : {
    1817             :     /* ASCII */
    1818             : 
    1819             :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1820             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1821             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1822             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1823             : 
    1824             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1825             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1826             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1827             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1828             : 
    1829             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1830             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1831             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1832             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1833             : 
    1834             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1835             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1836             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1837             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1838             : 
    1839             :     /* continuation bytes */
    1840             : 
    1841             :     /* 80..8F */
    1842             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1843             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1844             : 
    1845             :     /* 90..9F */
    1846             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1847             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1848             : 
    1849             :     /* A0..BF */
    1850             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1851             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1852             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1853             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1854             : 
    1855             :     /* leading bytes */
    1856             : 
    1857             :     /* C0..DF */
    1858             :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1859             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1860             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1861             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1862             : 
    1863             :     /* E0..EF */
    1864             :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1865             :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1866             : 
    1867             :     /* F0..FF */
    1868             :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1869             :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1870             : };
    1871             : 
    1872             : static void
    1873        1704 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1874             : {
    1875             :     /* Note: We deliberately don't check the state's value here. */
    1876       56232 :     while (len > 0)
    1877             :     {
    1878             :         /*
    1879             :          * It's important that the mask value is 31: In most instruction sets,
    1880             :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1881             :          * 32, so the compiler should elide the mask operation.
    1882             :          */
    1883       54528 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1884       54528 :         len--;
    1885             :     }
    1886             : 
    1887        1704 :     *state &= 31;
    1888        1704 : }
    1889             : 
    1890             : static int
    1891     1178874 : pg_utf8_verifystr(const unsigned char *s, int len)
    1892             : {
    1893     1178874 :     const unsigned char *start = s;
    1894     1178874 :     const int   orig_len = len;
    1895     1178874 :     uint32      state = BGN;
    1896             : 
    1897             : /*
    1898             :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1899             :  * the compiler can unroll a longer loop, it's not worth it because we
    1900             :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1901             :  */
    1902             : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1903             : 
    1904     1178874 :     if (len >= STRIDE_LENGTH)
    1905             :     {
    1906     4888300 :         while (len >= STRIDE_LENGTH)
    1907             :         {
    1908             :             /*
    1909             :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1910             :              * but we must first check for a non-END state, which means the
    1911             :              * previous chunk ended in the middle of a multibyte sequence.
    1912             :              */
    1913     4306660 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1914        1704 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1915             : 
    1916     4306660 :             s += STRIDE_LENGTH;
    1917     4306660 :             len -= STRIDE_LENGTH;
    1918             :         }
    1919             : 
    1920             :         /* The error state persists, so we only need to check for it here. */
    1921      581640 :         if (state == ERR)
    1922             :         {
    1923             :             /*
    1924             :              * Start over from the beginning with the slow path so we can
    1925             :              * count the valid bytes.
    1926             :              */
    1927         504 :             len = orig_len;
    1928         504 :             s = start;
    1929             :         }
    1930      581136 :         else if (state != END)
    1931             :         {
    1932             :             /*
    1933             :              * The fast path exited in the middle of a multibyte sequence.
    1934             :              * Walk backwards to find the leading byte so that the slow path
    1935             :              * can resume checking from there. We must always backtrack at
    1936             :              * least one byte, since the current byte could be e.g. an ASCII
    1937             :              * byte after a 2-byte lead, which is invalid.
    1938             :              */
    1939             :             do
    1940             :             {
    1941             :                 Assert(s > start);
    1942         114 :                 s--;
    1943         114 :                 len++;
    1944             :                 Assert(IS_HIGHBIT_SET(*s));
    1945         114 :             } while (pg_utf_mblen(s) <= 1);
    1946             :         }
    1947             :     }
    1948             : 
    1949             :     /* check remaining bytes */
    1950    17423446 :     while (len > 0)
    1951             :     {
    1952             :         int         l;
    1953             : 
    1954             :         /* fast path for ASCII-subset characters */
    1955    16247646 :         if (!IS_HIGHBIT_SET(*s))
    1956             :         {
    1957    16230104 :             if (*s == '\0')
    1958         204 :                 break;
    1959    16229900 :             l = 1;
    1960             :         }
    1961             :         else
    1962             :         {
    1963       17542 :             l = pg_utf8_verifychar(s, len);
    1964       17542 :             if (l == -1)
    1965        2870 :                 break;
    1966             :         }
    1967    16244572 :         s += l;
    1968    16244572 :         len -= l;
    1969             :     }
    1970             : 
    1971     1178874 :     return s - start;
    1972             : }
    1973             : 
    1974             : /*
    1975             :  * Check for validity of a single UTF-8 encoded character
    1976             :  *
    1977             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1978             :  * restrictions on the second byte are meant to ensure that there isn't
    1979             :  * more than one encoding of a given Unicode character point; that is,
    1980             :  * you may not use a longer-than-necessary byte sequence with high order
    1981             :  * zero bits to represent a character that would fit in fewer bytes.
    1982             :  * To do otherwise is to create security hazards (eg, create an apparent
    1983             :  * non-ASCII character that decodes to plain ASCII).
    1984             :  *
    1985             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1986             :  * caller must have checked that that many bytes are present in the buffer.
    1987             :  */
    1988             : bool
    1989       23580 : pg_utf8_islegal(const unsigned char *source, int length)
    1990             : {
    1991             :     unsigned char a;
    1992             : 
    1993       23580 :     switch (length)
    1994             :     {
    1995           0 :         default:
    1996             :             /* reject lengths 5 and 6 for now */
    1997           0 :             return false;
    1998        4596 :         case 4:
    1999        4596 :             a = source[3];
    2000        4596 :             if (a < 0x80 || a > 0xBF)
    2001         364 :                 return false;
    2002             :             /* FALL THRU */
    2003             :         case 3:
    2004       12038 :             a = source[2];
    2005       12038 :             if (a < 0x80 || a > 0xBF)
    2006         680 :                 return false;
    2007             :             /* FALL THRU */
    2008             :         case 2:
    2009       17978 :             a = source[1];
    2010       17978 :             switch (*source)
    2011             :             {
    2012         312 :                 case 0xE0:
    2013         312 :                     if (a < 0xA0 || a > 0xBF)
    2014         264 :                         return false;
    2015          48 :                     break;
    2016         312 :                 case 0xED:
    2017         312 :                     if (a < 0x80 || a > 0x9F)
    2018         264 :                         return false;
    2019          48 :                     break;
    2020        4052 :                 case 0xF0:
    2021        4052 :                     if (a < 0x90 || a > 0xBF)
    2022         264 :                         return false;
    2023        3788 :                     break;
    2024         180 :                 case 0xF4:
    2025         180 :                     if (a < 0x80 || a > 0x8F)
    2026         132 :                         return false;
    2027          48 :                     break;
    2028       13122 :                 default:
    2029       13122 :                     if (a < 0x80 || a > 0xBF)
    2030         292 :                         return false;
    2031       12830 :                     break;
    2032             :             }
    2033             :             /* FALL THRU */
    2034       21320 :         case 1:
    2035       21320 :             a = *source;
    2036       21320 :             if (a >= 0x80 && a < 0xC2)
    2037         396 :                 return false;
    2038       20924 :             if (a > 0xF4)
    2039         132 :                 return false;
    2040       20792 :             break;
    2041             :     }
    2042       20792 :     return true;
    2043             : }
    2044             : 
    2045             : 
    2046             : /*
    2047             :  * Fills the provided buffer with two bytes such that:
    2048             :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2049             :  */
    2050             : void
    2051         412 : pg_encoding_set_invalid(int encoding, char *dst)
    2052             : {
    2053             :     Assert(pg_encoding_max_length(encoding) > 1);
    2054             : 
    2055         412 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2056         412 :     dst[1] = NONUTF8_INVALID_BYTE1;
    2057         412 : }
    2058             : 
    2059             : /*
    2060             :  *-------------------------------------------------------------------
    2061             :  * encoding info table
    2062             :  *-------------------------------------------------------------------
    2063             :  */
    2064             : const pg_wchar_tbl pg_wchar_table[] = {
    2065             :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2066             :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2067             :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
    2068             :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2069             :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2070             :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2071             :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2072             :     [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2073             :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2074             :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2075             :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2076             :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2077             :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2078             :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2079             :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2080             :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2081             :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2082             :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2083             :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2084             :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2085             :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2086             :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2087             :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2088             :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2089             :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2090             :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2091             :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2092             :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2093             :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2094             :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2095             :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096             :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097             :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098             :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2099             :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2100             :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2101             :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2102             :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2103             :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2104             :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2105             :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2106             :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2107             : };
    2108             : 
    2109             : /*
    2110             :  * Returns the byte length of a multibyte character.
    2111             :  *
    2112             :  * Choose "mblen" functions based on the input string characteristics.
    2113             :  * pg_encoding_mblen() can be used when ANY of these conditions are met:
    2114             :  *
    2115             :  * - The input string is zero-terminated
    2116             :  *
    2117             :  * - The input string is known to be valid in the encoding (e.g., string
    2118             :  *   converted from database encoding)
    2119             :  *
    2120             :  * - The encoding is not GB18030 (e.g., when only database encodings are
    2121             :  *   passed to 'encoding' parameter)
    2122             :  *
    2123             :  * encoding==GB18030 requires examining up to two bytes to determine character
    2124             :  * length.  Therefore, callers satisfying none of those conditions must use
    2125             :  * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
    2126             :  * guaranteed to be within allocation bounds.
    2127             :  *
    2128             :  * When dealing with text that is not certainly valid in the specified
    2129             :  * encoding, the result may exceed the actual remaining string length.
    2130             :  * Callers that are not prepared to deal with that should use Min(remaining,
    2131             :  * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
    2132             :  * pg_encoding_mblen_bounded() are interchangeable.
    2133             :  */
    2134             : int
    2135    54237130 : pg_encoding_mblen(int encoding, const char *mbstr)
    2136             : {
    2137    54237130 :     return (PG_VALID_ENCODING(encoding) ?
    2138   108474260 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2139           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2140             : }
    2141             : 
    2142             : /*
    2143             :  * Returns the byte length of a multibyte character (possibly not
    2144             :  * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
    2145             :  */
    2146             : int
    2147        6094 : pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
    2148             :                                 size_t remaining)
    2149             : {
    2150             :     /*
    2151             :      * Define zero remaining as too few, even for single-byte encodings.
    2152             :      * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
    2153             :      * zero; others read one.
    2154             :      */
    2155        6094 :     if (remaining < 1 ||
    2156         338 :         (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
    2157          72 :         return INT_MAX;
    2158        6022 :     return pg_encoding_mblen(encoding, mbstr);
    2159             : }
    2160             : 
    2161             : /*
    2162             :  * Returns the byte length of a multibyte character; but not more than the
    2163             :  * distance to the terminating zero byte.  For input that might lack a
    2164             :  * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
    2165             :  */
    2166             : int
    2167           0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2168             : {
    2169           0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2170             : }
    2171             : 
    2172             : /*
    2173             :  * Returns the display length of a multibyte character.
    2174             :  */
    2175             : int
    2176    54057952 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2177             : {
    2178    54057952 :     return (PG_VALID_ENCODING(encoding) ?
    2179   108115904 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2180           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2181             : }
    2182             : 
    2183             : /*
    2184             :  * Verify the first multibyte character of the given string.
    2185             :  * Return its byte length if good, -1 if bad.  (See comments above for
    2186             :  * full details of the mbverifychar API.)
    2187             :  */
    2188             : int
    2189        9706 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2190             : {
    2191        9706 :     return (PG_VALID_ENCODING(encoding) ?
    2192       19412 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2193           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2194             : }
    2195             : 
    2196             : /*
    2197             :  * Verify that a string is valid for the given encoding.
    2198             :  * Returns the number of input bytes (<= len) that form a valid string.
    2199             :  * (See comments above for full details of the mbverifystr API.)
    2200             :  */
    2201             : int
    2202      467244 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2203             : {
    2204      467244 :     return (PG_VALID_ENCODING(encoding) ?
    2205      934488 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2206           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2207             : }
    2208             : 
    2209             : /*
    2210             :  * fetch maximum length of a given encoding
    2211             :  */
    2212             : int
    2213      884706 : pg_encoding_max_length(int encoding)
    2214             : {
    2215             :     Assert(PG_VALID_ENCODING(encoding));
    2216             : 
    2217             :     /*
    2218             :      * Check for the encoding despite the assert, due to some mingw versions
    2219             :      * otherwise issuing bogus warnings.
    2220             :      */
    2221      884706 :     return PG_VALID_ENCODING(encoding) ?
    2222     1769412 :         pg_wchar_table[encoding].maxmblen :
    2223             :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2224             : }

Generated by: LCOV version 1.14