LCOV - PostgreSQL 18devel - src/common/wchar.c

LCOV - code coverage report

Current view:	top level - src/common - wchar.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 18devel	Lines:	515	850	60.6 %
Date:	2025-04-01 14:15:22	Functions:	60	81	74.1 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wchar.c
       4             :  *    Functions for working with multibyte characters in various encodings.
       5             :  *
       6             :  * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/common/wchar.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #include "mb/pg_wchar.h"
      16             : #include "utils/ascii.h"
      17             : 
      18             : 
      19             : /*
      20             :  * In today's multibyte encodings other than UTF8, this two-byte sequence
      21             :  * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
      22             :  *
      23             :  * For historical reasons, several verifychar implementations opt to reject
      24             :  * this pair specifically.  Byte pair range constraints, in encoding
      25             :  * originator documentation, always excluded this pair.  No core conversion
      26             :  * could translate it.  However, longstanding verifychar implementations
      27             :  * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
      28             :  * pairs not valid per encoding originator documentation.  To avoid tightening
      29             :  * core or non-core conversions in a security patch, we sought this one pair.
      30             :  *
      31             :  * PQescapeString() historically used spaces for BYTE1; many other values
      32             :  * could suffice for BYTE1.
      33             :  */
      34             : #define NONUTF8_INVALID_BYTE0 (0x8d)
      35             : #define NONUTF8_INVALID_BYTE1 (' ')
      36             : 
      37             : 
      38             : /*
      39             :  * Operations on multi-byte encodings are driven by a table of helper
      40             :  * functions.
      41             :  *
      42             :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      43             :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      44             :  * and wchar2mb() conversion functions.
      45             :  *
      46             :  * These functions generally assume that their input is validly formed.
      47             :  * The "verifier" functions, further down in the file, have to be more
      48             :  * paranoid.
      49             :  *
      50             :  * We expect that mblen() does not need to examine more than the first byte
      51             :  * of the character to discover the correct length.  GB18030 is an exception
      52             :  * to that rule, though, as it also looks at second byte.  But even that
      53             :  * behaves in a predictable way, if you only pass the first byte: it will
      54             :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      55             :  * good enough for all current uses.
      56             :  *
      57             :  * Note: for the display output of psql to work properly, the return values
      58             :  * of the dsplen functions must conform to the Unicode standard. In particular
      59             :  * the NUL character is zero width and control characters are generally
      60             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      61             :  * subset to the ASCII routines to ensure consistency.
      62             :  */
      63             : 
      64             : /*
      65             :  * SQL/ASCII
      66             :  */
      67             : static int
      68         762 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      69             : {
      70         762 :     int         cnt = 0;
      71             : 
      72       63428 :     while (len > 0 && *from)
      73             :     {
      74       62666 :         *to++ = *from++;
      75       62666 :         len--;
      76       62666 :         cnt++;
      77             :     }
      78         762 :     *to = 0;
      79         762 :     return cnt;
      80             : }
      81             : 
      82             : static int
      83       48234 : pg_ascii_mblen(const unsigned char *s)
      84             : {
      85       48234 :     return 1;
      86             : }
      87             : 
      88             : static int
      89       45656 : pg_ascii_dsplen(const unsigned char *s)
      90             : {
      91       45656 :     if (*s == '\0')
      92           0 :         return 0;
      93       45656 :     if (*s < 0x20 || *s == 0x7f)
      94           6 :         return -1;
      95             : 
      96       45650 :     return 1;
      97             : }
      98             : 
      99             : /*
     100             :  * EUC
     101             :  */
     102             : static int
     103           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     104             : {
     105           0 :     int         cnt = 0;
     106             : 
     107           0 :     while (len > 0 && *from)
     108             :     {
     109           0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
     110             :                                          * KANA") */
     111             :         {
     112           0 :             from++;
     113           0 :             *to = (SS2 << 8) | *from++;
     114           0 :             len -= 2;
     115             :         }
     116           0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
     117             :         {
     118           0 :             from++;
     119           0 :             *to = (SS3 << 16) | (*from++ << 8);
     120           0 :             *to |= *from++;
     121           0 :             len -= 3;
     122             :         }
     123           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
     124             :         {
     125           0 :             *to = *from++ << 8;
     126           0 :             *to |= *from++;
     127           0 :             len -= 2;
     128             :         }
     129             :         else                    /* must be ASCII */
     130             :         {
     131           0 :             *to = *from++;
     132           0 :             len--;
     133             :         }
     134           0 :         to++;
     135           0 :         cnt++;
     136             :     }
     137           0 :     *to = 0;
     138           0 :     return cnt;
     139             : }
     140             : 
     141             : static inline int
     142         234 : pg_euc_mblen(const unsigned char *s)
     143             : {
     144             :     int         len;
     145             : 
     146         234 :     if (*s == SS2)
     147           0 :         len = 2;
     148         234 :     else if (*s == SS3)
     149           0 :         len = 3;
     150         234 :     else if (IS_HIGHBIT_SET(*s))
     151         162 :         len = 2;
     152             :     else
     153          72 :         len = 1;
     154         234 :     return len;
     155             : }
     156             : 
     157             : static inline int
     158           0 : pg_euc_dsplen(const unsigned char *s)
     159             : {
     160             :     int         len;
     161             : 
     162           0 :     if (*s == SS2)
     163           0 :         len = 2;
     164           0 :     else if (*s == SS3)
     165           0 :         len = 2;
     166           0 :     else if (IS_HIGHBIT_SET(*s))
     167           0 :         len = 2;
     168             :     else
     169           0 :         len = pg_ascii_dsplen(s);
     170           0 :     return len;
     171             : }
     172             : 
     173             : /*
     174             :  * EUC_JP
     175             :  */
     176             : static int
     177           0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     178             : {
     179           0 :     return pg_euc2wchar_with_len(from, to, len);
     180             : }
     181             : 
     182             : static int
     183         204 : pg_eucjp_mblen(const unsigned char *s)
     184             : {
     185         204 :     return pg_euc_mblen(s);
     186             : }
     187             : 
     188             : static int
     189           0 : pg_eucjp_dsplen(const unsigned char *s)
     190             : {
     191             :     int         len;
     192             : 
     193           0 :     if (*s == SS2)
     194           0 :         len = 1;
     195           0 :     else if (*s == SS3)
     196           0 :         len = 2;
     197           0 :     else if (IS_HIGHBIT_SET(*s))
     198           0 :         len = 2;
     199             :     else
     200           0 :         len = pg_ascii_dsplen(s);
     201           0 :     return len;
     202             : }
     203             : 
     204             : /*
     205             :  * EUC_KR
     206             :  */
     207             : static int
     208           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     209             : {
     210           0 :     return pg_euc2wchar_with_len(from, to, len);
     211             : }
     212             : 
     213             : static int
     214           6 : pg_euckr_mblen(const unsigned char *s)
     215             : {
     216           6 :     return pg_euc_mblen(s);
     217             : }
     218             : 
     219             : static int
     220           0 : pg_euckr_dsplen(const unsigned char *s)
     221             : {
     222           0 :     return pg_euc_dsplen(s);
     223             : }
     224             : 
     225             : /*
     226             :  * EUC_CN
     227             :  *
     228             :  */
     229             : static int
     230           0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     231             : {
     232           0 :     int         cnt = 0;
     233             : 
     234           0 :     while (len > 0 && *from)
     235             :     {
     236           0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
     237             :         {
     238           0 :             from++;
     239           0 :             *to = (SS2 << 16) | (*from++ << 8);
     240           0 :             *to |= *from++;
     241           0 :             len -= 3;
     242             :         }
     243           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     244             :         {
     245           0 :             from++;
     246           0 :             *to = (SS3 << 16) | (*from++ << 8);
     247           0 :             *to |= *from++;
     248           0 :             len -= 3;
     249             :         }
     250           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     251             :         {
     252           0 :             *to = *from++ << 8;
     253           0 :             *to |= *from++;
     254           0 :             len -= 2;
     255             :         }
     256             :         else
     257             :         {
     258           0 :             *to = *from++;
     259           0 :             len--;
     260             :         }
     261           0 :         to++;
     262           0 :         cnt++;
     263             :     }
     264           0 :     *to = 0;
     265           0 :     return cnt;
     266             : }
     267             : 
     268             : static int
     269           6 : pg_euccn_mblen(const unsigned char *s)
     270             : {
     271             :     int         len;
     272             : 
     273           6 :     if (IS_HIGHBIT_SET(*s))
     274           6 :         len = 2;
     275             :     else
     276           0 :         len = 1;
     277           6 :     return len;
     278             : }
     279             : 
     280             : static int
     281           0 : pg_euccn_dsplen(const unsigned char *s)
     282             : {
     283             :     int         len;
     284             : 
     285           0 :     if (IS_HIGHBIT_SET(*s))
     286           0 :         len = 2;
     287             :     else
     288           0 :         len = pg_ascii_dsplen(s);
     289           0 :     return len;
     290             : }
     291             : 
     292             : /*
     293             :  * EUC_TW
     294             :  *
     295             :  */
     296             : static int
     297           0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     298             : {
     299           0 :     int         cnt = 0;
     300             : 
     301           0 :     while (len > 0 && *from)
     302             :     {
     303           0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
     304             :         {
     305           0 :             from++;
     306           0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     307           0 :             *to |= *from++ << 8;
     308           0 :             *to |= *from++;
     309           0 :             len -= 4;
     310             :         }
     311           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     312             :         {
     313           0 :             from++;
     314           0 :             *to = (SS3 << 16) | (*from++ << 8);
     315           0 :             *to |= *from++;
     316           0 :             len -= 3;
     317             :         }
     318           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     319             :         {
     320           0 :             *to = *from++ << 8;
     321           0 :             *to |= *from++;
     322           0 :             len -= 2;
     323             :         }
     324             :         else
     325             :         {
     326           0 :             *to = *from++;
     327           0 :             len--;
     328             :         }
     329           0 :         to++;
     330           0 :         cnt++;
     331             :     }
     332           0 :     *to = 0;
     333           0 :     return cnt;
     334             : }
     335             : 
     336             : static int
     337           6 : pg_euctw_mblen(const unsigned char *s)
     338             : {
     339             :     int         len;
     340             : 
     341           6 :     if (*s == SS2)
     342           0 :         len = 4;
     343           6 :     else if (*s == SS3)
     344           0 :         len = 3;
     345           6 :     else if (IS_HIGHBIT_SET(*s))
     346           6 :         len = 2;
     347             :     else
     348           0 :         len = 1;
     349           6 :     return len;
     350             : }
     351             : 
     352             : static int
     353           0 : pg_euctw_dsplen(const unsigned char *s)
     354             : {
     355             :     int         len;
     356             : 
     357           0 :     if (*s == SS2)
     358           0 :         len = 2;
     359           0 :     else if (*s == SS3)
     360           0 :         len = 2;
     361           0 :     else if (IS_HIGHBIT_SET(*s))
     362           0 :         len = 2;
     363             :     else
     364           0 :         len = pg_ascii_dsplen(s);
     365           0 :     return len;
     366             : }
     367             : 
     368             : /*
     369             :  * Convert pg_wchar to EUC_* encoding.
     370             :  * caller must allocate enough space for "to", including a trailing zero!
     371             :  * len: length of from.
     372             :  * "from" not necessarily null terminated.
     373             :  */
     374             : static int
     375           0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     376             : {
     377           0 :     int         cnt = 0;
     378             : 
     379           0 :     while (len > 0 && *from)
     380             :     {
     381             :         unsigned char c;
     382             : 
     383           0 :         if ((c = (*from >> 24)))
     384             :         {
     385           0 :             *to++ = c;
     386           0 :             *to++ = (*from >> 16) & 0xff;
     387           0 :             *to++ = (*from >> 8) & 0xff;
     388           0 :             *to++ = *from & 0xff;
     389           0 :             cnt += 4;
     390             :         }
     391           0 :         else if ((c = (*from >> 16)))
     392             :         {
     393           0 :             *to++ = c;
     394           0 :             *to++ = (*from >> 8) & 0xff;
     395           0 :             *to++ = *from & 0xff;
     396           0 :             cnt += 3;
     397             :         }
     398           0 :         else if ((c = (*from >> 8)))
     399             :         {
     400           0 :             *to++ = c;
     401           0 :             *to++ = *from & 0xff;
     402           0 :             cnt += 2;
     403             :         }
     404             :         else
     405             :         {
     406           0 :             *to++ = *from;
     407           0 :             cnt++;
     408             :         }
     409           0 :         from++;
     410           0 :         len--;
     411             :     }
     412           0 :     *to = 0;
     413           0 :     return cnt;
     414             : }
     415             : 
     416             : 
     417             : /*
     418             :  * JOHAB
     419             :  */
     420             : static int
     421          24 : pg_johab_mblen(const unsigned char *s)
     422             : {
     423          24 :     return pg_euc_mblen(s);
     424             : }
     425             : 
     426             : static int
     427           0 : pg_johab_dsplen(const unsigned char *s)
     428             : {
     429           0 :     return pg_euc_dsplen(s);
     430             : }
     431             : 
     432             : /*
     433             :  * convert UTF8 string to pg_wchar (UCS-4)
     434             :  * caller must allocate enough space for "to", including a trailing zero!
     435             :  * len: length of from.
     436             :  * "from" not necessarily null terminated.
     437             :  */
     438             : static int
     439     7711086 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     440             : {
     441     7711086 :     int         cnt = 0;
     442             :     uint32      c1,
     443             :                 c2,
     444             :                 c3,
     445             :                 c4;
     446             : 
     447   158166622 :     while (len > 0 && *from)
     448             :     {
     449   150455536 :         if ((*from & 0x80) == 0)
     450             :         {
     451   150455080 :             *to = *from++;
     452   150455080 :             len--;
     453             :         }
     454         456 :         else if ((*from & 0xe0) == 0xc0)
     455             :         {
     456         364 :             if (len < 2)
     457           0 :                 break;          /* drop trailing incomplete char */
     458         364 :             c1 = *from++ & 0x1f;
     459         364 :             c2 = *from++ & 0x3f;
     460         364 :             *to = (c1 << 6) | c2;
     461         364 :             len -= 2;
     462             :         }
     463          92 :         else if ((*from & 0xf0) == 0xe0)
     464             :         {
     465          92 :             if (len < 3)
     466           0 :                 break;          /* drop trailing incomplete char */
     467          92 :             c1 = *from++ & 0x0f;
     468          92 :             c2 = *from++ & 0x3f;
     469          92 :             c3 = *from++ & 0x3f;
     470          92 :             *to = (c1 << 12) | (c2 << 6) | c3;
     471          92 :             len -= 3;
     472             :         }
     473           0 :         else if ((*from & 0xf8) == 0xf0)
     474             :         {
     475           0 :             if (len < 4)
     476           0 :                 break;          /* drop trailing incomplete char */
     477           0 :             c1 = *from++ & 0x07;
     478           0 :             c2 = *from++ & 0x3f;
     479           0 :             c3 = *from++ & 0x3f;
     480           0 :             c4 = *from++ & 0x3f;
     481           0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     482           0 :             len -= 4;
     483             :         }
     484             :         else
     485             :         {
     486             :             /* treat a bogus char as length 1; not ours to raise error */
     487           0 :             *to = *from++;
     488           0 :             len--;
     489             :         }
     490   150455536 :         to++;
     491   150455536 :         cnt++;
     492             :     }
     493     7711086 :     *to = 0;
     494     7711086 :     return cnt;
     495             : }
     496             : 
     497             : 
     498             : /*
     499             :  * Trivial conversion from pg_wchar to UTF-8.
     500             :  * caller should allocate enough space for "to"
     501             :  * len: length of from.
     502             :  * "from" not necessarily null terminated.
     503             :  */
     504             : static int
     505     1114944 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     506             : {
     507     1114944 :     int         cnt = 0;
     508             : 
     509    16790534 :     while (len > 0 && *from)
     510             :     {
     511             :         int         char_len;
     512             : 
     513    15675590 :         unicode_to_utf8(*from, to);
     514    15675590 :         char_len = pg_utf_mblen(to);
     515    15675590 :         cnt += char_len;
     516    15675590 :         to += char_len;
     517    15675590 :         from++;
     518    15675590 :         len--;
     519             :     }
     520     1114944 :     *to = 0;
     521     1114944 :     return cnt;
     522             : }
     523             : 
     524             : /*
     525             :  * Return the byte length of a UTF8 character pointed to by s
     526             :  *
     527             :  * Note: in the current implementation we do not support UTF8 sequences
     528             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     529             :  * We return "1" for any leading byte that is either flat-out illegal or
     530             :  * indicates a length larger than we support.
     531             :  *
     532             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     533             :  * other places would need to be fixed to change this.
     534             :  */
     535             : int
     536   321383932 : pg_utf_mblen(const unsigned char *s)
     537             : {
     538             :     int         len;
     539             : 
     540   321383932 :     if ((*s & 0x80) == 0)
     541   321355152 :         len = 1;
     542       28780 :     else if ((*s & 0xe0) == 0xc0)
     543       14794 :         len = 2;
     544       13986 :     else if ((*s & 0xf0) == 0xe0)
     545        9440 :         len = 3;
     546        4546 :     else if ((*s & 0xf8) == 0xf0)
     547        4372 :         len = 4;
     548             : #ifdef NOT_USED
     549             :     else if ((*s & 0xfc) == 0xf8)
     550             :         len = 5;
     551             :     else if ((*s & 0xfe) == 0xfc)
     552             :         len = 6;
     553             : #endif
     554             :     else
     555         174 :         len = 1;
     556   321383932 :     return len;
     557             : }
     558             : 
     559             : /*
     560             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     561             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     562             :  * <http://www.unix.org/online.html>
     563             :  *
     564             :  * Markus Kuhn -- 2001-09-08 -- public domain
     565             :  *
     566             :  * customised for PostgreSQL
     567             :  *
     568             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     569             :  */
     570             : 
     571             : struct mbinterval
     572             : {
     573             :     unsigned int first;
     574             :     unsigned int last;
     575             : };
     576             : 
     577             : /* auxiliary function for binary search in interval table */
     578             : static int
     579   107263112 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     580             : {
     581   107263112 :     int         min = 0;
     582             :     int         mid;
     583             : 
     584   107263112 :     if (ucs < table[0].first || ucs > table[max].last)
     585   107252808 :         return 0;
     586       90936 :     while (max >= min)
     587             :     {
     588       81376 :         mid = (min + max) / 2;
     589       81376 :         if (ucs > table[mid].last)
     590       18032 :             min = mid + 1;
     591       63344 :         else if (ucs < table[mid].first)
     592       62600 :             max = mid - 1;
     593             :         else
     594         744 :             return 1;
     595             :     }
     596             : 
     597        9560 :     return 0;
     598             : }
     599             : 
     600             : 
     601             : /* The following functions define the column width of an ISO 10646
     602             :  * character as follows:
     603             :  *
     604             :  *    - The null character (U+0000) has a column width of 0.
     605             :  *
     606             :  *    - Other C0/C1 control characters and DEL will lead to a return
     607             :  *      value of -1.
     608             :  *
     609             :  *    - Non-spacing and enclosing combining characters (general
     610             :  *      category code Mn, Me or Cf in the Unicode database) have a
     611             :  *      column width of 0.
     612             :  *
     613             :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     614             :  *      FullWidth (F) category as defined in Unicode Technical
     615             :  *      Report #11 have a column width of 2.
     616             :  *
     617             :  *    - All remaining characters (including all printable
     618             :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     619             :  *      etc.) have a column width of 1.
     620             :  *
     621             :  * This implementation assumes that wchar_t characters are encoded
     622             :  * in ISO 10646.
     623             :  */
     624             : 
     625             : static int
     626    53694424 : ucs_wcwidth(pg_wchar ucs)
     627             : {
     628             : #include "common/unicode_nonspacing_table.h"
     629             : #include "common/unicode_east_asian_fw_table.h"
     630             : 
     631             :     /* test for 8-bit control characters */
     632    53694424 :     if (ucs == 0)
     633           0 :         return 0;
     634             : 
     635    53694424 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     636       62544 :         return -1;
     637             : 
     638             :     /*
     639             :      * binary search in table of non-spacing characters
     640             :      *
     641             :      * XXX: In the official Unicode sources, it is possible for a character to
     642             :      * be described as both non-spacing and wide at the same time. As of
     643             :      * Unicode 13.0, treating the non-spacing property as the determining
     644             :      * factor for display width leads to the correct behavior, so do that
     645             :      * search first.
     646             :      */
     647    53631880 :     if (mbbisearch(ucs, nonspacing,
     648             :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     649         648 :         return 0;
     650             : 
     651             :     /* binary search in table of wide characters */
     652    53631232 :     if (mbbisearch(ucs, east_asian_fw,
     653             :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     654          96 :         return 2;
     655             : 
     656    53631136 :     return 1;
     657             : }
     658             : 
     659             : static int
     660    53694424 : pg_utf_dsplen(const unsigned char *s)
     661             : {
     662    53694424 :     return ucs_wcwidth(utf8_to_unicode(s));
     663             : }
     664             : 
     665             : /*
     666             :  * convert mule internal code to pg_wchar
     667             :  * caller should allocate enough space for "to"
     668             :  * len: length of from.
     669             :  * "from" not necessarily null terminated.
     670             :  */
     671             : static int
     672           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     673             : {
     674           0 :     int         cnt = 0;
     675             : 
     676           0 :     while (len > 0 && *from)
     677             :     {
     678           0 :         if (IS_LC1(*from) && len >= 2)
     679             :         {
     680           0 :             *to = *from++ << 16;
     681           0 :             *to |= *from++;
     682           0 :             len -= 2;
     683             :         }
     684           0 :         else if (IS_LCPRV1(*from) && len >= 3)
     685             :         {
     686           0 :             from++;
     687           0 :             *to = *from++ << 16;
     688           0 :             *to |= *from++;
     689           0 :             len -= 3;
     690             :         }
     691           0 :         else if (IS_LC2(*from) && len >= 3)
     692             :         {
     693           0 :             *to = *from++ << 16;
     694           0 :             *to |= *from++ << 8;
     695           0 :             *to |= *from++;
     696           0 :             len -= 3;
     697             :         }
     698           0 :         else if (IS_LCPRV2(*from) && len >= 4)
     699             :         {
     700           0 :             from++;
     701           0 :             *to = *from++ << 16;
     702           0 :             *to |= *from++ << 8;
     703           0 :             *to |= *from++;
     704           0 :             len -= 4;
     705             :         }
     706             :         else
     707             :         {                       /* assume ASCII */
     708           0 :             *to = (unsigned char) *from++;
     709           0 :             len--;
     710             :         }
     711           0 :         to++;
     712           0 :         cnt++;
     713             :     }
     714           0 :     *to = 0;
     715           0 :     return cnt;
     716             : }
     717             : 
     718             : /*
     719             :  * convert pg_wchar to mule internal code
     720             :  * caller should allocate enough space for "to"
     721             :  * len: length of from.
     722             :  * "from" not necessarily null terminated.
     723             :  */
     724             : static int
     725           0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     726             : {
     727           0 :     int         cnt = 0;
     728             : 
     729           0 :     while (len > 0 && *from)
     730             :     {
     731             :         unsigned char lb;
     732             : 
     733           0 :         lb = (*from >> 16) & 0xff;
     734           0 :         if (IS_LC1(lb))
     735             :         {
     736           0 :             *to++ = lb;
     737           0 :             *to++ = *from & 0xff;
     738           0 :             cnt += 2;
     739             :         }
     740           0 :         else if (IS_LC2(lb))
     741             :         {
     742           0 :             *to++ = lb;
     743           0 :             *to++ = (*from >> 8) & 0xff;
     744           0 :             *to++ = *from & 0xff;
     745           0 :             cnt += 3;
     746             :         }
     747           0 :         else if (IS_LCPRV1_A_RANGE(lb))
     748             :         {
     749           0 :             *to++ = LCPRV1_A;
     750           0 :             *to++ = lb;
     751           0 :             *to++ = *from & 0xff;
     752           0 :             cnt += 3;
     753             :         }
     754           0 :         else if (IS_LCPRV1_B_RANGE(lb))
     755             :         {
     756           0 :             *to++ = LCPRV1_B;
     757           0 :             *to++ = lb;
     758           0 :             *to++ = *from & 0xff;
     759           0 :             cnt += 3;
     760             :         }
     761           0 :         else if (IS_LCPRV2_A_RANGE(lb))
     762             :         {
     763           0 :             *to++ = LCPRV2_A;
     764           0 :             *to++ = lb;
     765           0 :             *to++ = (*from >> 8) & 0xff;
     766           0 :             *to++ = *from & 0xff;
     767           0 :             cnt += 4;
     768             :         }
     769           0 :         else if (IS_LCPRV2_B_RANGE(lb))
     770             :         {
     771           0 :             *to++ = LCPRV2_B;
     772           0 :             *to++ = lb;
     773           0 :             *to++ = (*from >> 8) & 0xff;
     774           0 :             *to++ = *from & 0xff;
     775           0 :             cnt += 4;
     776             :         }
     777             :         else
     778             :         {
     779           0 :             *to++ = *from & 0xff;
     780           0 :             cnt += 1;
     781             :         }
     782           0 :         from++;
     783           0 :         len--;
     784             :     }
     785           0 :     *to = 0;
     786           0 :     return cnt;
     787             : }
     788             : 
     789             : /* exported for direct use by conv.c */
     790             : int
     791        3024 : pg_mule_mblen(const unsigned char *s)
     792             : {
     793             :     int         len;
     794             : 
     795        3024 :     if (IS_LC1(*s))
     796        1220 :         len = 2;
     797        1804 :     else if (IS_LCPRV1(*s))
     798           0 :         len = 3;
     799        1804 :     else if (IS_LC2(*s))
     800        1710 :         len = 3;
     801          94 :     else if (IS_LCPRV2(*s))
     802          40 :         len = 4;
     803             :     else
     804          54 :         len = 1;                /* assume ASCII */
     805        3024 :     return len;
     806             : }
     807             : 
     808             : static int
     809           0 : pg_mule_dsplen(const unsigned char *s)
     810             : {
     811             :     int         len;
     812             : 
     813             :     /*
     814             :      * Note: it's not really appropriate to assume that all multibyte charsets
     815             :      * are double-wide on screen.  But this seems an okay approximation for
     816             :      * the MULE charsets we currently support.
     817             :      */
     818             : 
     819           0 :     if (IS_LC1(*s))
     820           0 :         len = 1;
     821           0 :     else if (IS_LCPRV1(*s))
     822           0 :         len = 1;
     823           0 :     else if (IS_LC2(*s))
     824           0 :         len = 2;
     825           0 :     else if (IS_LCPRV2(*s))
     826           0 :         len = 2;
     827             :     else
     828           0 :         len = 1;                /* assume ASCII */
     829             : 
     830           0 :     return len;
     831             : }
     832             : 
     833             : /*
     834             :  * ISO8859-1
     835             :  */
     836             : static int
     837        1070 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     838             : {
     839        1070 :     int         cnt = 0;
     840             : 
     841       30004 :     while (len > 0 && *from)
     842             :     {
     843       28934 :         *to++ = *from++;
     844       28934 :         len--;
     845       28934 :         cnt++;
     846             :     }
     847        1070 :     *to = 0;
     848        1070 :     return cnt;
     849             : }
     850             : 
     851             : /*
     852             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     853             :  * high bits.
     854             :  * caller should allocate enough space for "to"
     855             :  * len: length of from.
     856             :  * "from" not necessarily null terminated.
     857             :  */
     858             : static int
     859         150 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     860             : {
     861         150 :     int         cnt = 0;
     862             : 
     863        1356 :     while (len > 0 && *from)
     864             :     {
     865        1206 :         *to++ = *from++;
     866        1206 :         len--;
     867        1206 :         cnt++;
     868             :     }
     869         150 :     *to = 0;
     870         150 :     return cnt;
     871             : }
     872             : 
     873             : static int
     874        8428 : pg_latin1_mblen(const unsigned char *s)
     875             : {
     876        8428 :     return 1;
     877             : }
     878             : 
     879             : static int
     880        1232 : pg_latin1_dsplen(const unsigned char *s)
     881             : {
     882        1232 :     return pg_ascii_dsplen(s);
     883             : }
     884             : 
     885             : /*
     886             :  * SJIS
     887             :  */
     888             : static int
     889        1690 : pg_sjis_mblen(const unsigned char *s)
     890             : {
     891             :     int         len;
     892             : 
     893        1690 :     if (*s >= 0xa1 && *s <= 0xdf)
     894           0 :         len = 1;                /* 1 byte kana? */
     895        1690 :     else if (IS_HIGHBIT_SET(*s))
     896        1314 :         len = 2;                /* kanji? */
     897             :     else
     898         376 :         len = 1;                /* should be ASCII */
     899        1690 :     return len;
     900             : }
     901             : 
     902             : static int
     903           0 : pg_sjis_dsplen(const unsigned char *s)
     904             : {
     905             :     int         len;
     906             : 
     907           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     908           0 :         len = 1;                /* 1 byte kana? */
     909           0 :     else if (IS_HIGHBIT_SET(*s))
     910           0 :         len = 2;                /* kanji? */
     911             :     else
     912           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     913           0 :     return len;
     914             : }
     915             : 
     916             : /*
     917             :  * Big5
     918             :  */
     919             : static int
     920         492 : pg_big5_mblen(const unsigned char *s)
     921             : {
     922             :     int         len;
     923             : 
     924         492 :     if (IS_HIGHBIT_SET(*s))
     925         438 :         len = 2;                /* kanji? */
     926             :     else
     927          54 :         len = 1;                /* should be ASCII */
     928         492 :     return len;
     929             : }
     930             : 
     931             : static int
     932           0 : pg_big5_dsplen(const unsigned char *s)
     933             : {
     934             :     int         len;
     935             : 
     936           0 :     if (IS_HIGHBIT_SET(*s))
     937           0 :         len = 2;                /* kanji? */
     938             :     else
     939           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     940           0 :     return len;
     941             : }
     942             : 
     943             : /*
     944             :  * GBK
     945             :  */
     946             : static int
     947         556 : pg_gbk_mblen(const unsigned char *s)
     948             : {
     949             :     int         len;
     950             : 
     951         556 :     if (IS_HIGHBIT_SET(*s))
     952         416 :         len = 2;                /* kanji? */
     953             :     else
     954         140 :         len = 1;                /* should be ASCII */
     955         556 :     return len;
     956             : }
     957             : 
     958             : static int
     959           0 : pg_gbk_dsplen(const unsigned char *s)
     960             : {
     961             :     int         len;
     962             : 
     963           0 :     if (IS_HIGHBIT_SET(*s))
     964           0 :         len = 2;                /* kanji? */
     965             :     else
     966           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     967           0 :     return len;
     968             : }
     969             : 
     970             : /*
     971             :  * UHC
     972             :  */
     973             : static int
     974          24 : pg_uhc_mblen(const unsigned char *s)
     975             : {
     976             :     int         len;
     977             : 
     978          24 :     if (IS_HIGHBIT_SET(*s))
     979          24 :         len = 2;                /* 2byte? */
     980             :     else
     981           0 :         len = 1;                /* should be ASCII */
     982          24 :     return len;
     983             : }
     984             : 
     985             : static int
     986           0 : pg_uhc_dsplen(const unsigned char *s)
     987             : {
     988             :     int         len;
     989             : 
     990           0 :     if (IS_HIGHBIT_SET(*s))
     991           0 :         len = 2;                /* 2byte? */
     992             :     else
     993           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     994           0 :     return len;
     995             : }
     996             : 
     997             : /*
     998             :  * GB18030
     999             :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1000             :  */
    1001             : 
    1002             : /*
    1003             :  * Unlike all other mblen() functions, this also looks at the second byte of
    1004             :  * the input.  However, if you only pass the first byte of a multi-byte
    1005             :  * string, and \0 as the second byte, this still works in a predictable way:
    1006             :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1007             :  * enough for all current uses, as a client-only encoding.  It works that
    1008             :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1009             :  * fourth byte look like a 2-byte encoded character, when looked at
    1010             :  * separately.
    1011             :  */
    1012             : static int
    1013         926 : pg_gb18030_mblen(const unsigned char *s)
    1014             : {
    1015             :     int         len;
    1016             : 
    1017         926 :     if (!IS_HIGHBIT_SET(*s))
    1018         540 :         len = 1;                /* ASCII */
    1019         386 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1020         138 :         len = 4;
    1021             :     else
    1022         248 :         len = 2;
    1023         926 :     return len;
    1024             : }
    1025             : 
    1026             : static int
    1027           0 : pg_gb18030_dsplen(const unsigned char *s)
    1028             : {
    1029             :     int         len;
    1030             : 
    1031           0 :     if (IS_HIGHBIT_SET(*s))
    1032           0 :         len = 2;
    1033             :     else
    1034           0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1035           0 :     return len;
    1036             : }
    1037             : 
    1038             : /*
    1039             :  *-------------------------------------------------------------------
    1040             :  * multibyte sequence validators
    1041             :  *
    1042             :  * The verifychar functions accept "s", a pointer to the first byte of a
    1043             :  * string, and "len", the remaining length of the string.  If there is a
    1044             :  * validly encoded character beginning at *s, return its length in bytes;
    1045             :  * else return -1.
    1046             :  *
    1047             :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1048             :  * the length of the string.  They verify the whole string, and return the
    1049             :  * number of input bytes (<= len) that are valid.  In other words, if the
    1050             :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1051             :  * byte offset of the first invalid character.  The verifystr functions must
    1052             :  * test for and reject zeroes in the input.
    1053             :  *
    1054             :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1055             :  * they must test for and reject zeroes in any additional bytes of a
    1056             :  * multibyte character.  Note that this definition allows the function for a
    1057             :  * single-byte encoding to be just "return 1".
    1058             :  *-------------------------------------------------------------------
    1059             :  */
    1060             : static int
    1061         322 : pg_ascii_verifychar(const unsigned char *s, int len)
    1062             : {
    1063         322 :     return 1;
    1064             : }
    1065             : 
    1066             : static int
    1067      423392 : pg_ascii_verifystr(const unsigned char *s, int len)
    1068             : {
    1069      423392 :     const unsigned char *nullpos = memchr(s, 0, len);
    1070             : 
    1071      423392 :     if (nullpos == NULL)
    1072      423392 :         return len;
    1073             :     else
    1074           0 :         return nullpos - s;
    1075             : }
    1076             : 
    1077             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1078             : 
    1079             : static int
    1080         504 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1081             : {
    1082             :     int         l;
    1083             :     unsigned char c1,
    1084             :                 c2;
    1085             : 
    1086         504 :     c1 = *s++;
    1087             : 
    1088         504 :     switch (c1)
    1089             :     {
    1090           0 :         case SS2:               /* JIS X 0201 */
    1091           0 :             l = 2;
    1092           0 :             if (l > len)
    1093           0 :                 return -1;
    1094           0 :             c2 = *s++;
    1095           0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1096           0 :                 return -1;
    1097           0 :             break;
    1098             : 
    1099           0 :         case SS3:               /* JIS X 0212 */
    1100           0 :             l = 3;
    1101           0 :             if (l > len)
    1102           0 :                 return -1;
    1103           0 :             c2 = *s++;
    1104           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1105           0 :                 return -1;
    1106           0 :             c2 = *s++;
    1107           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1108           0 :                 return -1;
    1109           0 :             break;
    1110             : 
    1111         504 :         default:
    1112         504 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1113             :             {
    1114         504 :                 l = 2;
    1115         504 :                 if (l > len)
    1116          84 :                     return -1;
    1117         420 :                 if (!IS_EUC_RANGE_VALID(c1))
    1118          24 :                     return -1;
    1119         396 :                 c2 = *s++;
    1120         396 :                 if (!IS_EUC_RANGE_VALID(c2))
    1121         180 :                     return -1;
    1122             :             }
    1123             :             else
    1124             :                 /* must be ASCII */
    1125             :             {
    1126           0 :                 l = 1;
    1127             :             }
    1128         216 :             break;
    1129             :     }
    1130             : 
    1131         216 :     return l;
    1132             : }
    1133             : 
    1134             : static int
    1135         300 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1136             : {
    1137         300 :     const unsigned char *start = s;
    1138             : 
    1139         930 :     while (len > 0)
    1140             :     {
    1141             :         int         l;
    1142             : 
    1143             :         /* fast path for ASCII-subset characters */
    1144         846 :         if (!IS_HIGHBIT_SET(*s))
    1145             :         {
    1146         594 :             if (*s == '\0')
    1147          72 :                 break;
    1148         522 :             l = 1;
    1149             :         }
    1150             :         else
    1151             :         {
    1152         252 :             l = pg_eucjp_verifychar(s, len);
    1153         252 :             if (l == -1)
    1154         144 :                 break;
    1155             :         }
    1156         630 :         s += l;
    1157         630 :         len -= l;
    1158             :     }
    1159             : 
    1160         300 :     return s - start;
    1161             : }
    1162             : 
    1163             : static int
    1164          36 : pg_euckr_verifychar(const unsigned char *s, int len)
    1165             : {
    1166             :     int         l;
    1167             :     unsigned char c1,
    1168             :                 c2;
    1169             : 
    1170          36 :     c1 = *s++;
    1171             : 
    1172          36 :     if (IS_HIGHBIT_SET(c1))
    1173             :     {
    1174          36 :         l = 2;
    1175          36 :         if (l > len)
    1176          12 :             return -1;
    1177          24 :         if (!IS_EUC_RANGE_VALID(c1))
    1178          24 :             return -1;
    1179           0 :         c2 = *s++;
    1180           0 :         if (!IS_EUC_RANGE_VALID(c2))
    1181           0 :             return -1;
    1182             :     }
    1183             :     else
    1184             :         /* must be ASCII */
    1185             :     {
    1186           0 :         l = 1;
    1187             :     }
    1188             : 
    1189           0 :     return l;
    1190             : }
    1191             : 
    1192             : static int
    1193          60 : pg_euckr_verifystr(const unsigned char *s, int len)
    1194             : {
    1195          60 :     const unsigned char *start = s;
    1196             : 
    1197         132 :     while (len > 0)
    1198             :     {
    1199             :         int         l;
    1200             : 
    1201             :         /* fast path for ASCII-subset characters */
    1202         108 :         if (!IS_HIGHBIT_SET(*s))
    1203             :         {
    1204          72 :             if (*s == '\0')
    1205           0 :                 break;
    1206          72 :             l = 1;
    1207             :         }
    1208             :         else
    1209             :         {
    1210          36 :             l = pg_euckr_verifychar(s, len);
    1211          36 :             if (l == -1)
    1212          36 :                 break;
    1213             :         }
    1214          72 :         s += l;
    1215          72 :         len -= l;
    1216             :     }
    1217             : 
    1218          60 :     return s - start;
    1219             : }
    1220             : 
    1221             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1222             : #define pg_euccn_verifychar pg_euckr_verifychar
    1223             : #define pg_euccn_verifystr  pg_euckr_verifystr
    1224             : 
    1225             : static int
    1226          18 : pg_euctw_verifychar(const unsigned char *s, int len)
    1227             : {
    1228             :     int         l;
    1229             :     unsigned char c1,
    1230             :                 c2;
    1231             : 
    1232          18 :     c1 = *s++;
    1233             : 
    1234          18 :     switch (c1)
    1235             :     {
    1236           0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1237           0 :             l = 4;
    1238           0 :             if (l > len)
    1239           0 :                 return -1;
    1240           0 :             c2 = *s++;
    1241           0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1242           0 :                 return -1;
    1243           0 :             c2 = *s++;
    1244           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1245           0 :                 return -1;
    1246           0 :             c2 = *s++;
    1247           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1248           0 :                 return -1;
    1249           0 :             break;
    1250             : 
    1251           0 :         case SS3:               /* unused */
    1252           0 :             return -1;
    1253             : 
    1254          18 :         default:
    1255          18 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1256             :             {
    1257          18 :                 l = 2;
    1258          18 :                 if (l > len)
    1259           6 :                     return -1;
    1260             :                 /* no further range check on c1? */
    1261          12 :                 c2 = *s++;
    1262          12 :                 if (!IS_EUC_RANGE_VALID(c2))
    1263          12 :                     return -1;
    1264             :             }
    1265             :             else
    1266             :                 /* must be ASCII */
    1267             :             {
    1268           0 :                 l = 1;
    1269             :             }
    1270           0 :             break;
    1271             :     }
    1272           0 :     return l;
    1273             : }
    1274             : 
    1275             : static int
    1276          36 : pg_euctw_verifystr(const unsigned char *s, int len)
    1277             : {
    1278          36 :     const unsigned char *start = s;
    1279             : 
    1280          90 :     while (len > 0)
    1281             :     {
    1282             :         int         l;
    1283             : 
    1284             :         /* fast path for ASCII-subset characters */
    1285          72 :         if (!IS_HIGHBIT_SET(*s))
    1286             :         {
    1287          54 :             if (*s == '\0')
    1288           0 :                 break;
    1289          54 :             l = 1;
    1290             :         }
    1291             :         else
    1292             :         {
    1293          18 :             l = pg_euctw_verifychar(s, len);
    1294          18 :             if (l == -1)
    1295          18 :                 break;
    1296             :         }
    1297          54 :         s += l;
    1298          54 :         len -= l;
    1299             :     }
    1300             : 
    1301          36 :     return s - start;
    1302             : }
    1303             : 
    1304             : static int
    1305          18 : pg_johab_verifychar(const unsigned char *s, int len)
    1306             : {
    1307             :     int         l,
    1308             :                 mbl;
    1309             :     unsigned char c;
    1310             : 
    1311          18 :     l = mbl = pg_johab_mblen(s);
    1312             : 
    1313          18 :     if (len < l)
    1314           6 :         return -1;
    1315             : 
    1316          12 :     if (!IS_HIGHBIT_SET(*s))
    1317           0 :         return mbl;
    1318             : 
    1319          12 :     while (--l > 0)
    1320             :     {
    1321          12 :         c = *++s;
    1322          12 :         if (!IS_EUC_RANGE_VALID(c))
    1323          12 :             return -1;
    1324             :     }
    1325           0 :     return mbl;
    1326             : }
    1327             : 
    1328             : static int
    1329          24 : pg_johab_verifystr(const unsigned char *s, int len)
    1330             : {
    1331          24 :     const unsigned char *start = s;
    1332             : 
    1333          42 :     while (len > 0)
    1334             :     {
    1335             :         int         l;
    1336             : 
    1337             :         /* fast path for ASCII-subset characters */
    1338          36 :         if (!IS_HIGHBIT_SET(*s))
    1339             :         {
    1340          18 :             if (*s == '\0')
    1341           0 :                 break;
    1342          18 :             l = 1;
    1343             :         }
    1344             :         else
    1345             :         {
    1346          18 :             l = pg_johab_verifychar(s, len);
    1347          18 :             if (l == -1)
    1348          18 :                 break;
    1349             :         }
    1350          18 :         s += l;
    1351          18 :         len -= l;
    1352             :     }
    1353             : 
    1354          24 :     return s - start;
    1355             : }
    1356             : 
    1357             : static int
    1358        1350 : pg_mule_verifychar(const unsigned char *s, int len)
    1359             : {
    1360             :     int         l,
    1361             :                 mbl;
    1362             :     unsigned char c;
    1363             : 
    1364        1350 :     l = mbl = pg_mule_mblen(s);
    1365             : 
    1366        1350 :     if (len < l)
    1367         344 :         return -1;
    1368             : 
    1369        2032 :     while (--l > 0)
    1370             :     {
    1371        1348 :         c = *++s;
    1372        1348 :         if (!IS_HIGHBIT_SET(c))
    1373         322 :             return -1;
    1374             :     }
    1375         684 :     return mbl;
    1376             : }
    1377             : 
    1378             : static int
    1379         438 : pg_mule_verifystr(const unsigned char *s, int len)
    1380             : {
    1381         438 :     const unsigned char *start = s;
    1382             : 
    1383        1290 :     while (len > 0)
    1384             :     {
    1385             :         int         l;
    1386             : 
    1387             :         /* fast path for ASCII-subset characters */
    1388        1122 :         if (!IS_HIGHBIT_SET(*s))
    1389             :         {
    1390         690 :             if (*s == '\0')
    1391          36 :                 break;
    1392         654 :             l = 1;
    1393             :         }
    1394             :         else
    1395             :         {
    1396         432 :             l = pg_mule_verifychar(s, len);
    1397         432 :             if (l == -1)
    1398         234 :                 break;
    1399             :         }
    1400         852 :         s += l;
    1401         852 :         len -= l;
    1402             :     }
    1403             : 
    1404         438 :     return s - start;
    1405             : }
    1406             : 
    1407             : static int
    1408        7156 : pg_latin1_verifychar(const unsigned char *s, int len)
    1409             : {
    1410        7156 :     return 1;
    1411             : }
    1412             : 
    1413             : static int
    1414       11148 : pg_latin1_verifystr(const unsigned char *s, int len)
    1415             : {
    1416       11148 :     const unsigned char *nullpos = memchr(s, 0, len);
    1417             : 
    1418       11148 :     if (nullpos == NULL)
    1419       11040 :         return len;
    1420             :     else
    1421         108 :         return nullpos - s;
    1422             : }
    1423             : 
    1424             : static int
    1425        1002 : pg_sjis_verifychar(const unsigned char *s, int len)
    1426             : {
    1427             :     int         l,
    1428             :                 mbl;
    1429             :     unsigned char c1,
    1430             :                 c2;
    1431             : 
    1432        1002 :     l = mbl = pg_sjis_mblen(s);
    1433             : 
    1434        1002 :     if (len < l)
    1435         132 :         return -1;
    1436             : 
    1437         870 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1438           0 :         return mbl;
    1439             : 
    1440         870 :     c1 = *s++;
    1441         870 :     c2 = *s;
    1442         870 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1443         348 :         return -1;
    1444         522 :     return mbl;
    1445             : }
    1446             : 
    1447             : static int
    1448         546 : pg_sjis_verifystr(const unsigned char *s, int len)
    1449             : {
    1450         546 :     const unsigned char *start = s;
    1451             : 
    1452        2068 :     while (len > 0)
    1453             :     {
    1454             :         int         l;
    1455             : 
    1456             :         /* fast path for ASCII-subset characters */
    1457        1842 :         if (!IS_HIGHBIT_SET(*s))
    1458             :         {
    1459        1348 :             if (*s == '\0')
    1460          72 :                 break;
    1461        1276 :             l = 1;
    1462             :         }
    1463             :         else
    1464             :         {
    1465         494 :             l = pg_sjis_verifychar(s, len);
    1466         494 :             if (l == -1)
    1467         248 :                 break;
    1468             :         }
    1469        1522 :         s += l;
    1470        1522 :         len -= l;
    1471             :     }
    1472             : 
    1473         546 :     return s - start;
    1474             : }
    1475             : 
    1476             : static int
    1477         360 : pg_big5_verifychar(const unsigned char *s, int len)
    1478             : {
    1479             :     int         l,
    1480             :                 mbl;
    1481             : 
    1482         360 :     l = mbl = pg_big5_mblen(s);
    1483             : 
    1484         360 :     if (len < l)
    1485           6 :         return -1;
    1486             : 
    1487         354 :     if (l == 2 &&
    1488         354 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1489          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1490          12 :         return -1;
    1491             : 
    1492         576 :     while (--l > 0)
    1493             :     {
    1494         342 :         if (*++s == '\0')
    1495         108 :             return -1;
    1496             :     }
    1497             : 
    1498         234 :     return mbl;
    1499             : }
    1500             : 
    1501             : static int
    1502         162 : pg_big5_verifystr(const unsigned char *s, int len)
    1503             : {
    1504         162 :     const unsigned char *start = s;
    1505             : 
    1506         666 :     while (len > 0)
    1507             :     {
    1508             :         int         l;
    1509             : 
    1510             :         /* fast path for ASCII-subset characters */
    1511         594 :         if (!IS_HIGHBIT_SET(*s))
    1512             :         {
    1513         468 :             if (*s == '\0')
    1514          36 :                 break;
    1515         432 :             l = 1;
    1516             :         }
    1517             :         else
    1518             :         {
    1519         126 :             l = pg_big5_verifychar(s, len);
    1520         126 :             if (l == -1)
    1521          54 :                 break;
    1522             :         }
    1523         504 :         s += l;
    1524         504 :         len -= l;
    1525             :     }
    1526             : 
    1527         162 :     return s - start;
    1528             : }
    1529             : 
    1530             : static int
    1531         274 : pg_gbk_verifychar(const unsigned char *s, int len)
    1532             : {
    1533             :     int         l,
    1534             :                 mbl;
    1535             : 
    1536         274 :     l = mbl = pg_gbk_mblen(s);
    1537             : 
    1538         274 :     if (len < l)
    1539          54 :         return -1;
    1540             : 
    1541         220 :     if (l == 2 &&
    1542         220 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1543          28 :         s[1] == NONUTF8_INVALID_BYTE1)
    1544          28 :         return -1;
    1545             : 
    1546         384 :     while (--l > 0)
    1547             :     {
    1548         192 :         if (*++s == '\0')
    1549           0 :             return -1;
    1550             :     }
    1551             : 
    1552         192 :     return mbl;
    1553             : }
    1554             : 
    1555             : static int
    1556         256 : pg_gbk_verifystr(const unsigned char *s, int len)
    1557             : {
    1558         256 :     const unsigned char *start = s;
    1559             : 
    1560         658 :     while (len > 0)
    1561             :     {
    1562             :         int         l;
    1563             : 
    1564             :         /* fast path for ASCII-subset characters */
    1565         484 :         if (!IS_HIGHBIT_SET(*s))
    1566             :         {
    1567         242 :             if (*s == '\0')
    1568           0 :                 break;
    1569         242 :             l = 1;
    1570             :         }
    1571             :         else
    1572             :         {
    1573         242 :             l = pg_gbk_verifychar(s, len);
    1574         242 :             if (l == -1)
    1575          82 :                 break;
    1576             :         }
    1577         402 :         s += l;
    1578         402 :         len -= l;
    1579             :     }
    1580             : 
    1581         256 :     return s - start;
    1582             : }
    1583             : 
    1584             : static int
    1585          18 : pg_uhc_verifychar(const unsigned char *s, int len)
    1586             : {
    1587             :     int         l,
    1588             :                 mbl;
    1589             : 
    1590          18 :     l = mbl = pg_uhc_mblen(s);
    1591             : 
    1592          18 :     if (len < l)
    1593           6 :         return -1;
    1594             : 
    1595          12 :     if (l == 2 &&
    1596          12 :         s[0] == NONUTF8_INVALID_BYTE0 &&
    1597          12 :         s[1] == NONUTF8_INVALID_BYTE1)
    1598          12 :         return -1;
    1599             : 
    1600           0 :     while (--l > 0)
    1601             :     {
    1602           0 :         if (*++s == '\0')
    1603           0 :             return -1;
    1604             :     }
    1605             : 
    1606           0 :     return mbl;
    1607             : }
    1608             : 
    1609             : static int
    1610          24 : pg_uhc_verifystr(const unsigned char *s, int len)
    1611             : {
    1612          24 :     const unsigned char *start = s;
    1613             : 
    1614          42 :     while (len > 0)
    1615             :     {
    1616             :         int         l;
    1617             : 
    1618             :         /* fast path for ASCII-subset characters */
    1619          36 :         if (!IS_HIGHBIT_SET(*s))
    1620             :         {
    1621          18 :             if (*s == '\0')
    1622           0 :                 break;
    1623          18 :             l = 1;
    1624             :         }
    1625             :         else
    1626             :         {
    1627          18 :             l = pg_uhc_verifychar(s, len);
    1628          18 :             if (l == -1)
    1629          18 :                 break;
    1630             :         }
    1631          18 :         s += l;
    1632          18 :         len -= l;
    1633             :     }
    1634             : 
    1635          24 :     return s - start;
    1636             : }
    1637             : 
    1638             : static int
    1639         900 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1640             : {
    1641             :     int         l;
    1642             : 
    1643         900 :     if (!IS_HIGHBIT_SET(*s))
    1644           0 :         l = 1;                  /* ASCII */
    1645         900 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1646             :     {
    1647             :         /* Should be 4-byte, validate remaining bytes */
    1648         306 :         if (*s >= 0x81 && *s <= 0xfe &&
    1649         306 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1650         306 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1651         162 :             l = 4;
    1652             :         else
    1653         144 :             l = -1;
    1654             :     }
    1655         594 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1656             :     {
    1657             :         /* Should be 2-byte, validate */
    1658         564 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1659         324 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1660         276 :             l = 2;
    1661             :         else
    1662         288 :             l = -1;
    1663             :     }
    1664             :     else
    1665          30 :         l = -1;
    1666         900 :     return l;
    1667             : }
    1668             : 
    1669             : static int
    1670         648 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1671             : {
    1672         648 :     const unsigned char *start = s;
    1673             : 
    1674        2450 :     while (len > 0)
    1675             :     {
    1676             :         int         l;
    1677             : 
    1678             :         /* fast path for ASCII-subset characters */
    1679        2180 :         if (!IS_HIGHBIT_SET(*s))
    1680             :         {
    1681        1560 :             if (*s == '\0')
    1682          48 :                 break;
    1683        1512 :             l = 1;
    1684             :         }
    1685             :         else
    1686             :         {
    1687         620 :             l = pg_gb18030_verifychar(s, len);
    1688         620 :             if (l == -1)
    1689         330 :                 break;
    1690             :         }
    1691        1802 :         s += l;
    1692        1802 :         len -= l;
    1693             :     }
    1694             : 
    1695         648 :     return s - start;
    1696             : }
    1697             : 
    1698             : static int
    1699       17452 : pg_utf8_verifychar(const unsigned char *s, int len)
    1700             : {
    1701             :     int         l;
    1702             : 
    1703       17452 :     if ((*s & 0x80) == 0)
    1704             :     {
    1705           0 :         if (*s == '\0')
    1706           0 :             return -1;
    1707           0 :         return 1;
    1708             :     }
    1709       17452 :     else if ((*s & 0xe0) == 0xc0)
    1710        6060 :         l = 2;
    1711       11392 :     else if ((*s & 0xf0) == 0xe0)
    1712        6272 :         l = 3;
    1713        5120 :     else if ((*s & 0xf8) == 0xf0)
    1714        4856 :         l = 4;
    1715             :     else
    1716         264 :         l = 1;
    1717             : 
    1718       17452 :     if (l > len)
    1719         578 :         return -1;
    1720             : 
    1721       16874 :     if (!pg_utf8_islegal(s, l))
    1722        2356 :         return -1;
    1723             : 
    1724       14518 :     return l;
    1725             : }
    1726             : 
    1727             : /*
    1728             :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1729             :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1730             :  * input byte and current state are used to compute an index into an array of
    1731             :  * state transitions. Since the address of the next transition is dependent
    1732             :  * on this computation, there is latency in executing the load instruction,
    1733             :  * and the CPU is not kept busy.
    1734             :  *
    1735             :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1736             :  *
    1737             :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1738             :  *
    1739             :  * In a shift-based DFA, the input byte is an index into array of integers
    1740             :  * whose bit pattern encodes the state transitions. To compute the next
    1741             :  * state, we simply right-shift the integer by the current state and apply a
    1742             :  * mask. In this scheme, the address of the transition only depends on the
    1743             :  * input byte, so there is better pipelining.
    1744             :  *
    1745             :  * The naming convention for states and transitions was adopted from a UTF-8
    1746             :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1747             :  *
    1748             :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1749             :  *
    1750             :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1751             :  * ==========================================================================
    1752             :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1753             :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1754             :  *                                                                  |
    1755             :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1756             :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1757             :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1758             :  *                                                                  |
    1759             :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1760             :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1761             :  *                                                                  |
    1762             :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1763             :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1764             :  *
    1765             :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1766             :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1767             :  * it's possible to find state numbers such that the transitions fit within
    1768             :  * 32-bit integers, as Dougall Johnson demonstrated:
    1769             :  *
    1770             :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1771             :  *
    1772             :  * This packed representation is the reason for the seemingly odd choice of
    1773             :  * state values below.
    1774             :  */
    1775             : 
    1776             : /* Error */
    1777             : #define ERR  0
    1778             : /* Begin */
    1779             : #define BGN 11
    1780             : /* Continuation states, expect 1/2/3 continuation bytes */
    1781             : #define CS1 16
    1782             : #define CS2  1
    1783             : #define CS3  5
    1784             : /* Partial states, where the first continuation byte has a restricted range */
    1785             : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1786             : #define P3B 20                  /* Lead was ED, check for surrogate */
    1787             : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1788             : #define P4B 30                  /* Lead was F4, check for too-large */
    1789             : /* Begin and End are the same state */
    1790             : #define END BGN
    1791             : 
    1792             : /* the encoded state transitions for the lookup table */
    1793             : 
    1794             : /* ASCII */
    1795             : #define ASC (END << BGN)
    1796             : /* 2-byte lead */
    1797             : #define L2A (CS1 << BGN)
    1798             : /* 3-byte lead */
    1799             : #define L3A (P3A << BGN)
    1800             : #define L3B (CS2 << BGN)
    1801             : #define L3C (P3B << BGN)
    1802             : /* 4-byte lead */
    1803             : #define L4A (P4A << BGN)
    1804             : #define L4B (CS3 << BGN)
    1805             : #define L4C (P4B << BGN)
    1806             : /* continuation byte */
    1807             : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1808             : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1809             : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1810             : /* invalid byte */
    1811             : #define ILL ERR
    1812             : 
    1813             : static const uint32 Utf8Transition[256] =
    1814             : {
    1815             :     /* ASCII */
    1816             : 
    1817             :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1818             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1819             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1820             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1821             : 
    1822             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1823             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1824             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1825             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1826             : 
    1827             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1828             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1829             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1830             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1831             : 
    1832             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1833             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1834             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1835             :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1836             : 
    1837             :     /* continuation bytes */
    1838             : 
    1839             :     /* 80..8F */
    1840             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1841             :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1842             : 
    1843             :     /* 90..9F */
    1844             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1845             :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1846             : 
    1847             :     /* A0..BF */
    1848             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1849             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1850             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1851             :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1852             : 
    1853             :     /* leading bytes */
    1854             : 
    1855             :     /* C0..DF */
    1856             :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1857             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1858             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1859             :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1860             : 
    1861             :     /* E0..EF */
    1862             :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1863             :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1864             : 
    1865             :     /* F0..FF */
    1866             :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1867             :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1868             : };
    1869             : 
    1870             : static void
    1871        1698 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1872             : {
    1873             :     /* Note: We deliberately don't check the state's value here. */
    1874       56034 :     while (len > 0)
    1875             :     {
    1876             :         /*
    1877             :          * It's important that the mask value is 31: In most instruction sets,
    1878             :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1879             :          * 32, so the compiler should elide the mask operation.
    1880             :          */
    1881       54336 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1882       54336 :         len--;
    1883             :     }
    1884             : 
    1885        1698 :     *state &= 31;
    1886        1698 : }
    1887             : 
    1888             : static int
    1889     1132386 : pg_utf8_verifystr(const unsigned char *s, int len)
    1890             : {
    1891     1132386 :     const unsigned char *start = s;
    1892     1132386 :     const int   orig_len = len;
    1893     1132386 :     uint32      state = BGN;
    1894             : 
    1895             : /*
    1896             :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1897             :  * the compiler can unroll a longer loop, it's not worth it because we
    1898             :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1899             :  */
    1900             : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1901             : 
    1902     1132386 :     if (len >= STRIDE_LENGTH)
    1903             :     {
    1904     3805294 :         while (len >= STRIDE_LENGTH)
    1905             :         {
    1906             :             /*
    1907             :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1908             :              * but we must first check for a non-END state, which means the
    1909             :              * previous chunk ended in the middle of a multibyte sequence.
    1910             :              */
    1911     3259314 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1912        1698 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1913             : 
    1914     3259314 :             s += STRIDE_LENGTH;
    1915     3259314 :             len -= STRIDE_LENGTH;
    1916             :         }
    1917             : 
    1918             :         /* The error state persists, so we only need to check for it here. */
    1919      545980 :         if (state == ERR)
    1920             :         {
    1921             :             /*
    1922             :              * Start over from the beginning with the slow path so we can
    1923             :              * count the valid bytes.
    1924             :              */
    1925         504 :             len = orig_len;
    1926         504 :             s = start;
    1927             :         }
    1928      545476 :         else if (state != END)
    1929             :         {
    1930             :             /*
    1931             :              * The fast path exited in the middle of a multibyte sequence.
    1932             :              * Walk backwards to find the leading byte so that the slow path
    1933             :              * can resume checking from there. We must always backtrack at
    1934             :              * least one byte, since the current byte could be e.g. an ASCII
    1935             :              * byte after a 2-byte lead, which is invalid.
    1936             :              */
    1937             :             do
    1938             :             {
    1939             :                 Assert(s > start);
    1940         102 :                 s--;
    1941         102 :                 len++;
    1942             :                 Assert(IS_HIGHBIT_SET(*s));
    1943         102 :             } while (pg_utf_mblen(s) <= 1);
    1944             :         }
    1945             :     }
    1946             : 
    1947             :     /* check remaining bytes */
    1948    16845812 :     while (len > 0)
    1949             :     {
    1950             :         int         l;
    1951             : 
    1952             :         /* fast path for ASCII-subset characters */
    1953    15716500 :         if (!IS_HIGHBIT_SET(*s))
    1954             :         {
    1955    15699120 :             if (*s == '\0')
    1956         204 :                 break;
    1957    15698916 :             l = 1;
    1958             :         }
    1959             :         else
    1960             :         {
    1961       17380 :             l = pg_utf8_verifychar(s, len);
    1962       17380 :             if (l == -1)
    1963        2870 :                 break;
    1964             :         }
    1965    15713426 :         s += l;
    1966    15713426 :         len -= l;
    1967             :     }
    1968             : 
    1969     1132386 :     return s - start;
    1970             : }
    1971             : 
    1972             : /*
    1973             :  * Check for validity of a single UTF-8 encoded character
    1974             :  *
    1975             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1976             :  * restrictions on the second byte are meant to ensure that there isn't
    1977             :  * more than one encoding of a given Unicode character point; that is,
    1978             :  * you may not use a longer-than-necessary byte sequence with high order
    1979             :  * zero bits to represent a character that would fit in fewer bytes.
    1980             :  * To do otherwise is to create security hazards (eg, create an apparent
    1981             :  * non-ASCII character that decodes to plain ASCII).
    1982             :  *
    1983             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1984             :  * caller must have checked that that many bytes are present in the buffer.
    1985             :  */
    1986             : bool
    1987       23418 : pg_utf8_islegal(const unsigned char *source, int length)
    1988             : {
    1989             :     unsigned char a;
    1990             : 
    1991       23418 :     switch (length)
    1992             :     {
    1993           0 :         default:
    1994             :             /* reject lengths 5 and 6 for now */
    1995           0 :             return false;
    1996        4596 :         case 4:
    1997        4596 :             a = source[3];
    1998        4596 :             if (a < 0x80 || a > 0xBF)
    1999         364 :                 return false;
    2000             :             /* FALL THRU */
    2001             :         case 3:
    2002       11978 :             a = source[2];
    2003       11978 :             if (a < 0x80 || a > 0xBF)
    2004         680 :                 return false;
    2005             :             /* FALL THRU */
    2006             :         case 2:
    2007       17816 :             a = source[1];
    2008       17816 :             switch (*source)
    2009             :             {
    2010         312 :                 case 0xE0:
    2011         312 :                     if (a < 0xA0 || a > 0xBF)
    2012         264 :                         return false;
    2013          48 :                     break;
    2014         312 :                 case 0xED:
    2015         312 :                     if (a < 0x80 || a > 0x9F)
    2016         264 :                         return false;
    2017          48 :                     break;
    2018        4052 :                 case 0xF0:
    2019        4052 :                     if (a < 0x90 || a > 0xBF)
    2020         264 :                         return false;
    2021        3788 :                     break;
    2022         180 :                 case 0xF4:
    2023         180 :                     if (a < 0x80 || a > 0x8F)
    2024         132 :                         return false;
    2025          48 :                     break;
    2026       12960 :                 default:
    2027       12960 :                     if (a < 0x80 || a > 0xBF)
    2028         292 :                         return false;
    2029       12668 :                     break;
    2030             :             }
    2031             :             /* FALL THRU */
    2032       21158 :         case 1:
    2033       21158 :             a = *source;
    2034       21158 :             if (a >= 0x80 && a < 0xC2)
    2035         396 :                 return false;
    2036       20762 :             if (a > 0xF4)
    2037         132 :                 return false;
    2038       20630 :             break;
    2039             :     }
    2040       20630 :     return true;
    2041             : }
    2042             : 
    2043             : 
    2044             : /*
    2045             :  * Fills the provided buffer with two bytes such that:
    2046             :  *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
    2047             :  */
    2048             : void
    2049         364 : pg_encoding_set_invalid(int encoding, char *dst)
    2050             : {
    2051             :     Assert(pg_encoding_max_length(encoding) > 1);
    2052             : 
    2053         364 :     dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
    2054         364 :     dst[1] = NONUTF8_INVALID_BYTE1;
    2055         364 : }
    2056             : 
    2057             : /*
    2058             :  *-------------------------------------------------------------------
    2059             :  * encoding info table
    2060             :  *-------------------------------------------------------------------
    2061             :  */
    2062             : const pg_wchar_tbl pg_wchar_table[] = {
    2063             :     [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
    2064             :     [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2065             :     [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
    2066             :     [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
    2067             :     [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
    2068             :     [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
    2069             :     [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
    2070             :     [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
    2071             :     [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2072             :     [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2073             :     [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2074             :     [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2075             :     [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2076             :     [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2077             :     [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2078             :     [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2079             :     [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2080             :     [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2081             :     [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2082             :     [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2083             :     [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2084             :     [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2085             :     [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2086             :     [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2087             :     [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2088             :     [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2089             :     [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2090             :     [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2091             :     [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2092             :     [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2093             :     [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2094             :     [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2095             :     [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2096             :     [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2097             :     [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
    2098             :     [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2099             :     [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
    2100             :     [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
    2101             :     [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
    2102             :     [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
    2103             :     [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
    2104             :     [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
    2105             : };
    2106             : 
    2107             : /*
    2108             :  * Returns the byte length of a multibyte character.
    2109             :  *
    2110             :  * Caution: when dealing with text that is not certainly valid in the
    2111             :  * specified encoding, the result may exceed the actual remaining
    2112             :  * string length.  Callers that are not prepared to deal with that
    2113             :  * should use pg_encoding_mblen_bounded() instead.
    2114             :  */
    2115             : int
    2116    53909648 : pg_encoding_mblen(int encoding, const char *mbstr)
    2117             : {
    2118    53909648 :     return (PG_VALID_ENCODING(encoding) ?
    2119   107819296 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2120           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2121             : }
    2122             : 
    2123             : /*
    2124             :  * Returns the byte length of a multibyte character; but not more than
    2125             :  * the distance to end of string.
    2126             :  */
    2127             : int
    2128           0 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2129             : {
    2130           0 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2131             : }
    2132             : 
    2133             : /*
    2134             :  * Returns the display length of a multibyte character.
    2135             :  */
    2136             : int
    2137    53731356 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2138             : {
    2139    53731356 :     return (PG_VALID_ENCODING(encoding) ?
    2140   107462712 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2141           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2142             : }
    2143             : 
    2144             : /*
    2145             :  * Verify the first multibyte character of the given string.
    2146             :  * Return its byte length if good, -1 if bad.  (See comments above for
    2147             :  * full details of the mbverifychar API.)
    2148             :  */
    2149             : int
    2150        9670 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2151             : {
    2152        9670 :     return (PG_VALID_ENCODING(encoding) ?
    2153       19340 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2154           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2155             : }
    2156             : 
    2157             : /*
    2158             :  * Verify that a string is valid for the given encoding.
    2159             :  * Returns the number of input bytes (<= len) that form a valid string.
    2160             :  * (See comments above for full details of the mbverifystr API.)
    2161             :  */
    2162             : int
    2163      460010 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2164             : {
    2165      460010 :     return (PG_VALID_ENCODING(encoding) ?
    2166      920020 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2167           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2168             : }
    2169             : 
    2170             : /*
    2171             :  * fetch maximum length of a given encoding
    2172             :  */
    2173             : int
    2174      878844 : pg_encoding_max_length(int encoding)
    2175             : {
    2176             :     Assert(PG_VALID_ENCODING(encoding));
    2177             : 
    2178             :     /*
    2179             :      * Check for the encoding despite the assert, due to some mingw versions
    2180             :      * otherwise issuing bogus warnings.
    2181             :      */
    2182      878844 :     return PG_VALID_ENCODING(encoding) ?
    2183     1757688 :         pg_wchar_table[encoding].maxmblen :
    2184             :         pg_wchar_table[PG_SQL_ASCII].maxmblen;
    2185             : }

Generated by: LCOV version 1.14