LCOV - code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 84.3 % 325 274
Test Date: 2026-03-03 06:14:53 Functions: 100.0 % 11 11
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  *    Utility functions for conversion procs.
       4              :  *
       5              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       6              :  * Portions Copyright (c) 1994, Regents of the University of California
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/backend/utils/mb/conv.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "postgres.h"
      14              : #include "mb/pg_wchar.h"
      15              : 
      16              : 
      17              : /*
      18              :  * local2local: a generic single byte charset encoding
      19              :  * conversion between two ASCII-superset encodings.
      20              :  *
      21              :  * l points to the source string of length len
      22              :  * p is the output area (must be large enough!)
      23              :  * src_encoding is the PG identifier for the source encoding
      24              :  * dest_encoding is the PG identifier for the target encoding
      25              :  * tab holds conversion entries for the source charset
      26              :  * starting from 128 (0x80). each entry in the table holds the corresponding
      27              :  * code point for the target charset, or 0 if there is no equivalent code.
      28              :  *
      29              :  * Returns the number of input bytes consumed.  If noError is true, this can
      30              :  * be less than 'len'.
      31              :  */
      32              : int
      33          114 : local2local(const unsigned char *l,
      34              :             unsigned char *p,
      35              :             int len,
      36              :             int src_encoding,
      37              :             int dest_encoding,
      38              :             const unsigned char *tab,
      39              :             bool noError)
      40              : {
      41          114 :     const unsigned char *start = l;
      42              :     unsigned char c1,
      43              :                 c2;
      44              : 
      45          366 :     while (len > 0)
      46              :     {
      47          306 :         c1 = *l;
      48          306 :         if (c1 == 0)
      49              :         {
      50           54 :             if (noError)
      51           27 :                 break;
      52           27 :             report_invalid_encoding(src_encoding, (const char *) l, len);
      53              :         }
      54          252 :         if (!IS_HIGHBIT_SET(c1))
      55          153 :             *p++ = c1;
      56              :         else
      57              :         {
      58           99 :             c2 = tab[c1 - HIGHBIT];
      59           99 :             if (c2)
      60           99 :                 *p++ = c2;
      61              :             else
      62              :             {
      63            0 :                 if (noError)
      64            0 :                     break;
      65            0 :                 report_untranslatable_char(src_encoding, dest_encoding,
      66              :                                            (const char *) l, len);
      67              :             }
      68              :         }
      69          252 :         l++;
      70          252 :         len--;
      71              :     }
      72           87 :     *p = '\0';
      73              : 
      74           87 :     return l - start;
      75              : }
      76              : 
      77              : /*
      78              :  * LATINn ---> MIC when the charset's local codes map directly to MIC
      79              :  *
      80              :  * l points to the source string of length len
      81              :  * p is the output area (must be large enough!)
      82              :  * lc is the mule character set id for the local encoding
      83              :  * encoding is the PG identifier for the local encoding
      84              :  *
      85              :  * Returns the number of input bytes consumed.  If noError is true, this can
      86              :  * be less than 'len'.
      87              :  */
      88              : int
      89           15 : latin2mic(const unsigned char *l, unsigned char *p, int len,
      90              :           int lc, int encoding, bool noError)
      91              : {
      92           15 :     const unsigned char *start = l;
      93              :     int         c1;
      94              : 
      95           60 :     while (len > 0)
      96              :     {
      97           45 :         c1 = *l;
      98           45 :         if (c1 == 0)
      99              :         {
     100            0 :             if (noError)
     101            0 :                 break;
     102            0 :             report_invalid_encoding(encoding, (const char *) l, len);
     103              :         }
     104           45 :         if (IS_HIGHBIT_SET(c1))
     105            0 :             *p++ = lc;
     106           45 :         *p++ = c1;
     107           45 :         l++;
     108           45 :         len--;
     109              :     }
     110           15 :     *p = '\0';
     111              : 
     112           15 :     return l - start;
     113              : }
     114              : 
     115              : /*
     116              :  * MIC ---> LATINn when the charset's local codes map directly to MIC
     117              :  *
     118              :  * mic points to the source string of length len
     119              :  * p is the output area (must be large enough!)
     120              :  * lc is the mule character set id for the local encoding
     121              :  * encoding is the PG identifier for the local encoding
     122              :  *
     123              :  * Returns the number of input bytes consumed.  If noError is true, this can
     124              :  * be less than 'len'.
     125              :  */
     126              : int
     127          177 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
     128              :           int lc, int encoding, bool noError)
     129              : {
     130          177 :     const unsigned char *start = mic;
     131              :     int         c1;
     132              : 
     133          420 :     while (len > 0)
     134              :     {
     135          387 :         c1 = *mic;
     136          387 :         if (c1 == 0)
     137              :         {
     138            0 :             if (noError)
     139            0 :                 break;
     140            0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     141              :         }
     142          387 :         if (!IS_HIGHBIT_SET(c1))
     143              :         {
     144              :             /* easy for ASCII */
     145          180 :             *p++ = c1;
     146          180 :             mic++;
     147          180 :             len--;
     148              :         }
     149              :         else
     150              :         {
     151          207 :             int         l = pg_mule_mblen(mic);
     152              : 
     153          207 :             if (len < l)
     154              :             {
     155           54 :                 if (noError)
     156           27 :                     break;
     157           27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     158              :                                         len);
     159              :             }
     160          153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
     161              :             {
     162           90 :                 if (noError)
     163           45 :                     break;
     164           45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     165              :                                            (const char *) mic, len);
     166              :             }
     167           63 :             *p++ = mic[1];
     168           63 :             mic += 2;
     169           63 :             len -= 2;
     170              :         }
     171              :     }
     172          105 :     *p = '\0';
     173              : 
     174          105 :     return mic - start;
     175              : }
     176              : 
     177              : 
     178              : /*
     179              :  * latin2mic_with_table: a generic single byte charset encoding
     180              :  * conversion from a local charset to the mule internal code.
     181              :  *
     182              :  * l points to the source string of length len
     183              :  * p is the output area (must be large enough!)
     184              :  * lc is the mule character set id for the local encoding
     185              :  * encoding is the PG identifier for the local encoding
     186              :  * tab holds conversion entries for the local charset
     187              :  * starting from 128 (0x80). each entry in the table holds the corresponding
     188              :  * code point for the mule encoding, or 0 if there is no equivalent code.
     189              :  *
     190              :  * Returns the number of input bytes consumed.  If noError is true, this can
     191              :  * be less than 'len'.
     192              :  */
     193              : int
     194           84 : latin2mic_with_table(const unsigned char *l,
     195              :                      unsigned char *p,
     196              :                      int len,
     197              :                      int lc,
     198              :                      int encoding,
     199              :                      const unsigned char *tab,
     200              :                      bool noError)
     201              : {
     202           84 :     const unsigned char *start = l;
     203              :     unsigned char c1,
     204              :                 c2;
     205              : 
     206          246 :     while (len > 0)
     207              :     {
     208          216 :         c1 = *l;
     209          216 :         if (c1 == 0)
     210              :         {
     211           54 :             if (noError)
     212           27 :                 break;
     213           27 :             report_invalid_encoding(encoding, (const char *) l, len);
     214              :         }
     215          162 :         if (!IS_HIGHBIT_SET(c1))
     216           63 :             *p++ = c1;
     217              :         else
     218              :         {
     219           99 :             c2 = tab[c1 - HIGHBIT];
     220           99 :             if (c2)
     221              :             {
     222           99 :                 *p++ = lc;
     223           99 :                 *p++ = c2;
     224              :             }
     225              :             else
     226              :             {
     227            0 :                 if (noError)
     228            0 :                     break;
     229            0 :                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
     230              :                                            (const char *) l, len);
     231              :             }
     232              :         }
     233          162 :         l++;
     234          162 :         len--;
     235              :     }
     236           57 :     *p = '\0';
     237              : 
     238           57 :     return l - start;
     239              : }
     240              : 
     241              : /*
     242              :  * mic2latin_with_table: a generic single byte charset encoding
     243              :  * conversion from the mule internal code to a local charset.
     244              :  *
     245              :  * mic points to the source string of length len
     246              :  * p is the output area (must be large enough!)
     247              :  * lc is the mule character set id for the local encoding
     248              :  * encoding is the PG identifier for the local encoding
     249              :  * tab holds conversion entries for the mule internal code's second byte,
     250              :  * starting from 128 (0x80). each entry in the table holds the corresponding
     251              :  * code point for the local charset, or 0 if there is no equivalent code.
     252              :  *
     253              :  * Returns the number of input bytes consumed.  If noError is true, this can
     254              :  * be less than 'len'.
     255              :  */
     256              : int
     257          174 : mic2latin_with_table(const unsigned char *mic,
     258              :                      unsigned char *p,
     259              :                      int len,
     260              :                      int lc,
     261              :                      int encoding,
     262              :                      const unsigned char *tab,
     263              :                      bool noError)
     264              : {
     265          174 :     const unsigned char *start = mic;
     266              :     unsigned char c1,
     267              :                 c2;
     268              : 
     269          408 :     while (len > 0)
     270              :     {
     271          378 :         c1 = *mic;
     272          378 :         if (c1 == 0)
     273              :         {
     274            0 :             if (noError)
     275            0 :                 break;
     276            0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     277              :         }
     278          378 :         if (!IS_HIGHBIT_SET(c1))
     279              :         {
     280              :             /* easy for ASCII */
     281          171 :             *p++ = c1;
     282          171 :             mic++;
     283          171 :             len--;
     284              :         }
     285              :         else
     286              :         {
     287          207 :             int         l = pg_mule_mblen(mic);
     288              : 
     289          207 :             if (len < l)
     290              :             {
     291           54 :                 if (noError)
     292           27 :                     break;
     293           27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     294              :                                         len);
     295              :             }
     296          153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
     297           63 :                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
     298              :             {
     299           90 :                 if (noError)
     300           45 :                     break;
     301           45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     302              :                                            (const char *) mic, len);
     303              :                 break;          /* keep compiler quiet */
     304              :             }
     305           63 :             *p++ = c2;
     306           63 :             mic += 2;
     307           63 :             len -= 2;
     308              :         }
     309              :     }
     310          102 :     *p = '\0';
     311              : 
     312          102 :     return mic - start;
     313              : }
     314              : 
     315              : /*
     316              :  * comparison routine for bsearch()
     317              :  * this routine is intended for combined UTF8 -> local code
     318              :  */
     319              : static int
     320          234 : compare3(const void *p1, const void *p2)
     321              : {
     322              :     uint32      s1,
     323              :                 s2,
     324              :                 d1,
     325              :                 d2;
     326              : 
     327          234 :     s1 = *(const uint32 *) p1;
     328          234 :     s2 = *((const uint32 *) p1 + 1);
     329          234 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
     330          234 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
     331          234 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
     332              : }
     333              : 
     334              : /*
     335              :  * comparison routine for bsearch()
     336              :  * this routine is intended for local code -> combined UTF8
     337              :  */
     338              : static int
     339           81 : compare4(const void *p1, const void *p2)
     340              : {
     341              :     uint32      v1,
     342              :                 v2;
     343              : 
     344           81 :     v1 = *(const uint32 *) p1;
     345           81 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
     346           81 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
     347              : }
     348              : 
     349              : /*
     350              :  * store 32bit character representation into multibyte stream
     351              :  */
     352              : static inline unsigned char *
     353          615 : store_coded_char(unsigned char *dest, uint32 code)
     354              : {
     355          615 :     if (code & 0xff000000)
     356           63 :         *dest++ = code >> 24;
     357          615 :     if (code & 0x00ff0000)
     358          303 :         *dest++ = code >> 16;
     359          615 :     if (code & 0x0000ff00)
     360          552 :         *dest++ = code >> 8;
     361          615 :     if (code & 0x000000ff)
     362          615 :         *dest++ = code;
     363          615 :     return dest;
     364              : }
     365              : 
     366              : /*
     367              :  * Convert a character using a conversion radix tree.
     368              :  *
     369              :  * 'l' is the length of the input character in bytes, and b1-b4 are
     370              :  * the input character's bytes.
     371              :  */
     372              : static inline uint32
     373         1038 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
     374              :                  int l,
     375              :                  unsigned char b1,
     376              :                  unsigned char b2,
     377              :                  unsigned char b3,
     378              :                  unsigned char b4)
     379              : {
     380         1038 :     if (l == 4)
     381              :     {
     382              :         /* 4-byte code */
     383              : 
     384              :         /* check code validity */
     385           45 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
     386           45 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
     387           45 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
     388           45 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
     389            0 :             return 0;
     390              : 
     391              :         /* perform lookup */
     392           45 :         if (rt->chars32)
     393              :         {
     394           45 :             uint32      idx = rt->b4root;
     395              : 
     396           45 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
     397           45 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
     398           45 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
     399           45 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
     400              :         }
     401              :         else
     402              :         {
     403            0 :             uint16      idx = rt->b4root;
     404              : 
     405            0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
     406            0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
     407            0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
     408            0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
     409              :         }
     410              :     }
     411          993 :     else if (l == 3)
     412              :     {
     413              :         /* 3-byte code */
     414              : 
     415              :         /* check code validity */
     416          474 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
     417          150 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
     418          150 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
     419          324 :             return 0;
     420              : 
     421              :         /* perform lookup */
     422          150 :         if (rt->chars32)
     423              :         {
     424          150 :             uint32      idx = rt->b3root;
     425              : 
     426          150 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
     427          150 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
     428          150 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
     429              :         }
     430              :         else
     431              :         {
     432            0 :             uint16      idx = rt->b3root;
     433              : 
     434            0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
     435            0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
     436            0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
     437              :         }
     438              :     }
     439          519 :     else if (l == 2)
     440              :     {
     441              :         /* 2-byte code */
     442              : 
     443              :         /* check code validity - first byte */
     444          420 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
     445          384 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
     446           36 :             return 0;
     447              : 
     448              :         /* perform lookup */
     449          384 :         if (rt->chars32)
     450              :         {
     451          303 :             uint32      idx = rt->b2root;
     452              : 
     453          303 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
     454          303 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
     455              :         }
     456              :         else
     457              :         {
     458           81 :             uint16      idx = rt->b2root;
     459              : 
     460           81 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
     461           81 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
     462              :         }
     463              :     }
     464           99 :     else if (l == 1)
     465              :     {
     466              :         /* 1-byte code */
     467              : 
     468              :         /* check code validity - first byte */
     469           99 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
     470            0 :             return 0;
     471              : 
     472              :         /* perform lookup */
     473           99 :         if (rt->chars32)
     474           99 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
     475              :         else
     476            0 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
     477              :     }
     478            0 :     return 0;                   /* shouldn't happen */
     479              : }
     480              : 
     481              : /*
     482              :  * UTF8 ---> local code
     483              :  *
     484              :  * utf: input string in UTF8 encoding (need not be null-terminated)
     485              :  * len: length of input string (in bytes)
     486              :  * iso: pointer to the output area (must be large enough!)
     487              :  *        (output string will be null-terminated)
     488              :  * map: conversion map for single characters
     489              :  * cmap: conversion map for combined characters
     490              :  *        (optional, pass NULL if none)
     491              :  * cmapsize: number of entries in the conversion map for combined characters
     492              :  *        (optional, pass 0 if none)
     493              :  * conv_func: algorithmic encoding conversion function
     494              :  *        (optional, pass NULL if none)
     495              :  * encoding: PG identifier for the local encoding
     496              :  *
     497              :  * For each character, the cmap (if provided) is consulted first; if no match,
     498              :  * the map is consulted next; if still no match, the conv_func (if provided)
     499              :  * is applied.  An error is raised if no match is found.
     500              :  *
     501              :  * See pg_wchar.h for more details about the data structures used here.
     502              :  *
     503              :  * Returns the number of input bytes consumed.  If noError is true, this can
     504              :  * be less than 'len'.
     505              :  */
     506              : int
     507         1143 : UtfToLocal(const unsigned char *utf, int len,
     508              :            unsigned char *iso,
     509              :            const pg_mb_radix_tree *map,
     510              :            const pg_utf_to_local_combined *cmap, int cmapsize,
     511              :            utf_local_conversion_func conv_func,
     512              :            int encoding, bool noError)
     513              : {
     514              :     uint32      iutf;
     515              :     int         l;
     516              :     const pg_utf_to_local_combined *cp;
     517         1143 :     const unsigned char *start = utf;
     518              : 
     519         1143 :     if (!PG_VALID_ENCODING(encoding))
     520            0 :         ereport(ERROR,
     521              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     522              :                  errmsg("invalid encoding number: %d", encoding)));
     523              : 
     524         3567 :     for (; len > 0; len -= l)
     525              :     {
     526         3234 :         unsigned char b1 = 0;
     527         3234 :         unsigned char b2 = 0;
     528         3234 :         unsigned char b3 = 0;
     529         3234 :         unsigned char b4 = 0;
     530              : 
     531              :         /* "break" cases all represent errors */
     532         3234 :         if (*utf == '\0')
     533           90 :             break;
     534              : 
     535         3144 :         l = pg_utf_mblen(utf);
     536         3144 :         if (len < l)
     537          108 :             break;
     538              : 
     539         3036 :         if (!pg_utf8_islegal(utf, l))
     540          180 :             break;
     541              : 
     542         2856 :         if (l == 1)
     543              :         {
     544              :             /* ASCII case is easy, assume it's one-to-one conversion */
     545         2148 :             *iso++ = *utf++;
     546         2148 :             continue;
     547              :         }
     548              : 
     549              :         /* collect coded char of length l */
     550          708 :         if (l == 2)
     551              :         {
     552          207 :             b3 = *utf++;
     553          207 :             b4 = *utf++;
     554              :         }
     555          501 :         else if (l == 3)
     556              :         {
     557          501 :             b2 = *utf++;
     558          501 :             b3 = *utf++;
     559          501 :             b4 = *utf++;
     560              :         }
     561            0 :         else if (l == 4)
     562              :         {
     563            0 :             b1 = *utf++;
     564            0 :             b2 = *utf++;
     565            0 :             b3 = *utf++;
     566            0 :             b4 = *utf++;
     567              :         }
     568              :         else
     569              :         {
     570            0 :             elog(ERROR, "unsupported character length %d", l);
     571              :             iutf = 0;           /* keep compiler quiet */
     572              :         }
     573          708 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     574              : 
     575              :         /* First, try with combined map if possible */
     576          708 :         if (cmap && len > l)
     577              :         {
     578           72 :             const unsigned char *utf_save = utf;
     579           72 :             int         len_save = len;
     580           72 :             int         l_save = l;
     581              : 
     582              :             /* collect next character, same as above */
     583           72 :             len -= l;
     584              : 
     585           72 :             l = pg_utf_mblen(utf);
     586           72 :             if (len < l)
     587              :             {
     588              :                 /* need more data to decide if this is a combined char */
     589           18 :                 utf -= l_save;
     590           18 :                 break;
     591              :             }
     592              : 
     593           54 :             if (!pg_utf8_islegal(utf, l))
     594              :             {
     595            0 :                 if (!noError)
     596            0 :                     report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     597            0 :                 utf -= l_save;
     598            0 :                 break;
     599              :             }
     600              : 
     601              :             /* We assume ASCII character cannot be in combined map */
     602           54 :             if (l > 1)
     603              :             {
     604              :                 uint32      iutf2;
     605              :                 uint32      cutf[2];
     606              : 
     607           54 :                 if (l == 2)
     608              :                 {
     609           27 :                     iutf2 = *utf++ << 8;
     610           27 :                     iutf2 |= *utf++;
     611              :                 }
     612           27 :                 else if (l == 3)
     613              :                 {
     614           27 :                     iutf2 = *utf++ << 16;
     615           27 :                     iutf2 |= *utf++ << 8;
     616           27 :                     iutf2 |= *utf++;
     617              :                 }
     618            0 :                 else if (l == 4)
     619              :                 {
     620            0 :                     iutf2 = *utf++ << 24;
     621            0 :                     iutf2 |= *utf++ << 16;
     622            0 :                     iutf2 |= *utf++ << 8;
     623            0 :                     iutf2 |= *utf++;
     624              :                 }
     625              :                 else
     626              :                 {
     627            0 :                     elog(ERROR, "unsupported character length %d", l);
     628              :                     iutf2 = 0;  /* keep compiler quiet */
     629              :                 }
     630              : 
     631           54 :                 cutf[0] = iutf;
     632           54 :                 cutf[1] = iutf2;
     633              : 
     634           54 :                 cp = bsearch(cutf, cmap, cmapsize,
     635              :                              sizeof(pg_utf_to_local_combined), compare3);
     636              : 
     637           54 :                 if (cp)
     638              :                 {
     639            9 :                     iso = store_coded_char(iso, cp->code);
     640            9 :                     continue;
     641              :                 }
     642              :             }
     643              : 
     644              :             /* fail, so back up to reprocess second character next time */
     645           45 :             utf = utf_save;
     646           45 :             len = len_save;
     647           45 :             l = l_save;
     648              :         }
     649              : 
     650              :         /* Now check ordinary map */
     651          681 :         if (map)
     652              :         {
     653          681 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     654              : 
     655          681 :             if (converted)
     656              :             {
     657          231 :                 iso = store_coded_char(iso, converted);
     658          231 :                 continue;
     659              :             }
     660              :         }
     661              : 
     662              :         /* if there's a conversion function, try that */
     663          450 :         if (conv_func)
     664              :         {
     665           36 :             uint32      converted = (*conv_func) (iutf);
     666              : 
     667           36 :             if (converted)
     668              :             {
     669           36 :                 iso = store_coded_char(iso, converted);
     670           36 :                 continue;
     671              :             }
     672              :         }
     673              : 
     674              :         /* failed to translate this character */
     675          414 :         utf -= l;
     676          414 :         if (noError)
     677          207 :             break;
     678          207 :         report_untranslatable_char(PG_UTF8, encoding,
     679              :                                    (const char *) utf, len);
     680              :     }
     681              : 
     682              :     /* if we broke out of loop early, must be invalid input */
     683          936 :     if (len > 0 && !noError)
     684          198 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     685              : 
     686          738 :     *iso = '\0';
     687              : 
     688          738 :     return utf - start;
     689              : }
     690              : 
     691              : /*
     692              :  * local code ---> UTF8
     693              :  *
     694              :  * iso: input string in local encoding (need not be null-terminated)
     695              :  * len: length of input string (in bytes)
     696              :  * utf: pointer to the output area (must be large enough!)
     697              :  *        (output string will be null-terminated)
     698              :  * map: conversion map for single characters
     699              :  * cmap: conversion map for combined characters
     700              :  *        (optional, pass NULL if none)
     701              :  * cmapsize: number of entries in the conversion map for combined characters
     702              :  *        (optional, pass 0 if none)
     703              :  * conv_func: algorithmic encoding conversion function
     704              :  *        (optional, pass NULL if none)
     705              :  * encoding: PG identifier for the local encoding
     706              :  *
     707              :  * For each character, the map is consulted first; if no match, the cmap
     708              :  * (if provided) is consulted next; if still no match, the conv_func
     709              :  * (if provided) is applied.  An error is raised if no match is found.
     710              :  *
     711              :  * See pg_wchar.h for more details about the data structures used here.
     712              :  *
     713              :  * Returns the number of input bytes consumed.  If noError is true, this can
     714              :  * be less than 'len'.
     715              :  */
     716              : int
     717          681 : LocalToUtf(const unsigned char *iso, int len,
     718              :            unsigned char *utf,
     719              :            const pg_mb_radix_tree *map,
     720              :            const pg_local_to_utf_combined *cmap, int cmapsize,
     721              :            utf_local_conversion_func conv_func,
     722              :            int encoding,
     723              :            bool noError)
     724              : {
     725              :     uint32      iiso;
     726              :     int         l;
     727              :     const pg_local_to_utf_combined *cp;
     728          681 :     const unsigned char *start = iso;
     729              : 
     730          681 :     if (!PG_VALID_ENCODING(encoding))
     731            0 :         ereport(ERROR,
     732              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     733              :                  errmsg("invalid encoding number: %d", encoding)));
     734              : 
     735         3077 :     for (; len > 0; len -= l)
     736              :     {
     737         2810 :         unsigned char b1 = 0;
     738         2810 :         unsigned char b2 = 0;
     739         2810 :         unsigned char b3 = 0;
     740         2810 :         unsigned char b4 = 0;
     741              : 
     742              :         /* "break" cases all represent errors */
     743         2810 :         if (*iso == '\0')
     744          162 :             break;
     745              : 
     746         2648 :         if (!IS_HIGHBIT_SET(*iso))
     747              :         {
     748              :             /* ASCII case is easy, assume it's one-to-one conversion */
     749         2075 :             *utf++ = *iso++;
     750         2075 :             l = 1;
     751         2075 :             continue;
     752              :         }
     753              : 
     754          573 :         l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
     755          573 :         if (l < 0)
     756          216 :             break;
     757              : 
     758              :         /* collect coded char of length l */
     759          357 :         if (l == 1)
     760           99 :             b4 = *iso++;
     761          258 :         else if (l == 2)
     762              :         {
     763          213 :             b3 = *iso++;
     764          213 :             b4 = *iso++;
     765              :         }
     766           45 :         else if (l == 3)
     767              :         {
     768            0 :             b2 = *iso++;
     769            0 :             b3 = *iso++;
     770            0 :             b4 = *iso++;
     771              :         }
     772           45 :         else if (l == 4)
     773              :         {
     774           45 :             b1 = *iso++;
     775           45 :             b2 = *iso++;
     776           45 :             b3 = *iso++;
     777           45 :             b4 = *iso++;
     778              :         }
     779              :         else
     780              :         {
     781            0 :             elog(ERROR, "unsupported character length %d", l);
     782              :             iiso = 0;           /* keep compiler quiet */
     783              :         }
     784          357 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     785              : 
     786          357 :         if (map)
     787              :         {
     788          357 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     789              : 
     790          357 :             if (converted)
     791              :             {
     792          276 :                 utf = store_coded_char(utf, converted);
     793          276 :                 continue;
     794              :             }
     795              : 
     796              :             /* If there's a combined character map, try that */
     797           81 :             if (cmap)
     798              :             {
     799           18 :                 cp = bsearch(&iiso, cmap, cmapsize,
     800              :                              sizeof(pg_local_to_utf_combined), compare4);
     801              : 
     802           18 :                 if (cp)
     803              :                 {
     804           18 :                     utf = store_coded_char(utf, cp->utf1);
     805           18 :                     utf = store_coded_char(utf, cp->utf2);
     806           18 :                     continue;
     807              :                 }
     808              :             }
     809              :         }
     810              : 
     811              :         /* if there's a conversion function, try that */
     812           63 :         if (conv_func)
     813              :         {
     814           45 :             uint32      converted = (*conv_func) (iiso);
     815              : 
     816           45 :             if (converted)
     817              :             {
     818           27 :                 utf = store_coded_char(utf, converted);
     819           27 :                 continue;
     820              :             }
     821              :         }
     822              : 
     823              :         /* failed to translate this character */
     824           36 :         iso -= l;
     825           36 :         if (noError)
     826           18 :             break;
     827           18 :         report_untranslatable_char(encoding, PG_UTF8,
     828              :                                    (const char *) iso, len);
     829              :     }
     830              : 
     831              :     /* if we broke out of loop early, must be invalid input */
     832          663 :     if (len > 0 && !noError)
     833          186 :         report_invalid_encoding(encoding, (const char *) iso, len);
     834              : 
     835          477 :     *utf = '\0';
     836              : 
     837          477 :     return iso - start;
     838              : }
        

Generated by: LCOV version 2.0-1