LCOV - code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 86 308 27.9 %
Date: 2019-06-19 14:06:47 Functions: 8 13 61.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  *    Utility functions for conversion procs.
       4             :  *
       5             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       6             :  * Portions Copyright (c) 1994, Regents of the University of California
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/utils/mb/conv.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : #include "mb/pg_wchar.h"
      15             : 
      16             : 
      17             : /*
      18             :  * local2local: a generic single byte charset encoding
      19             :  * conversion between two ASCII-superset encodings.
      20             :  *
      21             :  * l points to the source string of length len
      22             :  * p is the output area (must be large enough!)
      23             :  * src_encoding is the PG identifier for the source encoding
      24             :  * dest_encoding is the PG identifier for the target encoding
      25             :  * tab holds conversion entries for the source charset
      26             :  * starting from 128 (0x80). each entry in the table holds the corresponding
      27             :  * code point for the target charset, or 0 if there is no equivalent code.
      28             :  */
      29             : void
      30          56 : local2local(const unsigned char *l,
      31             :             unsigned char *p,
      32             :             int len,
      33             :             int src_encoding,
      34             :             int dest_encoding,
      35             :             const unsigned char *tab)
      36             : {
      37             :     unsigned char c1,
      38             :                 c2;
      39             : 
      40         280 :     while (len > 0)
      41             :     {
      42         168 :         c1 = *l;
      43         168 :         if (c1 == 0)
      44           0 :             report_invalid_encoding(src_encoding, (const char *) l, len);
      45         168 :         if (!IS_HIGHBIT_SET(c1))
      46         168 :             *p++ = c1;
      47             :         else
      48             :         {
      49           0 :             c2 = tab[c1 - HIGHBIT];
      50           0 :             if (c2)
      51           0 :                 *p++ = c2;
      52             :             else
      53           0 :                 report_untranslatable_char(src_encoding, dest_encoding,
      54             :                                            (const char *) l, len);
      55             :         }
      56         168 :         l++;
      57         168 :         len--;
      58             :     }
      59          56 :     *p = '\0';
      60          56 : }
      61             : 
      62             : /*
      63             :  * LATINn ---> MIC when the charset's local codes map directly to MIC
      64             :  *
      65             :  * l points to the source string of length len
      66             :  * p is the output area (must be large enough!)
      67             :  * lc is the mule character set id for the local encoding
      68             :  * encoding is the PG identifier for the local encoding
      69             :  */
      70             : void
      71          20 : latin2mic(const unsigned char *l, unsigned char *p, int len,
      72             :           int lc, int encoding)
      73             : {
      74             :     int         c1;
      75             : 
      76         100 :     while (len > 0)
      77             :     {
      78          60 :         c1 = *l;
      79          60 :         if (c1 == 0)
      80           0 :             report_invalid_encoding(encoding, (const char *) l, len);
      81          60 :         if (IS_HIGHBIT_SET(c1))
      82           0 :             *p++ = lc;
      83          60 :         *p++ = c1;
      84          60 :         l++;
      85          60 :         len--;
      86             :     }
      87          20 :     *p = '\0';
      88          20 : }
      89             : 
      90             : /*
      91             :  * MIC ---> LATINn when the charset's local codes map directly to MIC
      92             :  *
      93             :  * mic points to the source string of length len
      94             :  * p is the output area (must be large enough!)
      95             :  * lc is the mule character set id for the local encoding
      96             :  * encoding is the PG identifier for the local encoding
      97             :  */
      98             : void
      99          20 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
     100             :           int lc, int encoding)
     101             : {
     102             :     int         c1;
     103             : 
     104         100 :     while (len > 0)
     105             :     {
     106          60 :         c1 = *mic;
     107          60 :         if (c1 == 0)
     108           0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     109          60 :         if (!IS_HIGHBIT_SET(c1))
     110             :         {
     111             :             /* easy for ASCII */
     112          60 :             *p++ = c1;
     113          60 :             mic++;
     114          60 :             len--;
     115             :         }
     116             :         else
     117             :         {
     118           0 :             int         l = pg_mic_mblen(mic);
     119             : 
     120           0 :             if (len < l)
     121           0 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     122             :                                         len);
     123           0 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
     124           0 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     125             :                                            (const char *) mic, len);
     126           0 :             *p++ = mic[1];
     127           0 :             mic += 2;
     128           0 :             len -= 2;
     129             :         }
     130             :     }
     131          20 :     *p = '\0';
     132          20 : }
     133             : 
     134             : 
     135             : /*
     136             :  * ASCII ---> MIC
     137             :  *
     138             :  * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
     139             :  * characters, here we must take a hard line because we don't know
     140             :  * the appropriate MIC equivalent.
     141             :  */
     142             : void
     143           4 : pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
     144             : {
     145             :     int         c1;
     146             : 
     147           8 :     while (len > 0)
     148             :     {
     149           0 :         c1 = *l;
     150           0 :         if (c1 == 0 || IS_HIGHBIT_SET(c1))
     151           0 :             report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
     152           0 :         *p++ = c1;
     153           0 :         l++;
     154           0 :         len--;
     155             :     }
     156           4 :     *p = '\0';
     157           4 : }
     158             : 
     159             : /*
     160             :  * MIC ---> ASCII
     161             :  */
     162             : void
     163           0 : pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
     164             : {
     165             :     int         c1;
     166             : 
     167           0 :     while (len > 0)
     168             :     {
     169           0 :         c1 = *mic;
     170           0 :         if (c1 == 0 || IS_HIGHBIT_SET(c1))
     171           0 :             report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
     172             :                                        (const char *) mic, len);
     173           0 :         *p++ = c1;
     174           0 :         mic++;
     175           0 :         len--;
     176             :     }
     177           0 :     *p = '\0';
     178           0 : }
     179             : 
     180             : /*
     181             :  * latin2mic_with_table: a generic single byte charset encoding
     182             :  * conversion from a local charset to the mule internal code.
     183             :  *
     184             :  * l points to the source string of length len
     185             :  * p is the output area (must be large enough!)
     186             :  * lc is the mule character set id for the local encoding
     187             :  * encoding is the PG identifier for the local encoding
     188             :  * tab holds conversion entries for the local charset
     189             :  * starting from 128 (0x80). each entry in the table holds the corresponding
     190             :  * code point for the mule encoding, or 0 if there is no equivalent code.
     191             :  */
     192             : void
     193          16 : latin2mic_with_table(const unsigned char *l,
     194             :                      unsigned char *p,
     195             :                      int len,
     196             :                      int lc,
     197             :                      int encoding,
     198             :                      const unsigned char *tab)
     199             : {
     200             :     unsigned char c1,
     201             :                 c2;
     202             : 
     203          80 :     while (len > 0)
     204             :     {
     205          48 :         c1 = *l;
     206          48 :         if (c1 == 0)
     207           0 :             report_invalid_encoding(encoding, (const char *) l, len);
     208          48 :         if (!IS_HIGHBIT_SET(c1))
     209          48 :             *p++ = c1;
     210             :         else
     211             :         {
     212           0 :             c2 = tab[c1 - HIGHBIT];
     213           0 :             if (c2)
     214             :             {
     215           0 :                 *p++ = lc;
     216           0 :                 *p++ = c2;
     217             :             }
     218             :             else
     219           0 :                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
     220             :                                            (const char *) l, len);
     221             :         }
     222          48 :         l++;
     223          48 :         len--;
     224             :     }
     225          16 :     *p = '\0';
     226          16 : }
     227             : 
     228             : /*
     229             :  * mic2latin_with_table: a generic single byte charset encoding
     230             :  * conversion from the mule internal code to a local charset.
     231             :  *
     232             :  * mic points to the source string of length len
     233             :  * p is the output area (must be large enough!)
     234             :  * lc is the mule character set id for the local encoding
     235             :  * encoding is the PG identifier for the local encoding
     236             :  * tab holds conversion entries for the mule internal code's second byte,
     237             :  * starting from 128 (0x80). each entry in the table holds the corresponding
     238             :  * code point for the local charset, or 0 if there is no equivalent code.
     239             :  */
     240             : void
     241          16 : mic2latin_with_table(const unsigned char *mic,
     242             :                      unsigned char *p,
     243             :                      int len,
     244             :                      int lc,
     245             :                      int encoding,
     246             :                      const unsigned char *tab)
     247             : {
     248             :     unsigned char c1,
     249             :                 c2;
     250             : 
     251          80 :     while (len > 0)
     252             :     {
     253          48 :         c1 = *mic;
     254          48 :         if (c1 == 0)
     255           0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     256          48 :         if (!IS_HIGHBIT_SET(c1))
     257             :         {
     258             :             /* easy for ASCII */
     259          48 :             *p++ = c1;
     260          48 :             mic++;
     261          48 :             len--;
     262             :         }
     263             :         else
     264             :         {
     265           0 :             int         l = pg_mic_mblen(mic);
     266             : 
     267           0 :             if (len < l)
     268           0 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     269             :                                         len);
     270           0 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
     271           0 :                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
     272             :             {
     273           0 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     274             :                                            (const char *) mic, len);
     275             :                 break;          /* keep compiler quiet */
     276             :             }
     277           0 :             *p++ = c2;
     278           0 :             mic += 2;
     279           0 :             len -= 2;
     280             :         }
     281             :     }
     282          16 :     *p = '\0';
     283          16 : }
     284             : 
     285             : /*
     286             :  * comparison routine for bsearch()
     287             :  * this routine is intended for combined UTF8 -> local code
     288             :  */
     289             : static int
     290           0 : compare3(const void *p1, const void *p2)
     291             : {
     292             :     uint32      s1,
     293             :                 s2,
     294             :                 d1,
     295             :                 d2;
     296             : 
     297           0 :     s1 = *(const uint32 *) p1;
     298           0 :     s2 = *((const uint32 *) p1 + 1);
     299           0 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
     300           0 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
     301           0 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
     302             : }
     303             : 
     304             : /*
     305             :  * comparison routine for bsearch()
     306             :  * this routine is intended for local code -> combined UTF8
     307             :  */
     308             : static int
     309           0 : compare4(const void *p1, const void *p2)
     310             : {
     311             :     uint32      v1,
     312             :                 v2;
     313             : 
     314           0 :     v1 = *(const uint32 *) p1;
     315           0 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
     316           0 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
     317             : }
     318             : 
     319             : /*
     320             :  * store 32bit character representation into multibyte stream
     321             :  */
     322             : static inline unsigned char *
     323           0 : store_coded_char(unsigned char *dest, uint32 code)
     324             : {
     325           0 :     if (code & 0xff000000)
     326           0 :         *dest++ = code >> 24;
     327           0 :     if (code & 0x00ff0000)
     328           0 :         *dest++ = code >> 16;
     329           0 :     if (code & 0x0000ff00)
     330           0 :         *dest++ = code >> 8;
     331           0 :     if (code & 0x000000ff)
     332           0 :         *dest++ = code;
     333           0 :     return dest;
     334             : }
     335             : 
     336             : /*
     337             :  * Convert a character using a conversion radix tree.
     338             :  *
     339             :  * 'l' is the length of the input character in bytes, and b1-b4 are
     340             :  * the input character's bytes.
     341             :  */
     342             : static inline uint32
     343           0 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
     344             :                  int l,
     345             :                  unsigned char b1,
     346             :                  unsigned char b2,
     347             :                  unsigned char b3,
     348             :                  unsigned char b4)
     349             : {
     350           0 :     if (l == 4)
     351             :     {
     352             :         /* 4-byte code */
     353             : 
     354             :         /* check code validity */
     355           0 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
     356           0 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
     357           0 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
     358           0 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
     359           0 :             return 0;
     360             : 
     361             :         /* perform lookup */
     362           0 :         if (rt->chars32)
     363             :         {
     364           0 :             uint32      idx = rt->b4root;
     365             : 
     366           0 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
     367           0 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
     368           0 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
     369           0 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
     370             :         }
     371             :         else
     372             :         {
     373           0 :             uint16      idx = rt->b4root;
     374             : 
     375           0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
     376           0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
     377           0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
     378           0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
     379             :         }
     380             :     }
     381           0 :     else if (l == 3)
     382             :     {
     383             :         /* 3-byte code */
     384             : 
     385             :         /* check code validity */
     386           0 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
     387           0 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
     388           0 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
     389           0 :             return 0;
     390             : 
     391             :         /* perform lookup */
     392           0 :         if (rt->chars32)
     393             :         {
     394           0 :             uint32      idx = rt->b3root;
     395             : 
     396           0 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
     397           0 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
     398           0 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
     399             :         }
     400             :         else
     401             :         {
     402           0 :             uint16      idx = rt->b3root;
     403             : 
     404           0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
     405           0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
     406           0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
     407             :         }
     408             :     }
     409           0 :     else if (l == 2)
     410             :     {
     411             :         /* 2-byte code */
     412             : 
     413             :         /* check code validity - first byte */
     414           0 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
     415           0 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
     416           0 :             return 0;
     417             : 
     418             :         /* perform lookup */
     419           0 :         if (rt->chars32)
     420             :         {
     421           0 :             uint32      idx = rt->b2root;
     422             : 
     423           0 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
     424           0 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
     425             :         }
     426             :         else
     427             :         {
     428           0 :             uint16      idx = rt->b2root;
     429             : 
     430           0 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
     431           0 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
     432             :         }
     433             :     }
     434           0 :     else if (l == 1)
     435             :     {
     436             :         /* 1-byte code */
     437             : 
     438             :         /* check code validity - first byte */
     439           0 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
     440           0 :             return 0;
     441             : 
     442             :         /* perform lookup */
     443           0 :         if (rt->chars32)
     444           0 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
     445             :         else
     446           0 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
     447             :     }
     448           0 :     return 0;                   /* shouldn't happen */
     449             : }
     450             : 
     451             : /*
     452             :  * UTF8 ---> local code
     453             :  *
     454             :  * utf: input string in UTF8 encoding (need not be null-terminated)
     455             :  * len: length of input string (in bytes)
     456             :  * iso: pointer to the output area (must be large enough!)
     457             :           (output string will be null-terminated)
     458             :  * map: conversion map for single characters
     459             :  * cmap: conversion map for combined characters
     460             :  *        (optional, pass NULL if none)
     461             :  * cmapsize: number of entries in the conversion map for combined characters
     462             :  *        (optional, pass 0 if none)
     463             :  * conv_func: algorithmic encoding conversion function
     464             :  *        (optional, pass NULL if none)
     465             :  * encoding: PG identifier for the local encoding
     466             :  *
     467             :  * For each character, the cmap (if provided) is consulted first; if no match,
     468             :  * the map is consulted next; if still no match, the conv_func (if provided)
     469             :  * is applied.  An error is raised if no match is found.
     470             :  *
     471             :  * See pg_wchar.h for more details about the data structures used here.
     472             :  */
     473             : void
     474         152 : UtfToLocal(const unsigned char *utf, int len,
     475             :            unsigned char *iso,
     476             :            const pg_mb_radix_tree *map,
     477             :            const pg_utf_to_local_combined *cmap, int cmapsize,
     478             :            utf_local_conversion_func conv_func,
     479             :            int encoding)
     480             : {
     481             :     uint32      iutf;
     482             :     int         l;
     483             :     const pg_utf_to_local_combined *cp;
     484             : 
     485         152 :     if (!PG_VALID_ENCODING(encoding))
     486           0 :         ereport(ERROR,
     487             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     488             :                  errmsg("invalid encoding number: %d", encoding)));
     489             : 
     490         608 :     for (; len > 0; len -= l)
     491             :     {
     492         456 :         unsigned char b1 = 0;
     493         456 :         unsigned char b2 = 0;
     494         456 :         unsigned char b3 = 0;
     495         456 :         unsigned char b4 = 0;
     496             : 
     497             :         /* "break" cases all represent errors */
     498         456 :         if (*utf == '\0')
     499           0 :             break;
     500             : 
     501         456 :         l = pg_utf_mblen(utf);
     502         456 :         if (len < l)
     503           0 :             break;
     504             : 
     505         456 :         if (!pg_utf8_islegal(utf, l))
     506           0 :             break;
     507             : 
     508         456 :         if (l == 1)
     509             :         {
     510             :             /* ASCII case is easy, assume it's one-to-one conversion */
     511         456 :             *iso++ = *utf++;
     512         456 :             continue;
     513             :         }
     514             : 
     515             :         /* collect coded char of length l */
     516           0 :         if (l == 2)
     517             :         {
     518           0 :             b3 = *utf++;
     519           0 :             b4 = *utf++;
     520             :         }
     521           0 :         else if (l == 3)
     522             :         {
     523           0 :             b2 = *utf++;
     524           0 :             b3 = *utf++;
     525           0 :             b4 = *utf++;
     526             :         }
     527           0 :         else if (l == 4)
     528             :         {
     529           0 :             b1 = *utf++;
     530           0 :             b2 = *utf++;
     531           0 :             b3 = *utf++;
     532           0 :             b4 = *utf++;
     533             :         }
     534             :         else
     535             :         {
     536           0 :             elog(ERROR, "unsupported character length %d", l);
     537             :             iutf = 0;           /* keep compiler quiet */
     538             :         }
     539           0 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     540             : 
     541             :         /* First, try with combined map if possible */
     542           0 :         if (cmap && len > l)
     543             :         {
     544           0 :             const unsigned char *utf_save = utf;
     545           0 :             int         len_save = len;
     546           0 :             int         l_save = l;
     547             : 
     548             :             /* collect next character, same as above */
     549           0 :             len -= l;
     550             : 
     551           0 :             l = pg_utf_mblen(utf);
     552           0 :             if (len < l)
     553           0 :                 break;
     554             : 
     555           0 :             if (!pg_utf8_islegal(utf, l))
     556           0 :                 break;
     557             : 
     558             :             /* We assume ASCII character cannot be in combined map */
     559           0 :             if (l > 1)
     560             :             {
     561             :                 uint32      iutf2;
     562             :                 uint32      cutf[2];
     563             : 
     564           0 :                 if (l == 2)
     565             :                 {
     566           0 :                     iutf2 = *utf++ << 8;
     567           0 :                     iutf2 |= *utf++;
     568             :                 }
     569           0 :                 else if (l == 3)
     570             :                 {
     571           0 :                     iutf2 = *utf++ << 16;
     572           0 :                     iutf2 |= *utf++ << 8;
     573           0 :                     iutf2 |= *utf++;
     574             :                 }
     575           0 :                 else if (l == 4)
     576             :                 {
     577           0 :                     iutf2 = *utf++ << 24;
     578           0 :                     iutf2 |= *utf++ << 16;
     579           0 :                     iutf2 |= *utf++ << 8;
     580           0 :                     iutf2 |= *utf++;
     581             :                 }
     582             :                 else
     583             :                 {
     584           0 :                     elog(ERROR, "unsupported character length %d", l);
     585             :                     iutf2 = 0;  /* keep compiler quiet */
     586             :                 }
     587             : 
     588           0 :                 cutf[0] = iutf;
     589           0 :                 cutf[1] = iutf2;
     590             : 
     591           0 :                 cp = bsearch(cutf, cmap, cmapsize,
     592             :                              sizeof(pg_utf_to_local_combined), compare3);
     593             : 
     594           0 :                 if (cp)
     595             :                 {
     596           0 :                     iso = store_coded_char(iso, cp->code);
     597           0 :                     continue;
     598             :                 }
     599             :             }
     600             : 
     601             :             /* fail, so back up to reprocess second character next time */
     602           0 :             utf = utf_save;
     603           0 :             len = len_save;
     604           0 :             l = l_save;
     605             :         }
     606             : 
     607             :         /* Now check ordinary map */
     608           0 :         if (map)
     609             :         {
     610           0 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     611             : 
     612           0 :             if (converted)
     613             :             {
     614           0 :                 iso = store_coded_char(iso, converted);
     615           0 :                 continue;
     616             :             }
     617             :         }
     618             : 
     619             :         /* if there's a conversion function, try that */
     620           0 :         if (conv_func)
     621             :         {
     622           0 :             uint32      converted = (*conv_func) (iutf);
     623             : 
     624           0 :             if (converted)
     625             :             {
     626           0 :                 iso = store_coded_char(iso, converted);
     627           0 :                 continue;
     628             :             }
     629             :         }
     630             : 
     631             :         /* failed to translate this character */
     632           0 :         report_untranslatable_char(PG_UTF8, encoding,
     633           0 :                                    (const char *) (utf - l), len);
     634             :     }
     635             : 
     636             :     /* if we broke out of loop early, must be invalid input */
     637         152 :     if (len > 0)
     638           0 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     639             : 
     640         152 :     *iso = '\0';
     641         152 : }
     642             : 
     643             : /*
     644             :  * local code ---> UTF8
     645             :  *
     646             :  * iso: input string in local encoding (need not be null-terminated)
     647             :  * len: length of input string (in bytes)
     648             :  * utf: pointer to the output area (must be large enough!)
     649             :           (output string will be null-terminated)
     650             :  * map: conversion map for single characters
     651             :  * cmap: conversion map for combined characters
     652             :  *        (optional, pass NULL if none)
     653             :  * cmapsize: number of entries in the conversion map for combined characters
     654             :  *        (optional, pass 0 if none)
     655             :  * conv_func: algorithmic encoding conversion function
     656             :  *        (optional, pass NULL if none)
     657             :  * encoding: PG identifier for the local encoding
     658             :  *
     659             :  * For each character, the map is consulted first; if no match, the cmap
     660             :  * (if provided) is consulted next; if still no match, the conv_func
     661             :  * (if provided) is applied.  An error is raised if no match is found.
     662             :  *
     663             :  * See pg_wchar.h for more details about the data structures used here.
     664             :  */
     665             : void
     666         152 : LocalToUtf(const unsigned char *iso, int len,
     667             :            unsigned char *utf,
     668             :            const pg_mb_radix_tree *map,
     669             :            const pg_local_to_utf_combined *cmap, int cmapsize,
     670             :            utf_local_conversion_func conv_func,
     671             :            int encoding)
     672             : {
     673             :     uint32      iiso;
     674             :     int         l;
     675             :     const pg_local_to_utf_combined *cp;
     676             : 
     677         152 :     if (!PG_VALID_ENCODING(encoding))
     678           0 :         ereport(ERROR,
     679             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     680             :                  errmsg("invalid encoding number: %d", encoding)));
     681             : 
     682         608 :     for (; len > 0; len -= l)
     683             :     {
     684         456 :         unsigned char b1 = 0;
     685         456 :         unsigned char b2 = 0;
     686         456 :         unsigned char b3 = 0;
     687         456 :         unsigned char b4 = 0;
     688             : 
     689             :         /* "break" cases all represent errors */
     690         456 :         if (*iso == '\0')
     691           0 :             break;
     692             : 
     693         456 :         if (!IS_HIGHBIT_SET(*iso))
     694             :         {
     695             :             /* ASCII case is easy, assume it's one-to-one conversion */
     696         456 :             *utf++ = *iso++;
     697         456 :             l = 1;
     698         456 :             continue;
     699             :         }
     700             : 
     701           0 :         l = pg_encoding_verifymb(encoding, (const char *) iso, len);
     702           0 :         if (l < 0)
     703           0 :             break;
     704             : 
     705             :         /* collect coded char of length l */
     706           0 :         if (l == 1)
     707           0 :             b4 = *iso++;
     708           0 :         else if (l == 2)
     709             :         {
     710           0 :             b3 = *iso++;
     711           0 :             b4 = *iso++;
     712             :         }
     713           0 :         else if (l == 3)
     714             :         {
     715           0 :             b2 = *iso++;
     716           0 :             b3 = *iso++;
     717           0 :             b4 = *iso++;
     718             :         }
     719           0 :         else if (l == 4)
     720             :         {
     721           0 :             b1 = *iso++;
     722           0 :             b2 = *iso++;
     723           0 :             b3 = *iso++;
     724           0 :             b4 = *iso++;
     725             :         }
     726             :         else
     727             :         {
     728           0 :             elog(ERROR, "unsupported character length %d", l);
     729             :             iiso = 0;           /* keep compiler quiet */
     730             :         }
     731           0 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     732             : 
     733           0 :         if (map)
     734             :         {
     735           0 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     736             : 
     737           0 :             if (converted)
     738             :             {
     739           0 :                 utf = store_coded_char(utf, converted);
     740           0 :                 continue;
     741             :             }
     742             : 
     743             :             /* If there's a combined character map, try that */
     744           0 :             if (cmap)
     745             :             {
     746           0 :                 cp = bsearch(&iiso, cmap, cmapsize,
     747             :                              sizeof(pg_local_to_utf_combined), compare4);
     748             : 
     749           0 :                 if (cp)
     750             :                 {
     751           0 :                     utf = store_coded_char(utf, cp->utf1);
     752           0 :                     utf = store_coded_char(utf, cp->utf2);
     753           0 :                     continue;
     754             :                 }
     755             :             }
     756             :         }
     757             : 
     758             :         /* if there's a conversion function, try that */
     759           0 :         if (conv_func)
     760             :         {
     761           0 :             uint32      converted = (*conv_func) (iiso);
     762             : 
     763           0 :             if (converted)
     764             :             {
     765           0 :                 utf = store_coded_char(utf, converted);
     766           0 :                 continue;
     767             :             }
     768             :         }
     769             : 
     770             :         /* failed to translate this character */
     771           0 :         report_untranslatable_char(encoding, PG_UTF8,
     772           0 :                                    (const char *) (iso - l), len);
     773             :     }
     774             : 
     775             :     /* if we broke out of loop early, must be invalid input */
     776         152 :     if (len > 0)
     777           0 :         report_invalid_encoding(encoding, (const char *) iso, len);
     778             : 
     779         152 :     *utf = '\0';
     780         152 : }

Generated by: LCOV version 1.13