LCOV - code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 274 325 84.3 %
Date: 2024-04-19 18:11:10 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  *    Utility functions for conversion procs.
       4             :  *
       5             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       6             :  * Portions Copyright (c) 1994, Regents of the University of California
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/utils/mb/conv.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres.h"
      14             : #include "mb/pg_wchar.h"
      15             : 
      16             : 
      17             : /*
      18             :  * local2local: a generic single byte charset encoding
      19             :  * conversion between two ASCII-superset encodings.
      20             :  *
      21             :  * l points to the source string of length len
      22             :  * p is the output area (must be large enough!)
      23             :  * src_encoding is the PG identifier for the source encoding
      24             :  * dest_encoding is the PG identifier for the target encoding
      25             :  * tab holds conversion entries for the source charset
      26             :  * starting from 128 (0x80). each entry in the table holds the corresponding
      27             :  * code point for the target charset, or 0 if there is no equivalent code.
      28             :  *
      29             :  * Returns the number of input bytes consumed.  If noError is true, this can
      30             :  * be less than 'len'.
      31             :  */
      32             : int
      33         228 : local2local(const unsigned char *l,
      34             :             unsigned char *p,
      35             :             int len,
      36             :             int src_encoding,
      37             :             int dest_encoding,
      38             :             const unsigned char *tab,
      39             :             bool noError)
      40             : {
      41         228 :     const unsigned char *start = l;
      42             :     unsigned char c1,
      43             :                 c2;
      44             : 
      45         732 :     while (len > 0)
      46             :     {
      47         612 :         c1 = *l;
      48         612 :         if (c1 == 0)
      49             :         {
      50         108 :             if (noError)
      51          54 :                 break;
      52          54 :             report_invalid_encoding(src_encoding, (const char *) l, len);
      53             :         }
      54         504 :         if (!IS_HIGHBIT_SET(c1))
      55         306 :             *p++ = c1;
      56             :         else
      57             :         {
      58         198 :             c2 = tab[c1 - HIGHBIT];
      59         198 :             if (c2)
      60         198 :                 *p++ = c2;
      61             :             else
      62             :             {
      63           0 :                 if (noError)
      64           0 :                     break;
      65           0 :                 report_untranslatable_char(src_encoding, dest_encoding,
      66             :                                            (const char *) l, len);
      67             :             }
      68             :         }
      69         504 :         l++;
      70         504 :         len--;
      71             :     }
      72         174 :     *p = '\0';
      73             : 
      74         174 :     return l - start;
      75             : }
      76             : 
      77             : /*
      78             :  * LATINn ---> MIC when the charset's local codes map directly to MIC
      79             :  *
      80             :  * l points to the source string of length len
      81             :  * p is the output area (must be large enough!)
      82             :  * lc is the mule character set id for the local encoding
      83             :  * encoding is the PG identifier for the local encoding
      84             :  *
      85             :  * Returns the number of input bytes consumed.  If noError is true, this can
      86             :  * be less than 'len'.
      87             :  */
      88             : int
      89          30 : latin2mic(const unsigned char *l, unsigned char *p, int len,
      90             :           int lc, int encoding, bool noError)
      91             : {
      92          30 :     const unsigned char *start = l;
      93             :     int         c1;
      94             : 
      95         120 :     while (len > 0)
      96             :     {
      97          90 :         c1 = *l;
      98          90 :         if (c1 == 0)
      99             :         {
     100           0 :             if (noError)
     101           0 :                 break;
     102           0 :             report_invalid_encoding(encoding, (const char *) l, len);
     103             :         }
     104          90 :         if (IS_HIGHBIT_SET(c1))
     105           0 :             *p++ = lc;
     106          90 :         *p++ = c1;
     107          90 :         l++;
     108          90 :         len--;
     109             :     }
     110          30 :     *p = '\0';
     111             : 
     112          30 :     return l - start;
     113             : }
     114             : 
     115             : /*
     116             :  * MIC ---> LATINn when the charset's local codes map directly to MIC
     117             :  *
     118             :  * mic points to the source string of length len
     119             :  * p is the output area (must be large enough!)
     120             :  * lc is the mule character set id for the local encoding
     121             :  * encoding is the PG identifier for the local encoding
     122             :  *
     123             :  * Returns the number of input bytes consumed.  If noError is true, this can
     124             :  * be less than 'len'.
     125             :  */
     126             : int
     127         354 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
     128             :           int lc, int encoding, bool noError)
     129             : {
     130         354 :     const unsigned char *start = mic;
     131             :     int         c1;
     132             : 
     133         840 :     while (len > 0)
     134             :     {
     135         774 :         c1 = *mic;
     136         774 :         if (c1 == 0)
     137             :         {
     138           0 :             if (noError)
     139           0 :                 break;
     140           0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     141             :         }
     142         774 :         if (!IS_HIGHBIT_SET(c1))
     143             :         {
     144             :             /* easy for ASCII */
     145         360 :             *p++ = c1;
     146         360 :             mic++;
     147         360 :             len--;
     148             :         }
     149             :         else
     150             :         {
     151         414 :             int         l = pg_mule_mblen(mic);
     152             : 
     153         414 :             if (len < l)
     154             :             {
     155         108 :                 if (noError)
     156          54 :                     break;
     157          54 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     158             :                                         len);
     159             :             }
     160         306 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
     161             :             {
     162         180 :                 if (noError)
     163          90 :                     break;
     164          90 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     165             :                                            (const char *) mic, len);
     166             :             }
     167         126 :             *p++ = mic[1];
     168         126 :             mic += 2;
     169         126 :             len -= 2;
     170             :         }
     171             :     }
     172         210 :     *p = '\0';
     173             : 
     174         210 :     return mic - start;
     175             : }
     176             : 
     177             : 
     178             : /*
     179             :  * latin2mic_with_table: a generic single byte charset encoding
     180             :  * conversion from a local charset to the mule internal code.
     181             :  *
     182             :  * l points to the source string of length len
     183             :  * p is the output area (must be large enough!)
     184             :  * lc is the mule character set id for the local encoding
     185             :  * encoding is the PG identifier for the local encoding
     186             :  * tab holds conversion entries for the local charset
     187             :  * starting from 128 (0x80). each entry in the table holds the corresponding
     188             :  * code point for the mule encoding, or 0 if there is no equivalent code.
     189             :  *
     190             :  * Returns the number of input bytes consumed.  If noError is true, this can
     191             :  * be less than 'len'.
     192             :  */
     193             : int
     194         168 : latin2mic_with_table(const unsigned char *l,
     195             :                      unsigned char *p,
     196             :                      int len,
     197             :                      int lc,
     198             :                      int encoding,
     199             :                      const unsigned char *tab,
     200             :                      bool noError)
     201             : {
     202         168 :     const unsigned char *start = l;
     203             :     unsigned char c1,
     204             :                 c2;
     205             : 
     206         492 :     while (len > 0)
     207             :     {
     208         432 :         c1 = *l;
     209         432 :         if (c1 == 0)
     210             :         {
     211         108 :             if (noError)
     212          54 :                 break;
     213          54 :             report_invalid_encoding(encoding, (const char *) l, len);
     214             :         }
     215         324 :         if (!IS_HIGHBIT_SET(c1))
     216         126 :             *p++ = c1;
     217             :         else
     218             :         {
     219         198 :             c2 = tab[c1 - HIGHBIT];
     220         198 :             if (c2)
     221             :             {
     222         198 :                 *p++ = lc;
     223         198 :                 *p++ = c2;
     224             :             }
     225             :             else
     226             :             {
     227           0 :                 if (noError)
     228           0 :                     break;
     229           0 :                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
     230             :                                            (const char *) l, len);
     231             :             }
     232             :         }
     233         324 :         l++;
     234         324 :         len--;
     235             :     }
     236         114 :     *p = '\0';
     237             : 
     238         114 :     return l - start;
     239             : }
     240             : 
     241             : /*
     242             :  * mic2latin_with_table: a generic single byte charset encoding
     243             :  * conversion from the mule internal code to a local charset.
     244             :  *
     245             :  * mic points to the source string of length len
     246             :  * p is the output area (must be large enough!)
     247             :  * lc is the mule character set id for the local encoding
     248             :  * encoding is the PG identifier for the local encoding
     249             :  * tab holds conversion entries for the mule internal code's second byte,
     250             :  * starting from 128 (0x80). each entry in the table holds the corresponding
     251             :  * code point for the local charset, or 0 if there is no equivalent code.
     252             :  *
     253             :  * Returns the number of input bytes consumed.  If noError is true, this can
     254             :  * be less than 'len'.
     255             :  */
     256             : int
     257         348 : mic2latin_with_table(const unsigned char *mic,
     258             :                      unsigned char *p,
     259             :                      int len,
     260             :                      int lc,
     261             :                      int encoding,
     262             :                      const unsigned char *tab,
     263             :                      bool noError)
     264             : {
     265         348 :     const unsigned char *start = mic;
     266             :     unsigned char c1,
     267             :                 c2;
     268             : 
     269         816 :     while (len > 0)
     270             :     {
     271         756 :         c1 = *mic;
     272         756 :         if (c1 == 0)
     273             :         {
     274           0 :             if (noError)
     275           0 :                 break;
     276           0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     277             :         }
     278         756 :         if (!IS_HIGHBIT_SET(c1))
     279             :         {
     280             :             /* easy for ASCII */
     281         342 :             *p++ = c1;
     282         342 :             mic++;
     283         342 :             len--;
     284             :         }
     285             :         else
     286             :         {
     287         414 :             int         l = pg_mule_mblen(mic);
     288             : 
     289         414 :             if (len < l)
     290             :             {
     291         108 :                 if (noError)
     292          54 :                     break;
     293          54 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     294             :                                         len);
     295             :             }
     296         306 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
     297         126 :                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
     298             :             {
     299         180 :                 if (noError)
     300          90 :                     break;
     301          90 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     302             :                                            (const char *) mic, len);
     303             :                 break;          /* keep compiler quiet */
     304             :             }
     305         126 :             *p++ = c2;
     306         126 :             mic += 2;
     307         126 :             len -= 2;
     308             :         }
     309             :     }
     310         204 :     *p = '\0';
     311             : 
     312         204 :     return mic - start;
     313             : }
     314             : 
     315             : /*
     316             :  * comparison routine for bsearch()
     317             :  * this routine is intended for combined UTF8 -> local code
     318             :  */
     319             : static int
     320         468 : compare3(const void *p1, const void *p2)
     321             : {
     322             :     uint32      s1,
     323             :                 s2,
     324             :                 d1,
     325             :                 d2;
     326             : 
     327         468 :     s1 = *(const uint32 *) p1;
     328         468 :     s2 = *((const uint32 *) p1 + 1);
     329         468 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
     330         468 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
     331         468 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
     332             : }
     333             : 
     334             : /*
     335             :  * comparison routine for bsearch()
     336             :  * this routine is intended for local code -> combined UTF8
     337             :  */
     338             : static int
     339         162 : compare4(const void *p1, const void *p2)
     340             : {
     341             :     uint32      v1,
     342             :                 v2;
     343             : 
     344         162 :     v1 = *(const uint32 *) p1;
     345         162 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
     346         162 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
     347             : }
     348             : 
     349             : /*
     350             :  * store 32bit character representation into multibyte stream
     351             :  */
     352             : static inline unsigned char *
     353        1134 : store_coded_char(unsigned char *dest, uint32 code)
     354             : {
     355        1134 :     if (code & 0xff000000)
     356         126 :         *dest++ = code >> 24;
     357        1134 :     if (code & 0x00ff0000)
     358         522 :         *dest++ = code >> 16;
     359        1134 :     if (code & 0x0000ff00)
     360        1008 :         *dest++ = code >> 8;
     361        1134 :     if (code & 0x000000ff)
     362        1134 :         *dest++ = code;
     363        1134 :     return dest;
     364             : }
     365             : 
     366             : /*
     367             :  * Convert a character using a conversion radix tree.
     368             :  *
     369             :  * 'l' is the length of the input character in bytes, and b1-b4 are
     370             :  * the input character's bytes.
     371             :  */
     372             : static inline uint32
     373        1980 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
     374             :                  int l,
     375             :                  unsigned char b1,
     376             :                  unsigned char b2,
     377             :                  unsigned char b3,
     378             :                  unsigned char b4)
     379             : {
     380        1980 :     if (l == 4)
     381             :     {
     382             :         /* 4-byte code */
     383             : 
     384             :         /* check code validity */
     385          90 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
     386          90 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
     387          90 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
     388          90 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
     389           0 :             return 0;
     390             : 
     391             :         /* perform lookup */
     392          90 :         if (rt->chars32)
     393             :         {
     394          90 :             uint32      idx = rt->b4root;
     395             : 
     396          90 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
     397          90 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
     398          90 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
     399          90 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
     400             :         }
     401             :         else
     402             :         {
     403           0 :             uint16      idx = rt->b4root;
     404             : 
     405           0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
     406           0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
     407           0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
     408           0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
     409             :         }
     410             :     }
     411        1890 :     else if (l == 3)
     412             :     {
     413             :         /* 3-byte code */
     414             : 
     415             :         /* check code validity */
     416         936 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
     417         288 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
     418         288 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
     419         648 :             return 0;
     420             : 
     421             :         /* perform lookup */
     422         288 :         if (rt->chars32)
     423             :         {
     424         288 :             uint32      idx = rt->b3root;
     425             : 
     426         288 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
     427         288 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
     428         288 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
     429             :         }
     430             :         else
     431             :         {
     432           0 :             uint16      idx = rt->b3root;
     433             : 
     434           0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
     435           0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
     436           0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
     437             :         }
     438             :     }
     439         954 :     else if (l == 2)
     440             :     {
     441             :         /* 2-byte code */
     442             : 
     443             :         /* check code validity - first byte */
     444         756 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
     445         684 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
     446          72 :             return 0;
     447             : 
     448             :         /* perform lookup */
     449         684 :         if (rt->chars32)
     450             :         {
     451         522 :             uint32      idx = rt->b2root;
     452             : 
     453         522 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
     454         522 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
     455             :         }
     456             :         else
     457             :         {
     458         162 :             uint16      idx = rt->b2root;
     459             : 
     460         162 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
     461         162 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
     462             :         }
     463             :     }
     464         198 :     else if (l == 1)
     465             :     {
     466             :         /* 1-byte code */
     467             : 
     468             :         /* check code validity - first byte */
     469         198 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
     470           0 :             return 0;
     471             : 
     472             :         /* perform lookup */
     473         198 :         if (rt->chars32)
     474         198 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
     475             :         else
     476           0 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
     477             :     }
     478           0 :     return 0;                   /* shouldn't happen */
     479             : }
     480             : 
     481             : /*
     482             :  * UTF8 ---> local code
     483             :  *
     484             :  * utf: input string in UTF8 encoding (need not be null-terminated)
     485             :  * len: length of input string (in bytes)
     486             :  * iso: pointer to the output area (must be large enough!)
     487             :           (output string will be null-terminated)
     488             :  * map: conversion map for single characters
     489             :  * cmap: conversion map for combined characters
     490             :  *        (optional, pass NULL if none)
     491             :  * cmapsize: number of entries in the conversion map for combined characters
     492             :  *        (optional, pass 0 if none)
     493             :  * conv_func: algorithmic encoding conversion function
     494             :  *        (optional, pass NULL if none)
     495             :  * encoding: PG identifier for the local encoding
     496             :  *
     497             :  * For each character, the cmap (if provided) is consulted first; if no match,
     498             :  * the map is consulted next; if still no match, the conv_func (if provided)
     499             :  * is applied.  An error is raised if no match is found.
     500             :  *
     501             :  * See pg_wchar.h for more details about the data structures used here.
     502             :  *
     503             :  * Returns the number of input bytes consumed.  If noError is true, this can
     504             :  * be less than 'len'.
     505             :  */
     506             : int
     507        2208 : UtfToLocal(const unsigned char *utf, int len,
     508             :            unsigned char *iso,
     509             :            const pg_mb_radix_tree *map,
     510             :            const pg_utf_to_local_combined *cmap, int cmapsize,
     511             :            utf_local_conversion_func conv_func,
     512             :            int encoding, bool noError)
     513             : {
     514             :     uint32      iutf;
     515             :     int         l;
     516             :     const pg_utf_to_local_combined *cp;
     517        2208 :     const unsigned char *start = utf;
     518             : 
     519        2208 :     if (!PG_VALID_ENCODING(encoding))
     520           0 :         ereport(ERROR,
     521             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     522             :                  errmsg("invalid encoding number: %d", encoding)));
     523             : 
     524        6060 :     for (; len > 0; len -= l)
     525             :     {
     526        5472 :         unsigned char b1 = 0;
     527        5472 :         unsigned char b2 = 0;
     528        5472 :         unsigned char b3 = 0;
     529        5472 :         unsigned char b4 = 0;
     530             : 
     531             :         /* "break" cases all represent errors */
     532        5472 :         if (*utf == '\0')
     533         180 :             break;
     534             : 
     535        5292 :         l = pg_utf_mblen(utf);
     536        5292 :         if (len < l)
     537         216 :             break;
     538             : 
     539        5076 :         if (!pg_utf8_islegal(utf, l))
     540         360 :             break;
     541             : 
     542        4716 :         if (l == 1)
     543             :         {
     544             :             /* ASCII case is easy, assume it's one-to-one conversion */
     545        3312 :             *iso++ = *utf++;
     546        3312 :             continue;
     547             :         }
     548             : 
     549             :         /* collect coded char of length l */
     550        1404 :         if (l == 2)
     551             :         {
     552         414 :             b3 = *utf++;
     553         414 :             b4 = *utf++;
     554             :         }
     555         990 :         else if (l == 3)
     556             :         {
     557         990 :             b2 = *utf++;
     558         990 :             b3 = *utf++;
     559         990 :             b4 = *utf++;
     560             :         }
     561           0 :         else if (l == 4)
     562             :         {
     563           0 :             b1 = *utf++;
     564           0 :             b2 = *utf++;
     565           0 :             b3 = *utf++;
     566           0 :             b4 = *utf++;
     567             :         }
     568             :         else
     569             :         {
     570           0 :             elog(ERROR, "unsupported character length %d", l);
     571             :             iutf = 0;           /* keep compiler quiet */
     572             :         }
     573        1404 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     574             : 
     575             :         /* First, try with combined map if possible */
     576        1404 :         if (cmap && len > l)
     577             :         {
     578         144 :             const unsigned char *utf_save = utf;
     579         144 :             int         len_save = len;
     580         144 :             int         l_save = l;
     581             : 
     582             :             /* collect next character, same as above */
     583         144 :             len -= l;
     584             : 
     585         144 :             l = pg_utf_mblen(utf);
     586         144 :             if (len < l)
     587             :             {
     588             :                 /* need more data to decide if this is a combined char */
     589          36 :                 utf -= l_save;
     590          36 :                 break;
     591             :             }
     592             : 
     593         108 :             if (!pg_utf8_islegal(utf, l))
     594             :             {
     595           0 :                 if (!noError)
     596           0 :                     report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     597           0 :                 utf -= l_save;
     598           0 :                 break;
     599             :             }
     600             : 
     601             :             /* We assume ASCII character cannot be in combined map */
     602         108 :             if (l > 1)
     603             :             {
     604             :                 uint32      iutf2;
     605             :                 uint32      cutf[2];
     606             : 
     607         108 :                 if (l == 2)
     608             :                 {
     609          54 :                     iutf2 = *utf++ << 8;
     610          54 :                     iutf2 |= *utf++;
     611             :                 }
     612          54 :                 else if (l == 3)
     613             :                 {
     614          54 :                     iutf2 = *utf++ << 16;
     615          54 :                     iutf2 |= *utf++ << 8;
     616          54 :                     iutf2 |= *utf++;
     617             :                 }
     618           0 :                 else if (l == 4)
     619             :                 {
     620           0 :                     iutf2 = *utf++ << 24;
     621           0 :                     iutf2 |= *utf++ << 16;
     622           0 :                     iutf2 |= *utf++ << 8;
     623           0 :                     iutf2 |= *utf++;
     624             :                 }
     625             :                 else
     626             :                 {
     627           0 :                     elog(ERROR, "unsupported character length %d", l);
     628             :                     iutf2 = 0;  /* keep compiler quiet */
     629             :                 }
     630             : 
     631         108 :                 cutf[0] = iutf;
     632         108 :                 cutf[1] = iutf2;
     633             : 
     634         108 :                 cp = bsearch(cutf, cmap, cmapsize,
     635             :                              sizeof(pg_utf_to_local_combined), compare3);
     636             : 
     637         108 :                 if (cp)
     638             :                 {
     639          18 :                     iso = store_coded_char(iso, cp->code);
     640          18 :                     continue;
     641             :                 }
     642             :             }
     643             : 
     644             :             /* fail, so back up to reprocess second character next time */
     645          90 :             utf = utf_save;
     646          90 :             len = len_save;
     647          90 :             l = l_save;
     648             :         }
     649             : 
     650             :         /* Now check ordinary map */
     651        1350 :         if (map)
     652             :         {
     653        1350 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     654             : 
     655        1350 :             if (converted)
     656             :             {
     657         450 :                 iso = store_coded_char(iso, converted);
     658         450 :                 continue;
     659             :             }
     660             :         }
     661             : 
     662             :         /* if there's a conversion function, try that */
     663         900 :         if (conv_func)
     664             :         {
     665          72 :             uint32      converted = (*conv_func) (iutf);
     666             : 
     667          72 :             if (converted)
     668             :             {
     669          72 :                 iso = store_coded_char(iso, converted);
     670          72 :                 continue;
     671             :             }
     672             :         }
     673             : 
     674             :         /* failed to translate this character */
     675         828 :         utf -= l;
     676         828 :         if (noError)
     677         414 :             break;
     678         414 :         report_untranslatable_char(PG_UTF8, encoding,
     679             :                                    (const char *) utf, len);
     680             :     }
     681             : 
     682             :     /* if we broke out of loop early, must be invalid input */
     683        1794 :     if (len > 0 && !noError)
     684         396 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     685             : 
     686        1398 :     *iso = '\0';
     687             : 
     688        1398 :     return utf - start;
     689             : }
     690             : 
     691             : /*
     692             :  * local code ---> UTF8
     693             :  *
     694             :  * iso: input string in local encoding (need not be null-terminated)
     695             :  * len: length of input string (in bytes)
     696             :  * utf: pointer to the output area (must be large enough!)
     697             :           (output string will be null-terminated)
     698             :  * map: conversion map for single characters
     699             :  * cmap: conversion map for combined characters
     700             :  *        (optional, pass NULL if none)
     701             :  * cmapsize: number of entries in the conversion map for combined characters
     702             :  *        (optional, pass 0 if none)
     703             :  * conv_func: algorithmic encoding conversion function
     704             :  *        (optional, pass NULL if none)
     705             :  * encoding: PG identifier for the local encoding
     706             :  *
     707             :  * For each character, the map is consulted first; if no match, the cmap
     708             :  * (if provided) is consulted next; if still no match, the conv_func
     709             :  * (if provided) is applied.  An error is raised if no match is found.
     710             :  *
     711             :  * See pg_wchar.h for more details about the data structures used here.
     712             :  *
     713             :  * Returns the number of input bytes consumed.  If noError is true, this can
     714             :  * be less than 'len'.
     715             :  */
     716             : int
     717        1236 : LocalToUtf(const unsigned char *iso, int len,
     718             :            unsigned char *utf,
     719             :            const pg_mb_radix_tree *map,
     720             :            const pg_local_to_utf_combined *cmap, int cmapsize,
     721             :            utf_local_conversion_func conv_func,
     722             :            int encoding,
     723             :            bool noError)
     724             : {
     725             :     uint32      iiso;
     726             :     int         l;
     727             :     const pg_local_to_utf_combined *cp;
     728        1236 :     const unsigned char *start = iso;
     729             : 
     730        1236 :     if (!PG_VALID_ENCODING(encoding))
     731           0 :         ereport(ERROR,
     732             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     733             :                  errmsg("invalid encoding number: %d", encoding)));
     734             : 
     735        4854 :     for (; len > 0; len -= l)
     736             :     {
     737        4374 :         unsigned char b1 = 0;
     738        4374 :         unsigned char b2 = 0;
     739        4374 :         unsigned char b3 = 0;
     740        4374 :         unsigned char b4 = 0;
     741             : 
     742             :         /* "break" cases all represent errors */
     743        4374 :         if (*iso == '\0')
     744         324 :             break;
     745             : 
     746        4050 :         if (!IS_HIGHBIT_SET(*iso))
     747             :         {
     748             :             /* ASCII case is easy, assume it's one-to-one conversion */
     749        3060 :             *utf++ = *iso++;
     750        3060 :             l = 1;
     751        3060 :             continue;
     752             :         }
     753             : 
     754         990 :         l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
     755         990 :         if (l < 0)
     756         360 :             break;
     757             : 
     758             :         /* collect coded char of length l */
     759         630 :         if (l == 1)
     760         198 :             b4 = *iso++;
     761         432 :         else if (l == 2)
     762             :         {
     763         342 :             b3 = *iso++;
     764         342 :             b4 = *iso++;
     765             :         }
     766          90 :         else if (l == 3)
     767             :         {
     768           0 :             b2 = *iso++;
     769           0 :             b3 = *iso++;
     770           0 :             b4 = *iso++;
     771             :         }
     772          90 :         else if (l == 4)
     773             :         {
     774          90 :             b1 = *iso++;
     775          90 :             b2 = *iso++;
     776          90 :             b3 = *iso++;
     777          90 :             b4 = *iso++;
     778             :         }
     779             :         else
     780             :         {
     781           0 :             elog(ERROR, "unsupported character length %d", l);
     782             :             iiso = 0;           /* keep compiler quiet */
     783             :         }
     784         630 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     785             : 
     786         630 :         if (map)
     787             :         {
     788         630 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     789             : 
     790         630 :             if (converted)
     791             :             {
     792         468 :                 utf = store_coded_char(utf, converted);
     793         468 :                 continue;
     794             :             }
     795             : 
     796             :             /* If there's a combined character map, try that */
     797         162 :             if (cmap)
     798             :             {
     799          36 :                 cp = bsearch(&iiso, cmap, cmapsize,
     800             :                              sizeof(pg_local_to_utf_combined), compare4);
     801             : 
     802          36 :                 if (cp)
     803             :                 {
     804          36 :                     utf = store_coded_char(utf, cp->utf1);
     805          36 :                     utf = store_coded_char(utf, cp->utf2);
     806          36 :                     continue;
     807             :                 }
     808             :             }
     809             :         }
     810             : 
     811             :         /* if there's a conversion function, try that */
     812         126 :         if (conv_func)
     813             :         {
     814          90 :             uint32      converted = (*conv_func) (iiso);
     815             : 
     816          90 :             if (converted)
     817             :             {
     818          54 :                 utf = store_coded_char(utf, converted);
     819          54 :                 continue;
     820             :             }
     821             :         }
     822             : 
     823             :         /* failed to translate this character */
     824          72 :         iso -= l;
     825          72 :         if (noError)
     826          36 :             break;
     827          36 :         report_untranslatable_char(encoding, PG_UTF8,
     828             :                                    (const char *) iso, len);
     829             :     }
     830             : 
     831             :     /* if we broke out of loop early, must be invalid input */
     832        1200 :     if (len > 0 && !noError)
     833         342 :         report_invalid_encoding(encoding, (const char *) iso, len);
     834             : 
     835         858 :     *utf = '\0';
     836             : 
     837         858 :     return iso - start;
     838             : }

Generated by: LCOV version 1.14