LCOV - code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 83.9 % 236 198
Test Date: 2026-04-20 21:16:34 Functions: 100.0 % 7 7
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  *    Utility functions for conversion procs.
       4              :  *
       5              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       6              :  * Portions Copyright (c) 1994, Regents of the University of California
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/backend/utils/mb/conv.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "postgres.h"
      14              : #include "mb/pg_wchar.h"
      15              : 
      16              : 
      17              : /*
      18              :  * local2local: a generic single byte charset encoding
      19              :  * conversion between two ASCII-superset encodings.
      20              :  *
      21              :  * l points to the source string of length len
      22              :  * p is the output area (must be large enough!)
      23              :  * src_encoding is the PG identifier for the source encoding
      24              :  * dest_encoding is the PG identifier for the target encoding
      25              :  * tab holds conversion entries for the source charset
      26              :  * starting from 128 (0x80). each entry in the table holds the corresponding
      27              :  * code point for the target charset, or 0 if there is no equivalent code.
      28              :  *
      29              :  * Returns the number of input bytes consumed.  If noError is true, this can
      30              :  * be less than 'len'.
      31              :  */
      32              : int
      33          152 : local2local(const unsigned char *l,
      34              :             unsigned char *p,
      35              :             int len,
      36              :             int src_encoding,
      37              :             int dest_encoding,
      38              :             const unsigned char *tab,
      39              :             bool noError)
      40              : {
      41          152 :     const unsigned char *start = l;
      42              :     unsigned char c1,
      43              :                 c2;
      44              : 
      45          488 :     while (len > 0)
      46              :     {
      47          408 :         c1 = *l;
      48          408 :         if (c1 == 0)
      49              :         {
      50           72 :             if (noError)
      51           36 :                 break;
      52           36 :             report_invalid_encoding(src_encoding, (const char *) l, len);
      53              :         }
      54          336 :         if (!IS_HIGHBIT_SET(c1))
      55          204 :             *p++ = c1;
      56              :         else
      57              :         {
      58          132 :             c2 = tab[c1 - HIGHBIT];
      59          132 :             if (c2)
      60          132 :                 *p++ = c2;
      61              :             else
      62              :             {
      63            0 :                 if (noError)
      64            0 :                     break;
      65            0 :                 report_untranslatable_char(src_encoding, dest_encoding,
      66              :                                            (const char *) l, len);
      67              :             }
      68              :         }
      69          336 :         l++;
      70          336 :         len--;
      71              :     }
      72          116 :     *p = '\0';
      73              : 
      74          116 :     return l - start;
      75              : }
      76              : 
      77              : /*
      78              :  * comparison routine for bsearch()
      79              :  * this routine is intended for combined UTF8 -> local code
      80              :  */
      81              : static int
      82          312 : compare3(const void *p1, const void *p2)
      83              : {
      84              :     uint32      s1,
      85              :                 s2,
      86              :                 d1,
      87              :                 d2;
      88              : 
      89          312 :     s1 = *(const uint32 *) p1;
      90          312 :     s2 = *((const uint32 *) p1 + 1);
      91          312 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
      92          312 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
      93          312 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
      94              : }
      95              : 
      96              : /*
      97              :  * comparison routine for bsearch()
      98              :  * this routine is intended for local code -> combined UTF8
      99              :  */
     100              : static int
     101          108 : compare4(const void *p1, const void *p2)
     102              : {
     103              :     uint32      v1,
     104              :                 v2;
     105              : 
     106          108 :     v1 = *(const uint32 *) p1;
     107          108 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
     108          108 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
     109              : }
     110              : 
     111              : /*
     112              :  * store 32bit character representation into multibyte stream
     113              :  */
     114              : static inline unsigned char *
     115          820 : store_coded_char(unsigned char *dest, uint32 code)
     116              : {
     117          820 :     if (code & 0xff000000)
     118           84 :         *dest++ = code >> 24;
     119          820 :     if (code & 0x00ff0000)
     120          404 :         *dest++ = code >> 16;
     121          820 :     if (code & 0x0000ff00)
     122          736 :         *dest++ = code >> 8;
     123          820 :     if (code & 0x000000ff)
     124          820 :         *dest++ = code;
     125          820 :     return dest;
     126              : }
     127              : 
     128              : /*
     129              :  * Convert a character using a conversion radix tree.
     130              :  *
     131              :  * 'l' is the length of the input character in bytes, and b1-b4 are
     132              :  * the input character's bytes.
     133              :  */
     134              : static inline uint32
     135         1384 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
     136              :                  int l,
     137              :                  unsigned char b1,
     138              :                  unsigned char b2,
     139              :                  unsigned char b3,
     140              :                  unsigned char b4)
     141              : {
     142         1384 :     if (l == 4)
     143              :     {
     144              :         /* 4-byte code */
     145              : 
     146              :         /* check code validity */
     147           60 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
     148           60 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
     149           60 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
     150           60 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
     151            0 :             return 0;
     152              : 
     153              :         /* perform lookup */
     154           60 :         if (rt->chars32)
     155              :         {
     156           60 :             uint32      idx = rt->b4root;
     157              : 
     158           60 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
     159           60 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
     160           60 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
     161           60 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
     162              :         }
     163              :         else
     164              :         {
     165            0 :             uint16      idx = rt->b4root;
     166              : 
     167            0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
     168            0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
     169            0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
     170            0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
     171              :         }
     172              :     }
     173         1324 :     else if (l == 3)
     174              :     {
     175              :         /* 3-byte code */
     176              : 
     177              :         /* check code validity */
     178          632 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
     179          200 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
     180          200 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
     181          432 :             return 0;
     182              : 
     183              :         /* perform lookup */
     184          200 :         if (rt->chars32)
     185              :         {
     186          200 :             uint32      idx = rt->b3root;
     187              : 
     188          200 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
     189          200 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
     190          200 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
     191              :         }
     192              :         else
     193              :         {
     194            0 :             uint16      idx = rt->b3root;
     195              : 
     196            0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
     197            0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
     198            0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
     199              :         }
     200              :     }
     201          692 :     else if (l == 2)
     202              :     {
     203              :         /* 2-byte code */
     204              : 
     205              :         /* check code validity - first byte */
     206          560 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
     207          512 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
     208           48 :             return 0;
     209              : 
     210              :         /* perform lookup */
     211          512 :         if (rt->chars32)
     212              :         {
     213          404 :             uint32      idx = rt->b2root;
     214              : 
     215          404 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
     216          404 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
     217              :         }
     218              :         else
     219              :         {
     220          108 :             uint16      idx = rt->b2root;
     221              : 
     222          108 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
     223          108 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
     224              :         }
     225              :     }
     226          132 :     else if (l == 1)
     227              :     {
     228              :         /* 1-byte code */
     229              : 
     230              :         /* check code validity - first byte */
     231          132 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
     232            0 :             return 0;
     233              : 
     234              :         /* perform lookup */
     235          132 :         if (rt->chars32)
     236          132 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
     237              :         else
     238            0 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
     239              :     }
     240            0 :     return 0;                   /* shouldn't happen */
     241              : }
     242              : 
     243              : /*
     244              :  * UTF8 ---> local code
     245              :  *
     246              :  * utf: input string in UTF8 encoding (need not be null-terminated)
     247              :  * len: length of input string (in bytes)
     248              :  * iso: pointer to the output area (must be large enough!)
     249              :  *        (output string will be null-terminated)
     250              :  * map: conversion map for single characters
     251              :  * cmap: conversion map for combined characters
     252              :  *        (optional, pass NULL if none)
     253              :  * cmapsize: number of entries in the conversion map for combined characters
     254              :  *        (optional, pass 0 if none)
     255              :  * conv_func: algorithmic encoding conversion function
     256              :  *        (optional, pass NULL if none)
     257              :  * encoding: PG identifier for the local encoding
     258              :  *
     259              :  * For each character, the cmap (if provided) is consulted first; if no match,
     260              :  * the map is consulted next; if still no match, the conv_func (if provided)
     261              :  * is applied.  An error is raised if no match is found.
     262              :  *
     263              :  * See pg_wchar.h for more details about the data structures used here.
     264              :  *
     265              :  * Returns the number of input bytes consumed.  If noError is true, this can
     266              :  * be less than 'len'.
     267              :  */
     268              : int
     269         1524 : UtfToLocal(const unsigned char *utf, int len,
     270              :            unsigned char *iso,
     271              :            const pg_mb_radix_tree *map,
     272              :            const pg_utf_to_local_combined *cmap, int cmapsize,
     273              :            utf_local_conversion_func conv_func,
     274              :            int encoding, bool noError)
     275              : {
     276              :     uint32      iutf;
     277              :     int         l;
     278              :     const pg_utf_to_local_combined *cp;
     279         1524 :     const unsigned char *start = utf;
     280              : 
     281         1524 :     if (!PG_VALID_ENCODING(encoding))
     282            0 :         ereport(ERROR,
     283              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     284              :                  errmsg("invalid encoding number: %d", encoding)));
     285              : 
     286         4756 :     for (; len > 0; len -= l)
     287              :     {
     288         4312 :         unsigned char b1 = 0;
     289         4312 :         unsigned char b2 = 0;
     290         4312 :         unsigned char b3 = 0;
     291         4312 :         unsigned char b4 = 0;
     292              : 
     293              :         /* "break" cases all represent errors */
     294         4312 :         if (*utf == '\0')
     295          120 :             break;
     296              : 
     297         4192 :         l = pg_utf_mblen(utf);
     298         4192 :         if (len < l)
     299          144 :             break;
     300              : 
     301         4048 :         if (!pg_utf8_islegal(utf, l))
     302          240 :             break;
     303              : 
     304         3808 :         if (l == 1)
     305              :         {
     306              :             /* ASCII case is easy, assume it's one-to-one conversion */
     307         2864 :             *iso++ = *utf++;
     308         2864 :             continue;
     309              :         }
     310              : 
     311              :         /* collect coded char of length l */
     312          944 :         if (l == 2)
     313              :         {
     314          276 :             b3 = *utf++;
     315          276 :             b4 = *utf++;
     316              :         }
     317          668 :         else if (l == 3)
     318              :         {
     319          668 :             b2 = *utf++;
     320          668 :             b3 = *utf++;
     321          668 :             b4 = *utf++;
     322              :         }
     323            0 :         else if (l == 4)
     324              :         {
     325            0 :             b1 = *utf++;
     326            0 :             b2 = *utf++;
     327            0 :             b3 = *utf++;
     328            0 :             b4 = *utf++;
     329              :         }
     330              :         else
     331              :         {
     332            0 :             elog(ERROR, "unsupported character length %d", l);
     333              :             iutf = 0;           /* keep compiler quiet */
     334              :         }
     335          944 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     336              : 
     337              :         /* First, try with combined map if possible */
     338          944 :         if (cmap && len > l)
     339              :         {
     340           96 :             const unsigned char *utf_save = utf;
     341           96 :             int         len_save = len;
     342           96 :             int         l_save = l;
     343              : 
     344              :             /* collect next character, same as above */
     345           96 :             len -= l;
     346              : 
     347           96 :             l = pg_utf_mblen(utf);
     348           96 :             if (len < l)
     349              :             {
     350              :                 /* need more data to decide if this is a combined char */
     351           24 :                 utf -= l_save;
     352           24 :                 break;
     353              :             }
     354              : 
     355           72 :             if (!pg_utf8_islegal(utf, l))
     356              :             {
     357            0 :                 if (!noError)
     358            0 :                     report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     359            0 :                 utf -= l_save;
     360            0 :                 break;
     361              :             }
     362              : 
     363              :             /* We assume ASCII character cannot be in combined map */
     364           72 :             if (l > 1)
     365              :             {
     366              :                 uint32      iutf2;
     367              :                 uint32      cutf[2];
     368              : 
     369           72 :                 if (l == 2)
     370              :                 {
     371           36 :                     iutf2 = *utf++ << 8;
     372           36 :                     iutf2 |= *utf++;
     373              :                 }
     374           36 :                 else if (l == 3)
     375              :                 {
     376           36 :                     iutf2 = *utf++ << 16;
     377           36 :                     iutf2 |= *utf++ << 8;
     378           36 :                     iutf2 |= *utf++;
     379              :                 }
     380            0 :                 else if (l == 4)
     381              :                 {
     382            0 :                     iutf2 = *utf++ << 24;
     383            0 :                     iutf2 |= *utf++ << 16;
     384            0 :                     iutf2 |= *utf++ << 8;
     385            0 :                     iutf2 |= *utf++;
     386              :                 }
     387              :                 else
     388              :                 {
     389            0 :                     elog(ERROR, "unsupported character length %d", l);
     390              :                     iutf2 = 0;  /* keep compiler quiet */
     391              :                 }
     392              : 
     393           72 :                 cutf[0] = iutf;
     394           72 :                 cutf[1] = iutf2;
     395              : 
     396           72 :                 cp = bsearch(cutf, cmap, cmapsize,
     397              :                              sizeof(pg_utf_to_local_combined), compare3);
     398              : 
     399           72 :                 if (cp)
     400              :                 {
     401           12 :                     iso = store_coded_char(iso, cp->code);
     402           12 :                     continue;
     403              :                 }
     404              :             }
     405              : 
     406              :             /* fail, so back up to reprocess second character next time */
     407           60 :             utf = utf_save;
     408           60 :             len = len_save;
     409           60 :             l = l_save;
     410              :         }
     411              : 
     412              :         /* Now check ordinary map */
     413          908 :         if (map)
     414              :         {
     415          908 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     416              : 
     417          908 :             if (converted)
     418              :             {
     419          308 :                 iso = store_coded_char(iso, converted);
     420          308 :                 continue;
     421              :             }
     422              :         }
     423              : 
     424              :         /* if there's a conversion function, try that */
     425          600 :         if (conv_func)
     426              :         {
     427           48 :             uint32      converted = (*conv_func) (iutf);
     428              : 
     429           48 :             if (converted)
     430              :             {
     431           48 :                 iso = store_coded_char(iso, converted);
     432           48 :                 continue;
     433              :             }
     434              :         }
     435              : 
     436              :         /* failed to translate this character */
     437          552 :         utf -= l;
     438          552 :         if (noError)
     439          276 :             break;
     440          276 :         report_untranslatable_char(PG_UTF8, encoding,
     441              :                                    (const char *) utf, len);
     442              :     }
     443              : 
     444              :     /* if we broke out of loop early, must be invalid input */
     445         1248 :     if (len > 0 && !noError)
     446          264 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     447              : 
     448          984 :     *iso = '\0';
     449              : 
     450          984 :     return utf - start;
     451              : }
     452              : 
     453              : /*
     454              :  * local code ---> UTF8
     455              :  *
     456              :  * iso: input string in local encoding (need not be null-terminated)
     457              :  * len: length of input string (in bytes)
     458              :  * utf: pointer to the output area (must be large enough!)
     459              :  *        (output string will be null-terminated)
     460              :  * map: conversion map for single characters
     461              :  * cmap: conversion map for combined characters
     462              :  *        (optional, pass NULL if none)
     463              :  * cmapsize: number of entries in the conversion map for combined characters
     464              :  *        (optional, pass 0 if none)
     465              :  * conv_func: algorithmic encoding conversion function
     466              :  *        (optional, pass NULL if none)
     467              :  * encoding: PG identifier for the local encoding
     468              :  *
     469              :  * For each character, the map is consulted first; if no match, the cmap
     470              :  * (if provided) is consulted next; if still no match, the conv_func
     471              :  * (if provided) is applied.  An error is raised if no match is found.
     472              :  *
     473              :  * See pg_wchar.h for more details about the data structures used here.
     474              :  *
     475              :  * Returns the number of input bytes consumed.  If noError is true, this can
     476              :  * be less than 'len'.
     477              :  */
     478              : int
     479          908 : LocalToUtf(const unsigned char *iso, int len,
     480              :            unsigned char *utf,
     481              :            const pg_mb_radix_tree *map,
     482              :            const pg_local_to_utf_combined *cmap, int cmapsize,
     483              :            utf_local_conversion_func conv_func,
     484              :            int encoding,
     485              :            bool noError)
     486              : {
     487              :     uint32      iiso;
     488              :     int         l;
     489              :     const pg_local_to_utf_combined *cp;
     490          908 :     const unsigned char *start = iso;
     491              : 
     492          908 :     if (!PG_VALID_ENCODING(encoding))
     493            0 :         ereport(ERROR,
     494              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     495              :                  errmsg("invalid encoding number: %d", encoding)));
     496              : 
     497         4122 :     for (; len > 0; len -= l)
     498              :     {
     499         3766 :         unsigned char b1 = 0;
     500         3766 :         unsigned char b2 = 0;
     501         3766 :         unsigned char b3 = 0;
     502         3766 :         unsigned char b4 = 0;
     503              : 
     504              :         /* "break" cases all represent errors */
     505         3766 :         if (*iso == '\0')
     506          216 :             break;
     507              : 
     508         3550 :         if (!IS_HIGHBIT_SET(*iso))
     509              :         {
     510              :             /* ASCII case is easy, assume it's one-to-one conversion */
     511         2786 :             *utf++ = *iso++;
     512         2786 :             l = 1;
     513         2786 :             continue;
     514              :         }
     515              : 
     516          764 :         l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
     517          764 :         if (l < 0)
     518          288 :             break;
     519              : 
     520              :         /* collect coded char of length l */
     521          476 :         if (l == 1)
     522          132 :             b4 = *iso++;
     523          344 :         else if (l == 2)
     524              :         {
     525          284 :             b3 = *iso++;
     526          284 :             b4 = *iso++;
     527              :         }
     528           60 :         else if (l == 3)
     529              :         {
     530            0 :             b2 = *iso++;
     531            0 :             b3 = *iso++;
     532            0 :             b4 = *iso++;
     533              :         }
     534           60 :         else if (l == 4)
     535              :         {
     536           60 :             b1 = *iso++;
     537           60 :             b2 = *iso++;
     538           60 :             b3 = *iso++;
     539           60 :             b4 = *iso++;
     540              :         }
     541              :         else
     542              :         {
     543            0 :             elog(ERROR, "unsupported character length %d", l);
     544              :             iiso = 0;           /* keep compiler quiet */
     545              :         }
     546          476 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     547              : 
     548          476 :         if (map)
     549              :         {
     550          476 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     551              : 
     552          476 :             if (converted)
     553              :             {
     554          368 :                 utf = store_coded_char(utf, converted);
     555          368 :                 continue;
     556              :             }
     557              : 
     558              :             /* If there's a combined character map, try that */
     559          108 :             if (cmap)
     560              :             {
     561           24 :                 cp = bsearch(&iiso, cmap, cmapsize,
     562              :                              sizeof(pg_local_to_utf_combined), compare4);
     563              : 
     564           24 :                 if (cp)
     565              :                 {
     566           24 :                     utf = store_coded_char(utf, cp->utf1);
     567           24 :                     utf = store_coded_char(utf, cp->utf2);
     568           24 :                     continue;
     569              :                 }
     570              :             }
     571              :         }
     572              : 
     573              :         /* if there's a conversion function, try that */
     574           84 :         if (conv_func)
     575              :         {
     576           60 :             uint32      converted = (*conv_func) (iiso);
     577              : 
     578           60 :             if (converted)
     579              :             {
     580           36 :                 utf = store_coded_char(utf, converted);
     581           36 :                 continue;
     582              :             }
     583              :         }
     584              : 
     585              :         /* failed to translate this character */
     586           48 :         iso -= l;
     587           48 :         if (noError)
     588           24 :             break;
     589           24 :         report_untranslatable_char(encoding, PG_UTF8,
     590              :                                    (const char *) iso, len);
     591              :     }
     592              : 
     593              :     /* if we broke out of loop early, must be invalid input */
     594          884 :     if (len > 0 && !noError)
     595          248 :         report_invalid_encoding(encoding, (const char *) iso, len);
     596              : 
     597          636 :     *utf = '\0';
     598              : 
     599          636 :     return iso - start;
     600              : }
        

Generated by: LCOV version 2.0-1