LCOV - code coverage report
Current view: top level - src/backend/utils/mb/conversion_procs/utf8_and_gb18030 - utf8_and_gb18030.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 81.5 % 92 75
Test Date: 2026-02-28 08:14:42 Functions: 100.0 % 11 11
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  *    GB18030 <--> UTF8
       4              :  *
       5              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       6              :  * Portions Copyright (c) 1994, Regents of the University of California
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : 
      14              : #include "postgres.h"
      15              : #include "fmgr.h"
      16              : #include "mb/pg_wchar.h"
      17              : #include "../../Unicode/gb18030_to_utf8.map"
      18              : #include "../../Unicode/utf8_to_gb18030.map"
      19              : 
      20            9 : PG_MODULE_MAGIC_EXT(
      21              :                     .name = "utf8_and_gb18030",
      22              :                     .version = PG_VERSION
      23              : );
      24              : 
      25            9 : PG_FUNCTION_INFO_V1(gb18030_to_utf8);
      26            9 : PG_FUNCTION_INFO_V1(utf8_to_gb18030);
      27              : 
      28              : /*
      29              :  * Convert 4-byte GB18030 characters to and from a linear code space
      30              :  *
      31              :  * The first and third bytes can range from 0x81 to 0xfe (126 values),
      32              :  * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
      33              :  */
      34              : static inline uint32
      35           90 : gb_linear(uint32 gb)
      36              : {
      37           90 :     uint32      b0 = (gb & 0xff000000) >> 24;
      38           90 :     uint32      b1 = (gb & 0x00ff0000) >> 16;
      39           90 :     uint32      b2 = (gb & 0x0000ff00) >> 8;
      40           90 :     uint32      b3 = (gb & 0x000000ff);
      41              : 
      42           90 :     return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
      43              :         (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
      44              : }
      45              : 
      46              : static inline uint32
      47           36 : gb_unlinear(uint32 lin)
      48              : {
      49           36 :     uint32      r0 = 0x81 + lin / 12600;
      50           36 :     uint32      r1 = 0x30 + (lin / 1260) % 10;
      51           36 :     uint32      r2 = 0x81 + (lin / 10) % 126;
      52           36 :     uint32      r3 = 0x30 + lin % 10;
      53              : 
      54           36 :     return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
      55              : }
      56              : 
      57              : /*
      58              :  * Convert word-formatted UTF8 to and from Unicode code points
      59              :  *
      60              :  * Probably this should be somewhere else ...
      61              :  */
      62              : static inline uint32
      63           27 : unicode_to_utf8word(uint32 c)
      64              : {
      65              :     uint32      word;
      66              : 
      67           27 :     if (c <= 0x7F)
      68              :     {
      69            0 :         word = c;
      70              :     }
      71           27 :     else if (c <= 0x7FF)
      72              :     {
      73            0 :         word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
      74            0 :         word |= 0x80 | (c & 0x3F);
      75              :     }
      76           27 :     else if (c <= 0xFFFF)
      77              :     {
      78           27 :         word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
      79           27 :         word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
      80           27 :         word |= 0x80 | (c & 0x3F);
      81              :     }
      82              :     else
      83              :     {
      84            0 :         word = (0xF0 | ((c >> 18) & 0x07)) << 24;
      85            0 :         word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
      86            0 :         word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
      87            0 :         word |= 0x80 | (c & 0x3F);
      88              :     }
      89              : 
      90           27 :     return word;
      91              : }
      92              : 
      93              : static inline uint32
      94           36 : utf8word_to_unicode(uint32 c)
      95              : {
      96              :     uint32      ucs;
      97              : 
      98           36 :     if (c <= 0x7F)
      99              :     {
     100            0 :         ucs = c;
     101              :     }
     102           36 :     else if (c <= 0xFFFF)
     103              :     {
     104            0 :         ucs = ((c >> 8) & 0x1F) << 6;
     105            0 :         ucs |= c & 0x3F;
     106              :     }
     107           36 :     else if (c <= 0xFFFFFF)
     108              :     {
     109           36 :         ucs = ((c >> 16) & 0x0F) << 12;
     110           36 :         ucs |= ((c >> 8) & 0x3F) << 6;
     111           36 :         ucs |= c & 0x3F;
     112              :     }
     113              :     else
     114              :     {
     115            0 :         ucs = ((c >> 24) & 0x07) << 18;
     116            0 :         ucs |= ((c >> 16) & 0x3F) << 12;
     117            0 :         ucs |= ((c >> 8) & 0x3F) << 6;
     118            0 :         ucs |= c & 0x3F;
     119              :     }
     120              : 
     121           36 :     return ucs;
     122              : }
     123              : 
     124              : /*
     125              :  * Perform mapping of GB18030 ranges to UTF8
     126              :  *
     127              :  * General description, and the range we need to convert for U+10000 and up:
     128              :  * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/gb18030.html
     129              :  *
     130              :  * Ranges up to U+FFFF:
     131              :  * https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/ranges.txt
     132              :  *
     133              :  * All are ranges of 4-byte GB18030 codes.
     134              :  */
     135              : static uint32
     136           45 : conv_18030_to_utf8(uint32 code)
     137              : {
     138              : #define conv18030(minunicode, mincode, maxcode) \
     139              :     if (code >= mincode && code <= maxcode) \
     140              :         return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
     141              : 
     142           45 :     conv18030(0x0452, 0x8130D330, 0x8136A531);
     143           45 :     conv18030(0x2643, 0x8137A839, 0x8138FD38);
     144           45 :     conv18030(0x361B, 0x8230A633, 0x8230F237);
     145           45 :     conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
     146           45 :     conv18030(0x4160, 0x8232C937, 0x8232F837);
     147           45 :     conv18030(0x44D7, 0x8233A339, 0x8233C931);
     148           45 :     conv18030(0x478E, 0x8233E838, 0x82349638);
     149           45 :     conv18030(0x49B8, 0x8234A131, 0x8234E733);
     150           45 :     conv18030(0x9FA6, 0x82358F33, 0x8336C738);
     151           45 :     conv18030(0xE865, 0x8336D030, 0x84308534);
     152           45 :     conv18030(0xFA2A, 0x84309C38, 0x84318537);
     153           18 :     conv18030(0xFFE6, 0x8431A234, 0x8431A439);
     154           18 :     conv18030(0x10000, 0x90308130, 0xE3329A35);
     155              :     /* No mapping exists */
     156           18 :     return 0;
     157              : }
     158              : 
     159              : /*
     160              :  * Perform mapping of UTF8 ranges to GB18030
     161              :  */
     162              : static uint32
     163           36 : conv_utf8_to_18030(uint32 code)
     164              : {
     165           36 :     uint32      ucs = utf8word_to_unicode(code);
     166              : 
     167              : #define convutf8(minunicode, maxunicode, mincode) \
     168              :     if (ucs >= minunicode && ucs <= maxunicode) \
     169              :         return gb_unlinear(ucs - minunicode + gb_linear(mincode))
     170              : 
     171           36 :     convutf8(0x0452, 0x200F, 0x8130D330);
     172           36 :     convutf8(0x2643, 0x2E80, 0x8137A839);
     173           36 :     convutf8(0x361B, 0x3917, 0x8230A633);
     174           36 :     convutf8(0x3CE1, 0x4055, 0x8231D438);
     175           36 :     convutf8(0x4160, 0x4336, 0x8232C937);
     176           36 :     convutf8(0x44D7, 0x464B, 0x8233A339);
     177           36 :     convutf8(0x478E, 0x4946, 0x8233E838);
     178           36 :     convutf8(0x49B8, 0x4C76, 0x8234A131);
     179           36 :     convutf8(0x9FA6, 0xD7FF, 0x82358F33);
     180            9 :     convutf8(0xE865, 0xF92B, 0x8336D030);
     181            9 :     convutf8(0xFA2A, 0xFE2F, 0x84309C38);
     182            0 :     convutf8(0xFFE6, 0xFFFF, 0x8431A234);
     183            0 :     convutf8(0x10000, 0x10FFFF, 0x90308130);
     184              :     /* No mapping exists */
     185            0 :     return 0;
     186              : }
     187              : 
     188              : /* ----------
     189              :  * conv_proc(
     190              :  *      INTEGER,    -- source encoding id
     191              :  *      INTEGER,    -- destination encoding id
     192              :  *      CSTRING,    -- source string (null terminated C string)
     193              :  *      CSTRING,    -- destination string (null terminated C string)
     194              :  *      INTEGER,    -- source string length
     195              :  *      BOOL        -- if true, don't throw an error if conversion fails
     196              :  * ) returns INTEGER;
     197              :  *
     198              :  * Returns the number of bytes successfully converted.
     199              :  * ----------
     200              :  */
     201              : Datum
     202          153 : gb18030_to_utf8(PG_FUNCTION_ARGS)
     203              : {
     204          153 :     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
     205          153 :     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
     206          153 :     int         len = PG_GETARG_INT32(4);
     207          153 :     bool        noError = PG_GETARG_BOOL(5);
     208              :     int         converted;
     209              : 
     210          153 :     CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
     211              : 
     212          153 :     converted = LocalToUtf(src, len, dest,
     213              :                            &gb18030_to_unicode_tree,
     214              :                            NULL, 0,
     215              :                            conv_18030_to_utf8,
     216              :                            PG_GB18030,
     217              :                            noError);
     218              : 
     219           99 :     PG_RETURN_INT32(converted);
     220              : }
     221              : 
     222              : Datum
     223          174 : utf8_to_gb18030(PG_FUNCTION_ARGS)
     224              : {
     225          174 :     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
     226          174 :     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
     227          174 :     int         len = PG_GETARG_INT32(4);
     228          174 :     bool        noError = PG_GETARG_BOOL(5);
     229              :     int         converted;
     230              : 
     231          174 :     CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
     232              : 
     233          174 :     converted = UtfToLocal(src, len, dest,
     234              :                            &gb18030_from_unicode_tree,
     235              :                            NULL, 0,
     236              :                            conv_utf8_to_18030,
     237              :                            PG_GB18030,
     238              :                            noError);
     239              : 
     240          129 :     PG_RETURN_INT32(converted);
     241              : }
        

Generated by: LCOV version 2.0-1