LCOV - code coverage report
Current view: top level - src/backend/utils/mb/conversion_procs/utf8_and_gb18030 - utf8_and_gb18030.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 75 92 81.5 %
Date: 2025-04-01 16:15:31 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  *    GB18030 <--> UTF8
       4             :  *
       5             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       6             :  * Portions Copyright (c) 1994, Regents of the University of California
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : 
      14             : #include "postgres.h"
      15             : #include "fmgr.h"
      16             : #include "mb/pg_wchar.h"
      17             : #include "../../Unicode/gb18030_to_utf8.map"
      18             : #include "../../Unicode/utf8_to_gb18030.map"
      19             : 
      20          12 : PG_MODULE_MAGIC_EXT(
      21             :                     .name = "utf8_and_gb18030",
      22             :                     .version = PG_VERSION
      23             : );
      24             : 
      25          12 : PG_FUNCTION_INFO_V1(gb18030_to_utf8);
      26          12 : PG_FUNCTION_INFO_V1(utf8_to_gb18030);
      27             : 
      28             : /*
      29             :  * Convert 4-byte GB18030 characters to and from a linear code space
      30             :  *
      31             :  * The first and third bytes can range from 0x81 to 0xfe (126 values),
      32             :  * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
      33             :  */
      34             : static inline uint32
      35         180 : gb_linear(uint32 gb)
      36             : {
      37         180 :     uint32      b0 = (gb & 0xff000000) >> 24;
      38         180 :     uint32      b1 = (gb & 0x00ff0000) >> 16;
      39         180 :     uint32      b2 = (gb & 0x0000ff00) >> 8;
      40         180 :     uint32      b3 = (gb & 0x000000ff);
      41             : 
      42         180 :     return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
      43             :         (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
      44             : }
      45             : 
      46             : static inline uint32
      47          72 : gb_unlinear(uint32 lin)
      48             : {
      49          72 :     uint32      r0 = 0x81 + lin / 12600;
      50          72 :     uint32      r1 = 0x30 + (lin / 1260) % 10;
      51          72 :     uint32      r2 = 0x81 + (lin / 10) % 126;
      52          72 :     uint32      r3 = 0x30 + lin % 10;
      53             : 
      54          72 :     return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
      55             : }
      56             : 
      57             : /*
      58             :  * Convert word-formatted UTF8 to and from Unicode code points
      59             :  *
      60             :  * Probably this should be somewhere else ...
      61             :  */
      62             : static inline uint32
      63          54 : unicode_to_utf8word(uint32 c)
      64             : {
      65             :     uint32      word;
      66             : 
      67          54 :     if (c <= 0x7F)
      68             :     {
      69           0 :         word = c;
      70             :     }
      71          54 :     else if (c <= 0x7FF)
      72             :     {
      73           0 :         word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
      74           0 :         word |= 0x80 | (c & 0x3F);
      75             :     }
      76          54 :     else if (c <= 0xFFFF)
      77             :     {
      78          54 :         word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
      79          54 :         word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
      80          54 :         word |= 0x80 | (c & 0x3F);
      81             :     }
      82             :     else
      83             :     {
      84           0 :         word = (0xF0 | ((c >> 18) & 0x07)) << 24;
      85           0 :         word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
      86           0 :         word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
      87           0 :         word |= 0x80 | (c & 0x3F);
      88             :     }
      89             : 
      90          54 :     return word;
      91             : }
      92             : 
      93             : static inline uint32
      94          72 : utf8word_to_unicode(uint32 c)
      95             : {
      96             :     uint32      ucs;
      97             : 
      98          72 :     if (c <= 0x7F)
      99             :     {
     100           0 :         ucs = c;
     101             :     }
     102          72 :     else if (c <= 0xFFFF)
     103             :     {
     104           0 :         ucs = ((c >> 8) & 0x1F) << 6;
     105           0 :         ucs |= c & 0x3F;
     106             :     }
     107          72 :     else if (c <= 0xFFFFFF)
     108             :     {
     109          72 :         ucs = ((c >> 16) & 0x0F) << 12;
     110          72 :         ucs |= ((c >> 8) & 0x3F) << 6;
     111          72 :         ucs |= c & 0x3F;
     112             :     }
     113             :     else
     114             :     {
     115           0 :         ucs = ((c >> 24) & 0x07) << 18;
     116           0 :         ucs |= ((c >> 16) & 0x3F) << 12;
     117           0 :         ucs |= ((c >> 8) & 0x3F) << 6;
     118           0 :         ucs |= c & 0x3F;
     119             :     }
     120             : 
     121          72 :     return ucs;
     122             : }
     123             : 
     124             : /*
     125             :  * Perform mapping of GB18030 ranges to UTF8
     126             :  *
     127             :  * The ranges we need to convert are specified in gb-18030-2000.xml.
     128             :  * All are ranges of 4-byte GB18030 codes.
     129             :  */
     130             : static uint32
     131          90 : conv_18030_to_utf8(uint32 code)
     132             : {
     133             : #define conv18030(minunicode, mincode, maxcode) \
     134             :     if (code >= mincode && code <= maxcode) \
     135             :         return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
     136             : 
     137          90 :     conv18030(0x0452, 0x8130D330, 0x8136A531);
     138          90 :     conv18030(0x2643, 0x8137A839, 0x8138FD38);
     139          90 :     conv18030(0x361B, 0x8230A633, 0x8230F237);
     140          90 :     conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
     141          90 :     conv18030(0x4160, 0x8232C937, 0x8232F837);
     142          90 :     conv18030(0x44D7, 0x8233A339, 0x8233C931);
     143          90 :     conv18030(0x478E, 0x8233E838, 0x82349638);
     144          90 :     conv18030(0x49B8, 0x8234A131, 0x8234E733);
     145          90 :     conv18030(0x9FA6, 0x82358F33, 0x8336C738);
     146          90 :     conv18030(0xE865, 0x8336D030, 0x84308534);
     147          90 :     conv18030(0xFA2A, 0x84309C38, 0x84318537);
     148          36 :     conv18030(0xFFE6, 0x8431A234, 0x8431A439);
     149          36 :     conv18030(0x10000, 0x90308130, 0xE3329A35);
     150             :     /* No mapping exists */
     151          36 :     return 0;
     152             : }
     153             : 
     154             : /*
     155             :  * Perform mapping of UTF8 ranges to GB18030
     156             :  */
     157             : static uint32
     158          72 : conv_utf8_to_18030(uint32 code)
     159             : {
     160          72 :     uint32      ucs = utf8word_to_unicode(code);
     161             : 
     162             : #define convutf8(minunicode, maxunicode, mincode) \
     163             :     if (ucs >= minunicode && ucs <= maxunicode) \
     164             :         return gb_unlinear(ucs - minunicode + gb_linear(mincode))
     165             : 
     166          72 :     convutf8(0x0452, 0x200F, 0x8130D330);
     167          72 :     convutf8(0x2643, 0x2E80, 0x8137A839);
     168          72 :     convutf8(0x361B, 0x3917, 0x8230A633);
     169          72 :     convutf8(0x3CE1, 0x4055, 0x8231D438);
     170          72 :     convutf8(0x4160, 0x4336, 0x8232C937);
     171          72 :     convutf8(0x44D7, 0x464B, 0x8233A339);
     172          72 :     convutf8(0x478E, 0x4946, 0x8233E838);
     173          72 :     convutf8(0x49B8, 0x4C76, 0x8234A131);
     174          72 :     convutf8(0x9FA6, 0xD7FF, 0x82358F33);
     175          18 :     convutf8(0xE865, 0xF92B, 0x8336D030);
     176          18 :     convutf8(0xFA2A, 0xFE2F, 0x84309C38);
     177           0 :     convutf8(0xFFE6, 0xFFFF, 0x8431A234);
     178           0 :     convutf8(0x10000, 0x10FFFF, 0x90308130);
     179             :     /* No mapping exists */
     180           0 :     return 0;
     181             : }
     182             : 
     183             : /* ----------
     184             :  * conv_proc(
     185             :  *      INTEGER,    -- source encoding id
     186             :  *      INTEGER,    -- destination encoding id
     187             :  *      CSTRING,    -- source string (null terminated C string)
     188             :  *      CSTRING,    -- destination string (null terminated C string)
     189             :  *      INTEGER,    -- source string length
     190             :  *      BOOL        -- if true, don't throw an error if conversion fails
     191             :  * ) returns INTEGER;
     192             :  *
     193             :  * Returns the number of bytes successfully converted.
     194             :  * ----------
     195             :  */
     196             : Datum
     197         240 : gb18030_to_utf8(PG_FUNCTION_ARGS)
     198             : {
     199         240 :     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
     200         240 :     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
     201         240 :     int         len = PG_GETARG_INT32(4);
     202         240 :     bool        noError = PG_GETARG_BOOL(5);
     203             :     int         converted;
     204             : 
     205         240 :     CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
     206             : 
     207         240 :     converted = LocalToUtf(src, len, dest,
     208             :                            &gb18030_to_unicode_tree,
     209             :                            NULL, 0,
     210             :                            conv_18030_to_utf8,
     211             :                            PG_GB18030,
     212             :                            noError);
     213             : 
     214         150 :     PG_RETURN_INT32(converted);
     215             : }
     216             : 
     217             : Datum
     218         330 : utf8_to_gb18030(PG_FUNCTION_ARGS)
     219             : {
     220         330 :     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
     221         330 :     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
     222         330 :     int         len = PG_GETARG_INT32(4);
     223         330 :     bool        noError = PG_GETARG_BOOL(5);
     224             :     int         converted;
     225             : 
     226         330 :     CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
     227             : 
     228         330 :     converted = UtfToLocal(src, len, dest,
     229             :                            &gb18030_from_unicode_tree,
     230             :                            NULL, 0,
     231             :                            conv_utf8_to_18030,
     232             :                            PG_GB18030,
     233             :                            noError);
     234             : 
     235         240 :     PG_RETURN_INT32(converted);
     236             : }

Generated by: LCOV version 1.14