Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * GB18030 <--> UTF8
4 : *
5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 : #include "fmgr.h"
16 : #include "mb/pg_wchar.h"
17 : #include "../../Unicode/gb18030_to_utf8.map"
18 : #include "../../Unicode/utf8_to_gb18030.map"
19 :
20 12 : PG_MODULE_MAGIC;
21 :
22 12 : PG_FUNCTION_INFO_V1(gb18030_to_utf8);
23 12 : PG_FUNCTION_INFO_V1(utf8_to_gb18030);
24 :
25 : /*
26 : * Convert 4-byte GB18030 characters to and from a linear code space
27 : *
28 : * The first and third bytes can range from 0x81 to 0xfe (126 values),
29 : * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
30 : */
31 : static inline uint32
32 180 : gb_linear(uint32 gb)
33 : {
34 180 : uint32 b0 = (gb & 0xff000000) >> 24;
35 180 : uint32 b1 = (gb & 0x00ff0000) >> 16;
36 180 : uint32 b2 = (gb & 0x0000ff00) >> 8;
37 180 : uint32 b3 = (gb & 0x000000ff);
38 :
39 180 : return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
40 : (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
41 : }
42 :
43 : static inline uint32
44 72 : gb_unlinear(uint32 lin)
45 : {
46 72 : uint32 r0 = 0x81 + lin / 12600;
47 72 : uint32 r1 = 0x30 + (lin / 1260) % 10;
48 72 : uint32 r2 = 0x81 + (lin / 10) % 126;
49 72 : uint32 r3 = 0x30 + lin % 10;
50 :
51 72 : return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
52 : }
53 :
54 : /*
55 : * Convert word-formatted UTF8 to and from Unicode code points
56 : *
57 : * Probably this should be somewhere else ...
58 : */
59 : static inline uint32
60 54 : unicode_to_utf8word(uint32 c)
61 : {
62 : uint32 word;
63 :
64 54 : if (c <= 0x7F)
65 : {
66 0 : word = c;
67 : }
68 54 : else if (c <= 0x7FF)
69 : {
70 0 : word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
71 0 : word |= 0x80 | (c & 0x3F);
72 : }
73 54 : else if (c <= 0xFFFF)
74 : {
75 54 : word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
76 54 : word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
77 54 : word |= 0x80 | (c & 0x3F);
78 : }
79 : else
80 : {
81 0 : word = (0xF0 | ((c >> 18) & 0x07)) << 24;
82 0 : word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
83 0 : word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
84 0 : word |= 0x80 | (c & 0x3F);
85 : }
86 :
87 54 : return word;
88 : }
89 :
90 : static inline uint32
91 72 : utf8word_to_unicode(uint32 c)
92 : {
93 : uint32 ucs;
94 :
95 72 : if (c <= 0x7F)
96 : {
97 0 : ucs = c;
98 : }
99 72 : else if (c <= 0xFFFF)
100 : {
101 0 : ucs = ((c >> 8) & 0x1F) << 6;
102 0 : ucs |= c & 0x3F;
103 : }
104 72 : else if (c <= 0xFFFFFF)
105 : {
106 72 : ucs = ((c >> 16) & 0x0F) << 12;
107 72 : ucs |= ((c >> 8) & 0x3F) << 6;
108 72 : ucs |= c & 0x3F;
109 : }
110 : else
111 : {
112 0 : ucs = ((c >> 24) & 0x07) << 18;
113 0 : ucs |= ((c >> 16) & 0x3F) << 12;
114 0 : ucs |= ((c >> 8) & 0x3F) << 6;
115 0 : ucs |= c & 0x3F;
116 : }
117 :
118 72 : return ucs;
119 : }
120 :
121 : /*
122 : * Perform mapping of GB18030 ranges to UTF8
123 : *
124 : * The ranges we need to convert are specified in gb-18030-2000.xml.
125 : * All are ranges of 4-byte GB18030 codes.
126 : */
127 : static uint32
128 90 : conv_18030_to_utf8(uint32 code)
129 : {
130 : #define conv18030(minunicode, mincode, maxcode) \
131 : if (code >= mincode && code <= maxcode) \
132 : return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
133 :
134 90 : conv18030(0x0452, 0x8130D330, 0x8136A531);
135 90 : conv18030(0x2643, 0x8137A839, 0x8138FD38);
136 90 : conv18030(0x361B, 0x8230A633, 0x8230F237);
137 90 : conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
138 90 : conv18030(0x4160, 0x8232C937, 0x8232F837);
139 90 : conv18030(0x44D7, 0x8233A339, 0x8233C931);
140 90 : conv18030(0x478E, 0x8233E838, 0x82349638);
141 90 : conv18030(0x49B8, 0x8234A131, 0x8234E733);
142 90 : conv18030(0x9FA6, 0x82358F33, 0x8336C738);
143 90 : conv18030(0xE865, 0x8336D030, 0x84308534);
144 90 : conv18030(0xFA2A, 0x84309C38, 0x84318537);
145 36 : conv18030(0xFFE6, 0x8431A234, 0x8431A439);
146 36 : conv18030(0x10000, 0x90308130, 0xE3329A35);
147 : /* No mapping exists */
148 36 : return 0;
149 : }
150 :
151 : /*
152 : * Perform mapping of UTF8 ranges to GB18030
153 : */
154 : static uint32
155 72 : conv_utf8_to_18030(uint32 code)
156 : {
157 72 : uint32 ucs = utf8word_to_unicode(code);
158 :
159 : #define convutf8(minunicode, maxunicode, mincode) \
160 : if (ucs >= minunicode && ucs <= maxunicode) \
161 : return gb_unlinear(ucs - minunicode + gb_linear(mincode))
162 :
163 72 : convutf8(0x0452, 0x200F, 0x8130D330);
164 72 : convutf8(0x2643, 0x2E80, 0x8137A839);
165 72 : convutf8(0x361B, 0x3917, 0x8230A633);
166 72 : convutf8(0x3CE1, 0x4055, 0x8231D438);
167 72 : convutf8(0x4160, 0x4336, 0x8232C937);
168 72 : convutf8(0x44D7, 0x464B, 0x8233A339);
169 72 : convutf8(0x478E, 0x4946, 0x8233E838);
170 72 : convutf8(0x49B8, 0x4C76, 0x8234A131);
171 72 : convutf8(0x9FA6, 0xD7FF, 0x82358F33);
172 18 : convutf8(0xE865, 0xF92B, 0x8336D030);
173 18 : convutf8(0xFA2A, 0xFE2F, 0x84309C38);
174 0 : convutf8(0xFFE6, 0xFFFF, 0x8431A234);
175 0 : convutf8(0x10000, 0x10FFFF, 0x90308130);
176 : /* No mapping exists */
177 0 : return 0;
178 : }
179 :
180 : /* ----------
181 : * conv_proc(
182 : * INTEGER, -- source encoding id
183 : * INTEGER, -- destination encoding id
184 : * CSTRING, -- source string (null terminated C string)
185 : * CSTRING, -- destination string (null terminated C string)
186 : * INTEGER, -- source string length
187 : * BOOL -- if true, don't throw an error if conversion fails
188 : * ) returns INTEGER;
189 : *
190 : * Returns the number of bytes successfully converted.
191 : * ----------
192 : */
193 : Datum
194 240 : gb18030_to_utf8(PG_FUNCTION_ARGS)
195 : {
196 240 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
197 240 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
198 240 : int len = PG_GETARG_INT32(4);
199 240 : bool noError = PG_GETARG_BOOL(5);
200 : int converted;
201 :
202 240 : CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
203 :
204 240 : converted = LocalToUtf(src, len, dest,
205 : &gb18030_to_unicode_tree,
206 : NULL, 0,
207 : conv_18030_to_utf8,
208 : PG_GB18030,
209 : noError);
210 :
211 150 : PG_RETURN_INT32(converted);
212 : }
213 :
214 : Datum
215 330 : utf8_to_gb18030(PG_FUNCTION_ARGS)
216 : {
217 330 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
218 330 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
219 330 : int len = PG_GETARG_INT32(4);
220 330 : bool noError = PG_GETARG_BOOL(5);
221 : int converted;
222 :
223 330 : CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
224 :
225 330 : converted = UtfToLocal(src, len, dest,
226 : &gb18030_from_unicode_tree,
227 : NULL, 0,
228 : conv_utf8_to_18030,
229 : PG_GB18030,
230 : noError);
231 :
232 240 : PG_RETURN_INT32(converted);
233 : }
|