Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * Utility functions for conversion procs.
4 : *
5 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conv.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 : #include "mb/pg_wchar.h"
15 :
16 :
17 : /*
18 : * local2local: a generic single byte charset encoding
19 : * conversion between two ASCII-superset encodings.
20 : *
21 : * l points to the source string of length len
22 : * p is the output area (must be large enough!)
23 : * src_encoding is the PG identifier for the source encoding
24 : * dest_encoding is the PG identifier for the target encoding
25 : * tab holds conversion entries for the source charset
26 : * starting from 128 (0x80). each entry in the table holds the corresponding
27 : * code point for the target charset, or 0 if there is no equivalent code.
28 : *
29 : * Returns the number of input bytes consumed. If noError is true, this can
30 : * be less than 'len'.
31 : */
32 : int
33 152 : local2local(const unsigned char *l,
34 : unsigned char *p,
35 : int len,
36 : int src_encoding,
37 : int dest_encoding,
38 : const unsigned char *tab,
39 : bool noError)
40 : {
41 152 : const unsigned char *start = l;
42 : unsigned char c1,
43 : c2;
44 :
45 488 : while (len > 0)
46 : {
47 408 : c1 = *l;
48 408 : if (c1 == 0)
49 : {
50 72 : if (noError)
51 36 : break;
52 36 : report_invalid_encoding(src_encoding, (const char *) l, len);
53 : }
54 336 : if (!IS_HIGHBIT_SET(c1))
55 204 : *p++ = c1;
56 : else
57 : {
58 132 : c2 = tab[c1 - HIGHBIT];
59 132 : if (c2)
60 132 : *p++ = c2;
61 : else
62 : {
63 0 : if (noError)
64 0 : break;
65 0 : report_untranslatable_char(src_encoding, dest_encoding,
66 : (const char *) l, len);
67 : }
68 : }
69 336 : l++;
70 336 : len--;
71 : }
72 116 : *p = '\0';
73 :
74 116 : return l - start;
75 : }
76 :
77 : /*
78 : * comparison routine for bsearch()
79 : * this routine is intended for combined UTF8 -> local code
80 : */
81 : static int
82 312 : compare3(const void *p1, const void *p2)
83 : {
84 : uint32 s1,
85 : s2,
86 : d1,
87 : d2;
88 :
89 312 : s1 = *(const uint32 *) p1;
90 312 : s2 = *((const uint32 *) p1 + 1);
91 312 : d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
92 312 : d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
93 312 : return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
94 : }
95 :
96 : /*
97 : * comparison routine for bsearch()
98 : * this routine is intended for local code -> combined UTF8
99 : */
100 : static int
101 108 : compare4(const void *p1, const void *p2)
102 : {
103 : uint32 v1,
104 : v2;
105 :
106 108 : v1 = *(const uint32 *) p1;
107 108 : v2 = ((const pg_local_to_utf_combined *) p2)->code;
108 108 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
109 : }
110 :
111 : /*
112 : * store 32bit character representation into multibyte stream
113 : */
114 : static inline unsigned char *
115 820 : store_coded_char(unsigned char *dest, uint32 code)
116 : {
117 820 : if (code & 0xff000000)
118 84 : *dest++ = code >> 24;
119 820 : if (code & 0x00ff0000)
120 404 : *dest++ = code >> 16;
121 820 : if (code & 0x0000ff00)
122 736 : *dest++ = code >> 8;
123 820 : if (code & 0x000000ff)
124 820 : *dest++ = code;
125 820 : return dest;
126 : }
127 :
128 : /*
129 : * Convert a character using a conversion radix tree.
130 : *
131 : * 'l' is the length of the input character in bytes, and b1-b4 are
132 : * the input character's bytes.
133 : */
134 : static inline uint32
135 1384 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
136 : int l,
137 : unsigned char b1,
138 : unsigned char b2,
139 : unsigned char b3,
140 : unsigned char b4)
141 : {
142 1384 : if (l == 4)
143 : {
144 : /* 4-byte code */
145 :
146 : /* check code validity */
147 60 : if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
148 60 : b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
149 60 : b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
150 60 : b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
151 0 : return 0;
152 :
153 : /* perform lookup */
154 60 : if (rt->chars32)
155 : {
156 60 : uint32 idx = rt->b4root;
157 :
158 60 : idx = rt->chars32[b1 + idx - rt->b4_1_lower];
159 60 : idx = rt->chars32[b2 + idx - rt->b4_2_lower];
160 60 : idx = rt->chars32[b3 + idx - rt->b4_3_lower];
161 60 : return rt->chars32[b4 + idx - rt->b4_4_lower];
162 : }
163 : else
164 : {
165 0 : uint16 idx = rt->b4root;
166 :
167 0 : idx = rt->chars16[b1 + idx - rt->b4_1_lower];
168 0 : idx = rt->chars16[b2 + idx - rt->b4_2_lower];
169 0 : idx = rt->chars16[b3 + idx - rt->b4_3_lower];
170 0 : return rt->chars16[b4 + idx - rt->b4_4_lower];
171 : }
172 : }
173 1324 : else if (l == 3)
174 : {
175 : /* 3-byte code */
176 :
177 : /* check code validity */
178 632 : if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
179 200 : b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
180 200 : b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
181 432 : return 0;
182 :
183 : /* perform lookup */
184 200 : if (rt->chars32)
185 : {
186 200 : uint32 idx = rt->b3root;
187 :
188 200 : idx = rt->chars32[b2 + idx - rt->b3_1_lower];
189 200 : idx = rt->chars32[b3 + idx - rt->b3_2_lower];
190 200 : return rt->chars32[b4 + idx - rt->b3_3_lower];
191 : }
192 : else
193 : {
194 0 : uint16 idx = rt->b3root;
195 :
196 0 : idx = rt->chars16[b2 + idx - rt->b3_1_lower];
197 0 : idx = rt->chars16[b3 + idx - rt->b3_2_lower];
198 0 : return rt->chars16[b4 + idx - rt->b3_3_lower];
199 : }
200 : }
201 692 : else if (l == 2)
202 : {
203 : /* 2-byte code */
204 :
205 : /* check code validity - first byte */
206 560 : if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
207 512 : b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
208 48 : return 0;
209 :
210 : /* perform lookup */
211 512 : if (rt->chars32)
212 : {
213 404 : uint32 idx = rt->b2root;
214 :
215 404 : idx = rt->chars32[b3 + idx - rt->b2_1_lower];
216 404 : return rt->chars32[b4 + idx - rt->b2_2_lower];
217 : }
218 : else
219 : {
220 108 : uint16 idx = rt->b2root;
221 :
222 108 : idx = rt->chars16[b3 + idx - rt->b2_1_lower];
223 108 : return rt->chars16[b4 + idx - rt->b2_2_lower];
224 : }
225 : }
226 132 : else if (l == 1)
227 : {
228 : /* 1-byte code */
229 :
230 : /* check code validity - first byte */
231 132 : if (b4 < rt->b1_lower || b4 > rt->b1_upper)
232 0 : return 0;
233 :
234 : /* perform lookup */
235 132 : if (rt->chars32)
236 132 : return rt->chars32[b4 + rt->b1root - rt->b1_lower];
237 : else
238 0 : return rt->chars16[b4 + rt->b1root - rt->b1_lower];
239 : }
240 0 : return 0; /* shouldn't happen */
241 : }
242 :
243 : /*
244 : * UTF8 ---> local code
245 : *
246 : * utf: input string in UTF8 encoding (need not be null-terminated)
247 : * len: length of input string (in bytes)
248 : * iso: pointer to the output area (must be large enough!)
249 : * (output string will be null-terminated)
250 : * map: conversion map for single characters
251 : * cmap: conversion map for combined characters
252 : * (optional, pass NULL if none)
253 : * cmapsize: number of entries in the conversion map for combined characters
254 : * (optional, pass 0 if none)
255 : * conv_func: algorithmic encoding conversion function
256 : * (optional, pass NULL if none)
257 : * encoding: PG identifier for the local encoding
258 : *
259 : * For each character, the cmap (if provided) is consulted first; if no match,
260 : * the map is consulted next; if still no match, the conv_func (if provided)
261 : * is applied. An error is raised if no match is found.
262 : *
263 : * See pg_wchar.h for more details about the data structures used here.
264 : *
265 : * Returns the number of input bytes consumed. If noError is true, this can
266 : * be less than 'len'.
267 : */
268 : int
269 1524 : UtfToLocal(const unsigned char *utf, int len,
270 : unsigned char *iso,
271 : const pg_mb_radix_tree *map,
272 : const pg_utf_to_local_combined *cmap, int cmapsize,
273 : utf_local_conversion_func conv_func,
274 : int encoding, bool noError)
275 : {
276 : uint32 iutf;
277 : int l;
278 : const pg_utf_to_local_combined *cp;
279 1524 : const unsigned char *start = utf;
280 :
281 1524 : if (!PG_VALID_ENCODING(encoding))
282 0 : ereport(ERROR,
283 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
284 : errmsg("invalid encoding number: %d", encoding)));
285 :
286 4756 : for (; len > 0; len -= l)
287 : {
288 4312 : unsigned char b1 = 0;
289 4312 : unsigned char b2 = 0;
290 4312 : unsigned char b3 = 0;
291 4312 : unsigned char b4 = 0;
292 :
293 : /* "break" cases all represent errors */
294 4312 : if (*utf == '\0')
295 120 : break;
296 :
297 4192 : l = pg_utf_mblen(utf);
298 4192 : if (len < l)
299 144 : break;
300 :
301 4048 : if (!pg_utf8_islegal(utf, l))
302 240 : break;
303 :
304 3808 : if (l == 1)
305 : {
306 : /* ASCII case is easy, assume it's one-to-one conversion */
307 2864 : *iso++ = *utf++;
308 2864 : continue;
309 : }
310 :
311 : /* collect coded char of length l */
312 944 : if (l == 2)
313 : {
314 276 : b3 = *utf++;
315 276 : b4 = *utf++;
316 : }
317 668 : else if (l == 3)
318 : {
319 668 : b2 = *utf++;
320 668 : b3 = *utf++;
321 668 : b4 = *utf++;
322 : }
323 0 : else if (l == 4)
324 : {
325 0 : b1 = *utf++;
326 0 : b2 = *utf++;
327 0 : b3 = *utf++;
328 0 : b4 = *utf++;
329 : }
330 : else
331 : {
332 0 : elog(ERROR, "unsupported character length %d", l);
333 : iutf = 0; /* keep compiler quiet */
334 : }
335 944 : iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
336 :
337 : /* First, try with combined map if possible */
338 944 : if (cmap && len > l)
339 : {
340 96 : const unsigned char *utf_save = utf;
341 96 : int len_save = len;
342 96 : int l_save = l;
343 :
344 : /* collect next character, same as above */
345 96 : len -= l;
346 :
347 96 : l = pg_utf_mblen(utf);
348 96 : if (len < l)
349 : {
350 : /* need more data to decide if this is a combined char */
351 24 : utf -= l_save;
352 24 : break;
353 : }
354 :
355 72 : if (!pg_utf8_islegal(utf, l))
356 : {
357 0 : if (!noError)
358 0 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
359 0 : utf -= l_save;
360 0 : break;
361 : }
362 :
363 : /* We assume ASCII character cannot be in combined map */
364 72 : if (l > 1)
365 : {
366 : uint32 iutf2;
367 : uint32 cutf[2];
368 :
369 72 : if (l == 2)
370 : {
371 36 : iutf2 = *utf++ << 8;
372 36 : iutf2 |= *utf++;
373 : }
374 36 : else if (l == 3)
375 : {
376 36 : iutf2 = *utf++ << 16;
377 36 : iutf2 |= *utf++ << 8;
378 36 : iutf2 |= *utf++;
379 : }
380 0 : else if (l == 4)
381 : {
382 0 : iutf2 = *utf++ << 24;
383 0 : iutf2 |= *utf++ << 16;
384 0 : iutf2 |= *utf++ << 8;
385 0 : iutf2 |= *utf++;
386 : }
387 : else
388 : {
389 0 : elog(ERROR, "unsupported character length %d", l);
390 : iutf2 = 0; /* keep compiler quiet */
391 : }
392 :
393 72 : cutf[0] = iutf;
394 72 : cutf[1] = iutf2;
395 :
396 72 : cp = bsearch(cutf, cmap, cmapsize,
397 : sizeof(pg_utf_to_local_combined), compare3);
398 :
399 72 : if (cp)
400 : {
401 12 : iso = store_coded_char(iso, cp->code);
402 12 : continue;
403 : }
404 : }
405 :
406 : /* fail, so back up to reprocess second character next time */
407 60 : utf = utf_save;
408 60 : len = len_save;
409 60 : l = l_save;
410 : }
411 :
412 : /* Now check ordinary map */
413 908 : if (map)
414 : {
415 908 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
416 :
417 908 : if (converted)
418 : {
419 308 : iso = store_coded_char(iso, converted);
420 308 : continue;
421 : }
422 : }
423 :
424 : /* if there's a conversion function, try that */
425 600 : if (conv_func)
426 : {
427 48 : uint32 converted = (*conv_func) (iutf);
428 :
429 48 : if (converted)
430 : {
431 48 : iso = store_coded_char(iso, converted);
432 48 : continue;
433 : }
434 : }
435 :
436 : /* failed to translate this character */
437 552 : utf -= l;
438 552 : if (noError)
439 276 : break;
440 276 : report_untranslatable_char(PG_UTF8, encoding,
441 : (const char *) utf, len);
442 : }
443 :
444 : /* if we broke out of loop early, must be invalid input */
445 1248 : if (len > 0 && !noError)
446 264 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
447 :
448 984 : *iso = '\0';
449 :
450 984 : return utf - start;
451 : }
452 :
453 : /*
454 : * local code ---> UTF8
455 : *
456 : * iso: input string in local encoding (need not be null-terminated)
457 : * len: length of input string (in bytes)
458 : * utf: pointer to the output area (must be large enough!)
459 : * (output string will be null-terminated)
460 : * map: conversion map for single characters
461 : * cmap: conversion map for combined characters
462 : * (optional, pass NULL if none)
463 : * cmapsize: number of entries in the conversion map for combined characters
464 : * (optional, pass 0 if none)
465 : * conv_func: algorithmic encoding conversion function
466 : * (optional, pass NULL if none)
467 : * encoding: PG identifier for the local encoding
468 : *
469 : * For each character, the map is consulted first; if no match, the cmap
470 : * (if provided) is consulted next; if still no match, the conv_func
471 : * (if provided) is applied. An error is raised if no match is found.
472 : *
473 : * See pg_wchar.h for more details about the data structures used here.
474 : *
475 : * Returns the number of input bytes consumed. If noError is true, this can
476 : * be less than 'len'.
477 : */
478 : int
479 908 : LocalToUtf(const unsigned char *iso, int len,
480 : unsigned char *utf,
481 : const pg_mb_radix_tree *map,
482 : const pg_local_to_utf_combined *cmap, int cmapsize,
483 : utf_local_conversion_func conv_func,
484 : int encoding,
485 : bool noError)
486 : {
487 : uint32 iiso;
488 : int l;
489 : const pg_local_to_utf_combined *cp;
490 908 : const unsigned char *start = iso;
491 :
492 908 : if (!PG_VALID_ENCODING(encoding))
493 0 : ereport(ERROR,
494 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
495 : errmsg("invalid encoding number: %d", encoding)));
496 :
497 4122 : for (; len > 0; len -= l)
498 : {
499 3766 : unsigned char b1 = 0;
500 3766 : unsigned char b2 = 0;
501 3766 : unsigned char b3 = 0;
502 3766 : unsigned char b4 = 0;
503 :
504 : /* "break" cases all represent errors */
505 3766 : if (*iso == '\0')
506 216 : break;
507 :
508 3550 : if (!IS_HIGHBIT_SET(*iso))
509 : {
510 : /* ASCII case is easy, assume it's one-to-one conversion */
511 2786 : *utf++ = *iso++;
512 2786 : l = 1;
513 2786 : continue;
514 : }
515 :
516 764 : l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
517 764 : if (l < 0)
518 288 : break;
519 :
520 : /* collect coded char of length l */
521 476 : if (l == 1)
522 132 : b4 = *iso++;
523 344 : else if (l == 2)
524 : {
525 284 : b3 = *iso++;
526 284 : b4 = *iso++;
527 : }
528 60 : else if (l == 3)
529 : {
530 0 : b2 = *iso++;
531 0 : b3 = *iso++;
532 0 : b4 = *iso++;
533 : }
534 60 : else if (l == 4)
535 : {
536 60 : b1 = *iso++;
537 60 : b2 = *iso++;
538 60 : b3 = *iso++;
539 60 : b4 = *iso++;
540 : }
541 : else
542 : {
543 0 : elog(ERROR, "unsupported character length %d", l);
544 : iiso = 0; /* keep compiler quiet */
545 : }
546 476 : iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
547 :
548 476 : if (map)
549 : {
550 476 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
551 :
552 476 : if (converted)
553 : {
554 368 : utf = store_coded_char(utf, converted);
555 368 : continue;
556 : }
557 :
558 : /* If there's a combined character map, try that */
559 108 : if (cmap)
560 : {
561 24 : cp = bsearch(&iiso, cmap, cmapsize,
562 : sizeof(pg_local_to_utf_combined), compare4);
563 :
564 24 : if (cp)
565 : {
566 24 : utf = store_coded_char(utf, cp->utf1);
567 24 : utf = store_coded_char(utf, cp->utf2);
568 24 : continue;
569 : }
570 : }
571 : }
572 :
573 : /* if there's a conversion function, try that */
574 84 : if (conv_func)
575 : {
576 60 : uint32 converted = (*conv_func) (iiso);
577 :
578 60 : if (converted)
579 : {
580 36 : utf = store_coded_char(utf, converted);
581 36 : continue;
582 : }
583 : }
584 :
585 : /* failed to translate this character */
586 48 : iso -= l;
587 48 : if (noError)
588 24 : break;
589 24 : report_untranslatable_char(encoding, PG_UTF8,
590 : (const char *) iso, len);
591 : }
592 :
593 : /* if we broke out of loop early, must be invalid input */
594 884 : if (len > 0 && !noError)
595 248 : report_invalid_encoding(encoding, (const char *) iso, len);
596 :
597 636 : *utf = '\0';
598 :
599 636 : return iso - start;
600 : }
|