Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * Utility functions for conversion procs.
4 : *
5 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conv.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 : #include "mb/pg_wchar.h"
15 :
16 :
17 : /*
18 : * local2local: a generic single byte charset encoding
19 : * conversion between two ASCII-superset encodings.
20 : *
21 : * l points to the source string of length len
22 : * p is the output area (must be large enough!)
23 : * src_encoding is the PG identifier for the source encoding
24 : * dest_encoding is the PG identifier for the target encoding
25 : * tab holds conversion entries for the source charset
26 : * starting from 128 (0x80). each entry in the table holds the corresponding
27 : * code point for the target charset, or 0 if there is no equivalent code.
28 : *
29 : * Returns the number of input bytes consumed. If noError is true, this can
30 : * be less than 'len'.
31 : */
32 : int
33 152 : local2local(const unsigned char *l,
34 : unsigned char *p,
35 : int len,
36 : int src_encoding,
37 : int dest_encoding,
38 : const unsigned char *tab,
39 : bool noError)
40 : {
41 152 : const unsigned char *start = l;
42 : unsigned char c1,
43 : c2;
44 :
45 488 : while (len > 0)
46 : {
47 408 : c1 = *l;
48 408 : if (c1 == 0)
49 : {
50 72 : if (noError)
51 36 : break;
52 36 : report_invalid_encoding(src_encoding, (const char *) l, len);
53 : }
54 336 : if (!IS_HIGHBIT_SET(c1))
55 204 : *p++ = c1;
56 : else
57 : {
58 132 : c2 = tab[c1 - HIGHBIT];
59 132 : if (c2)
60 132 : *p++ = c2;
61 : else
62 : {
63 0 : if (noError)
64 0 : break;
65 0 : report_untranslatable_char(src_encoding, dest_encoding,
66 : (const char *) l, len);
67 : }
68 : }
69 336 : l++;
70 336 : len--;
71 : }
72 116 : *p = '\0';
73 :
74 116 : return l - start;
75 : }
76 :
77 : /*
78 : * LATINn ---> MIC when the charset's local codes map directly to MIC
79 : *
80 : * l points to the source string of length len
81 : * p is the output area (must be large enough!)
82 : * lc is the mule character set id for the local encoding
83 : * encoding is the PG identifier for the local encoding
84 : *
85 : * Returns the number of input bytes consumed. If noError is true, this can
86 : * be less than 'len'.
87 : */
88 : int
89 20 : latin2mic(const unsigned char *l, unsigned char *p, int len,
90 : int lc, int encoding, bool noError)
91 : {
92 20 : const unsigned char *start = l;
93 : int c1;
94 :
95 80 : while (len > 0)
96 : {
97 60 : c1 = *l;
98 60 : if (c1 == 0)
99 : {
100 0 : if (noError)
101 0 : break;
102 0 : report_invalid_encoding(encoding, (const char *) l, len);
103 : }
104 60 : if (IS_HIGHBIT_SET(c1))
105 0 : *p++ = lc;
106 60 : *p++ = c1;
107 60 : l++;
108 60 : len--;
109 : }
110 20 : *p = '\0';
111 :
112 20 : return l - start;
113 : }
114 :
115 : /*
116 : * MIC ---> LATINn when the charset's local codes map directly to MIC
117 : *
118 : * mic points to the source string of length len
119 : * p is the output area (must be large enough!)
120 : * lc is the mule character set id for the local encoding
121 : * encoding is the PG identifier for the local encoding
122 : *
123 : * Returns the number of input bytes consumed. If noError is true, this can
124 : * be less than 'len'.
125 : */
126 : int
127 236 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
128 : int lc, int encoding, bool noError)
129 : {
130 236 : const unsigned char *start = mic;
131 : int c1;
132 :
133 560 : while (len > 0)
134 : {
135 516 : c1 = *mic;
136 516 : if (c1 == 0)
137 : {
138 0 : if (noError)
139 0 : break;
140 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141 : }
142 516 : if (!IS_HIGHBIT_SET(c1))
143 : {
144 : /* easy for ASCII */
145 240 : *p++ = c1;
146 240 : mic++;
147 240 : len--;
148 : }
149 : else
150 : {
151 276 : int l = pg_mule_mblen(mic);
152 :
153 276 : if (len < l)
154 : {
155 72 : if (noError)
156 36 : break;
157 36 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158 : len);
159 : }
160 204 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161 : {
162 120 : if (noError)
163 60 : break;
164 60 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165 : (const char *) mic, len);
166 : }
167 84 : *p++ = mic[1];
168 84 : mic += 2;
169 84 : len -= 2;
170 : }
171 : }
172 140 : *p = '\0';
173 :
174 140 : return mic - start;
175 : }
176 :
177 :
178 : /*
179 : * latin2mic_with_table: a generic single byte charset encoding
180 : * conversion from a local charset to the mule internal code.
181 : *
182 : * l points to the source string of length len
183 : * p is the output area (must be large enough!)
184 : * lc is the mule character set id for the local encoding
185 : * encoding is the PG identifier for the local encoding
186 : * tab holds conversion entries for the local charset
187 : * starting from 128 (0x80). each entry in the table holds the corresponding
188 : * code point for the mule encoding, or 0 if there is no equivalent code.
189 : *
190 : * Returns the number of input bytes consumed. If noError is true, this can
191 : * be less than 'len'.
192 : */
193 : int
194 112 : latin2mic_with_table(const unsigned char *l,
195 : unsigned char *p,
196 : int len,
197 : int lc,
198 : int encoding,
199 : const unsigned char *tab,
200 : bool noError)
201 : {
202 112 : const unsigned char *start = l;
203 : unsigned char c1,
204 : c2;
205 :
206 328 : while (len > 0)
207 : {
208 288 : c1 = *l;
209 288 : if (c1 == 0)
210 : {
211 72 : if (noError)
212 36 : break;
213 36 : report_invalid_encoding(encoding, (const char *) l, len);
214 : }
215 216 : if (!IS_HIGHBIT_SET(c1))
216 84 : *p++ = c1;
217 : else
218 : {
219 132 : c2 = tab[c1 - HIGHBIT];
220 132 : if (c2)
221 : {
222 132 : *p++ = lc;
223 132 : *p++ = c2;
224 : }
225 : else
226 : {
227 0 : if (noError)
228 0 : break;
229 0 : report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230 : (const char *) l, len);
231 : }
232 : }
233 216 : l++;
234 216 : len--;
235 : }
236 76 : *p = '\0';
237 :
238 76 : return l - start;
239 : }
240 :
241 : /*
242 : * mic2latin_with_table: a generic single byte charset encoding
243 : * conversion from the mule internal code to a local charset.
244 : *
245 : * mic points to the source string of length len
246 : * p is the output area (must be large enough!)
247 : * lc is the mule character set id for the local encoding
248 : * encoding is the PG identifier for the local encoding
249 : * tab holds conversion entries for the mule internal code's second byte,
250 : * starting from 128 (0x80). each entry in the table holds the corresponding
251 : * code point for the local charset, or 0 if there is no equivalent code.
252 : *
253 : * Returns the number of input bytes consumed. If noError is true, this can
254 : * be less than 'len'.
255 : */
256 : int
257 232 : mic2latin_with_table(const unsigned char *mic,
258 : unsigned char *p,
259 : int len,
260 : int lc,
261 : int encoding,
262 : const unsigned char *tab,
263 : bool noError)
264 : {
265 232 : const unsigned char *start = mic;
266 : unsigned char c1,
267 : c2;
268 :
269 544 : while (len > 0)
270 : {
271 504 : c1 = *mic;
272 504 : if (c1 == 0)
273 : {
274 0 : if (noError)
275 0 : break;
276 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277 : }
278 504 : if (!IS_HIGHBIT_SET(c1))
279 : {
280 : /* easy for ASCII */
281 228 : *p++ = c1;
282 228 : mic++;
283 228 : len--;
284 : }
285 : else
286 : {
287 276 : int l = pg_mule_mblen(mic);
288 :
289 276 : if (len < l)
290 : {
291 72 : if (noError)
292 36 : break;
293 36 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294 : len);
295 : }
296 204 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297 84 : (c2 = tab[mic[1] - HIGHBIT]) == 0)
298 : {
299 120 : if (noError)
300 60 : break;
301 60 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302 : (const char *) mic, len);
303 : break; /* keep compiler quiet */
304 : }
305 84 : *p++ = c2;
306 84 : mic += 2;
307 84 : len -= 2;
308 : }
309 : }
310 136 : *p = '\0';
311 :
312 136 : return mic - start;
313 : }
314 :
315 : /*
316 : * comparison routine for bsearch()
317 : * this routine is intended for combined UTF8 -> local code
318 : */
319 : static int
320 312 : compare3(const void *p1, const void *p2)
321 : {
322 : uint32 s1,
323 : s2,
324 : d1,
325 : d2;
326 :
327 312 : s1 = *(const uint32 *) p1;
328 312 : s2 = *((const uint32 *) p1 + 1);
329 312 : d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 312 : d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 312 : return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 : }
333 :
334 : /*
335 : * comparison routine for bsearch()
336 : * this routine is intended for local code -> combined UTF8
337 : */
338 : static int
339 108 : compare4(const void *p1, const void *p2)
340 : {
341 : uint32 v1,
342 : v2;
343 :
344 108 : v1 = *(const uint32 *) p1;
345 108 : v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 108 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 : }
348 :
349 : /*
350 : * store 32bit character representation into multibyte stream
351 : */
352 : static inline unsigned char *
353 820 : store_coded_char(unsigned char *dest, uint32 code)
354 : {
355 820 : if (code & 0xff000000)
356 84 : *dest++ = code >> 24;
357 820 : if (code & 0x00ff0000)
358 404 : *dest++ = code >> 16;
359 820 : if (code & 0x0000ff00)
360 736 : *dest++ = code >> 8;
361 820 : if (code & 0x000000ff)
362 820 : *dest++ = code;
363 820 : return dest;
364 : }
365 :
366 : /*
367 : * Convert a character using a conversion radix tree.
368 : *
369 : * 'l' is the length of the input character in bytes, and b1-b4 are
370 : * the input character's bytes.
371 : */
372 : static inline uint32
373 1384 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374 : int l,
375 : unsigned char b1,
376 : unsigned char b2,
377 : unsigned char b3,
378 : unsigned char b4)
379 : {
380 1384 : if (l == 4)
381 : {
382 : /* 4-byte code */
383 :
384 : /* check code validity */
385 60 : if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386 60 : b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387 60 : b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388 60 : b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389 0 : return 0;
390 :
391 : /* perform lookup */
392 60 : if (rt->chars32)
393 : {
394 60 : uint32 idx = rt->b4root;
395 :
396 60 : idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397 60 : idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398 60 : idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399 60 : return rt->chars32[b4 + idx - rt->b4_4_lower];
400 : }
401 : else
402 : {
403 0 : uint16 idx = rt->b4root;
404 :
405 0 : idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406 0 : idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407 0 : idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408 0 : return rt->chars16[b4 + idx - rt->b4_4_lower];
409 : }
410 : }
411 1324 : else if (l == 3)
412 : {
413 : /* 3-byte code */
414 :
415 : /* check code validity */
416 632 : if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417 200 : b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418 200 : b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419 432 : return 0;
420 :
421 : /* perform lookup */
422 200 : if (rt->chars32)
423 : {
424 200 : uint32 idx = rt->b3root;
425 :
426 200 : idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427 200 : idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428 200 : return rt->chars32[b4 + idx - rt->b3_3_lower];
429 : }
430 : else
431 : {
432 0 : uint16 idx = rt->b3root;
433 :
434 0 : idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435 0 : idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436 0 : return rt->chars16[b4 + idx - rt->b3_3_lower];
437 : }
438 : }
439 692 : else if (l == 2)
440 : {
441 : /* 2-byte code */
442 :
443 : /* check code validity - first byte */
444 560 : if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445 512 : b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446 48 : return 0;
447 :
448 : /* perform lookup */
449 512 : if (rt->chars32)
450 : {
451 404 : uint32 idx = rt->b2root;
452 :
453 404 : idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454 404 : return rt->chars32[b4 + idx - rt->b2_2_lower];
455 : }
456 : else
457 : {
458 108 : uint16 idx = rt->b2root;
459 :
460 108 : idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461 108 : return rt->chars16[b4 + idx - rt->b2_2_lower];
462 : }
463 : }
464 132 : else if (l == 1)
465 : {
466 : /* 1-byte code */
467 :
468 : /* check code validity - first byte */
469 132 : if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470 0 : return 0;
471 :
472 : /* perform lookup */
473 132 : if (rt->chars32)
474 132 : return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475 : else
476 0 : return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477 : }
478 0 : return 0; /* shouldn't happen */
479 : }
480 :
481 : /*
482 : * UTF8 ---> local code
483 : *
484 : * utf: input string in UTF8 encoding (need not be null-terminated)
485 : * len: length of input string (in bytes)
486 : * iso: pointer to the output area (must be large enough!)
487 : * (output string will be null-terminated)
488 : * map: conversion map for single characters
489 : * cmap: conversion map for combined characters
490 : * (optional, pass NULL if none)
491 : * cmapsize: number of entries in the conversion map for combined characters
492 : * (optional, pass 0 if none)
493 : * conv_func: algorithmic encoding conversion function
494 : * (optional, pass NULL if none)
495 : * encoding: PG identifier for the local encoding
496 : *
497 : * For each character, the cmap (if provided) is consulted first; if no match,
498 : * the map is consulted next; if still no match, the conv_func (if provided)
499 : * is applied. An error is raised if no match is found.
500 : *
501 : * See pg_wchar.h for more details about the data structures used here.
502 : *
503 : * Returns the number of input bytes consumed. If noError is true, this can
504 : * be less than 'len'.
505 : */
506 : int
507 1524 : UtfToLocal(const unsigned char *utf, int len,
508 : unsigned char *iso,
509 : const pg_mb_radix_tree *map,
510 : const pg_utf_to_local_combined *cmap, int cmapsize,
511 : utf_local_conversion_func conv_func,
512 : int encoding, bool noError)
513 : {
514 : uint32 iutf;
515 : int l;
516 : const pg_utf_to_local_combined *cp;
517 1524 : const unsigned char *start = utf;
518 :
519 1524 : if (!PG_VALID_ENCODING(encoding))
520 0 : ereport(ERROR,
521 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522 : errmsg("invalid encoding number: %d", encoding)));
523 :
524 4756 : for (; len > 0; len -= l)
525 : {
526 4312 : unsigned char b1 = 0;
527 4312 : unsigned char b2 = 0;
528 4312 : unsigned char b3 = 0;
529 4312 : unsigned char b4 = 0;
530 :
531 : /* "break" cases all represent errors */
532 4312 : if (*utf == '\0')
533 120 : break;
534 :
535 4192 : l = pg_utf_mblen(utf);
536 4192 : if (len < l)
537 144 : break;
538 :
539 4048 : if (!pg_utf8_islegal(utf, l))
540 240 : break;
541 :
542 3808 : if (l == 1)
543 : {
544 : /* ASCII case is easy, assume it's one-to-one conversion */
545 2864 : *iso++ = *utf++;
546 2864 : continue;
547 : }
548 :
549 : /* collect coded char of length l */
550 944 : if (l == 2)
551 : {
552 276 : b3 = *utf++;
553 276 : b4 = *utf++;
554 : }
555 668 : else if (l == 3)
556 : {
557 668 : b2 = *utf++;
558 668 : b3 = *utf++;
559 668 : b4 = *utf++;
560 : }
561 0 : else if (l == 4)
562 : {
563 0 : b1 = *utf++;
564 0 : b2 = *utf++;
565 0 : b3 = *utf++;
566 0 : b4 = *utf++;
567 : }
568 : else
569 : {
570 0 : elog(ERROR, "unsupported character length %d", l);
571 : iutf = 0; /* keep compiler quiet */
572 : }
573 944 : iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574 :
575 : /* First, try with combined map if possible */
576 944 : if (cmap && len > l)
577 : {
578 96 : const unsigned char *utf_save = utf;
579 96 : int len_save = len;
580 96 : int l_save = l;
581 :
582 : /* collect next character, same as above */
583 96 : len -= l;
584 :
585 96 : l = pg_utf_mblen(utf);
586 96 : if (len < l)
587 : {
588 : /* need more data to decide if this is a combined char */
589 24 : utf -= l_save;
590 24 : break;
591 : }
592 :
593 72 : if (!pg_utf8_islegal(utf, l))
594 : {
595 0 : if (!noError)
596 0 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597 0 : utf -= l_save;
598 0 : break;
599 : }
600 :
601 : /* We assume ASCII character cannot be in combined map */
602 72 : if (l > 1)
603 : {
604 : uint32 iutf2;
605 : uint32 cutf[2];
606 :
607 72 : if (l == 2)
608 : {
609 36 : iutf2 = *utf++ << 8;
610 36 : iutf2 |= *utf++;
611 : }
612 36 : else if (l == 3)
613 : {
614 36 : iutf2 = *utf++ << 16;
615 36 : iutf2 |= *utf++ << 8;
616 36 : iutf2 |= *utf++;
617 : }
618 0 : else if (l == 4)
619 : {
620 0 : iutf2 = *utf++ << 24;
621 0 : iutf2 |= *utf++ << 16;
622 0 : iutf2 |= *utf++ << 8;
623 0 : iutf2 |= *utf++;
624 : }
625 : else
626 : {
627 0 : elog(ERROR, "unsupported character length %d", l);
628 : iutf2 = 0; /* keep compiler quiet */
629 : }
630 :
631 72 : cutf[0] = iutf;
632 72 : cutf[1] = iutf2;
633 :
634 72 : cp = bsearch(cutf, cmap, cmapsize,
635 : sizeof(pg_utf_to_local_combined), compare3);
636 :
637 72 : if (cp)
638 : {
639 12 : iso = store_coded_char(iso, cp->code);
640 12 : continue;
641 : }
642 : }
643 :
644 : /* fail, so back up to reprocess second character next time */
645 60 : utf = utf_save;
646 60 : len = len_save;
647 60 : l = l_save;
648 : }
649 :
650 : /* Now check ordinary map */
651 908 : if (map)
652 : {
653 908 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654 :
655 908 : if (converted)
656 : {
657 308 : iso = store_coded_char(iso, converted);
658 308 : continue;
659 : }
660 : }
661 :
662 : /* if there's a conversion function, try that */
663 600 : if (conv_func)
664 : {
665 48 : uint32 converted = (*conv_func) (iutf);
666 :
667 48 : if (converted)
668 : {
669 48 : iso = store_coded_char(iso, converted);
670 48 : continue;
671 : }
672 : }
673 :
674 : /* failed to translate this character */
675 552 : utf -= l;
676 552 : if (noError)
677 276 : break;
678 276 : report_untranslatable_char(PG_UTF8, encoding,
679 : (const char *) utf, len);
680 : }
681 :
682 : /* if we broke out of loop early, must be invalid input */
683 1248 : if (len > 0 && !noError)
684 264 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685 :
686 984 : *iso = '\0';
687 :
688 984 : return utf - start;
689 : }
690 :
691 : /*
692 : * local code ---> UTF8
693 : *
694 : * iso: input string in local encoding (need not be null-terminated)
695 : * len: length of input string (in bytes)
696 : * utf: pointer to the output area (must be large enough!)
697 : * (output string will be null-terminated)
698 : * map: conversion map for single characters
699 : * cmap: conversion map for combined characters
700 : * (optional, pass NULL if none)
701 : * cmapsize: number of entries in the conversion map for combined characters
702 : * (optional, pass 0 if none)
703 : * conv_func: algorithmic encoding conversion function
704 : * (optional, pass NULL if none)
705 : * encoding: PG identifier for the local encoding
706 : *
707 : * For each character, the map is consulted first; if no match, the cmap
708 : * (if provided) is consulted next; if still no match, the conv_func
709 : * (if provided) is applied. An error is raised if no match is found.
710 : *
711 : * See pg_wchar.h for more details about the data structures used here.
712 : *
713 : * Returns the number of input bytes consumed. If noError is true, this can
714 : * be less than 'len'.
715 : */
716 : int
717 908 : LocalToUtf(const unsigned char *iso, int len,
718 : unsigned char *utf,
719 : const pg_mb_radix_tree *map,
720 : const pg_local_to_utf_combined *cmap, int cmapsize,
721 : utf_local_conversion_func conv_func,
722 : int encoding,
723 : bool noError)
724 : {
725 : uint32 iiso;
726 : int l;
727 : const pg_local_to_utf_combined *cp;
728 908 : const unsigned char *start = iso;
729 :
730 908 : if (!PG_VALID_ENCODING(encoding))
731 0 : ereport(ERROR,
732 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733 : errmsg("invalid encoding number: %d", encoding)));
734 :
735 4122 : for (; len > 0; len -= l)
736 : {
737 3766 : unsigned char b1 = 0;
738 3766 : unsigned char b2 = 0;
739 3766 : unsigned char b3 = 0;
740 3766 : unsigned char b4 = 0;
741 :
742 : /* "break" cases all represent errors */
743 3766 : if (*iso == '\0')
744 216 : break;
745 :
746 3550 : if (!IS_HIGHBIT_SET(*iso))
747 : {
748 : /* ASCII case is easy, assume it's one-to-one conversion */
749 2786 : *utf++ = *iso++;
750 2786 : l = 1;
751 2786 : continue;
752 : }
753 :
754 764 : l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755 764 : if (l < 0)
756 288 : break;
757 :
758 : /* collect coded char of length l */
759 476 : if (l == 1)
760 132 : b4 = *iso++;
761 344 : else if (l == 2)
762 : {
763 284 : b3 = *iso++;
764 284 : b4 = *iso++;
765 : }
766 60 : else if (l == 3)
767 : {
768 0 : b2 = *iso++;
769 0 : b3 = *iso++;
770 0 : b4 = *iso++;
771 : }
772 60 : else if (l == 4)
773 : {
774 60 : b1 = *iso++;
775 60 : b2 = *iso++;
776 60 : b3 = *iso++;
777 60 : b4 = *iso++;
778 : }
779 : else
780 : {
781 0 : elog(ERROR, "unsupported character length %d", l);
782 : iiso = 0; /* keep compiler quiet */
783 : }
784 476 : iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785 :
786 476 : if (map)
787 : {
788 476 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789 :
790 476 : if (converted)
791 : {
792 368 : utf = store_coded_char(utf, converted);
793 368 : continue;
794 : }
795 :
796 : /* If there's a combined character map, try that */
797 108 : if (cmap)
798 : {
799 24 : cp = bsearch(&iiso, cmap, cmapsize,
800 : sizeof(pg_local_to_utf_combined), compare4);
801 :
802 24 : if (cp)
803 : {
804 24 : utf = store_coded_char(utf, cp->utf1);
805 24 : utf = store_coded_char(utf, cp->utf2);
806 24 : continue;
807 : }
808 : }
809 : }
810 :
811 : /* if there's a conversion function, try that */
812 84 : if (conv_func)
813 : {
814 60 : uint32 converted = (*conv_func) (iiso);
815 :
816 60 : if (converted)
817 : {
818 36 : utf = store_coded_char(utf, converted);
819 36 : continue;
820 : }
821 : }
822 :
823 : /* failed to translate this character */
824 48 : iso -= l;
825 48 : if (noError)
826 24 : break;
827 24 : report_untranslatable_char(encoding, PG_UTF8,
828 : (const char *) iso, len);
829 : }
830 :
831 : /* if we broke out of loop early, must be invalid input */
832 884 : if (len > 0 && !noError)
833 248 : report_invalid_encoding(encoding, (const char *) iso, len);
834 :
835 636 : *utf = '\0';
836 :
837 636 : return iso - start;
838 : }
|