Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * Utility functions for conversion procs.
4 : *
5 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conv.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 : #include "mb/pg_wchar.h"
15 :
16 :
17 : /*
18 : * local2local: a generic single byte charset encoding
19 : * conversion between two ASCII-superset encodings.
20 : *
21 : * l points to the source string of length len
22 : * p is the output area (must be large enough!)
23 : * src_encoding is the PG identifier for the source encoding
24 : * dest_encoding is the PG identifier for the target encoding
25 : * tab holds conversion entries for the source charset
26 : * starting from 128 (0x80). each entry in the table holds the corresponding
27 : * code point for the target charset, or 0 if there is no equivalent code.
28 : *
29 : * Returns the number of input bytes consumed. If noError is true, this can
30 : * be less than 'len'.
31 : */
32 : int
33 228 : local2local(const unsigned char *l,
34 : unsigned char *p,
35 : int len,
36 : int src_encoding,
37 : int dest_encoding,
38 : const unsigned char *tab,
39 : bool noError)
40 : {
41 228 : const unsigned char *start = l;
42 : unsigned char c1,
43 : c2;
44 :
45 732 : while (len > 0)
46 : {
47 612 : c1 = *l;
48 612 : if (c1 == 0)
49 : {
50 108 : if (noError)
51 54 : break;
52 54 : report_invalid_encoding(src_encoding, (const char *) l, len);
53 : }
54 504 : if (!IS_HIGHBIT_SET(c1))
55 306 : *p++ = c1;
56 : else
57 : {
58 198 : c2 = tab[c1 - HIGHBIT];
59 198 : if (c2)
60 198 : *p++ = c2;
61 : else
62 : {
63 0 : if (noError)
64 0 : break;
65 0 : report_untranslatable_char(src_encoding, dest_encoding,
66 : (const char *) l, len);
67 : }
68 : }
69 504 : l++;
70 504 : len--;
71 : }
72 174 : *p = '\0';
73 :
74 174 : return l - start;
75 : }
76 :
77 : /*
78 : * LATINn ---> MIC when the charset's local codes map directly to MIC
79 : *
80 : * l points to the source string of length len
81 : * p is the output area (must be large enough!)
82 : * lc is the mule character set id for the local encoding
83 : * encoding is the PG identifier for the local encoding
84 : *
85 : * Returns the number of input bytes consumed. If noError is true, this can
86 : * be less than 'len'.
87 : */
88 : int
89 30 : latin2mic(const unsigned char *l, unsigned char *p, int len,
90 : int lc, int encoding, bool noError)
91 : {
92 30 : const unsigned char *start = l;
93 : int c1;
94 :
95 120 : while (len > 0)
96 : {
97 90 : c1 = *l;
98 90 : if (c1 == 0)
99 : {
100 0 : if (noError)
101 0 : break;
102 0 : report_invalid_encoding(encoding, (const char *) l, len);
103 : }
104 90 : if (IS_HIGHBIT_SET(c1))
105 0 : *p++ = lc;
106 90 : *p++ = c1;
107 90 : l++;
108 90 : len--;
109 : }
110 30 : *p = '\0';
111 :
112 30 : return l - start;
113 : }
114 :
115 : /*
116 : * MIC ---> LATINn when the charset's local codes map directly to MIC
117 : *
118 : * mic points to the source string of length len
119 : * p is the output area (must be large enough!)
120 : * lc is the mule character set id for the local encoding
121 : * encoding is the PG identifier for the local encoding
122 : *
123 : * Returns the number of input bytes consumed. If noError is true, this can
124 : * be less than 'len'.
125 : */
126 : int
127 354 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
128 : int lc, int encoding, bool noError)
129 : {
130 354 : const unsigned char *start = mic;
131 : int c1;
132 :
133 840 : while (len > 0)
134 : {
135 774 : c1 = *mic;
136 774 : if (c1 == 0)
137 : {
138 0 : if (noError)
139 0 : break;
140 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141 : }
142 774 : if (!IS_HIGHBIT_SET(c1))
143 : {
144 : /* easy for ASCII */
145 360 : *p++ = c1;
146 360 : mic++;
147 360 : len--;
148 : }
149 : else
150 : {
151 414 : int l = pg_mule_mblen(mic);
152 :
153 414 : if (len < l)
154 : {
155 108 : if (noError)
156 54 : break;
157 54 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158 : len);
159 : }
160 306 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161 : {
162 180 : if (noError)
163 90 : break;
164 90 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165 : (const char *) mic, len);
166 : }
167 126 : *p++ = mic[1];
168 126 : mic += 2;
169 126 : len -= 2;
170 : }
171 : }
172 210 : *p = '\0';
173 :
174 210 : return mic - start;
175 : }
176 :
177 :
178 : /*
179 : * latin2mic_with_table: a generic single byte charset encoding
180 : * conversion from a local charset to the mule internal code.
181 : *
182 : * l points to the source string of length len
183 : * p is the output area (must be large enough!)
184 : * lc is the mule character set id for the local encoding
185 : * encoding is the PG identifier for the local encoding
186 : * tab holds conversion entries for the local charset
187 : * starting from 128 (0x80). each entry in the table holds the corresponding
188 : * code point for the mule encoding, or 0 if there is no equivalent code.
189 : *
190 : * Returns the number of input bytes consumed. If noError is true, this can
191 : * be less than 'len'.
192 : */
193 : int
194 168 : latin2mic_with_table(const unsigned char *l,
195 : unsigned char *p,
196 : int len,
197 : int lc,
198 : int encoding,
199 : const unsigned char *tab,
200 : bool noError)
201 : {
202 168 : const unsigned char *start = l;
203 : unsigned char c1,
204 : c2;
205 :
206 492 : while (len > 0)
207 : {
208 432 : c1 = *l;
209 432 : if (c1 == 0)
210 : {
211 108 : if (noError)
212 54 : break;
213 54 : report_invalid_encoding(encoding, (const char *) l, len);
214 : }
215 324 : if (!IS_HIGHBIT_SET(c1))
216 126 : *p++ = c1;
217 : else
218 : {
219 198 : c2 = tab[c1 - HIGHBIT];
220 198 : if (c2)
221 : {
222 198 : *p++ = lc;
223 198 : *p++ = c2;
224 : }
225 : else
226 : {
227 0 : if (noError)
228 0 : break;
229 0 : report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230 : (const char *) l, len);
231 : }
232 : }
233 324 : l++;
234 324 : len--;
235 : }
236 114 : *p = '\0';
237 :
238 114 : return l - start;
239 : }
240 :
241 : /*
242 : * mic2latin_with_table: a generic single byte charset encoding
243 : * conversion from the mule internal code to a local charset.
244 : *
245 : * mic points to the source string of length len
246 : * p is the output area (must be large enough!)
247 : * lc is the mule character set id for the local encoding
248 : * encoding is the PG identifier for the local encoding
249 : * tab holds conversion entries for the mule internal code's second byte,
250 : * starting from 128 (0x80). each entry in the table holds the corresponding
251 : * code point for the local charset, or 0 if there is no equivalent code.
252 : *
253 : * Returns the number of input bytes consumed. If noError is true, this can
254 : * be less than 'len'.
255 : */
256 : int
257 348 : mic2latin_with_table(const unsigned char *mic,
258 : unsigned char *p,
259 : int len,
260 : int lc,
261 : int encoding,
262 : const unsigned char *tab,
263 : bool noError)
264 : {
265 348 : const unsigned char *start = mic;
266 : unsigned char c1,
267 : c2;
268 :
269 816 : while (len > 0)
270 : {
271 756 : c1 = *mic;
272 756 : if (c1 == 0)
273 : {
274 0 : if (noError)
275 0 : break;
276 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277 : }
278 756 : if (!IS_HIGHBIT_SET(c1))
279 : {
280 : /* easy for ASCII */
281 342 : *p++ = c1;
282 342 : mic++;
283 342 : len--;
284 : }
285 : else
286 : {
287 414 : int l = pg_mule_mblen(mic);
288 :
289 414 : if (len < l)
290 : {
291 108 : if (noError)
292 54 : break;
293 54 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294 : len);
295 : }
296 306 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297 126 : (c2 = tab[mic[1] - HIGHBIT]) == 0)
298 : {
299 180 : if (noError)
300 90 : break;
301 90 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302 : (const char *) mic, len);
303 : break; /* keep compiler quiet */
304 : }
305 126 : *p++ = c2;
306 126 : mic += 2;
307 126 : len -= 2;
308 : }
309 : }
310 204 : *p = '\0';
311 :
312 204 : return mic - start;
313 : }
314 :
315 : /*
316 : * comparison routine for bsearch()
317 : * this routine is intended for combined UTF8 -> local code
318 : */
319 : static int
320 468 : compare3(const void *p1, const void *p2)
321 : {
322 : uint32 s1,
323 : s2,
324 : d1,
325 : d2;
326 :
327 468 : s1 = *(const uint32 *) p1;
328 468 : s2 = *((const uint32 *) p1 + 1);
329 468 : d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 468 : d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 468 : return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 : }
333 :
334 : /*
335 : * comparison routine for bsearch()
336 : * this routine is intended for local code -> combined UTF8
337 : */
338 : static int
339 162 : compare4(const void *p1, const void *p2)
340 : {
341 : uint32 v1,
342 : v2;
343 :
344 162 : v1 = *(const uint32 *) p1;
345 162 : v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 162 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 : }
348 :
349 : /*
350 : * store 32bit character representation into multibyte stream
351 : */
352 : static inline unsigned char *
353 1134 : store_coded_char(unsigned char *dest, uint32 code)
354 : {
355 1134 : if (code & 0xff000000)
356 126 : *dest++ = code >> 24;
357 1134 : if (code & 0x00ff0000)
358 522 : *dest++ = code >> 16;
359 1134 : if (code & 0x0000ff00)
360 1008 : *dest++ = code >> 8;
361 1134 : if (code & 0x000000ff)
362 1134 : *dest++ = code;
363 1134 : return dest;
364 : }
365 :
366 : /*
367 : * Convert a character using a conversion radix tree.
368 : *
369 : * 'l' is the length of the input character in bytes, and b1-b4 are
370 : * the input character's bytes.
371 : */
372 : static inline uint32
373 1980 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374 : int l,
375 : unsigned char b1,
376 : unsigned char b2,
377 : unsigned char b3,
378 : unsigned char b4)
379 : {
380 1980 : if (l == 4)
381 : {
382 : /* 4-byte code */
383 :
384 : /* check code validity */
385 90 : if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386 90 : b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387 90 : b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388 90 : b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389 0 : return 0;
390 :
391 : /* perform lookup */
392 90 : if (rt->chars32)
393 : {
394 90 : uint32 idx = rt->b4root;
395 :
396 90 : idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397 90 : idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398 90 : idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399 90 : return rt->chars32[b4 + idx - rt->b4_4_lower];
400 : }
401 : else
402 : {
403 0 : uint16 idx = rt->b4root;
404 :
405 0 : idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406 0 : idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407 0 : idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408 0 : return rt->chars16[b4 + idx - rt->b4_4_lower];
409 : }
410 : }
411 1890 : else if (l == 3)
412 : {
413 : /* 3-byte code */
414 :
415 : /* check code validity */
416 936 : if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417 288 : b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418 288 : b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419 648 : return 0;
420 :
421 : /* perform lookup */
422 288 : if (rt->chars32)
423 : {
424 288 : uint32 idx = rt->b3root;
425 :
426 288 : idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427 288 : idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428 288 : return rt->chars32[b4 + idx - rt->b3_3_lower];
429 : }
430 : else
431 : {
432 0 : uint16 idx = rt->b3root;
433 :
434 0 : idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435 0 : idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436 0 : return rt->chars16[b4 + idx - rt->b3_3_lower];
437 : }
438 : }
439 954 : else if (l == 2)
440 : {
441 : /* 2-byte code */
442 :
443 : /* check code validity - first byte */
444 756 : if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445 684 : b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446 72 : return 0;
447 :
448 : /* perform lookup */
449 684 : if (rt->chars32)
450 : {
451 522 : uint32 idx = rt->b2root;
452 :
453 522 : idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454 522 : return rt->chars32[b4 + idx - rt->b2_2_lower];
455 : }
456 : else
457 : {
458 162 : uint16 idx = rt->b2root;
459 :
460 162 : idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461 162 : return rt->chars16[b4 + idx - rt->b2_2_lower];
462 : }
463 : }
464 198 : else if (l == 1)
465 : {
466 : /* 1-byte code */
467 :
468 : /* check code validity - first byte */
469 198 : if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470 0 : return 0;
471 :
472 : /* perform lookup */
473 198 : if (rt->chars32)
474 198 : return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475 : else
476 0 : return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477 : }
478 0 : return 0; /* shouldn't happen */
479 : }
480 :
481 : /*
482 : * UTF8 ---> local code
483 : *
484 : * utf: input string in UTF8 encoding (need not be null-terminated)
485 : * len: length of input string (in bytes)
486 : * iso: pointer to the output area (must be large enough!)
487 : (output string will be null-terminated)
488 : * map: conversion map for single characters
489 : * cmap: conversion map for combined characters
490 : * (optional, pass NULL if none)
491 : * cmapsize: number of entries in the conversion map for combined characters
492 : * (optional, pass 0 if none)
493 : * conv_func: algorithmic encoding conversion function
494 : * (optional, pass NULL if none)
495 : * encoding: PG identifier for the local encoding
496 : *
497 : * For each character, the cmap (if provided) is consulted first; if no match,
498 : * the map is consulted next; if still no match, the conv_func (if provided)
499 : * is applied. An error is raised if no match is found.
500 : *
501 : * See pg_wchar.h for more details about the data structures used here.
502 : *
503 : * Returns the number of input bytes consumed. If noError is true, this can
504 : * be less than 'len'.
505 : */
506 : int
507 2208 : UtfToLocal(const unsigned char *utf, int len,
508 : unsigned char *iso,
509 : const pg_mb_radix_tree *map,
510 : const pg_utf_to_local_combined *cmap, int cmapsize,
511 : utf_local_conversion_func conv_func,
512 : int encoding, bool noError)
513 : {
514 : uint32 iutf;
515 : int l;
516 : const pg_utf_to_local_combined *cp;
517 2208 : const unsigned char *start = utf;
518 :
519 2208 : if (!PG_VALID_ENCODING(encoding))
520 0 : ereport(ERROR,
521 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522 : errmsg("invalid encoding number: %d", encoding)));
523 :
524 6060 : for (; len > 0; len -= l)
525 : {
526 5472 : unsigned char b1 = 0;
527 5472 : unsigned char b2 = 0;
528 5472 : unsigned char b3 = 0;
529 5472 : unsigned char b4 = 0;
530 :
531 : /* "break" cases all represent errors */
532 5472 : if (*utf == '\0')
533 180 : break;
534 :
535 5292 : l = pg_utf_mblen(utf);
536 5292 : if (len < l)
537 216 : break;
538 :
539 5076 : if (!pg_utf8_islegal(utf, l))
540 360 : break;
541 :
542 4716 : if (l == 1)
543 : {
544 : /* ASCII case is easy, assume it's one-to-one conversion */
545 3312 : *iso++ = *utf++;
546 3312 : continue;
547 : }
548 :
549 : /* collect coded char of length l */
550 1404 : if (l == 2)
551 : {
552 414 : b3 = *utf++;
553 414 : b4 = *utf++;
554 : }
555 990 : else if (l == 3)
556 : {
557 990 : b2 = *utf++;
558 990 : b3 = *utf++;
559 990 : b4 = *utf++;
560 : }
561 0 : else if (l == 4)
562 : {
563 0 : b1 = *utf++;
564 0 : b2 = *utf++;
565 0 : b3 = *utf++;
566 0 : b4 = *utf++;
567 : }
568 : else
569 : {
570 0 : elog(ERROR, "unsupported character length %d", l);
571 : iutf = 0; /* keep compiler quiet */
572 : }
573 1404 : iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574 :
575 : /* First, try with combined map if possible */
576 1404 : if (cmap && len > l)
577 : {
578 144 : const unsigned char *utf_save = utf;
579 144 : int len_save = len;
580 144 : int l_save = l;
581 :
582 : /* collect next character, same as above */
583 144 : len -= l;
584 :
585 144 : l = pg_utf_mblen(utf);
586 144 : if (len < l)
587 : {
588 : /* need more data to decide if this is a combined char */
589 36 : utf -= l_save;
590 36 : break;
591 : }
592 :
593 108 : if (!pg_utf8_islegal(utf, l))
594 : {
595 0 : if (!noError)
596 0 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597 0 : utf -= l_save;
598 0 : break;
599 : }
600 :
601 : /* We assume ASCII character cannot be in combined map */
602 108 : if (l > 1)
603 : {
604 : uint32 iutf2;
605 : uint32 cutf[2];
606 :
607 108 : if (l == 2)
608 : {
609 54 : iutf2 = *utf++ << 8;
610 54 : iutf2 |= *utf++;
611 : }
612 54 : else if (l == 3)
613 : {
614 54 : iutf2 = *utf++ << 16;
615 54 : iutf2 |= *utf++ << 8;
616 54 : iutf2 |= *utf++;
617 : }
618 0 : else if (l == 4)
619 : {
620 0 : iutf2 = *utf++ << 24;
621 0 : iutf2 |= *utf++ << 16;
622 0 : iutf2 |= *utf++ << 8;
623 0 : iutf2 |= *utf++;
624 : }
625 : else
626 : {
627 0 : elog(ERROR, "unsupported character length %d", l);
628 : iutf2 = 0; /* keep compiler quiet */
629 : }
630 :
631 108 : cutf[0] = iutf;
632 108 : cutf[1] = iutf2;
633 :
634 108 : cp = bsearch(cutf, cmap, cmapsize,
635 : sizeof(pg_utf_to_local_combined), compare3);
636 :
637 108 : if (cp)
638 : {
639 18 : iso = store_coded_char(iso, cp->code);
640 18 : continue;
641 : }
642 : }
643 :
644 : /* fail, so back up to reprocess second character next time */
645 90 : utf = utf_save;
646 90 : len = len_save;
647 90 : l = l_save;
648 : }
649 :
650 : /* Now check ordinary map */
651 1350 : if (map)
652 : {
653 1350 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654 :
655 1350 : if (converted)
656 : {
657 450 : iso = store_coded_char(iso, converted);
658 450 : continue;
659 : }
660 : }
661 :
662 : /* if there's a conversion function, try that */
663 900 : if (conv_func)
664 : {
665 72 : uint32 converted = (*conv_func) (iutf);
666 :
667 72 : if (converted)
668 : {
669 72 : iso = store_coded_char(iso, converted);
670 72 : continue;
671 : }
672 : }
673 :
674 : /* failed to translate this character */
675 828 : utf -= l;
676 828 : if (noError)
677 414 : break;
678 414 : report_untranslatable_char(PG_UTF8, encoding,
679 : (const char *) utf, len);
680 : }
681 :
682 : /* if we broke out of loop early, must be invalid input */
683 1794 : if (len > 0 && !noError)
684 396 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685 :
686 1398 : *iso = '\0';
687 :
688 1398 : return utf - start;
689 : }
690 :
691 : /*
692 : * local code ---> UTF8
693 : *
694 : * iso: input string in local encoding (need not be null-terminated)
695 : * len: length of input string (in bytes)
696 : * utf: pointer to the output area (must be large enough!)
697 : (output string will be null-terminated)
698 : * map: conversion map for single characters
699 : * cmap: conversion map for combined characters
700 : * (optional, pass NULL if none)
701 : * cmapsize: number of entries in the conversion map for combined characters
702 : * (optional, pass 0 if none)
703 : * conv_func: algorithmic encoding conversion function
704 : * (optional, pass NULL if none)
705 : * encoding: PG identifier for the local encoding
706 : *
707 : * For each character, the map is consulted first; if no match, the cmap
708 : * (if provided) is consulted next; if still no match, the conv_func
709 : * (if provided) is applied. An error is raised if no match is found.
710 : *
711 : * See pg_wchar.h for more details about the data structures used here.
712 : *
713 : * Returns the number of input bytes consumed. If noError is true, this can
714 : * be less than 'len'.
715 : */
716 : int
717 1236 : LocalToUtf(const unsigned char *iso, int len,
718 : unsigned char *utf,
719 : const pg_mb_radix_tree *map,
720 : const pg_local_to_utf_combined *cmap, int cmapsize,
721 : utf_local_conversion_func conv_func,
722 : int encoding,
723 : bool noError)
724 : {
725 : uint32 iiso;
726 : int l;
727 : const pg_local_to_utf_combined *cp;
728 1236 : const unsigned char *start = iso;
729 :
730 1236 : if (!PG_VALID_ENCODING(encoding))
731 0 : ereport(ERROR,
732 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733 : errmsg("invalid encoding number: %d", encoding)));
734 :
735 4854 : for (; len > 0; len -= l)
736 : {
737 4374 : unsigned char b1 = 0;
738 4374 : unsigned char b2 = 0;
739 4374 : unsigned char b3 = 0;
740 4374 : unsigned char b4 = 0;
741 :
742 : /* "break" cases all represent errors */
743 4374 : if (*iso == '\0')
744 324 : break;
745 :
746 4050 : if (!IS_HIGHBIT_SET(*iso))
747 : {
748 : /* ASCII case is easy, assume it's one-to-one conversion */
749 3060 : *utf++ = *iso++;
750 3060 : l = 1;
751 3060 : continue;
752 : }
753 :
754 990 : l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755 990 : if (l < 0)
756 360 : break;
757 :
758 : /* collect coded char of length l */
759 630 : if (l == 1)
760 198 : b4 = *iso++;
761 432 : else if (l == 2)
762 : {
763 342 : b3 = *iso++;
764 342 : b4 = *iso++;
765 : }
766 90 : else if (l == 3)
767 : {
768 0 : b2 = *iso++;
769 0 : b3 = *iso++;
770 0 : b4 = *iso++;
771 : }
772 90 : else if (l == 4)
773 : {
774 90 : b1 = *iso++;
775 90 : b2 = *iso++;
776 90 : b3 = *iso++;
777 90 : b4 = *iso++;
778 : }
779 : else
780 : {
781 0 : elog(ERROR, "unsupported character length %d", l);
782 : iiso = 0; /* keep compiler quiet */
783 : }
784 630 : iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785 :
786 630 : if (map)
787 : {
788 630 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789 :
790 630 : if (converted)
791 : {
792 468 : utf = store_coded_char(utf, converted);
793 468 : continue;
794 : }
795 :
796 : /* If there's a combined character map, try that */
797 162 : if (cmap)
798 : {
799 36 : cp = bsearch(&iiso, cmap, cmapsize,
800 : sizeof(pg_local_to_utf_combined), compare4);
801 :
802 36 : if (cp)
803 : {
804 36 : utf = store_coded_char(utf, cp->utf1);
805 36 : utf = store_coded_char(utf, cp->utf2);
806 36 : continue;
807 : }
808 : }
809 : }
810 :
811 : /* if there's a conversion function, try that */
812 126 : if (conv_func)
813 : {
814 90 : uint32 converted = (*conv_func) (iiso);
815 :
816 90 : if (converted)
817 : {
818 54 : utf = store_coded_char(utf, converted);
819 54 : continue;
820 : }
821 : }
822 :
823 : /* failed to translate this character */
824 72 : iso -= l;
825 72 : if (noError)
826 36 : break;
827 36 : report_untranslatable_char(encoding, PG_UTF8,
828 : (const char *) iso, len);
829 : }
830 :
831 : /* if we broke out of loop early, must be invalid input */
832 1200 : if (len > 0 && !noError)
833 342 : report_invalid_encoding(encoding, (const char *) iso, len);
834 :
835 858 : *utf = '\0';
836 :
837 858 : return iso - start;
838 : }
|