Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ISO8859_1 <--> UTF8
4 : *
5 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 : #include "fmgr.h"
16 : #include "mb/pg_wchar.h"
17 :
18 119 : PG_MODULE_MAGIC_EXT(
19 : .name = "utf8_and_iso8859_1",
20 : .version = PG_VERSION
21 : );
22 :
23 30 : PG_FUNCTION_INFO_V1(iso8859_1_to_utf8);
24 105 : PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
25 :
26 : /* ----------
27 : * conv_proc(
28 : * INTEGER, -- source encoding id
29 : * INTEGER, -- destination encoding id
30 : * CSTRING, -- source string (null terminated C string)
31 : * CSTRING, -- destination string (null terminated C string)
32 : * INTEGER, -- source string length
33 : * BOOL -- if true, don't throw an error if conversion fails
34 : * ) returns INTEGER;
35 : *
36 : * Returns the number of bytes successfully converted.
37 : * ----------
38 : */
39 :
40 : Datum
41 80 : iso8859_1_to_utf8(PG_FUNCTION_ARGS)
42 : {
43 80 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
44 80 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
45 80 : int len = PG_GETARG_INT32(4);
46 80 : bool noError = PG_GETARG_BOOL(5);
47 80 : unsigned char *start = src;
48 : unsigned short c;
49 :
50 80 : CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
51 :
52 3134 : while (len > 0)
53 : {
54 3054 : c = *src;
55 3054 : if (c == 0)
56 : {
57 0 : if (noError)
58 0 : break;
59 0 : report_invalid_encoding(PG_LATIN1, (const char *) src, len);
60 : }
61 3054 : if (!IS_HIGHBIT_SET(c))
62 3030 : *dest++ = c;
63 : else
64 : {
65 24 : *dest++ = (c >> 6) | 0xc0;
66 24 : *dest++ = (c & 0x003f) | HIGHBIT;
67 : }
68 3054 : src++;
69 3054 : len--;
70 : }
71 80 : *dest = '\0';
72 :
73 80 : PG_RETURN_INT32(src - start);
74 : }
75 :
76 : Datum
77 458 : utf8_to_iso8859_1(PG_FUNCTION_ARGS)
78 : {
79 458 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
80 458 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
81 458 : int len = PG_GETARG_INT32(4);
82 458 : bool noError = PG_GETARG_BOOL(5);
83 458 : unsigned char *start = src;
84 : unsigned short c,
85 : c1;
86 :
87 458 : CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_LATIN1);
88 :
89 2450 : while (len > 0)
90 : {
91 2256 : c = *src;
92 2256 : if (c == 0)
93 : {
94 24 : if (noError)
95 12 : break;
96 12 : report_invalid_encoding(PG_UTF8, (const char *) src, len);
97 : }
98 : /* fast path for ASCII-subset characters */
99 2232 : if (!IS_HIGHBIT_SET(c))
100 : {
101 1967 : *dest++ = c;
102 1967 : src++;
103 1967 : len--;
104 : }
105 : else
106 : {
107 265 : int l = pg_utf_mblen(src);
108 :
109 265 : if (l > len || !pg_utf8_islegal(src, l))
110 : {
111 72 : if (noError)
112 36 : break;
113 36 : report_invalid_encoding(PG_UTF8, (const char *) src, len);
114 : }
115 193 : if (l != 2)
116 : {
117 144 : if (noError)
118 72 : break;
119 72 : report_untranslatable_char(PG_UTF8, PG_LATIN1,
120 : (const char *) src, len);
121 : }
122 49 : c1 = src[1] & 0x3f;
123 49 : c = ((c & 0x1f) << 6) | c1;
124 49 : if (c >= 0x80 && c <= 0xff)
125 : {
126 25 : *dest++ = (unsigned char) c;
127 25 : src += 2;
128 25 : len -= 2;
129 : }
130 : else
131 : {
132 24 : if (noError)
133 12 : break;
134 12 : report_untranslatable_char(PG_UTF8, PG_LATIN1,
135 : (const char *) src, len);
136 : }
137 : }
138 : }
139 326 : *dest = '\0';
140 :
141 326 : PG_RETURN_INT32(src - start);
142 : }
|