Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ISO8859_1 <--> UTF8
4 : *
5 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 : #include "fmgr.h"
16 : #include "mb/pg_wchar.h"
17 :
18 113 : PG_MODULE_MAGIC_EXT(
19 : .name = "utf8_and_iso8859_1",
20 : .version = PG_VERSION
21 : );
22 :
23 24 : PG_FUNCTION_INFO_V1(iso8859_1_to_utf8);
24 102 : PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
25 :
26 : /* ----------
27 : * conv_proc(
28 : * INTEGER, -- source encoding id
29 : * INTEGER, -- destination encoding id
30 : * CSTRING, -- source string (null terminated C string)
31 : * CSTRING, -- destination string (null terminated C string)
32 : * INTEGER, -- source string length
33 : * BOOL -- if true, don't throw an error if conversion fails
34 : * ) returns INTEGER;
35 : *
36 : * Returns the number of bytes successfully converted.
37 : * ----------
38 : */
39 :
40 : Datum
41 65 : iso8859_1_to_utf8(PG_FUNCTION_ARGS)
42 : {
43 65 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
44 65 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
45 65 : int len = PG_GETARG_INT32(4);
46 65 : bool noError = PG_GETARG_BOOL(5);
47 65 : unsigned char *start = src;
48 : unsigned short c;
49 :
50 65 : CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
51 :
52 2937 : while (len > 0)
53 : {
54 2872 : c = *src;
55 2872 : if (c == 0)
56 : {
57 0 : if (noError)
58 0 : break;
59 0 : report_invalid_encoding(PG_LATIN1, (const char *) src, len);
60 : }
61 2872 : if (!IS_HIGHBIT_SET(c))
62 2854 : *dest++ = c;
63 : else
64 : {
65 18 : *dest++ = (c >> 6) | 0xc0;
66 18 : *dest++ = (c & 0x003f) | HIGHBIT;
67 : }
68 2872 : src++;
69 2872 : len--;
70 : }
71 65 : *dest = '\0';
72 :
73 65 : PG_RETURN_INT32(src - start);
74 : }
75 :
76 : Datum
77 383 : utf8_to_iso8859_1(PG_FUNCTION_ARGS)
78 : {
79 383 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
80 383 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
81 383 : int len = PG_GETARG_INT32(4);
82 383 : bool noError = PG_GETARG_BOOL(5);
83 383 : unsigned char *start = src;
84 : unsigned short c,
85 : c1;
86 :
87 383 : CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_LATIN1);
88 :
89 2252 : while (len > 0)
90 : {
91 2067 : c = *src;
92 2067 : if (c == 0)
93 : {
94 18 : if (noError)
95 9 : break;
96 9 : report_invalid_encoding(PG_UTF8, (const char *) src, len);
97 : }
98 : /* fast path for ASCII-subset characters */
99 2049 : if (!IS_HIGHBIT_SET(c))
100 : {
101 1850 : *dest++ = c;
102 1850 : src++;
103 1850 : len--;
104 : }
105 : else
106 : {
107 199 : int l = pg_utf_mblen(src);
108 :
109 199 : if (l > len || !pg_utf8_islegal(src, l))
110 : {
111 54 : if (noError)
112 27 : break;
113 27 : report_invalid_encoding(PG_UTF8, (const char *) src, len);
114 : }
115 145 : if (l != 2)
116 : {
117 108 : if (noError)
118 54 : break;
119 54 : report_untranslatable_char(PG_UTF8, PG_LATIN1,
120 : (const char *) src, len);
121 : }
122 37 : c1 = src[1] & 0x3f;
123 37 : c = ((c & 0x1f) << 6) | c1;
124 37 : if (c >= 0x80 && c <= 0xff)
125 : {
126 19 : *dest++ = (unsigned char) c;
127 19 : src += 2;
128 19 : len -= 2;
129 : }
130 : else
131 : {
132 18 : if (noError)
133 9 : break;
134 9 : report_untranslatable_char(PG_UTF8, PG_LATIN1,
135 : (const char *) src, len);
136 : }
137 : }
138 : }
139 284 : *dest = '\0';
140 :
141 284 : PG_RETURN_INT32(src - start);
142 : }
|