Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_case.c
3 : * Unicode case mapping and case conversion.
4 : *
5 : * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/common/unicode_case.c
9 : *
10 : *-------------------------------------------------------------------------
11 : */
12 : #ifndef FRONTEND
13 : #include "postgres.h"
14 : #else
15 : #include "postgres_fe.h"
16 : #endif
17 :
18 : #include "common/unicode_case.h"
19 : #include "common/unicode_case_table.h"
20 : #include "mb/pg_wchar.h"
21 :
22 : static const pg_case_map *find_case_map(pg_wchar ucs);
23 : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
24 : CaseKind str_casekind, WordBoundaryNext wbnext,
25 : void *wbstate);
26 :
27 : pg_wchar
28 372 : unicode_lowercase_simple(pg_wchar code)
29 : {
30 372 : const pg_case_map *map = find_case_map(code);
31 :
32 372 : return map ? map->simplemap[CaseLower] : code;
33 : }
34 :
35 : pg_wchar
36 0 : unicode_titlecase_simple(pg_wchar code)
37 : {
38 0 : const pg_case_map *map = find_case_map(code);
39 :
40 0 : return map ? map->simplemap[CaseTitle] : code;
41 : }
42 :
43 : pg_wchar
44 372 : unicode_uppercase_simple(pg_wchar code)
45 : {
46 372 : const pg_case_map *map = find_case_map(code);
47 :
48 372 : return map ? map->simplemap[CaseUpper] : code;
49 : }
50 :
51 : /*
52 : * unicode_strlower()
53 : *
54 : * Convert src to lowercase, and return the result length (not including
55 : * terminating NUL).
56 : *
57 : * String src must be encoded in UTF-8. If srclen < 0, src must be
58 : * NUL-terminated.
59 : *
60 : * Result string is stored in dst, truncating if larger than dstsize. If
61 : * dstsize is greater than the result length, dst will be NUL-terminated;
62 : * otherwise not.
63 : *
64 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
65 : * required buffer size before allocating.
66 : */
67 : size_t
68 2046 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
69 : {
70 2046 : return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
71 : }
72 :
73 : /*
74 : * unicode_strtitle()
75 : *
76 : * Convert src to titlecase, and return the result length (not including
77 : * terminating NUL).
78 : *
79 : * String src must be encoded in UTF-8. If srclen < 0, src must be
80 : * NUL-terminated.
81 : *
82 : * Result string is stored in dst, truncating if larger than dstsize. If
83 : * dstsize is greater than the result length, dst will be NUL-terminated;
84 : * otherwise not.
85 : *
86 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
87 : * required buffer size before allocating.
88 : *
89 : * Titlecasing requires knowledge about word boundaries, which is provided by
90 : * the callback wbnext. A word boundary is the offset of the start of a word
91 : * or the offset of the character immediately following a word.
92 : *
93 : * The caller is expected to initialize and free the callback state
94 : * wbstate. The callback should first return offset 0 for the first boundary;
95 : * then the offset of each subsequent word boundary; then the total length of
96 : * the string to indicate the final boundary.
97 : */
98 : size_t
99 86 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
100 : WordBoundaryNext wbnext, void *wbstate)
101 : {
102 86 : return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
103 : wbstate);
104 : }
105 :
106 : /*
107 : * unicode_strupper()
108 : *
109 : * Convert src to uppercase, and return the result length (not including
110 : * terminating NUL).
111 : *
112 : * String src must be encoded in UTF-8. If srclen < 0, src must be
113 : * NUL-terminated.
114 : *
115 : * Result string is stored in dst, truncating if larger than dstsize. If
116 : * dstsize is greater than the result length, dst will be NUL-terminated;
117 : * otherwise not.
118 : *
119 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
120 : * required buffer size before allocating.
121 : */
122 : size_t
123 316786 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
124 : {
125 316786 : return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
126 : }
127 :
128 : /*
129 : * If str_casekind is CaseLower or CaseUpper, map each character in the string
130 : * for which a mapping is available.
131 : *
132 : * If str_casekind is CaseTitle, maps characters found on a word boundary to
133 : * uppercase and other characters to lowercase.
134 : */
135 : static size_t
136 318918 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
137 : CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
138 : {
139 : /* character CaseKind varies while titlecasing */
140 318918 : CaseKind chr_casekind = str_casekind;
141 318918 : size_t srcoff = 0;
142 318918 : size_t result_len = 0;
143 318918 : size_t boundary = 0;
144 :
145 : Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
146 : (str_casekind != CaseTitle && !wbnext && !wbstate));
147 :
148 318918 : if (str_casekind == CaseTitle)
149 : {
150 86 : boundary = wbnext(wbstate);
151 : Assert(boundary == 0); /* start of text is always a boundary */
152 : }
153 :
154 919206 : while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
155 : {
156 600288 : pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
157 600288 : int u1len = unicode_utf8len(u1);
158 600288 : const pg_case_map *casemap = find_case_map(u1);
159 :
160 600288 : if (str_casekind == CaseTitle)
161 : {
162 666 : if (srcoff == boundary)
163 : {
164 258 : chr_casekind = CaseUpper;
165 258 : boundary = wbnext(wbstate);
166 : }
167 : else
168 408 : chr_casekind = CaseLower;
169 : }
170 :
171 : /* perform mapping, update result_len, and write to dst */
172 600288 : if (casemap)
173 : {
174 600192 : pg_wchar u2 = casemap->simplemap[chr_casekind];
175 600192 : pg_wchar u2len = unicode_utf8len(u2);
176 :
177 600192 : if (result_len + u2len <= dstsize)
178 600168 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
179 :
180 600192 : result_len += u2len;
181 : }
182 : else
183 : {
184 : /* no mapping; copy bytes from src */
185 96 : if (result_len + u1len <= dstsize)
186 96 : memcpy(dst + result_len, src + srcoff, u1len);
187 :
188 96 : result_len += u1len;
189 : }
190 :
191 600288 : srcoff += u1len;
192 : }
193 :
194 318918 : if (result_len < dstsize)
195 318882 : dst[result_len] = '\0';
196 :
197 318918 : return result_len;
198 : }
199 :
200 : /* find entry in simple case map, if any */
201 : static const pg_case_map *
202 601032 : find_case_map(pg_wchar ucs)
203 : {
204 : int min;
205 : int mid;
206 : int max;
207 :
208 : /* all chars <= 0x80 are stored in array for fast lookup */
209 : Assert(lengthof(case_map) >= 0x80);
210 601032 : if (ucs < 0x80)
211 : {
212 600012 : const pg_case_map *map = &case_map[ucs];
213 :
214 : Assert(map->codepoint == ucs);
215 600012 : return map;
216 : }
217 :
218 : /* otherwise, binary search */
219 1020 : min = 0x80;
220 1020 : max = lengthof(case_map) - 1;
221 10536 : while (max >= min)
222 : {
223 10440 : mid = (min + max) / 2;
224 10440 : if (ucs > case_map[mid].codepoint)
225 3150 : min = mid + 1;
226 7290 : else if (ucs < case_map[mid].codepoint)
227 6366 : max = mid - 1;
228 : else
229 924 : return &case_map[mid];
230 : }
231 :
232 96 : return NULL;
233 : }
|