Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_case.c
3 : * Unicode case mapping and case conversion.
4 : *
5 : * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/common/unicode_case.c
9 : *
10 : *-------------------------------------------------------------------------
11 : */
12 : #ifndef FRONTEND
13 : #include "postgres.h"
14 : #else
15 : #include "postgres_fe.h"
16 : #endif
17 :
18 : #include "common/unicode_case.h"
19 : #include "common/unicode_case_table.h"
20 : #include "common/unicode_category.h"
21 : #include "mb/pg_wchar.h"
22 :
23 : static const pg_case_map *find_case_map(pg_wchar ucs);
24 : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25 : CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
26 : void *wbstate);
27 : static bool check_special_conditions(int conditions, const char *str,
28 : size_t len, size_t offset);
29 :
30 : pg_wchar
31 528 : unicode_lowercase_simple(pg_wchar code)
32 : {
33 528 : const pg_case_map *map = find_case_map(code);
34 :
35 528 : return map ? map->simplemap[CaseLower] : code;
36 : }
37 :
38 : pg_wchar
39 0 : unicode_titlecase_simple(pg_wchar code)
40 : {
41 0 : const pg_case_map *map = find_case_map(code);
42 :
43 0 : return map ? map->simplemap[CaseTitle] : code;
44 : }
45 :
46 : pg_wchar
47 528 : unicode_uppercase_simple(pg_wchar code)
48 : {
49 528 : const pg_case_map *map = find_case_map(code);
50 :
51 528 : return map ? map->simplemap[CaseUpper] : code;
52 : }
53 :
54 : /*
55 : * unicode_strlower()
56 : *
57 : * Convert src to lowercase, and return the result length (not including
58 : * terminating NUL).
59 : *
60 : * String src must be encoded in UTF-8. If srclen < 0, src must be
61 : * NUL-terminated.
62 : *
63 : * Result string is stored in dst, truncating if larger than dstsize. If
64 : * dstsize is greater than the result length, dst will be NUL-terminated;
65 : * otherwise not.
66 : *
67 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
68 : * required buffer size before allocating.
69 : *
70 : * If full is true, use special case mappings if available and if the
71 : * conditions are satisfied.
72 : */
73 : size_t
74 11922 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
75 : bool full)
76 : {
77 11922 : return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
78 : NULL);
79 : }
80 :
81 : /*
82 : * unicode_strtitle()
83 : *
84 : * Convert src to titlecase, and return the result length (not including
85 : * terminating NUL).
86 : *
87 : * String src must be encoded in UTF-8. If srclen < 0, src must be
88 : * NUL-terminated.
89 : *
90 : * Result string is stored in dst, truncating if larger than dstsize. If
91 : * dstsize is greater than the result length, dst will be NUL-terminated;
92 : * otherwise not.
93 : *
94 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
95 : * required buffer size before allocating.
96 : *
97 : * If full is true, use special case mappings if available and if the
98 : * conditions are satisfied. Otherwise, use only simple mappings and use
99 : * uppercase instead of titlecase.
100 : *
101 : * Titlecasing requires knowledge about word boundaries, which is provided by
102 : * the callback wbnext. A word boundary is the offset of the start of a word
103 : * or the offset of the character immediately following a word.
104 : *
105 : * The caller is expected to initialize and free the callback state
106 : * wbstate. The callback should first return offset 0 for the first boundary;
107 : * then the offset of each subsequent word boundary; then the total length of
108 : * the string to indicate the final boundary.
109 : */
110 : size_t
111 170 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
112 : bool full, WordBoundaryNext wbnext, void *wbstate)
113 : {
114 170 : return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
115 : wbstate);
116 : }
117 :
118 : /*
119 : * unicode_strupper()
120 : *
121 : * Convert src to uppercase, and return the result length (not including
122 : * terminating NUL).
123 : *
124 : * String src must be encoded in UTF-8. If srclen < 0, src must be
125 : * NUL-terminated.
126 : *
127 : * Result string is stored in dst, truncating if larger than dstsize. If
128 : * dstsize is greater than the result length, dst will be NUL-terminated;
129 : * otherwise not.
130 : *
131 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
132 : * required buffer size before allocating.
133 : *
134 : * If full is true, use special case mappings if available and if the
135 : * conditions are satisfied.
136 : */
137 : size_t
138 316858 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
139 : bool full)
140 : {
141 316858 : return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
142 : NULL);
143 : }
144 :
145 : /*
146 : * Implement Unicode Default Case Conversion algorithm.
147 : *
148 : * If str_casekind is CaseLower or CaseUpper, map each character in the string
149 : * for which a mapping is available.
150 : *
151 : * If str_casekind is CaseTitle, maps characters found on a word boundary to
152 : * titlecase (or uppercase if full is false) and other characters to
153 : * lowercase. NB: does not currently implement the Unicode behavior in which
154 : * the word boundary is adjusted to the next Cased character. That behavior
155 : * could be implemented as an option, but it doesn't match the default
156 : * behavior of ICU, nor does it match the documented behavior of INITCAP().
157 : *
158 : * If full is true, use special mappings for relevant characters, which can
159 : * map a single codepoint to multiple codepoints, or depend on conditions.
160 : */
161 : static size_t
162 328950 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
163 : CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
164 : void *wbstate)
165 : {
166 : /* character CaseKind varies while titlecasing */
167 328950 : CaseKind chr_casekind = str_casekind;
168 328950 : size_t srcoff = 0;
169 328950 : size_t result_len = 0;
170 328950 : size_t boundary = 0;
171 :
172 : Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
173 : (str_casekind != CaseTitle && !wbnext && !wbstate));
174 :
175 328950 : if (str_casekind == CaseTitle)
176 : {
177 170 : boundary = wbnext(wbstate);
178 : Assert(boundary == 0); /* start of text is always a boundary */
179 : }
180 :
181 978330 : while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
182 : {
183 649380 : pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
184 649380 : int u1len = unicode_utf8len(u1);
185 649380 : const pg_case_map *casemap = find_case_map(u1);
186 649380 : const pg_special_case *special = NULL;
187 :
188 649380 : if (str_casekind == CaseTitle)
189 : {
190 1314 : if (srcoff == boundary)
191 : {
192 510 : chr_casekind = full ? CaseTitle : CaseUpper;
193 510 : boundary = wbnext(wbstate);
194 : }
195 : else
196 804 : chr_casekind = CaseLower;
197 : }
198 :
199 : /*
200 : * Find special case that matches the conditions, if any.
201 : *
202 : * Note: only a single special mapping per codepoint is currently
203 : * supported, though Unicode allows for multiple special mappings for
204 : * a single codepoint.
205 : */
206 649380 : if (full && casemap && casemap->special_case)
207 : {
208 126 : int16 conditions = casemap->special_case->conditions;
209 :
210 : Assert(casemap->special_case->codepoint == u1);
211 126 : if (check_special_conditions(conditions, src, srclen, srcoff))
212 96 : special = casemap->special_case;
213 : }
214 :
215 : /* perform mapping, update result_len, and write to dst */
216 649380 : if (special)
217 : {
218 228 : for (int i = 0; i < MAX_CASE_EXPANSION; i++)
219 : {
220 228 : pg_wchar u2 = special->map[chr_casekind][i];
221 228 : size_t u2len = unicode_utf8len(u2);
222 :
223 228 : if (u2 == '\0')
224 96 : break;
225 :
226 132 : if (result_len + u2len <= dstsize)
227 132 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
228 :
229 132 : result_len += u2len;
230 : }
231 : }
232 649284 : else if (casemap)
233 : {
234 649236 : pg_wchar u2 = casemap->simplemap[chr_casekind];
235 649236 : pg_wchar u2len = unicode_utf8len(u2);
236 :
237 649236 : if (result_len + u2len <= dstsize)
238 649188 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
239 :
240 649236 : result_len += u2len;
241 : }
242 : else
243 : {
244 : /* no mapping; copy bytes from src */
245 48 : if (result_len + u1len <= dstsize)
246 48 : memcpy(dst + result_len, src + srcoff, u1len);
247 :
248 48 : result_len += u1len;
249 : }
250 :
251 649380 : srcoff += u1len;
252 : }
253 :
254 328950 : if (result_len < dstsize)
255 328878 : dst[result_len] = '\0';
256 :
257 328950 : return result_len;
258 : }
259 :
260 : /*
261 : * Check that the condition matches Final_Sigma, described in Unicode Table
262 : * 3-17. The character at the given offset must be directly preceded by a
263 : * Cased character, and must not be directly followed by a Cased character.
264 : *
265 : * Case_Ignorable characters are ignored. NB: some characters may be both
266 : * Cased and Case_Ignorable, in which case they are ignored.
267 : */
268 : static bool
269 54 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
270 : {
271 : /* the start of the string is not preceded by a Cased character */
272 54 : if (offset == 0)
273 6 : return false;
274 :
275 : /* iterate backwards, looking for Cased character */
276 138 : for (int i = offset - 1; i >= 0; i--)
277 : {
278 138 : if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
279 : {
280 72 : pg_wchar curr = utf8_to_unicode(str + i);
281 :
282 72 : if (pg_u_prop_case_ignorable(curr))
283 24 : continue;
284 48 : else if (pg_u_prop_cased(curr))
285 42 : break;
286 : else
287 6 : return false;
288 : }
289 66 : else if ((str[i] & 0xC0) == 0x80)
290 66 : continue;
291 :
292 : Assert(false); /* invalid UTF-8 */
293 : }
294 :
295 : /* end of string is not followed by a Cased character */
296 42 : if (offset == len)
297 0 : return true;
298 :
299 : /* iterate forwards, looking for Cased character */
300 132 : for (int i = offset + 1; i < len && str[i] != '\0'; i++)
301 : {
302 114 : if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
303 : {
304 48 : pg_wchar curr = utf8_to_unicode(str + i);
305 :
306 48 : if (pg_u_prop_case_ignorable(curr))
307 24 : continue;
308 24 : else if (pg_u_prop_cased(curr))
309 18 : return false;
310 : else
311 6 : break;
312 : }
313 66 : else if ((str[i] & 0xC0) == 0x80)
314 66 : continue;
315 :
316 : Assert(false); /* invalid UTF-8 */
317 : }
318 :
319 24 : return true;
320 : }
321 :
322 : static bool
323 126 : check_special_conditions(int conditions, const char *str, size_t len,
324 : size_t offset)
325 : {
326 126 : if (conditions == 0)
327 72 : return true;
328 54 : else if (conditions == PG_U_FINAL_SIGMA)
329 54 : return check_final_sigma((unsigned char *) str, len, offset);
330 :
331 : /* no other conditions supported */
332 : Assert(false);
333 0 : return false;
334 : }
335 :
336 : /* find entry in simple case map, if any */
337 : static const pg_case_map *
338 650436 : find_case_map(pg_wchar ucs)
339 : {
340 : int min;
341 : int mid;
342 : int max;
343 :
344 : /* all chars <= 0x80 are stored in array for fast lookup */
345 : Assert(lengthof(case_map) >= 0x80);
346 650436 : if (ucs < 0x80)
347 : {
348 648294 : const pg_case_map *map = &case_map[ucs];
349 :
350 : Assert(map->codepoint == ucs);
351 648294 : return map;
352 : }
353 :
354 : /* otherwise, binary search */
355 2142 : min = 0x80;
356 2142 : max = lengthof(case_map) - 1;
357 19404 : while (max >= min)
358 : {
359 19356 : mid = (min + max) / 2;
360 19356 : if (ucs > case_map[mid].codepoint)
361 6330 : min = mid + 1;
362 13026 : else if (ucs < case_map[mid].codepoint)
363 10932 : max = mid - 1;
364 : else
365 2094 : return &case_map[mid];
366 : }
367 :
368 48 : return NULL;
369 : }
|