Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_case.c
3 : * Unicode case mapping and case conversion.
4 : *
5 : * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/common/unicode_case.c
9 : *
10 : *-------------------------------------------------------------------------
11 : */
12 : #ifndef FRONTEND
13 : #include "postgres.h"
14 : #else
15 : #include "postgres_fe.h"
16 : #endif
17 :
18 : #include "common/unicode_case.h"
19 : #include "common/unicode_case_table.h"
20 : #include "common/unicode_category.h"
21 : #include "mb/pg_wchar.h"
22 :
23 : static const pg_case_map *find_case_map(pg_wchar ucs);
24 : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25 : CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
26 : void *wbstate);
27 : static bool check_special_conditions(int conditions, const char *str,
28 : size_t len, size_t offset);
29 :
30 : pg_wchar
31 528 : unicode_lowercase_simple(pg_wchar code)
32 : {
33 528 : const pg_case_map *map = find_case_map(code);
34 :
35 528 : return map ? map->simplemap[CaseLower] : code;
36 : }
37 :
38 : pg_wchar
39 0 : unicode_titlecase_simple(pg_wchar code)
40 : {
41 0 : const pg_case_map *map = find_case_map(code);
42 :
43 0 : return map ? map->simplemap[CaseTitle] : code;
44 : }
45 :
46 : pg_wchar
47 528 : unicode_uppercase_simple(pg_wchar code)
48 : {
49 528 : const pg_case_map *map = find_case_map(code);
50 :
51 528 : return map ? map->simplemap[CaseUpper] : code;
52 : }
53 :
54 : pg_wchar
55 0 : unicode_casefold_simple(pg_wchar code)
56 : {
57 0 : const pg_case_map *map = find_case_map(code);
58 :
59 0 : return map ? map->simplemap[CaseFold] : code;
60 : }
61 :
62 : /*
63 : * unicode_strlower()
64 : *
65 : * Convert src to lowercase, and return the result length (not including
66 : * terminating NUL).
67 : *
68 : * String src must be encoded in UTF-8. If srclen < 0, src must be
69 : * NUL-terminated.
70 : *
71 : * Result string is stored in dst, truncating if larger than dstsize. If
72 : * dstsize is greater than the result length, dst will be NUL-terminated;
73 : * otherwise not.
74 : *
75 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
76 : * required buffer size before allocating.
77 : *
78 : * If full is true, use special case mappings if available and if the
79 : * conditions are satisfied.
80 : */
81 : size_t
82 11946 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
83 : bool full)
84 : {
85 11946 : return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
86 : NULL);
87 : }
88 :
89 : /*
90 : * unicode_strtitle()
91 : *
92 : * Convert src to titlecase, and return the result length (not including
93 : * terminating NUL).
94 : *
95 : * String src must be encoded in UTF-8. If srclen < 0, src must be
96 : * NUL-terminated.
97 : *
98 : * Result string is stored in dst, truncating if larger than dstsize. If
99 : * dstsize is greater than the result length, dst will be NUL-terminated;
100 : * otherwise not.
101 : *
102 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
103 : * required buffer size before allocating.
104 : *
105 : * If full is true, use special case mappings if available and if the
106 : * conditions are satisfied. Otherwise, use only simple mappings and use
107 : * uppercase instead of titlecase.
108 : *
109 : * Titlecasing requires knowledge about word boundaries, which is provided by
110 : * the callback wbnext. A word boundary is the offset of the start of a word
111 : * or the offset of the character immediately following a word.
112 : *
113 : * The caller is expected to initialize and free the callback state
114 : * wbstate. The callback should first return offset 0 for the first boundary;
115 : * then the offset of each subsequent word boundary; then the total length of
116 : * the string to indicate the final boundary.
117 : */
118 : size_t
119 170 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
120 : bool full, WordBoundaryNext wbnext, void *wbstate)
121 : {
122 170 : return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
123 : wbstate);
124 : }
125 :
126 : /*
127 : * unicode_strupper()
128 : *
129 : * Convert src to uppercase, and return the result length (not including
130 : * terminating NUL).
131 : *
132 : * String src must be encoded in UTF-8. If srclen < 0, src must be
133 : * NUL-terminated.
134 : *
135 : * Result string is stored in dst, truncating if larger than dstsize. If
136 : * dstsize is greater than the result length, dst will be NUL-terminated;
137 : * otherwise not.
138 : *
139 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
140 : * required buffer size before allocating.
141 : *
142 : * If full is true, use special case mappings if available and if the
143 : * conditions are satisfied.
144 : */
145 : size_t
146 316858 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
147 : bool full)
148 : {
149 316858 : return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
150 : NULL);
151 : }
152 :
153 : /*
154 : * unicode_strfold()
155 : *
156 : * Case fold src, and return the result length (not including terminating
157 : * NUL).
158 : *
159 : * String src must be encoded in UTF-8. If srclen < 0, src must be
160 : * NUL-terminated.
161 : *
162 : * Result string is stored in dst, truncating if larger than dstsize. If
163 : * dstsize is greater than the result length, dst will be NUL-terminated;
164 : * otherwise not.
165 : *
166 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
167 : * required buffer size before allocating.
168 : */
169 : size_t
170 12 : unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
171 : bool full)
172 : {
173 12 : return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
174 : NULL);
175 : }
176 :
177 : /*
178 : * Implement Unicode Default Case Conversion algorithm.
179 : *
180 : * If str_casekind is CaseLower or CaseUpper, map each character in the string
181 : * for which a mapping is available.
182 : *
183 : * If str_casekind is CaseTitle, maps characters found on a word boundary to
184 : * titlecase (or uppercase if full is false) and other characters to
185 : * lowercase. NB: does not currently implement the Unicode behavior in which
186 : * the word boundary is adjusted to the next Cased character. That behavior
187 : * could be implemented as an option, but it doesn't match the default
188 : * behavior of ICU, nor does it match the documented behavior of INITCAP().
189 : *
190 : * If full is true, use special mappings for relevant characters, which can
191 : * map a single codepoint to multiple codepoints, or depend on conditions.
192 : */
193 : static size_t
194 328986 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
195 : CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
196 : void *wbstate)
197 : {
198 : /* character CaseKind varies while titlecasing */
199 328986 : CaseKind chr_casekind = str_casekind;
200 328986 : size_t srcoff = 0;
201 328986 : size_t result_len = 0;
202 328986 : size_t boundary = 0;
203 :
204 : Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
205 : (str_casekind != CaseTitle && !wbnext && !wbstate));
206 :
207 328986 : if (str_casekind == CaseTitle)
208 : {
209 170 : boundary = wbnext(wbstate);
210 : Assert(boundary == 0); /* start of text is always a boundary */
211 : }
212 :
213 979402 : while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
214 : {
215 650416 : pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
216 650416 : int u1len = unicode_utf8len(u1);
217 650416 : const pg_case_map *casemap = find_case_map(u1);
218 650416 : const pg_special_case *special = NULL;
219 :
220 650416 : if (str_casekind == CaseTitle)
221 : {
222 1314 : if (srcoff == boundary)
223 : {
224 510 : chr_casekind = full ? CaseTitle : CaseUpper;
225 510 : boundary = wbnext(wbstate);
226 : }
227 : else
228 804 : chr_casekind = CaseLower;
229 : }
230 :
231 : /*
232 : * Find special case that matches the conditions, if any.
233 : *
234 : * Note: only a single special mapping per codepoint is currently
235 : * supported, though Unicode allows for multiple special mappings for
236 : * a single codepoint.
237 : */
238 650416 : if (full && casemap && casemap->special_case)
239 : {
240 150 : int16 conditions = casemap->special_case->conditions;
241 :
242 : Assert(casemap->special_case->codepoint == u1);
243 150 : if (check_special_conditions(conditions, src, srclen, srcoff))
244 114 : special = casemap->special_case;
245 : }
246 :
247 : /* perform mapping, update result_len, and write to dst */
248 650416 : if (special)
249 : {
250 282 : for (int i = 0; i < MAX_CASE_EXPANSION; i++)
251 : {
252 282 : pg_wchar u2 = special->map[chr_casekind][i];
253 282 : size_t u2len = unicode_utf8len(u2);
254 :
255 282 : if (u2 == '\0')
256 114 : break;
257 :
258 168 : if (result_len + u2len <= dstsize)
259 168 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
260 :
261 168 : result_len += u2len;
262 : }
263 : }
264 650302 : else if (casemap)
265 : {
266 650254 : pg_wchar u2 = casemap->simplemap[chr_casekind];
267 650254 : pg_wchar u2len = unicode_utf8len(u2);
268 :
269 650254 : if (result_len + u2len <= dstsize)
270 650206 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
271 :
272 650254 : result_len += u2len;
273 : }
274 : else
275 : {
276 : /* no mapping; copy bytes from src */
277 48 : if (result_len + u1len <= dstsize)
278 48 : memcpy(dst + result_len, src + srcoff, u1len);
279 :
280 48 : result_len += u1len;
281 : }
282 :
283 650416 : srcoff += u1len;
284 : }
285 :
286 328986 : if (result_len < dstsize)
287 328914 : dst[result_len] = '\0';
288 :
289 328986 : return result_len;
290 : }
291 :
292 : /*
293 : * Check that the condition matches Final_Sigma, described in Unicode Table
294 : * 3-17. The character at the given offset must be directly preceded by a
295 : * Cased character, and must not be directly followed by a Cased character.
296 : *
297 : * Case_Ignorable characters are ignored. NB: some characters may be both
298 : * Cased and Case_Ignorable, in which case they are ignored.
299 : */
300 : static bool
301 60 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
302 : {
303 : /* the start of the string is not preceded by a Cased character */
304 60 : if (offset == 0)
305 6 : return false;
306 :
307 : /* iterate backwards, looking for Cased character */
308 144 : for (int i = offset - 1; i >= 0; i--)
309 : {
310 144 : if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
311 : {
312 78 : pg_wchar curr = utf8_to_unicode(str + i);
313 :
314 78 : if (pg_u_prop_case_ignorable(curr))
315 24 : continue;
316 54 : else if (pg_u_prop_cased(curr))
317 42 : break;
318 : else
319 12 : return false;
320 : }
321 66 : else if ((str[i] & 0xC0) == 0x80)
322 66 : continue;
323 :
324 : Assert(false); /* invalid UTF-8 */
325 : }
326 :
327 : /* end of string is not followed by a Cased character */
328 42 : if (offset == len)
329 0 : return true;
330 :
331 : /* iterate forwards, looking for Cased character */
332 132 : for (int i = offset + 1; i < len && str[i] != '\0'; i++)
333 : {
334 114 : if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
335 : {
336 48 : pg_wchar curr = utf8_to_unicode(str + i);
337 :
338 48 : if (pg_u_prop_case_ignorable(curr))
339 24 : continue;
340 24 : else if (pg_u_prop_cased(curr))
341 18 : return false;
342 : else
343 6 : break;
344 : }
345 66 : else if ((str[i] & 0xC0) == 0x80)
346 66 : continue;
347 :
348 : Assert(false); /* invalid UTF-8 */
349 : }
350 :
351 24 : return true;
352 : }
353 :
354 : static bool
355 150 : check_special_conditions(int conditions, const char *str, size_t len,
356 : size_t offset)
357 : {
358 150 : if (conditions == 0)
359 90 : return true;
360 60 : else if (conditions == PG_U_FINAL_SIGMA)
361 60 : return check_final_sigma((unsigned char *) str, len, offset);
362 :
363 : /* no other conditions supported */
364 : Assert(false);
365 0 : return false;
366 : }
367 :
368 : /* find entry in simple case map, if any */
369 : static const pg_case_map *
370 651472 : find_case_map(pg_wchar ucs)
371 : {
372 : int min;
373 : int mid;
374 : int max;
375 :
376 : /* all chars <= 0x80 are stored in array for fast lookup */
377 : Assert(lengthof(case_map) >= 0x80);
378 651472 : if (ucs < 0x80)
379 : {
380 649210 : const pg_case_map *map = &case_map[ucs];
381 :
382 : Assert(map->codepoint == ucs);
383 649210 : return map;
384 : }
385 :
386 : /* otherwise, binary search */
387 2262 : min = 0x80;
388 2262 : max = lengthof(case_map) - 1;
389 20700 : while (max >= min)
390 : {
391 20652 : mid = (min + max) / 2;
392 20652 : if (ucs > case_map[mid].codepoint)
393 6774 : min = mid + 1;
394 13878 : else if (ucs < case_map[mid].codepoint)
395 11664 : max = mid - 1;
396 : else
397 2214 : return &case_map[mid];
398 : }
399 :
400 48 : return NULL;
401 : }
|