Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_case.c
3 : * Unicode case mapping and case conversion.
4 : *
5 : * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/common/unicode_case.c
9 : *
10 : *-------------------------------------------------------------------------
11 : */
12 : #ifndef FRONTEND
13 : #include "postgres.h"
14 : #else
15 : #include "postgres_fe.h"
16 : #endif
17 :
18 : #include "common/unicode_case.h"
19 : #include "common/unicode_case_table.h"
20 : #include "common/unicode_category.h"
21 : #include "mb/pg_wchar.h"
22 :
23 : static const pg_case_map *find_case_map(pg_wchar ucs);
24 : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25 : CaseKind str_casekind, WordBoundaryNext wbnext,
26 : void *wbstate);
27 :
28 : pg_wchar
29 372 : unicode_lowercase_simple(pg_wchar code)
30 : {
31 372 : const pg_case_map *map = find_case_map(code);
32 :
33 372 : return map ? map->simplemap[CaseLower] : code;
34 : }
35 :
36 : pg_wchar
37 0 : unicode_titlecase_simple(pg_wchar code)
38 : {
39 0 : const pg_case_map *map = find_case_map(code);
40 :
41 0 : return map ? map->simplemap[CaseTitle] : code;
42 : }
43 :
44 : pg_wchar
45 372 : unicode_uppercase_simple(pg_wchar code)
46 : {
47 372 : const pg_case_map *map = find_case_map(code);
48 :
49 372 : return map ? map->simplemap[CaseUpper] : code;
50 : }
51 :
52 : /*
53 : * unicode_strlower()
54 : *
55 : * Convert src to lowercase, and return the result length (not including
56 : * terminating NUL).
57 : *
58 : * String src must be encoded in UTF-8. If srclen < 0, src must be
59 : * NUL-terminated.
60 : *
61 : * Result string is stored in dst, truncating if larger than dstsize. If
62 : * dstsize is greater than the result length, dst will be NUL-terminated;
63 : * otherwise not.
64 : *
65 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
66 : * required buffer size before allocating.
67 : */
68 : size_t
69 2014 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
70 : {
71 2014 : return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
72 : }
73 :
74 : /*
75 : * unicode_strtitle()
76 : *
77 : * Convert src to titlecase, and return the result length (not including
78 : * terminating NUL).
79 : *
80 : * String src must be encoded in UTF-8. If srclen < 0, src must be
81 : * NUL-terminated.
82 : *
83 : * Result string is stored in dst, truncating if larger than dstsize. If
84 : * dstsize is greater than the result length, dst will be NUL-terminated;
85 : * otherwise not.
86 : *
87 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
88 : * required buffer size before allocating.
89 : *
90 : * Titlecasing requires knowledge about word boundaries, which is provided by
91 : * the callback wbnext. A word boundary is the offset of the start of a word
92 : * or the offset of the character immediately following a word.
93 : *
94 : * The caller is expected to initialize and free the callback state
95 : * wbstate. The callback should first return offset 0 for the first boundary;
96 : * then the offset of each subsequent word boundary; then the total length of
97 : * the string to indicate the final boundary.
98 : */
99 : size_t
100 86 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
101 : WordBoundaryNext wbnext, void *wbstate)
102 : {
103 86 : return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
104 : wbstate);
105 : }
106 :
107 : /*
108 : * unicode_strupper()
109 : *
110 : * Convert src to uppercase, and return the result length (not including
111 : * terminating NUL).
112 : *
113 : * String src must be encoded in UTF-8. If srclen < 0, src must be
114 : * NUL-terminated.
115 : *
116 : * Result string is stored in dst, truncating if larger than dstsize. If
117 : * dstsize is greater than the result length, dst will be NUL-terminated;
118 : * otherwise not.
119 : *
120 : * If dstsize is zero, dst may be NULL. This is useful for calculating the
121 : * required buffer size before allocating.
122 : */
123 : size_t
124 316786 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
125 : {
126 316786 : return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
127 : }
128 :
129 : /*
130 : * If str_casekind is CaseLower or CaseUpper, map each character in the string
131 : * for which a mapping is available.
132 : *
133 : * If str_casekind is CaseTitle, maps characters found on a word boundary to
134 : * uppercase and other characters to lowercase.
135 : */
136 : static size_t
137 318886 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
138 : CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
139 : {
140 : /* character CaseKind varies while titlecasing */
141 318886 : CaseKind chr_casekind = str_casekind;
142 318886 : size_t srcoff = 0;
143 318886 : size_t result_len = 0;
144 318886 : size_t boundary = 0;
145 :
146 : Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
147 : (str_casekind != CaseTitle && !wbnext && !wbstate));
148 :
149 318886 : if (str_casekind == CaseTitle)
150 : {
151 86 : boundary = wbnext(wbstate);
152 : Assert(boundary == 0); /* start of text is always a boundary */
153 : }
154 :
155 918522 : while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
156 : {
157 599636 : pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
158 599636 : int u1len = unicode_utf8len(u1);
159 599636 : const pg_case_map *casemap = find_case_map(u1);
160 :
161 599636 : if (str_casekind == CaseTitle)
162 : {
163 666 : if (srcoff == boundary)
164 : {
165 258 : chr_casekind = CaseUpper;
166 258 : boundary = wbnext(wbstate);
167 : }
168 : else
169 408 : chr_casekind = CaseLower;
170 : }
171 :
172 : /* perform mapping, update result_len, and write to dst */
173 599636 : if (casemap)
174 : {
175 599540 : pg_wchar u2 = casemap->simplemap[chr_casekind];
176 599540 : pg_wchar u2len = unicode_utf8len(u2);
177 :
178 599540 : if (result_len + u2len <= dstsize)
179 599516 : unicode_to_utf8(u2, (unsigned char *) dst + result_len);
180 :
181 599540 : result_len += u2len;
182 : }
183 : else
184 : {
185 : /* no mapping; copy bytes from src */
186 96 : if (result_len + u1len <= dstsize)
187 96 : memcpy(dst + result_len, src + srcoff, u1len);
188 :
189 96 : result_len += u1len;
190 : }
191 :
192 599636 : srcoff += u1len;
193 : }
194 :
195 318886 : if (result_len < dstsize)
196 318850 : dst[result_len] = '\0';
197 :
198 318886 : return result_len;
199 : }
200 :
201 : /* find entry in simple case map, if any */
202 : static const pg_case_map *
203 600380 : find_case_map(pg_wchar ucs)
204 : {
205 : int min;
206 : int mid;
207 : int max;
208 :
209 : /* all chars <= 0x80 are stored in array for fast lookup */
210 : Assert(lengthof(case_map) >= 0x80);
211 600380 : if (ucs < 0x80)
212 : {
213 599360 : const pg_case_map *map = &case_map[ucs];
214 :
215 : Assert(map->codepoint == ucs);
216 599360 : return map;
217 : }
218 :
219 : /* otherwise, binary search */
220 1020 : min = 0x80;
221 1020 : max = lengthof(case_map) - 1;
222 10536 : while (max >= min)
223 : {
224 10440 : mid = (min + max) / 2;
225 10440 : if (ucs > case_map[mid].codepoint)
226 3150 : min = mid + 1;
227 7290 : else if (ucs < case_map[mid].codepoint)
228 6366 : max = mid - 1;
229 : else
230 924 : return &case_map[mid];
231 : }
232 :
233 96 : return NULL;
234 : }
|