Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for builtin provider
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_builtin.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include "catalog/pg_database.h"
15 : #include "catalog/pg_collation.h"
16 : #include "common/unicode_case.h"
17 : #include "common/unicode_category.h"
18 : #include "miscadmin.h"
19 : #include "utils/builtins.h"
20 : #include "utils/pg_locale.h"
21 : #include "utils/syscache.h"
22 :
23 : extern pg_locale_t create_pg_locale_builtin(Oid collid,
24 : MemoryContext context);
25 : extern char *get_collation_actual_version_builtin(const char *collcollate);
26 :
27 : struct WordBoundaryState
28 : {
29 : const char *str;
30 : size_t len;
31 : size_t offset;
32 : bool posix;
33 : bool init;
34 : bool prev_alnum;
35 : };
36 :
37 : /*
38 : * In UTF-8, pg_wchar is guaranteed to be the code point value.
39 : */
40 : static inline char32_t
41 225726 : to_char32(pg_wchar wc)
42 : {
43 : Assert(GetDatabaseEncoding() == PG_UTF8);
44 225726 : return (char32_t) wc;
45 : }
46 :
47 : static inline pg_wchar
48 1056 : to_pg_wchar(char32_t c32)
49 : {
50 : Assert(GetDatabaseEncoding() == PG_UTF8);
51 1056 : return (pg_wchar) c32;
52 : }
53 :
54 : /*
55 : * Simple word boundary iterator that draws boundaries each time the result of
56 : * pg_u_isalnum() changes.
57 : */
58 : static size_t
59 824 : initcap_wbnext(void *state)
60 : {
61 824 : struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
62 :
63 1700 : while (wbstate->offset < wbstate->len &&
64 1506 : wbstate->str[wbstate->offset] != '\0')
65 : {
66 1506 : char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
67 1506 : wbstate->offset);
68 1506 : bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
69 :
70 1506 : if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
71 : {
72 630 : size_t prev_offset = wbstate->offset;
73 :
74 630 : wbstate->init = true;
75 630 : wbstate->offset += unicode_utf8len(u);
76 630 : wbstate->prev_alnum = curr_alnum;
77 630 : return prev_offset;
78 : }
79 :
80 876 : wbstate->offset += unicode_utf8len(u);
81 : }
82 :
83 194 : return wbstate->len;
84 : }
85 :
86 : static size_t
87 12026 : strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
88 : pg_locale_t locale)
89 : {
90 24052 : return unicode_strlower(dest, destsize, src, srclen,
91 12026 : locale->builtin.casemap_full);
92 : }
93 :
94 : static size_t
95 194 : strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
96 : pg_locale_t locale)
97 : {
98 194 : struct WordBoundaryState wbstate = {
99 : .str = src,
100 : .len = srclen,
101 : .offset = 0,
102 194 : .posix = !locale->builtin.casemap_full,
103 : .init = false,
104 : .prev_alnum = false,
105 : };
106 :
107 388 : return unicode_strtitle(dest, destsize, src, srclen,
108 194 : locale->builtin.casemap_full,
109 : initcap_wbnext, &wbstate);
110 : }
111 :
112 : static size_t
113 316918 : strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
114 : pg_locale_t locale)
115 : {
116 633836 : return unicode_strupper(dest, destsize, src, srclen,
117 316918 : locale->builtin.casemap_full);
118 : }
119 :
120 : static size_t
121 12 : strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
122 : pg_locale_t locale)
123 : {
124 24 : return unicode_strfold(dest, destsize, src, srclen,
125 12 : locale->builtin.casemap_full);
126 : }
127 :
128 : static bool
129 78016 : wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
130 : {
131 78016 : return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
132 : }
133 :
134 : static bool
135 39654 : wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
136 : {
137 39654 : return pg_u_isalpha(to_char32(wc));
138 : }
139 :
140 : static bool
141 41218 : wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
142 : {
143 41218 : return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
144 : }
145 :
146 : static bool
147 24576 : wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
148 : {
149 24576 : return pg_u_isupper(to_char32(wc));
150 : }
151 :
152 : static bool
153 0 : wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
154 : {
155 0 : return pg_u_islower(to_char32(wc));
156 : }
157 :
158 : static bool
159 0 : wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
160 : {
161 0 : return pg_u_isgraph(to_char32(wc));
162 : }
163 :
164 : static bool
165 0 : wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
166 : {
167 0 : return pg_u_isprint(to_char32(wc));
168 : }
169 :
170 : static bool
171 24576 : wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
172 : {
173 24576 : return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
174 : }
175 :
176 : static bool
177 16624 : wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
178 : {
179 16624 : return pg_u_isspace(to_char32(wc));
180 : }
181 :
182 : static bool
183 6 : wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
184 : {
185 6 : return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
186 : }
187 :
188 : static bool
189 0 : char_is_cased_builtin(char ch, pg_locale_t locale)
190 : {
191 0 : return IS_HIGHBIT_SET(ch) ||
192 0 : (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
193 : }
194 :
195 : static pg_wchar
196 528 : wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
197 : {
198 528 : return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
199 : }
200 :
201 : static pg_wchar
202 528 : wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
203 : {
204 528 : return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
205 : }
206 :
207 : static const struct ctype_methods ctype_methods_builtin = {
208 : .strlower = strlower_builtin,
209 : .strtitle = strtitle_builtin,
210 : .strupper = strupper_builtin,
211 : .strfold = strfold_builtin,
212 : .wc_isdigit = wc_isdigit_builtin,
213 : .wc_isalpha = wc_isalpha_builtin,
214 : .wc_isalnum = wc_isalnum_builtin,
215 : .wc_isupper = wc_isupper_builtin,
216 : .wc_islower = wc_islower_builtin,
217 : .wc_isgraph = wc_isgraph_builtin,
218 : .wc_isprint = wc_isprint_builtin,
219 : .wc_ispunct = wc_ispunct_builtin,
220 : .wc_isspace = wc_isspace_builtin,
221 : .wc_isxdigit = wc_isxdigit_builtin,
222 : .char_is_cased = char_is_cased_builtin,
223 : .wc_tolower = wc_tolower_builtin,
224 : .wc_toupper = wc_toupper_builtin,
225 : };
226 :
227 : pg_locale_t
228 1836 : create_pg_locale_builtin(Oid collid, MemoryContext context)
229 : {
230 : const char *locstr;
231 : pg_locale_t result;
232 :
233 1836 : if (collid == DEFAULT_COLLATION_OID)
234 : {
235 : HeapTuple tp;
236 : Datum datum;
237 :
238 1784 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
239 1784 : if (!HeapTupleIsValid(tp))
240 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
241 1784 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
242 : Anum_pg_database_datlocale);
243 1784 : locstr = TextDatumGetCString(datum);
244 1784 : ReleaseSysCache(tp);
245 : }
246 : else
247 : {
248 : HeapTuple tp;
249 : Datum datum;
250 :
251 52 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
252 52 : if (!HeapTupleIsValid(tp))
253 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
254 52 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
255 : Anum_pg_collation_colllocale);
256 52 : locstr = TextDatumGetCString(datum);
257 52 : ReleaseSysCache(tp);
258 : }
259 :
260 1836 : builtin_validate_locale(GetDatabaseEncoding(), locstr);
261 :
262 1836 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
263 :
264 1836 : result->builtin.locale = MemoryContextStrdup(context, locstr);
265 1836 : result->builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
266 1836 : result->deterministic = true;
267 1836 : result->collate_is_c = true;
268 1836 : result->ctype_is_c = (strcmp(locstr, "C") == 0);
269 1836 : if (!result->ctype_is_c)
270 1804 : result->ctype = &ctype_methods_builtin;
271 :
272 1836 : return result;
273 : }
274 :
275 : char *
276 1904 : get_collation_actual_version_builtin(const char *collcollate)
277 : {
278 : /*
279 : * The only two supported locales (C and C.UTF-8) are both based on memcmp
280 : * and are not expected to change, but track the version anyway.
281 : *
282 : * Note that the character semantics may change for some locales, but the
283 : * collation version only tracks changes to sort order.
284 : */
285 1904 : if (strcmp(collcollate, "C") == 0)
286 62 : return "1";
287 1842 : else if (strcmp(collcollate, "C.UTF-8") == 0)
288 1822 : return "1";
289 20 : else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
290 20 : return "1";
291 : else
292 0 : ereport(ERROR,
293 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
294 : errmsg("invalid locale name \"%s\" for builtin provider",
295 : collcollate)));
296 :
297 : return NULL; /* keep compiler quiet */
298 : }
|