Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for builtin provider
4 : *
5 : * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_builtin.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include "catalog/pg_database.h"
15 : #include "catalog/pg_collation.h"
16 : #include "common/unicode_case.h"
17 : #include "common/unicode_category.h"
18 : #include "miscadmin.h"
19 : #include "utils/builtins.h"
20 : #include "utils/pg_locale.h"
21 : #include "utils/syscache.h"
22 :
23 : extern pg_locale_t create_pg_locale_builtin(Oid collid,
24 : MemoryContext context);
25 : extern char *get_collation_actual_version_builtin(const char *collcollate);
26 :
27 : struct WordBoundaryState
28 : {
29 : const char *str;
30 : size_t len;
31 : size_t offset;
32 : bool posix;
33 : bool init;
34 : bool prev_alnum;
35 : };
36 :
37 : /*
38 : * In UTF-8, pg_wchar is guaranteed to be the code point value.
39 : */
40 : static inline char32_t
41 129459 : to_char32(pg_wchar wc)
42 : {
43 : Assert(GetDatabaseEncoding() == PG_UTF8);
44 129459 : return (char32_t) wc;
45 : }
46 :
47 : static inline pg_wchar
48 650 : to_pg_wchar(char32_t c32)
49 : {
50 : Assert(GetDatabaseEncoding() == PG_UTF8);
51 650 : return (pg_wchar) c32;
52 : }
53 :
54 : /*
55 : * Simple word boundary iterator that draws boundaries each time the result of
56 : * pg_u_isalnum() changes.
57 : */
58 : static size_t
59 564 : initcap_wbnext(void *state)
60 : {
61 564 : struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
62 :
63 1165 : while (wbstate->offset < wbstate->len)
64 : {
65 1032 : char32_t u = utf8_to_unicode((const unsigned char *) wbstate->str +
66 1032 : wbstate->offset);
67 1032 : bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
68 :
69 1032 : if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
70 : {
71 431 : size_t prev_offset = wbstate->offset;
72 :
73 431 : wbstate->init = true;
74 431 : wbstate->offset += unicode_utf8len(u);
75 431 : wbstate->prev_alnum = curr_alnum;
76 431 : return prev_offset;
77 : }
78 :
79 601 : wbstate->offset += unicode_utf8len(u);
80 : }
81 :
82 133 : return wbstate->len;
83 : }
84 :
85 : static size_t
86 6312 : strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
87 : pg_locale_t locale)
88 : {
89 12624 : return unicode_strlower(dest, destsize, src, srclen,
90 6312 : locale->builtin.casemap_full);
91 : }
92 :
93 : static size_t
94 133 : strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
95 : pg_locale_t locale)
96 : {
97 133 : struct WordBoundaryState wbstate = {
98 : .str = src,
99 : .len = srclen,
100 : .offset = 0,
101 133 : .posix = !locale->builtin.casemap_full,
102 : .init = false,
103 : .prev_alnum = false,
104 : };
105 :
106 266 : return unicode_strtitle(dest, destsize, src, srclen,
107 133 : locale->builtin.casemap_full,
108 : initcap_wbnext, &wbstate);
109 : }
110 :
111 : static size_t
112 158561 : strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
113 : pg_locale_t locale)
114 : {
115 317122 : return unicode_strupper(dest, destsize, src, srclen,
116 158561 : locale->builtin.casemap_full);
117 : }
118 :
119 : static size_t
120 10 : strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen,
121 : pg_locale_t locale)
122 : {
123 20 : return unicode_strfold(dest, destsize, src, srclen,
124 10 : locale->builtin.casemap_full);
125 : }
126 :
127 : static bool
128 43117 : wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
129 : {
130 43117 : return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
131 : }
132 :
133 : static bool
134 19901 : wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
135 : {
136 19901 : return pg_u_isalpha(to_char32(wc));
137 : }
138 :
139 : static bool
140 24708 : wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
141 : {
142 24708 : return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
143 : }
144 :
145 : static bool
146 16384 : wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
147 : {
148 16384 : return pg_u_isupper(to_char32(wc));
149 : }
150 :
151 : static bool
152 0 : wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
153 : {
154 0 : return pg_u_islower(to_char32(wc));
155 : }
156 :
157 : static bool
158 0 : wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
159 : {
160 0 : return pg_u_isgraph(to_char32(wc));
161 : }
162 :
163 : static bool
164 0 : wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
165 : {
166 0 : return pg_u_isprint(to_char32(wc));
167 : }
168 :
169 : static bool
170 16384 : wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
171 : {
172 16384 : return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
173 : }
174 :
175 : static bool
176 8312 : wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
177 : {
178 8312 : return pg_u_isspace(to_char32(wc));
179 : }
180 :
181 : static bool
182 3 : wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
183 : {
184 3 : return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
185 : }
186 :
187 : static bool
188 0 : wc_iscased_builtin(pg_wchar wc, pg_locale_t locale)
189 : {
190 0 : return pg_u_prop_cased(to_char32(wc));
191 : }
192 :
193 : static pg_wchar
194 325 : wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
195 : {
196 325 : return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
197 : }
198 :
199 : static pg_wchar
200 325 : wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
201 : {
202 325 : return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
203 : }
204 :
205 : static const struct ctype_methods ctype_methods_builtin = {
206 : .strlower = strlower_builtin,
207 : .strtitle = strtitle_builtin,
208 : .strupper = strupper_builtin,
209 : .strfold = strfold_builtin,
210 : /* uses plain ASCII semantics for historical reasons */
211 : .downcase_ident = NULL,
212 : .wc_isdigit = wc_isdigit_builtin,
213 : .wc_isalpha = wc_isalpha_builtin,
214 : .wc_isalnum = wc_isalnum_builtin,
215 : .wc_isupper = wc_isupper_builtin,
216 : .wc_islower = wc_islower_builtin,
217 : .wc_isgraph = wc_isgraph_builtin,
218 : .wc_isprint = wc_isprint_builtin,
219 : .wc_ispunct = wc_ispunct_builtin,
220 : .wc_isspace = wc_isspace_builtin,
221 : .wc_isxdigit = wc_isxdigit_builtin,
222 : .wc_iscased = wc_iscased_builtin,
223 : .wc_tolower = wc_tolower_builtin,
224 : .wc_toupper = wc_toupper_builtin,
225 : };
226 :
227 : pg_locale_t
228 971 : create_pg_locale_builtin(Oid collid, MemoryContext context)
229 : {
230 : const char *locstr;
231 : pg_locale_t result;
232 :
233 971 : if (collid == DEFAULT_COLLATION_OID)
234 : {
235 : HeapTuple tp;
236 : Datum datum;
237 :
238 933 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
239 933 : if (!HeapTupleIsValid(tp))
240 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
241 933 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
242 : Anum_pg_database_datlocale);
243 933 : locstr = TextDatumGetCString(datum);
244 933 : ReleaseSysCache(tp);
245 : }
246 : else
247 : {
248 : HeapTuple tp;
249 : Datum datum;
250 :
251 38 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
252 38 : if (!HeapTupleIsValid(tp))
253 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
254 38 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
255 : Anum_pg_collation_colllocale);
256 38 : locstr = TextDatumGetCString(datum);
257 38 : ReleaseSysCache(tp);
258 : }
259 :
260 971 : builtin_validate_locale(GetDatabaseEncoding(), locstr);
261 :
262 971 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
263 :
264 971 : result->builtin.locale = MemoryContextStrdup(context, locstr);
265 971 : result->builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
266 971 : result->deterministic = true;
267 971 : result->collate_is_c = true;
268 971 : result->ctype_is_c = (strcmp(locstr, "C") == 0);
269 971 : if (!result->ctype_is_c)
270 948 : result->ctype = &ctype_methods_builtin;
271 :
272 971 : return result;
273 : }
274 :
275 : char *
276 1014 : get_collation_actual_version_builtin(const char *collcollate)
277 : {
278 : /*
279 : * The only two supported locales (C and C.UTF-8) are both based on memcmp
280 : * and are not expected to change, but track the version anyway.
281 : *
282 : * Note that the character semantics may change for some locales, but the
283 : * collation version only tracks changes to sort order.
284 : */
285 1014 : if (strcmp(collcollate, "C") == 0)
286 44 : return "1";
287 970 : else if (strcmp(collcollate, "C.UTF-8") == 0)
288 957 : return "1";
289 13 : else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
290 13 : return "1";
291 : else
292 0 : ereport(ERROR,
293 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
294 : errmsg("invalid locale name \"%s\" for builtin provider",
295 : collcollate)));
296 :
297 : return NULL; /* keep compiler quiet */
298 : }
|