Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include "catalog/pg_collation.h"
15 : #include "mb/pg_wchar.h"
16 : #include "utils/formatting.h"
17 : #include "utils/pg_locale.h"
18 :
19 : /*
20 : * Size of stack buffer to use for string transformations, used to avoid heap
21 : * allocations in typical cases. This should be large enough that most strings
22 : * will fit, but small enough that we feel comfortable putting it on the
23 : * stack.
24 : */
25 : #define TEXTBUFLEN 1024
26 :
27 : extern locale_t make_libc_collator(const char *collate,
28 : const char *ctype);
29 : extern int strncoll_libc(const char *arg1, ssize_t len1,
30 : const char *arg2, ssize_t len2,
31 : pg_locale_t locale);
32 : extern size_t strnxfrm_libc(char *dest, size_t destsize,
33 : const char *src, ssize_t srclen,
34 : pg_locale_t locale);
35 :
36 : static void report_newlocale_failure(const char *localename);
37 :
38 : #ifdef WIN32
39 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
40 : const char *arg2, ssize_t len2,
41 : pg_locale_t locale);
42 : #endif
43 :
44 : /*
45 : * Create a locale_t with the given collation and ctype.
46 : *
47 : * The "C" and "POSIX" locales are not actually handled by libc, so return
48 : * NULL.
49 : *
50 : * Ensure that no path leaks a locale_t.
51 : */
52 : locale_t
53 30176 : make_libc_collator(const char *collate, const char *ctype)
54 : {
55 30176 : locale_t loc = 0;
56 :
57 30176 : if (strcmp(collate, ctype) == 0)
58 : {
59 30176 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
60 : {
61 : /* Normal case where they're the same */
62 25886 : errno = 0;
63 : #ifndef WIN32
64 25886 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
65 : NULL);
66 : #else
67 : loc = _create_locale(LC_ALL, collate);
68 : #endif
69 25886 : if (!loc)
70 0 : report_newlocale_failure(collate);
71 : }
72 : }
73 : else
74 : {
75 : #ifndef WIN32
76 : /* We need two newlocale() steps */
77 0 : locale_t loc1 = 0;
78 :
79 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
80 : {
81 0 : errno = 0;
82 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
83 0 : if (!loc1)
84 0 : report_newlocale_failure(collate);
85 : }
86 :
87 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
88 : {
89 0 : errno = 0;
90 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
91 0 : if (!loc)
92 : {
93 0 : if (loc1)
94 0 : freelocale(loc1);
95 0 : report_newlocale_failure(ctype);
96 : }
97 : }
98 : else
99 0 : loc = loc1;
100 : #else
101 :
102 : /*
103 : * XXX The _create_locale() API doesn't appear to support this. Could
104 : * perhaps be worked around by changing pg_locale_t to contain two
105 : * separate fields.
106 : */
107 : ereport(ERROR,
108 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
109 : errmsg("collations with different collate and ctype values are not supported on this platform")));
110 : #endif
111 : }
112 :
113 30176 : return loc;
114 : }
115 :
116 : /*
117 : * strncoll_libc
118 : *
119 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
120 : *
121 : * An input string length of -1 means that it's already NUL-terminated.
122 : */
123 : int
124 24784284 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
125 : pg_locale_t locale)
126 : {
127 : char sbuf[TEXTBUFLEN];
128 24784284 : char *buf = sbuf;
129 24784284 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
130 24784284 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
131 : const char *arg1n;
132 : const char *arg2n;
133 : int result;
134 :
135 : Assert(locale->provider == COLLPROVIDER_LIBC);
136 :
137 : #ifdef WIN32
138 : /* check for this case before doing the work for nul-termination */
139 : if (GetDatabaseEncoding() == PG_UTF8)
140 : return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
141 : #endif /* WIN32 */
142 :
143 24784284 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
144 360 : buf = palloc(bufsize1 + bufsize2);
145 :
146 : /* nul-terminate arguments if necessary */
147 24784284 : if (len1 == -1)
148 : {
149 22748714 : arg1n = arg1;
150 : }
151 : else
152 : {
153 2035570 : char *buf1 = buf;
154 :
155 2035570 : memcpy(buf1, arg1, len1);
156 2035570 : buf1[len1] = '\0';
157 2035570 : arg1n = buf1;
158 : }
159 :
160 24784284 : if (len2 == -1)
161 : {
162 22748714 : arg2n = arg2;
163 : }
164 : else
165 : {
166 2035570 : char *buf2 = buf + bufsize1;
167 :
168 2035570 : memcpy(buf2, arg2, len2);
169 2035570 : buf2[len2] = '\0';
170 2035570 : arg2n = buf2;
171 : }
172 :
173 24784284 : result = strcoll_l(arg1n, arg2n, locale->info.lt);
174 :
175 24784284 : if (buf != sbuf)
176 360 : pfree(buf);
177 :
178 24784284 : return result;
179 : }
180 :
181 : /*
182 : * strnxfrm_libc
183 : *
184 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
185 : *
186 : * A source length of -1 means that it's already NUL-terminated.
187 : */
188 : size_t
189 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
190 : pg_locale_t locale)
191 : {
192 : char sbuf[TEXTBUFLEN];
193 144 : char *buf = sbuf;
194 144 : size_t bufsize = srclen + 1;
195 : size_t result;
196 :
197 : Assert(locale->provider == COLLPROVIDER_LIBC);
198 :
199 144 : if (srclen == -1)
200 144 : return strxfrm_l(dest, src, destsize, locale->info.lt);
201 :
202 0 : if (bufsize > TEXTBUFLEN)
203 0 : buf = palloc(bufsize);
204 :
205 : /* nul-terminate argument */
206 0 : memcpy(buf, src, srclen);
207 0 : buf[srclen] = '\0';
208 :
209 0 : result = strxfrm_l(dest, buf, destsize, locale->info.lt);
210 :
211 0 : if (buf != sbuf)
212 0 : pfree(buf);
213 :
214 : /* if dest is defined, it should be nul-terminated */
215 : Assert(result >= destsize || dest[result] == '\0');
216 :
217 0 : return result;
218 : }
219 :
220 : /*
221 : * strncoll_libc_win32_utf8
222 : *
223 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
224 : * invoke wcscoll_l().
225 : *
226 : * An input string length of -1 means that it's NUL-terminated.
227 : */
228 : #ifdef WIN32
229 : static int
230 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
231 : ssize_t len2, pg_locale_t locale)
232 : {
233 : char sbuf[TEXTBUFLEN];
234 : char *buf = sbuf;
235 : char *a1p,
236 : *a2p;
237 : int a1len;
238 : int a2len;
239 : int r;
240 : int result;
241 :
242 : Assert(locale->provider == COLLPROVIDER_LIBC);
243 : Assert(GetDatabaseEncoding() == PG_UTF8);
244 :
245 : if (len1 == -1)
246 : len1 = strlen(arg1);
247 : if (len2 == -1)
248 : len2 = strlen(arg2);
249 :
250 : a1len = len1 * 2 + 2;
251 : a2len = len2 * 2 + 2;
252 :
253 : if (a1len + a2len > TEXTBUFLEN)
254 : buf = palloc(a1len + a2len);
255 :
256 : a1p = buf;
257 : a2p = buf + a1len;
258 :
259 : /* API does not work for zero-length input */
260 : if (len1 == 0)
261 : r = 0;
262 : else
263 : {
264 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
265 : (LPWSTR) a1p, a1len / 2);
266 : if (!r)
267 : ereport(ERROR,
268 : (errmsg("could not convert string to UTF-16: error code %lu",
269 : GetLastError())));
270 : }
271 : ((LPWSTR) a1p)[r] = 0;
272 :
273 : if (len2 == 0)
274 : r = 0;
275 : else
276 : {
277 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
278 : (LPWSTR) a2p, a2len / 2);
279 : if (!r)
280 : ereport(ERROR,
281 : (errmsg("could not convert string to UTF-16: error code %lu",
282 : GetLastError())));
283 : }
284 : ((LPWSTR) a2p)[r] = 0;
285 :
286 : errno = 0;
287 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
288 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
289 : ereport(ERROR,
290 : (errmsg("could not compare Unicode strings: %m")));
291 :
292 : if (buf != sbuf)
293 : pfree(buf);
294 :
295 : return result;
296 : }
297 : #endif /* WIN32 */
298 :
299 : /* simple subroutine for reporting errors from newlocale() */
300 : static void
301 0 : report_newlocale_failure(const char *localename)
302 : {
303 : int save_errno;
304 :
305 : /*
306 : * Windows doesn't provide any useful error indication from
307 : * _create_locale(), and BSD-derived platforms don't seem to feel they
308 : * need to set errno either (even though POSIX is pretty clear that
309 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
310 : * is what to report.
311 : */
312 0 : if (errno == 0)
313 0 : errno = ENOENT;
314 :
315 : /*
316 : * ENOENT means "no such locale", not "no such file", so clarify that
317 : * errno with an errdetail message.
318 : */
319 0 : save_errno = errno; /* auxiliary funcs might change errno */
320 0 : ereport(ERROR,
321 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
322 : errmsg("could not create locale \"%s\": %m",
323 : localename),
324 : (save_errno == ENOENT ?
325 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
326 : localename) : 0)));
327 : }
328 :
329 : /*
330 : * POSIX doesn't define _l-variants of these functions, but several systems
331 : * have them. We provide our own replacements here.
332 : */
333 : #ifndef HAVE_MBSTOWCS_L
334 : static size_t
335 864782 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
336 : {
337 : #ifdef WIN32
338 : return _mbstowcs_l(dest, src, n, loc);
339 : #else
340 : size_t result;
341 864782 : locale_t save_locale = uselocale(loc);
342 :
343 864782 : result = mbstowcs(dest, src, n);
344 864782 : uselocale(save_locale);
345 864782 : return result;
346 : #endif
347 : }
348 : #endif
349 : #ifndef HAVE_WCSTOMBS_L
350 : static size_t
351 864782 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
352 : {
353 : #ifdef WIN32
354 : return _wcstombs_l(dest, src, n, loc);
355 : #else
356 : size_t result;
357 864782 : locale_t save_locale = uselocale(loc);
358 :
359 864782 : result = wcstombs(dest, src, n);
360 864782 : uselocale(save_locale);
361 864782 : return result;
362 : #endif
363 : }
364 : #endif
365 :
366 : /*
367 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
368 : * Therefore we keep them here rather than with the mbutils code.
369 : */
370 :
371 : /*
372 : * wchar2char --- convert wide characters to multibyte format
373 : *
374 : * This has the same API as the standard wcstombs_l() function; in particular,
375 : * tolen is the maximum number of bytes to store at *to, and *from must be
376 : * zero-terminated. The output will be zero-terminated iff there is room.
377 : */
378 : size_t
379 1138778 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
380 : {
381 : size_t result;
382 :
383 1138778 : if (tolen == 0)
384 0 : return 0;
385 :
386 : #ifdef WIN32
387 :
388 : /*
389 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
390 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
391 : * MultiByteToWideChar().
392 : */
393 : if (GetDatabaseEncoding() == PG_UTF8)
394 : {
395 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
396 : NULL, NULL);
397 : /* A zero return is failure */
398 : if (result <= 0)
399 : result = -1;
400 : else
401 : {
402 : Assert(result <= tolen);
403 : /* Microsoft counts the zero terminator in the result */
404 : result--;
405 : }
406 : }
407 : else
408 : #endif /* WIN32 */
409 1138778 : if (locale == (pg_locale_t) 0)
410 : {
411 : /* Use wcstombs directly for the default locale */
412 273996 : result = wcstombs(to, from, tolen);
413 : }
414 : else
415 : {
416 : /* Use wcstombs_l for nondefault locales */
417 864782 : result = wcstombs_l(to, from, tolen, locale->info.lt);
418 : }
419 :
420 1138778 : return result;
421 : }
422 :
423 : /*
424 : * char2wchar --- convert multibyte characters to wide characters
425 : *
426 : * This has almost the API of mbstowcs_l(), except that *from need not be
427 : * null-terminated; instead, the number of input bytes is specified as
428 : * fromlen. Also, we ereport() rather than returning -1 for invalid
429 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
430 : * The output will be zero-terminated iff there is room.
431 : */
432 : size_t
433 1154654 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
434 : pg_locale_t locale)
435 : {
436 : size_t result;
437 :
438 1154654 : if (tolen == 0)
439 0 : return 0;
440 :
441 : #ifdef WIN32
442 : /* See WIN32 "Unicode" comment above */
443 : if (GetDatabaseEncoding() == PG_UTF8)
444 : {
445 : /* Win32 API does not work for zero-length input */
446 : if (fromlen == 0)
447 : result = 0;
448 : else
449 : {
450 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
451 : /* A zero return is failure */
452 : if (result == 0)
453 : result = -1;
454 : }
455 :
456 : if (result != -1)
457 : {
458 : Assert(result < tolen);
459 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
460 : to[result] = 0;
461 : }
462 : }
463 : else
464 : #endif /* WIN32 */
465 : {
466 : /* mbstowcs requires ending '\0' */
467 1154654 : char *str = pnstrdup(from, fromlen);
468 :
469 1154654 : if (locale == (pg_locale_t) 0)
470 : {
471 : /* Use mbstowcs directly for the default locale */
472 289872 : result = mbstowcs(to, str, tolen);
473 : }
474 : else
475 : {
476 : /* Use mbstowcs_l for nondefault locales */
477 864782 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
478 : }
479 :
480 1154654 : pfree(str);
481 : }
482 :
483 1154654 : if (result == -1)
484 : {
485 : /*
486 : * Invalid multibyte character encountered. We try to give a useful
487 : * error message by letting pg_verifymbstr check the string. But it's
488 : * possible that the string is OK to us, and not OK to mbstowcs ---
489 : * this suggests that the LC_CTYPE locale is different from the
490 : * database encoding. Give a generic error message if pg_verifymbstr
491 : * can't find anything wrong.
492 : */
493 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
494 : /* but if it does ... */
495 0 : ereport(ERROR,
496 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
497 : errmsg("invalid multibyte character for locale"),
498 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
499 : }
500 :
501 1154654 : return result;
502 : }
|