LCOV - code coverage report
Current view: top level - src/backend/utils/adt - pg_locale_libc.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 60 92 65.2 %
Date: 2024-11-21 08:14:44 Functions: 7 8 87.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-----------------------------------------------------------------------
       2             :  *
       3             :  * PostgreSQL locale utilities for libc
       4             :  *
       5             :  * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
       6             :  *
       7             :  * src/backend/utils/adt/pg_locale_libc.c
       8             :  *
       9             :  *-----------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres.h"
      13             : 
      14             : #include "catalog/pg_collation.h"
      15             : #include "mb/pg_wchar.h"
      16             : #include "utils/formatting.h"
      17             : #include "utils/pg_locale.h"
      18             : 
      19             : /*
      20             :  * Size of stack buffer to use for string transformations, used to avoid heap
      21             :  * allocations in typical cases. This should be large enough that most strings
      22             :  * will fit, but small enough that we feel comfortable putting it on the
      23             :  * stack.
      24             :  */
      25             : #define     TEXTBUFLEN          1024
      26             : 
      27             : extern locale_t make_libc_collator(const char *collate,
      28             :                                    const char *ctype);
      29             : extern int  strncoll_libc(const char *arg1, ssize_t len1,
      30             :                           const char *arg2, ssize_t len2,
      31             :                           pg_locale_t locale);
      32             : extern size_t strnxfrm_libc(char *dest, size_t destsize,
      33             :                             const char *src, ssize_t srclen,
      34             :                             pg_locale_t locale);
      35             : 
      36             : static void report_newlocale_failure(const char *localename);
      37             : 
      38             : #ifdef WIN32
      39             : static int  strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
      40             :                                      const char *arg2, ssize_t len2,
      41             :                                      pg_locale_t locale);
      42             : #endif
      43             : 
      44             : /*
      45             :  * Create a locale_t with the given collation and ctype.
      46             :  *
      47             :  * The "C" and "POSIX" locales are not actually handled by libc, so return
      48             :  * NULL.
      49             :  *
      50             :  * Ensure that no path leaks a locale_t.
      51             :  */
      52             : locale_t
      53       30176 : make_libc_collator(const char *collate, const char *ctype)
      54             : {
      55       30176 :     locale_t    loc = 0;
      56             : 
      57       30176 :     if (strcmp(collate, ctype) == 0)
      58             :     {
      59       30176 :         if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
      60             :         {
      61             :             /* Normal case where they're the same */
      62       25886 :             errno = 0;
      63             : #ifndef WIN32
      64       25886 :             loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
      65             :                             NULL);
      66             : #else
      67             :             loc = _create_locale(LC_ALL, collate);
      68             : #endif
      69       25886 :             if (!loc)
      70           0 :                 report_newlocale_failure(collate);
      71             :         }
      72             :     }
      73             :     else
      74             :     {
      75             : #ifndef WIN32
      76             :         /* We need two newlocale() steps */
      77           0 :         locale_t    loc1 = 0;
      78             : 
      79           0 :         if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
      80             :         {
      81           0 :             errno = 0;
      82           0 :             loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
      83           0 :             if (!loc1)
      84           0 :                 report_newlocale_failure(collate);
      85             :         }
      86             : 
      87           0 :         if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
      88             :         {
      89           0 :             errno = 0;
      90           0 :             loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
      91           0 :             if (!loc)
      92             :             {
      93           0 :                 if (loc1)
      94           0 :                     freelocale(loc1);
      95           0 :                 report_newlocale_failure(ctype);
      96             :             }
      97             :         }
      98             :         else
      99           0 :             loc = loc1;
     100             : #else
     101             : 
     102             :         /*
     103             :          * XXX The _create_locale() API doesn't appear to support this. Could
     104             :          * perhaps be worked around by changing pg_locale_t to contain two
     105             :          * separate fields.
     106             :          */
     107             :         ereport(ERROR,
     108             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     109             :                  errmsg("collations with different collate and ctype values are not supported on this platform")));
     110             : #endif
     111             :     }
     112             : 
     113       30176 :     return loc;
     114             : }
     115             : 
     116             : /*
     117             :  * strncoll_libc
     118             :  *
     119             :  * NUL-terminate arguments, if necessary, and pass to strcoll_l().
     120             :  *
     121             :  * An input string length of -1 means that it's already NUL-terminated.
     122             :  */
     123             : int
     124    24784284 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
     125             :               pg_locale_t locale)
     126             : {
     127             :     char        sbuf[TEXTBUFLEN];
     128    24784284 :     char       *buf = sbuf;
     129    24784284 :     size_t      bufsize1 = (len1 == -1) ? 0 : len1 + 1;
     130    24784284 :     size_t      bufsize2 = (len2 == -1) ? 0 : len2 + 1;
     131             :     const char *arg1n;
     132             :     const char *arg2n;
     133             :     int         result;
     134             : 
     135             :     Assert(locale->provider == COLLPROVIDER_LIBC);
     136             : 
     137             : #ifdef WIN32
     138             :     /* check for this case before doing the work for nul-termination */
     139             :     if (GetDatabaseEncoding() == PG_UTF8)
     140             :         return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
     141             : #endif                          /* WIN32 */
     142             : 
     143    24784284 :     if (bufsize1 + bufsize2 > TEXTBUFLEN)
     144         360 :         buf = palloc(bufsize1 + bufsize2);
     145             : 
     146             :     /* nul-terminate arguments if necessary */
     147    24784284 :     if (len1 == -1)
     148             :     {
     149    22748714 :         arg1n = arg1;
     150             :     }
     151             :     else
     152             :     {
     153     2035570 :         char       *buf1 = buf;
     154             : 
     155     2035570 :         memcpy(buf1, arg1, len1);
     156     2035570 :         buf1[len1] = '\0';
     157     2035570 :         arg1n = buf1;
     158             :     }
     159             : 
     160    24784284 :     if (len2 == -1)
     161             :     {
     162    22748714 :         arg2n = arg2;
     163             :     }
     164             :     else
     165             :     {
     166     2035570 :         char       *buf2 = buf + bufsize1;
     167             : 
     168     2035570 :         memcpy(buf2, arg2, len2);
     169     2035570 :         buf2[len2] = '\0';
     170     2035570 :         arg2n = buf2;
     171             :     }
     172             : 
     173    24784284 :     result = strcoll_l(arg1n, arg2n, locale->info.lt);
     174             : 
     175    24784284 :     if (buf != sbuf)
     176         360 :         pfree(buf);
     177             : 
     178    24784284 :     return result;
     179             : }
     180             : 
     181             : /*
     182             :  * strnxfrm_libc
     183             :  *
     184             :  * NUL-terminate src, if necessary, and pass to strxfrm_l().
     185             :  *
     186             :  * A source length of -1 means that it's already NUL-terminated.
     187             :  */
     188             : size_t
     189         144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
     190             :               pg_locale_t locale)
     191             : {
     192             :     char        sbuf[TEXTBUFLEN];
     193         144 :     char       *buf = sbuf;
     194         144 :     size_t      bufsize = srclen + 1;
     195             :     size_t      result;
     196             : 
     197             :     Assert(locale->provider == COLLPROVIDER_LIBC);
     198             : 
     199         144 :     if (srclen == -1)
     200         144 :         return strxfrm_l(dest, src, destsize, locale->info.lt);
     201             : 
     202           0 :     if (bufsize > TEXTBUFLEN)
     203           0 :         buf = palloc(bufsize);
     204             : 
     205             :     /* nul-terminate argument */
     206           0 :     memcpy(buf, src, srclen);
     207           0 :     buf[srclen] = '\0';
     208             : 
     209           0 :     result = strxfrm_l(dest, buf, destsize, locale->info.lt);
     210             : 
     211           0 :     if (buf != sbuf)
     212           0 :         pfree(buf);
     213             : 
     214             :     /* if dest is defined, it should be nul-terminated */
     215             :     Assert(result >= destsize || dest[result] == '\0');
     216             : 
     217           0 :     return result;
     218             : }
     219             : 
     220             : /*
     221             :  * strncoll_libc_win32_utf8
     222             :  *
     223             :  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
     224             :  * invoke wcscoll_l().
     225             :  *
     226             :  * An input string length of -1 means that it's NUL-terminated.
     227             :  */
     228             : #ifdef WIN32
     229             : static int
     230             : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
     231             :                          ssize_t len2, pg_locale_t locale)
     232             : {
     233             :     char        sbuf[TEXTBUFLEN];
     234             :     char       *buf = sbuf;
     235             :     char       *a1p,
     236             :                *a2p;
     237             :     int         a1len;
     238             :     int         a2len;
     239             :     int         r;
     240             :     int         result;
     241             : 
     242             :     Assert(locale->provider == COLLPROVIDER_LIBC);
     243             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     244             : 
     245             :     if (len1 == -1)
     246             :         len1 = strlen(arg1);
     247             :     if (len2 == -1)
     248             :         len2 = strlen(arg2);
     249             : 
     250             :     a1len = len1 * 2 + 2;
     251             :     a2len = len2 * 2 + 2;
     252             : 
     253             :     if (a1len + a2len > TEXTBUFLEN)
     254             :         buf = palloc(a1len + a2len);
     255             : 
     256             :     a1p = buf;
     257             :     a2p = buf + a1len;
     258             : 
     259             :     /* API does not work for zero-length input */
     260             :     if (len1 == 0)
     261             :         r = 0;
     262             :     else
     263             :     {
     264             :         r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
     265             :                                 (LPWSTR) a1p, a1len / 2);
     266             :         if (!r)
     267             :             ereport(ERROR,
     268             :                     (errmsg("could not convert string to UTF-16: error code %lu",
     269             :                             GetLastError())));
     270             :     }
     271             :     ((LPWSTR) a1p)[r] = 0;
     272             : 
     273             :     if (len2 == 0)
     274             :         r = 0;
     275             :     else
     276             :     {
     277             :         r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
     278             :                                 (LPWSTR) a2p, a2len / 2);
     279             :         if (!r)
     280             :             ereport(ERROR,
     281             :                     (errmsg("could not convert string to UTF-16: error code %lu",
     282             :                             GetLastError())));
     283             :     }
     284             :     ((LPWSTR) a2p)[r] = 0;
     285             : 
     286             :     errno = 0;
     287             :     result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
     288             :     if (result == 2147483647)   /* _NLSCMPERROR; missing from mingw headers */
     289             :         ereport(ERROR,
     290             :                 (errmsg("could not compare Unicode strings: %m")));
     291             : 
     292             :     if (buf != sbuf)
     293             :         pfree(buf);
     294             : 
     295             :     return result;
     296             : }
     297             : #endif                          /* WIN32 */
     298             : 
     299             : /* simple subroutine for reporting errors from newlocale() */
     300             : static void
     301           0 : report_newlocale_failure(const char *localename)
     302             : {
     303             :     int         save_errno;
     304             : 
     305             :     /*
     306             :      * Windows doesn't provide any useful error indication from
     307             :      * _create_locale(), and BSD-derived platforms don't seem to feel they
     308             :      * need to set errno either (even though POSIX is pretty clear that
     309             :      * newlocale should do so).  So, if errno hasn't been set, assume ENOENT
     310             :      * is what to report.
     311             :      */
     312           0 :     if (errno == 0)
     313           0 :         errno = ENOENT;
     314             : 
     315             :     /*
     316             :      * ENOENT means "no such locale", not "no such file", so clarify that
     317             :      * errno with an errdetail message.
     318             :      */
     319           0 :     save_errno = errno;         /* auxiliary funcs might change errno */
     320           0 :     ereport(ERROR,
     321             :             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     322             :              errmsg("could not create locale \"%s\": %m",
     323             :                     localename),
     324             :              (save_errno == ENOENT ?
     325             :               errdetail("The operating system could not find any locale data for the locale name \"%s\".",
     326             :                         localename) : 0)));
     327             : }
     328             : 
     329             : /*
     330             :  * POSIX doesn't define _l-variants of these functions, but several systems
     331             :  * have them.  We provide our own replacements here.
     332             :  */
     333             : #ifndef HAVE_MBSTOWCS_L
     334             : static size_t
     335      864782 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
     336             : {
     337             : #ifdef WIN32
     338             :     return _mbstowcs_l(dest, src, n, loc);
     339             : #else
     340             :     size_t      result;
     341      864782 :     locale_t    save_locale = uselocale(loc);
     342             : 
     343      864782 :     result = mbstowcs(dest, src, n);
     344      864782 :     uselocale(save_locale);
     345      864782 :     return result;
     346             : #endif
     347             : }
     348             : #endif
     349             : #ifndef HAVE_WCSTOMBS_L
     350             : static size_t
     351      864782 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
     352             : {
     353             : #ifdef WIN32
     354             :     return _wcstombs_l(dest, src, n, loc);
     355             : #else
     356             :     size_t      result;
     357      864782 :     locale_t    save_locale = uselocale(loc);
     358             : 
     359      864782 :     result = wcstombs(dest, src, n);
     360      864782 :     uselocale(save_locale);
     361      864782 :     return result;
     362             : #endif
     363             : }
     364             : #endif
     365             : 
     366             : /*
     367             :  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
     368             :  * Therefore we keep them here rather than with the mbutils code.
     369             :  */
     370             : 
     371             : /*
     372             :  * wchar2char --- convert wide characters to multibyte format
     373             :  *
     374             :  * This has the same API as the standard wcstombs_l() function; in particular,
     375             :  * tolen is the maximum number of bytes to store at *to, and *from must be
     376             :  * zero-terminated.  The output will be zero-terminated iff there is room.
     377             :  */
     378             : size_t
     379     1138778 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
     380             : {
     381             :     size_t      result;
     382             : 
     383     1138778 :     if (tolen == 0)
     384           0 :         return 0;
     385             : 
     386             : #ifdef WIN32
     387             : 
     388             :     /*
     389             :      * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
     390             :      * for some reason mbstowcs and wcstombs won't do this for us, so we use
     391             :      * MultiByteToWideChar().
     392             :      */
     393             :     if (GetDatabaseEncoding() == PG_UTF8)
     394             :     {
     395             :         result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
     396             :                                      NULL, NULL);
     397             :         /* A zero return is failure */
     398             :         if (result <= 0)
     399             :             result = -1;
     400             :         else
     401             :         {
     402             :             Assert(result <= tolen);
     403             :             /* Microsoft counts the zero terminator in the result */
     404             :             result--;
     405             :         }
     406             :     }
     407             :     else
     408             : #endif                          /* WIN32 */
     409     1138778 :     if (locale == (pg_locale_t) 0)
     410             :     {
     411             :         /* Use wcstombs directly for the default locale */
     412      273996 :         result = wcstombs(to, from, tolen);
     413             :     }
     414             :     else
     415             :     {
     416             :         /* Use wcstombs_l for nondefault locales */
     417      864782 :         result = wcstombs_l(to, from, tolen, locale->info.lt);
     418             :     }
     419             : 
     420     1138778 :     return result;
     421             : }
     422             : 
     423             : /*
     424             :  * char2wchar --- convert multibyte characters to wide characters
     425             :  *
     426             :  * This has almost the API of mbstowcs_l(), except that *from need not be
     427             :  * null-terminated; instead, the number of input bytes is specified as
     428             :  * fromlen.  Also, we ereport() rather than returning -1 for invalid
     429             :  * input encoding.  tolen is the maximum number of wchar_t's to store at *to.
     430             :  * The output will be zero-terminated iff there is room.
     431             :  */
     432             : size_t
     433     1154654 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
     434             :            pg_locale_t locale)
     435             : {
     436             :     size_t      result;
     437             : 
     438     1154654 :     if (tolen == 0)
     439           0 :         return 0;
     440             : 
     441             : #ifdef WIN32
     442             :     /* See WIN32 "Unicode" comment above */
     443             :     if (GetDatabaseEncoding() == PG_UTF8)
     444             :     {
     445             :         /* Win32 API does not work for zero-length input */
     446             :         if (fromlen == 0)
     447             :             result = 0;
     448             :         else
     449             :         {
     450             :             result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
     451             :             /* A zero return is failure */
     452             :             if (result == 0)
     453             :                 result = -1;
     454             :         }
     455             : 
     456             :         if (result != -1)
     457             :         {
     458             :             Assert(result < tolen);
     459             :             /* Append trailing null wchar (MultiByteToWideChar() does not) */
     460             :             to[result] = 0;
     461             :         }
     462             :     }
     463             :     else
     464             : #endif                          /* WIN32 */
     465             :     {
     466             :         /* mbstowcs requires ending '\0' */
     467     1154654 :         char       *str = pnstrdup(from, fromlen);
     468             : 
     469     1154654 :         if (locale == (pg_locale_t) 0)
     470             :         {
     471             :             /* Use mbstowcs directly for the default locale */
     472      289872 :             result = mbstowcs(to, str, tolen);
     473             :         }
     474             :         else
     475             :         {
     476             :             /* Use mbstowcs_l for nondefault locales */
     477      864782 :             result = mbstowcs_l(to, str, tolen, locale->info.lt);
     478             :         }
     479             : 
     480     1154654 :         pfree(str);
     481             :     }
     482             : 
     483     1154654 :     if (result == -1)
     484             :     {
     485             :         /*
     486             :          * Invalid multibyte character encountered.  We try to give a useful
     487             :          * error message by letting pg_verifymbstr check the string.  But it's
     488             :          * possible that the string is OK to us, and not OK to mbstowcs ---
     489             :          * this suggests that the LC_CTYPE locale is different from the
     490             :          * database encoding.  Give a generic error message if pg_verifymbstr
     491             :          * can't find anything wrong.
     492             :          */
     493           0 :         pg_verifymbstr(from, fromlen, false);   /* might not return */
     494             :         /* but if it does ... */
     495           0 :         ereport(ERROR,
     496             :                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     497             :                  errmsg("invalid multibyte character for locale"),
     498             :                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
     499             :     }
     500             : 
     501     1154654 :     return result;
     502             : }

Generated by: LCOV version 1.14