LCOV - PostgreSQL 18devel - src/port/chklocale.c

LCOV - code coverage report

Current view:	top level - src/port - chklocale.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 18devel	Lines:	17	25	68.0 %
Date:	2025-01-18 05:15:39	Functions:	1	1	100.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * chklocale.c
       4             :  *      Functions for handling locale-related info
       5             :  *
       6             :  *
       7             :  * Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/port/chklocale.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #ifndef WIN32
      23             : #include <langinfo.h>
      24             : #endif
      25             : 
      26             : #include "mb/pg_wchar.h"
      27             : 
      28             : 
      29             : /*
      30             :  * This table needs to recognize all the CODESET spellings for supported
      31             :  * backend encodings, as well as frontend-only encodings where possible
      32             :  * (the latter case is currently only needed for initdb to recognize
      33             :  * error situations).  On Windows, we rely on entries for codepage
      34             :  * numbers (CPnnn).
      35             :  *
      36             :  * Note that we search the table with pg_strcasecmp(), so variant
      37             :  * capitalizations don't need their own entries.
      38             :  */
      39             : struct encoding_match
      40             : {
      41             :     enum pg_enc pg_enc_code;
      42             :     const char *system_enc_name;
      43             : };
      44             : 
      45             : static const struct encoding_match encoding_match_list[] = {
      46             :     {PG_EUC_JP, "EUC-JP"},
      47             :     {PG_EUC_JP, "eucJP"},
      48             :     {PG_EUC_JP, "IBM-eucJP"},
      49             :     {PG_EUC_JP, "sdeckanji"},
      50             :     {PG_EUC_JP, "CP20932"},
      51             : 
      52             :     {PG_EUC_CN, "EUC-CN"},
      53             :     {PG_EUC_CN, "eucCN"},
      54             :     {PG_EUC_CN, "IBM-eucCN"},
      55             :     {PG_EUC_CN, "GB2312"},
      56             :     {PG_EUC_CN, "dechanzi"},
      57             :     {PG_EUC_CN, "CP20936"},
      58             : 
      59             :     {PG_EUC_KR, "EUC-KR"},
      60             :     {PG_EUC_KR, "eucKR"},
      61             :     {PG_EUC_KR, "IBM-eucKR"},
      62             :     {PG_EUC_KR, "deckorean"},
      63             :     {PG_EUC_KR, "5601"},
      64             :     {PG_EUC_KR, "CP51949"},
      65             : 
      66             :     {PG_EUC_TW, "EUC-TW"},
      67             :     {PG_EUC_TW, "eucTW"},
      68             :     {PG_EUC_TW, "IBM-eucTW"},
      69             :     {PG_EUC_TW, "cns11643"},
      70             :     /* No codepage for EUC-TW ? */
      71             : 
      72             :     {PG_UTF8, "UTF-8"},
      73             :     {PG_UTF8, "utf8"},
      74             :     {PG_UTF8, "CP65001"},
      75             : 
      76             :     {PG_LATIN1, "ISO-8859-1"},
      77             :     {PG_LATIN1, "ISO8859-1"},
      78             :     {PG_LATIN1, "iso88591"},
      79             :     {PG_LATIN1, "CP28591"},
      80             : 
      81             :     {PG_LATIN2, "ISO-8859-2"},
      82             :     {PG_LATIN2, "ISO8859-2"},
      83             :     {PG_LATIN2, "iso88592"},
      84             :     {PG_LATIN2, "CP28592"},
      85             : 
      86             :     {PG_LATIN3, "ISO-8859-3"},
      87             :     {PG_LATIN3, "ISO8859-3"},
      88             :     {PG_LATIN3, "iso88593"},
      89             :     {PG_LATIN3, "CP28593"},
      90             : 
      91             :     {PG_LATIN4, "ISO-8859-4"},
      92             :     {PG_LATIN4, "ISO8859-4"},
      93             :     {PG_LATIN4, "iso88594"},
      94             :     {PG_LATIN4, "CP28594"},
      95             : 
      96             :     {PG_LATIN5, "ISO-8859-9"},
      97             :     {PG_LATIN5, "ISO8859-9"},
      98             :     {PG_LATIN5, "iso88599"},
      99             :     {PG_LATIN5, "CP28599"},
     100             : 
     101             :     {PG_LATIN6, "ISO-8859-10"},
     102             :     {PG_LATIN6, "ISO8859-10"},
     103             :     {PG_LATIN6, "iso885910"},
     104             : 
     105             :     {PG_LATIN7, "ISO-8859-13"},
     106             :     {PG_LATIN7, "ISO8859-13"},
     107             :     {PG_LATIN7, "iso885913"},
     108             : 
     109             :     {PG_LATIN8, "ISO-8859-14"},
     110             :     {PG_LATIN8, "ISO8859-14"},
     111             :     {PG_LATIN8, "iso885914"},
     112             : 
     113             :     {PG_LATIN9, "ISO-8859-15"},
     114             :     {PG_LATIN9, "ISO8859-15"},
     115             :     {PG_LATIN9, "iso885915"},
     116             :     {PG_LATIN9, "CP28605"},
     117             : 
     118             :     {PG_LATIN10, "ISO-8859-16"},
     119             :     {PG_LATIN10, "ISO8859-16"},
     120             :     {PG_LATIN10, "iso885916"},
     121             : 
     122             :     {PG_KOI8R, "KOI8-R"},
     123             :     {PG_KOI8R, "CP20866"},
     124             : 
     125             :     {PG_KOI8U, "KOI8-U"},
     126             :     {PG_KOI8U, "CP21866"},
     127             : 
     128             :     {PG_WIN866, "CP866"},
     129             :     {PG_WIN874, "CP874"},
     130             :     {PG_WIN1250, "CP1250"},
     131             :     {PG_WIN1251, "CP1251"},
     132             :     {PG_WIN1251, "ansi-1251"},
     133             :     {PG_WIN1252, "CP1252"},
     134             :     {PG_WIN1253, "CP1253"},
     135             :     {PG_WIN1254, "CP1254"},
     136             :     {PG_WIN1255, "CP1255"},
     137             :     {PG_WIN1256, "CP1256"},
     138             :     {PG_WIN1257, "CP1257"},
     139             :     {PG_WIN1258, "CP1258"},
     140             : 
     141             :     {PG_ISO_8859_5, "ISO-8859-5"},
     142             :     {PG_ISO_8859_5, "ISO8859-5"},
     143             :     {PG_ISO_8859_5, "iso88595"},
     144             :     {PG_ISO_8859_5, "CP28595"},
     145             : 
     146             :     {PG_ISO_8859_6, "ISO-8859-6"},
     147             :     {PG_ISO_8859_6, "ISO8859-6"},
     148             :     {PG_ISO_8859_6, "iso88596"},
     149             :     {PG_ISO_8859_6, "CP28596"},
     150             : 
     151             :     {PG_ISO_8859_7, "ISO-8859-7"},
     152             :     {PG_ISO_8859_7, "ISO8859-7"},
     153             :     {PG_ISO_8859_7, "iso88597"},
     154             :     {PG_ISO_8859_7, "CP28597"},
     155             : 
     156             :     {PG_ISO_8859_8, "ISO-8859-8"},
     157             :     {PG_ISO_8859_8, "ISO8859-8"},
     158             :     {PG_ISO_8859_8, "iso88598"},
     159             :     {PG_ISO_8859_8, "CP28598"},
     160             : 
     161             :     {PG_SJIS, "SJIS"},
     162             :     {PG_SJIS, "PCK"},
     163             :     {PG_SJIS, "CP932"},
     164             :     {PG_SJIS, "SHIFT_JIS"},
     165             : 
     166             :     {PG_BIG5, "BIG5"},
     167             :     {PG_BIG5, "BIG5HKSCS"},
     168             :     {PG_BIG5, "Big5-HKSCS"},
     169             :     {PG_BIG5, "CP950"},
     170             : 
     171             :     {PG_GBK, "GBK"},
     172             :     {PG_GBK, "CP936"},
     173             : 
     174             :     {PG_UHC, "UHC"},
     175             :     {PG_UHC, "CP949"},
     176             : 
     177             :     {PG_JOHAB, "JOHAB"},
     178             :     {PG_JOHAB, "CP1361"},
     179             : 
     180             :     {PG_GB18030, "GB18030"},
     181             :     {PG_GB18030, "CP54936"},
     182             : 
     183             :     {PG_SHIFT_JIS_2004, "SJIS_2004"},
     184             : 
     185             :     {PG_SQL_ASCII, "US-ASCII"},
     186             : 
     187             :     {PG_SQL_ASCII, NULL}        /* end marker */
     188             : };
     189             : 
     190             : #ifdef WIN32
     191             : /*
     192             :  * On Windows, use CP<code page number> instead of CODESET.
     193             :  *
     194             :  * This routine uses GetLocaleInfoEx() to parse short locale names like
     195             :  * "de-DE", "fr-FR", etc.  If those cannot be parsed correctly process falls
     196             :  * back to the pre-VS-2010 manual parsing done with using
     197             :  * <Language>_<Country>.<CodePage> as a base.
     198             :  *
     199             :  * Returns a malloc()'d string for the caller to free.
     200             :  */
     201             : static char *
     202             : win32_get_codeset(const char *ctype)
     203             : {
     204             :     char       *r = NULL;
     205             :     char       *codepage;
     206             :     uint32      cp;
     207             :     WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];
     208             : 
     209             :     memset(wctype, 0, sizeof(wctype));
     210             :     MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
     211             : 
     212             :     if (GetLocaleInfoEx(wctype,
     213             :                         LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
     214             :                         (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
     215             :     {
     216             :         r = malloc(16);         /* excess */
     217             :         if (r != NULL)
     218             :         {
     219             :             /*
     220             :              * If the return value is CP_ACP that means no ANSI code page is
     221             :              * available, so only Unicode can be used for the locale.
     222             :              */
     223             :             if (cp == CP_ACP)
     224             :                 strcpy(r, "utf8");
     225             :             else
     226             :                 sprintf(r, "CP%u", cp);
     227             :         }
     228             :     }
     229             :     else
     230             :     {
     231             :         /*
     232             :          * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For
     233             :          * example, English_United States.1252.  If we see digits after the
     234             :          * last dot, assume it's a codepage number.  Otherwise, we might be
     235             :          * dealing with a Unix-style locale string; Windows' setlocale() will
     236             :          * take those even though GetLocaleInfoEx() won't, so we end up here.
     237             :          * In that case, just return what's after the last dot and hope we can
     238             :          * find it in our table.
     239             :          */
     240             :         codepage = strrchr(ctype, '.');
     241             :         if (codepage != NULL)
     242             :         {
     243             :             size_t      ln;
     244             : 
     245             :             codepage++;
     246             :             ln = strlen(codepage);
     247             :             r = malloc(ln + 3);
     248             :             if (r != NULL)
     249             :             {
     250             :                 if (strspn(codepage, "0123456789") == ln)
     251             :                     sprintf(r, "CP%s", codepage);
     252             :                 else
     253             :                     strcpy(r, codepage);
     254             :             }
     255             :         }
     256             :     }
     257             : 
     258             :     return r;
     259             : }
     260             : 
     261             : #ifndef FRONTEND
     262             : /*
     263             :  * Given a Windows code page identifier, find the corresponding PostgreSQL
     264             :  * encoding.  Issue a warning and return -1 if none found.
     265             :  */
     266             : int
     267             : pg_codepage_to_encoding(UINT cp)
     268             : {
     269             :     char        sys[16];
     270             :     int         i;
     271             : 
     272             :     sprintf(sys, "CP%u", cp);
     273             : 
     274             :     /* Check the table */
     275             :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     276             :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     277             :             return encoding_match_list[i].pg_enc_code;
     278             : 
     279             :     ereport(WARNING,
     280             :             (errmsg("could not determine encoding for codeset \"%s\"", sys)));
     281             : 
     282             :     return -1;
     283             : }
     284             : #endif
     285             : #endif                          /* WIN32 */
     286             : 
     287             : /*
     288             :  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
     289             :  * encoding, if we can determine it.  Return -1 if we can't determine it.
     290             :  *
     291             :  * Pass in NULL to get the encoding for the current locale setting.
     292             :  * Pass "" to get the encoding selected by the server's environment.
     293             :  *
     294             :  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
     295             :  * with any desired encoding.
     296             :  *
     297             :  * If running in the backend and write_message is false, this function must
     298             :  * cope with the possibility that elog() and palloc() are not yet usable.
     299             :  */
     300             : int
     301       36542 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     302             : {
     303             :     char       *sys;
     304             :     int         i;
     305             : 
     306             : #ifndef WIN32
     307             :     locale_t    loc;
     308             : #endif
     309             : 
     310             :     /* Get the CODESET property, and also LC_CTYPE if not passed in */
     311       36542 :     if (!ctype)
     312       34366 :         ctype = setlocale(LC_CTYPE, NULL);
     313             : 
     314             : 
     315             :     /* If locale is C or POSIX, we can allow all encodings */
     316       71456 :     if (pg_strcasecmp(ctype, "C") == 0 ||
     317       34914 :         pg_strcasecmp(ctype, "POSIX") == 0)
     318        1762 :         return PG_SQL_ASCII;
     319             : 
     320             : 
     321             : #ifndef WIN32
     322       34780 :     loc = newlocale(LC_CTYPE_MASK, ctype, (locale_t) 0);
     323       34780 :     if (loc == (locale_t) 0)
     324           0 :         return -1;              /* bogus ctype passed in? */
     325             : 
     326       34780 :     sys = nl_langinfo_l(CODESET, loc);
     327       34780 :     if (sys)
     328       34780 :         sys = strdup(sys);
     329             : 
     330       34780 :     freelocale(loc);
     331             : #else
     332             :     sys = win32_get_codeset(ctype);
     333             : #endif
     334             : 
     335       34780 :     if (!sys)
     336           0 :         return -1;              /* out of memory; unlikely */
     337             : 
     338             :     /* Check the table */
     339      765160 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     340             :     {
     341      765160 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     342             :         {
     343       34780 :             free(sys);
     344       34780 :             return encoding_match_list[i].pg_enc_code;
     345             :         }
     346             :     }
     347             : 
     348             :     /* Special-case kluges for particular platforms go here */
     349             : 
     350             : #ifdef __darwin__
     351             : 
     352             :     /*
     353             :      * Current macOS has many locales that report an empty string for CODESET,
     354             :      * but they all seem to actually use UTF-8.
     355             :      */
     356             :     if (strlen(sys) == 0)
     357             :     {
     358             :         free(sys);
     359             :         return PG_UTF8;
     360             :     }
     361             : #endif
     362             : 
     363             :     /*
     364             :      * We print a warning if we got a CODESET string but couldn't recognize
     365             :      * it.  This means we need another entry in the table.
     366             :      */
     367           0 :     if (write_message)
     368             :     {
     369             : #ifdef FRONTEND
     370           0 :         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
     371             :                 ctype, sys);
     372             :         /* keep newline separate so there's only one translatable string */
     373           0 :         fputc('\n', stderr);
     374             : #else
     375           0 :         ereport(WARNING,
     376             :                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
     377             :                         ctype, sys)));
     378             : #endif
     379             :     }
     380             : 
     381           0 :     free(sys);
     382           0 :     return -1;
     383             : }

Generated by: LCOV version 1.14