LCOV - code coverage report
Current view: top level - src/port - chklocale.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 68.0 % 25 17
Test Date: 2026-03-03 10:15:07 Functions: 100.0 % 1 1
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * chklocale.c
       4              :  *      Functions for handling locale-related info
       5              :  *
       6              :  *
       7              :  * Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/port/chklocale.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #ifndef FRONTEND
      17              : #include "postgres.h"
      18              : #else
      19              : #include "postgres_fe.h"
      20              : #endif
      21              : 
      22              : #ifndef WIN32
      23              : #include <langinfo.h>
      24              : #endif
      25              : 
      26              : #include "mb/pg_wchar.h"
      27              : 
      28              : 
      29              : /*
      30              :  * This table needs to recognize all the CODESET spellings for supported
      31              :  * backend encodings, as well as frontend-only encodings where possible
      32              :  * (the latter case is currently only needed for initdb to recognize
      33              :  * error situations).  On Windows, we rely on entries for codepage
      34              :  * numbers (CPnnn).
      35              :  *
      36              :  * Note that we search the table with pg_strcasecmp(), so variant
      37              :  * capitalizations don't need their own entries.
      38              :  */
      39              : struct encoding_match
      40              : {
      41              :     enum pg_enc pg_enc_code;
      42              :     const char *system_enc_name;
      43              : };
      44              : 
      45              : static const struct encoding_match encoding_match_list[] = {
      46              :     {PG_EUC_JP, "EUC-JP"},
      47              :     {PG_EUC_JP, "eucJP"},
      48              :     {PG_EUC_JP, "IBM-eucJP"},
      49              :     {PG_EUC_JP, "sdeckanji"},
      50              :     {PG_EUC_JP, "CP20932"},
      51              : 
      52              :     {PG_EUC_CN, "EUC-CN"},
      53              :     {PG_EUC_CN, "eucCN"},
      54              :     {PG_EUC_CN, "IBM-eucCN"},
      55              :     {PG_EUC_CN, "GB2312"},
      56              :     {PG_EUC_CN, "dechanzi"},
      57              :     {PG_EUC_CN, "CP20936"},
      58              : 
      59              :     {PG_EUC_KR, "EUC-KR"},
      60              :     {PG_EUC_KR, "eucKR"},
      61              :     {PG_EUC_KR, "IBM-eucKR"},
      62              :     {PG_EUC_KR, "deckorean"},
      63              :     {PG_EUC_KR, "5601"},
      64              :     {PG_EUC_KR, "CP51949"},
      65              : 
      66              :     {PG_EUC_TW, "EUC-TW"},
      67              :     {PG_EUC_TW, "eucTW"},
      68              :     {PG_EUC_TW, "IBM-eucTW"},
      69              :     {PG_EUC_TW, "cns11643"},
      70              :     /* No codepage for EUC-TW ? */
      71              : 
      72              :     {PG_UTF8, "UTF-8"},
      73              :     {PG_UTF8, "utf8"},
      74              :     {PG_UTF8, "CP65001"},
      75              : 
      76              :     {PG_LATIN1, "ISO-8859-1"},
      77              :     {PG_LATIN1, "ISO8859-1"},
      78              :     {PG_LATIN1, "iso88591"},
      79              :     {PG_LATIN1, "CP28591"},
      80              : 
      81              :     {PG_LATIN2, "ISO-8859-2"},
      82              :     {PG_LATIN2, "ISO8859-2"},
      83              :     {PG_LATIN2, "iso88592"},
      84              :     {PG_LATIN2, "CP28592"},
      85              : 
      86              :     {PG_LATIN3, "ISO-8859-3"},
      87              :     {PG_LATIN3, "ISO8859-3"},
      88              :     {PG_LATIN3, "iso88593"},
      89              :     {PG_LATIN3, "CP28593"},
      90              : 
      91              :     {PG_LATIN4, "ISO-8859-4"},
      92              :     {PG_LATIN4, "ISO8859-4"},
      93              :     {PG_LATIN4, "iso88594"},
      94              :     {PG_LATIN4, "CP28594"},
      95              : 
      96              :     {PG_LATIN5, "ISO-8859-9"},
      97              :     {PG_LATIN5, "ISO8859-9"},
      98              :     {PG_LATIN5, "iso88599"},
      99              :     {PG_LATIN5, "CP28599"},
     100              : 
     101              :     {PG_LATIN6, "ISO-8859-10"},
     102              :     {PG_LATIN6, "ISO8859-10"},
     103              :     {PG_LATIN6, "iso885910"},
     104              : 
     105              :     {PG_LATIN7, "ISO-8859-13"},
     106              :     {PG_LATIN7, "ISO8859-13"},
     107              :     {PG_LATIN7, "iso885913"},
     108              : 
     109              :     {PG_LATIN8, "ISO-8859-14"},
     110              :     {PG_LATIN8, "ISO8859-14"},
     111              :     {PG_LATIN8, "iso885914"},
     112              : 
     113              :     {PG_LATIN9, "ISO-8859-15"},
     114              :     {PG_LATIN9, "ISO8859-15"},
     115              :     {PG_LATIN9, "iso885915"},
     116              :     {PG_LATIN9, "CP28605"},
     117              : 
     118              :     {PG_LATIN10, "ISO-8859-16"},
     119              :     {PG_LATIN10, "ISO8859-16"},
     120              :     {PG_LATIN10, "iso885916"},
     121              : 
     122              :     {PG_KOI8R, "KOI8-R"},
     123              :     {PG_KOI8R, "CP20866"},
     124              : 
     125              :     {PG_KOI8U, "KOI8-U"},
     126              :     {PG_KOI8U, "CP21866"},
     127              : 
     128              :     {PG_WIN866, "CP866"},
     129              :     {PG_WIN874, "CP874"},
     130              :     {PG_WIN1250, "CP1250"},
     131              :     {PG_WIN1251, "CP1251"},
     132              :     {PG_WIN1251, "ansi-1251"},
     133              :     {PG_WIN1252, "CP1252"},
     134              :     {PG_WIN1253, "CP1253"},
     135              :     {PG_WIN1254, "CP1254"},
     136              :     {PG_WIN1255, "CP1255"},
     137              :     {PG_WIN1256, "CP1256"},
     138              :     {PG_WIN1257, "CP1257"},
     139              :     {PG_WIN1258, "CP1258"},
     140              : 
     141              :     {PG_ISO_8859_5, "ISO-8859-5"},
     142              :     {PG_ISO_8859_5, "ISO8859-5"},
     143              :     {PG_ISO_8859_5, "iso88595"},
     144              :     {PG_ISO_8859_5, "CP28595"},
     145              : 
     146              :     {PG_ISO_8859_6, "ISO-8859-6"},
     147              :     {PG_ISO_8859_6, "ISO8859-6"},
     148              :     {PG_ISO_8859_6, "iso88596"},
     149              :     {PG_ISO_8859_6, "CP28596"},
     150              : 
     151              :     {PG_ISO_8859_7, "ISO-8859-7"},
     152              :     {PG_ISO_8859_7, "ISO8859-7"},
     153              :     {PG_ISO_8859_7, "iso88597"},
     154              :     {PG_ISO_8859_7, "CP28597"},
     155              : 
     156              :     {PG_ISO_8859_8, "ISO-8859-8"},
     157              :     {PG_ISO_8859_8, "ISO8859-8"},
     158              :     {PG_ISO_8859_8, "iso88598"},
     159              :     {PG_ISO_8859_8, "CP28598"},
     160              : 
     161              :     {PG_SJIS, "SJIS"},
     162              :     {PG_SJIS, "PCK"},
     163              :     {PG_SJIS, "CP932"},
     164              :     {PG_SJIS, "SHIFT_JIS"},
     165              : 
     166              :     {PG_BIG5, "BIG5"},
     167              :     {PG_BIG5, "BIG5HKSCS"},
     168              :     {PG_BIG5, "Big5-HKSCS"},
     169              :     {PG_BIG5, "CP950"},
     170              : 
     171              :     {PG_GBK, "GBK"},
     172              :     {PG_GBK, "CP936"},
     173              : 
     174              :     {PG_UHC, "UHC"},
     175              :     {PG_UHC, "CP949"},
     176              : 
     177              :     {PG_JOHAB, "JOHAB"},
     178              :     {PG_JOHAB, "CP1361"},
     179              : 
     180              :     {PG_GB18030, "GB18030"},
     181              :     {PG_GB18030, "CP54936"},
     182              : 
     183              :     {PG_SHIFT_JIS_2004, "SJIS_2004"},
     184              : 
     185              :     {PG_SQL_ASCII, "US-ASCII"},
     186              : 
     187              :     {PG_SQL_ASCII, NULL}        /* end marker */
     188              : };
     189              : 
     190              : #ifdef WIN32
     191              : /*
     192              :  * On Windows, use CP<code page number> instead of CODESET.
     193              :  *
     194              :  * This routine uses GetLocaleInfoEx() to parse short locale names like
     195              :  * "de-DE", "fr-FR", etc.  If those cannot be parsed correctly process falls
     196              :  * back to the pre-VS-2010 manual parsing done with using
     197              :  * <Language>_<Country>.<CodePage> as a base.
     198              :  *
     199              :  * Returns a malloc()'d string for the caller to free.
     200              :  */
     201              : static char *
     202              : win32_get_codeset(const char *ctype)
     203              : {
     204              :     char       *r = NULL;
     205              :     const char *codepage;
     206              :     uint32      cp;
     207              :     WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];
     208              : 
     209              :     memset(wctype, 0, sizeof(wctype));
     210              :     MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
     211              : 
     212              :     if (GetLocaleInfoEx(wctype,
     213              :                         LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
     214              :                         (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
     215              :     {
     216              :         r = malloc(16);         /* excess */
     217              :         if (r != NULL)
     218              :         {
     219              :             /*
     220              :              * If the return value is CP_ACP that means no ANSI code page is
     221              :              * available, so only Unicode can be used for the locale.
     222              :              */
     223              :             if (cp == CP_ACP)
     224              :                 strcpy(r, "utf8");
     225              :             else
     226              :                 sprintf(r, "CP%u", cp);
     227              :         }
     228              :     }
     229              :     else
     230              :     {
     231              :         /*
     232              :          * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For
     233              :          * example, English_United States.1252.  If we see digits after the
     234              :          * last dot, assume it's a codepage number.  Otherwise, we might be
     235              :          * dealing with a Unix-style locale string; Windows' setlocale() will
     236              :          * take those even though GetLocaleInfoEx() won't, so we end up here.
     237              :          * In that case, just return what's after the last dot and hope we can
     238              :          * find it in our table.
     239              :          */
     240              :         codepage = strrchr(ctype, '.');
     241              :         if (codepage != NULL)
     242              :         {
     243              :             size_t      ln;
     244              : 
     245              :             codepage++;
     246              :             ln = strlen(codepage);
     247              :             r = malloc(ln + 3);
     248              :             if (r != NULL)
     249              :             {
     250              :                 if (strspn(codepage, "0123456789") == ln)
     251              :                     sprintf(r, "CP%s", codepage);
     252              :                 else
     253              :                     strcpy(r, codepage);
     254              :             }
     255              :         }
     256              :     }
     257              : 
     258              :     return r;
     259              : }
     260              : 
     261              : #ifndef FRONTEND
     262              : /*
     263              :  * Given a Windows code page identifier, find the corresponding PostgreSQL
     264              :  * encoding.  Issue a warning and return -1 if none found.
     265              :  */
     266              : int
     267              : pg_codepage_to_encoding(UINT cp)
     268              : {
     269              :     char        sys[16];
     270              :     int         i;
     271              : 
     272              :     sprintf(sys, "CP%u", cp);
     273              : 
     274              :     /* Check the table */
     275              :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     276              :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     277              :             return encoding_match_list[i].pg_enc_code;
     278              : 
     279              :     ereport(WARNING,
     280              :             (errmsg("could not determine encoding for codeset \"%s\"", sys)));
     281              : 
     282              :     return -1;
     283              : }
     284              : #endif
     285              : #endif                          /* WIN32 */
     286              : 
     287              : /*
     288              :  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
     289              :  * encoding, if we can determine it.  Return -1 if we can't determine it.
     290              :  *
     291              :  * Pass in NULL to get the encoding for the current locale setting.
     292              :  * Pass "" to get the encoding selected by the server's environment.
     293              :  *
     294              :  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
     295              :  * with any desired encoding.
     296              :  *
     297              :  * If running in the backend and write_message is false, this function must
     298              :  * cope with the possibility that elog() and palloc() are not yet usable.
     299              :  */
     300              : int
     301        20581 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     302              : {
     303              :     char       *sys;
     304              :     int         i;
     305              : 
     306              : #ifndef WIN32
     307              :     locale_t    loc;
     308              : #endif
     309              : 
     310              :     /* Get the CODESET property, and also LC_CTYPE if not passed in */
     311        20581 :     if (!ctype)
     312        19303 :         ctype = setlocale(LC_CTYPE, NULL);
     313              : 
     314              : 
     315              :     /* If locale is C or POSIX, we can allow all encodings */
     316        40342 :     if (pg_strcasecmp(ctype, "C") == 0 ||
     317        19761 :         pg_strcasecmp(ctype, "POSIX") == 0)
     318          893 :         return PG_SQL_ASCII;
     319              : 
     320              : 
     321              : #ifndef WIN32
     322        19688 :     loc = newlocale(LC_CTYPE_MASK, ctype, (locale_t) 0);
     323        19688 :     if (loc == (locale_t) 0)
     324            0 :         return -1;              /* bogus ctype passed in? */
     325              : 
     326        19688 :     sys = nl_langinfo_l(CODESET, loc);
     327        19688 :     if (sys)
     328        19688 :         sys = strdup(sys);
     329              : 
     330        19688 :     freelocale(loc);
     331              : #else
     332              :     sys = win32_get_codeset(ctype);
     333              : #endif
     334              : 
     335        19688 :     if (!sys)
     336            0 :         return -1;              /* out of memory; unlikely */
     337              : 
     338              :     /* Check the table */
     339       433136 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     340              :     {
     341       433136 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     342              :         {
     343        19688 :             free(sys);
     344        19688 :             return encoding_match_list[i].pg_enc_code;
     345              :         }
     346              :     }
     347              : 
     348              :     /* Special-case kluges for particular platforms go here */
     349              : 
     350              : #ifdef __darwin__
     351              : 
     352              :     /*
     353              :      * Current macOS has many locales that report an empty string for CODESET,
     354              :      * but they all seem to actually use UTF-8.
     355              :      */
     356              :     if (strlen(sys) == 0)
     357              :     {
     358              :         free(sys);
     359              :         return PG_UTF8;
     360              :     }
     361              : #endif
     362              : 
     363              :     /*
     364              :      * We print a warning if we got a CODESET string but couldn't recognize
     365              :      * it.  This means we need another entry in the table.
     366              :      */
     367            0 :     if (write_message)
     368              :     {
     369              : #ifdef FRONTEND
     370            0 :         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
     371              :                 ctype, sys);
     372              :         /* keep newline separate so there's only one translatable string */
     373            0 :         fputc('\n', stderr);
     374              : #else
     375            0 :         ereport(WARNING,
     376              :                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
     377              :                         ctype, sys)));
     378              : #endif
     379              :     }
     380              : 
     381            0 :     free(sys);
     382            0 :     return -1;
     383              : }
        

Generated by: LCOV version 2.0-1