LCOV - code coverage report
Current view: top level - src/port - chklocale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 29 41 70.7 %
Date: 2024-04-25 05:11:31 Functions: 1 1 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * chklocale.c
       4             :  *      Functions for handling locale-related info
       5             :  *
       6             :  *
       7             :  * Copyright (c) 1996-2024, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/port/chklocale.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #ifdef HAVE_LANGINFO_H
      23             : #include <langinfo.h>
      24             : #endif
      25             : 
      26             : #include "mb/pg_wchar.h"
      27             : 
      28             : 
      29             : /*
      30             :  * This table needs to recognize all the CODESET spellings for supported
      31             :  * backend encodings, as well as frontend-only encodings where possible
      32             :  * (the latter case is currently only needed for initdb to recognize
      33             :  * error situations).  On Windows, we rely on entries for codepage
      34             :  * numbers (CPnnn).
      35             :  *
      36             :  * Note that we search the table with pg_strcasecmp(), so variant
      37             :  * capitalizations don't need their own entries.
      38             :  */
      39             : struct encoding_match
      40             : {
      41             :     enum pg_enc pg_enc_code;
      42             :     const char *system_enc_name;
      43             : };
      44             : 
      45             : static const struct encoding_match encoding_match_list[] = {
      46             :     {PG_EUC_JP, "EUC-JP"},
      47             :     {PG_EUC_JP, "eucJP"},
      48             :     {PG_EUC_JP, "IBM-eucJP"},
      49             :     {PG_EUC_JP, "sdeckanji"},
      50             :     {PG_EUC_JP, "CP20932"},
      51             : 
      52             :     {PG_EUC_CN, "EUC-CN"},
      53             :     {PG_EUC_CN, "eucCN"},
      54             :     {PG_EUC_CN, "IBM-eucCN"},
      55             :     {PG_EUC_CN, "GB2312"},
      56             :     {PG_EUC_CN, "dechanzi"},
      57             :     {PG_EUC_CN, "CP20936"},
      58             : 
      59             :     {PG_EUC_KR, "EUC-KR"},
      60             :     {PG_EUC_KR, "eucKR"},
      61             :     {PG_EUC_KR, "IBM-eucKR"},
      62             :     {PG_EUC_KR, "deckorean"},
      63             :     {PG_EUC_KR, "5601"},
      64             :     {PG_EUC_KR, "CP51949"},
      65             : 
      66             :     {PG_EUC_TW, "EUC-TW"},
      67             :     {PG_EUC_TW, "eucTW"},
      68             :     {PG_EUC_TW, "IBM-eucTW"},
      69             :     {PG_EUC_TW, "cns11643"},
      70             :     /* No codepage for EUC-TW ? */
      71             : 
      72             :     {PG_UTF8, "UTF-8"},
      73             :     {PG_UTF8, "utf8"},
      74             :     {PG_UTF8, "CP65001"},
      75             : 
      76             :     {PG_LATIN1, "ISO-8859-1"},
      77             :     {PG_LATIN1, "ISO8859-1"},
      78             :     {PG_LATIN1, "iso88591"},
      79             :     {PG_LATIN1, "CP28591"},
      80             : 
      81             :     {PG_LATIN2, "ISO-8859-2"},
      82             :     {PG_LATIN2, "ISO8859-2"},
      83             :     {PG_LATIN2, "iso88592"},
      84             :     {PG_LATIN2, "CP28592"},
      85             : 
      86             :     {PG_LATIN3, "ISO-8859-3"},
      87             :     {PG_LATIN3, "ISO8859-3"},
      88             :     {PG_LATIN3, "iso88593"},
      89             :     {PG_LATIN3, "CP28593"},
      90             : 
      91             :     {PG_LATIN4, "ISO-8859-4"},
      92             :     {PG_LATIN4, "ISO8859-4"},
      93             :     {PG_LATIN4, "iso88594"},
      94             :     {PG_LATIN4, "CP28594"},
      95             : 
      96             :     {PG_LATIN5, "ISO-8859-9"},
      97             :     {PG_LATIN5, "ISO8859-9"},
      98             :     {PG_LATIN5, "iso88599"},
      99             :     {PG_LATIN5, "CP28599"},
     100             : 
     101             :     {PG_LATIN6, "ISO-8859-10"},
     102             :     {PG_LATIN6, "ISO8859-10"},
     103             :     {PG_LATIN6, "iso885910"},
     104             : 
     105             :     {PG_LATIN7, "ISO-8859-13"},
     106             :     {PG_LATIN7, "ISO8859-13"},
     107             :     {PG_LATIN7, "iso885913"},
     108             : 
     109             :     {PG_LATIN8, "ISO-8859-14"},
     110             :     {PG_LATIN8, "ISO8859-14"},
     111             :     {PG_LATIN8, "iso885914"},
     112             : 
     113             :     {PG_LATIN9, "ISO-8859-15"},
     114             :     {PG_LATIN9, "ISO8859-15"},
     115             :     {PG_LATIN9, "iso885915"},
     116             :     {PG_LATIN9, "CP28605"},
     117             : 
     118             :     {PG_LATIN10, "ISO-8859-16"},
     119             :     {PG_LATIN10, "ISO8859-16"},
     120             :     {PG_LATIN10, "iso885916"},
     121             : 
     122             :     {PG_KOI8R, "KOI8-R"},
     123             :     {PG_KOI8R, "CP20866"},
     124             : 
     125             :     {PG_KOI8U, "KOI8-U"},
     126             :     {PG_KOI8U, "CP21866"},
     127             : 
     128             :     {PG_WIN866, "CP866"},
     129             :     {PG_WIN874, "CP874"},
     130             :     {PG_WIN1250, "CP1250"},
     131             :     {PG_WIN1251, "CP1251"},
     132             :     {PG_WIN1251, "ansi-1251"},
     133             :     {PG_WIN1252, "CP1252"},
     134             :     {PG_WIN1253, "CP1253"},
     135             :     {PG_WIN1254, "CP1254"},
     136             :     {PG_WIN1255, "CP1255"},
     137             :     {PG_WIN1256, "CP1256"},
     138             :     {PG_WIN1257, "CP1257"},
     139             :     {PG_WIN1258, "CP1258"},
     140             : 
     141             :     {PG_ISO_8859_5, "ISO-8859-5"},
     142             :     {PG_ISO_8859_5, "ISO8859-5"},
     143             :     {PG_ISO_8859_5, "iso88595"},
     144             :     {PG_ISO_8859_5, "CP28595"},
     145             : 
     146             :     {PG_ISO_8859_6, "ISO-8859-6"},
     147             :     {PG_ISO_8859_6, "ISO8859-6"},
     148             :     {PG_ISO_8859_6, "iso88596"},
     149             :     {PG_ISO_8859_6, "CP28596"},
     150             : 
     151             :     {PG_ISO_8859_7, "ISO-8859-7"},
     152             :     {PG_ISO_8859_7, "ISO8859-7"},
     153             :     {PG_ISO_8859_7, "iso88597"},
     154             :     {PG_ISO_8859_7, "CP28597"},
     155             : 
     156             :     {PG_ISO_8859_8, "ISO-8859-8"},
     157             :     {PG_ISO_8859_8, "ISO8859-8"},
     158             :     {PG_ISO_8859_8, "iso88598"},
     159             :     {PG_ISO_8859_8, "CP28598"},
     160             : 
     161             :     {PG_SJIS, "SJIS"},
     162             :     {PG_SJIS, "PCK"},
     163             :     {PG_SJIS, "CP932"},
     164             :     {PG_SJIS, "SHIFT_JIS"},
     165             : 
     166             :     {PG_BIG5, "BIG5"},
     167             :     {PG_BIG5, "BIG5HKSCS"},
     168             :     {PG_BIG5, "Big5-HKSCS"},
     169             :     {PG_BIG5, "CP950"},
     170             : 
     171             :     {PG_GBK, "GBK"},
     172             :     {PG_GBK, "CP936"},
     173             : 
     174             :     {PG_UHC, "UHC"},
     175             :     {PG_UHC, "CP949"},
     176             : 
     177             :     {PG_JOHAB, "JOHAB"},
     178             :     {PG_JOHAB, "CP1361"},
     179             : 
     180             :     {PG_GB18030, "GB18030"},
     181             :     {PG_GB18030, "CP54936"},
     182             : 
     183             :     {PG_SHIFT_JIS_2004, "SJIS_2004"},
     184             : 
     185             :     {PG_SQL_ASCII, "US-ASCII"},
     186             : 
     187             :     {PG_SQL_ASCII, NULL}        /* end marker */
     188             : };
     189             : 
     190             : #ifdef WIN32
     191             : /*
     192             :  * On Windows, use CP<code page number> instead of the nl_langinfo() result
     193             :  *
     194             :  * This routine uses GetLocaleInfoEx() to parse short locale names like
     195             :  * "de-DE", "fr-FR", etc.  If those cannot be parsed correctly process falls
     196             :  * back to the pre-VS-2010 manual parsing done with using
     197             :  * <Language>_<Country>.<CodePage> as a base.
     198             :  *
     199             :  * Returns a malloc()'d string for the caller to free.
     200             :  */
     201             : static char *
     202             : win32_langinfo(const char *ctype)
     203             : {
     204             :     char       *r = NULL;
     205             :     char       *codepage;
     206             : 
     207             : #if defined(_MSC_VER)
     208             :     uint32      cp;
     209             :     WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];
     210             : 
     211             :     memset(wctype, 0, sizeof(wctype));
     212             :     MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
     213             : 
     214             :     if (GetLocaleInfoEx(wctype,
     215             :                         LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
     216             :                         (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
     217             :     {
     218             :         r = malloc(16);         /* excess */
     219             :         if (r != NULL)
     220             :         {
     221             :             /*
     222             :              * If the return value is CP_ACP that means no ANSI code page is
     223             :              * available, so only Unicode can be used for the locale.
     224             :              */
     225             :             if (cp == CP_ACP)
     226             :                 strcpy(r, "utf8");
     227             :             else
     228             :                 sprintf(r, "CP%u", cp);
     229             :         }
     230             :     }
     231             :     else
     232             : #endif
     233             :     {
     234             :         /*
     235             :          * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For
     236             :          * example, English_United States.1252.  If we see digits after the
     237             :          * last dot, assume it's a codepage number.  Otherwise, we might be
     238             :          * dealing with a Unix-style locale string; Windows' setlocale() will
     239             :          * take those even though GetLocaleInfoEx() won't, so we end up here.
     240             :          * In that case, just return what's after the last dot and hope we can
     241             :          * find it in our table.
     242             :          */
     243             :         codepage = strrchr(ctype, '.');
     244             :         if (codepage != NULL)
     245             :         {
     246             :             size_t      ln;
     247             : 
     248             :             codepage++;
     249             :             ln = strlen(codepage);
     250             :             r = malloc(ln + 3);
     251             :             if (r != NULL)
     252             :             {
     253             :                 if (strspn(codepage, "0123456789") == ln)
     254             :                     sprintf(r, "CP%s", codepage);
     255             :                 else
     256             :                     strcpy(r, codepage);
     257             :             }
     258             :         }
     259             :     }
     260             : 
     261             :     return r;
     262             : }
     263             : 
     264             : #ifndef FRONTEND
     265             : /*
     266             :  * Given a Windows code page identifier, find the corresponding PostgreSQL
     267             :  * encoding.  Issue a warning and return -1 if none found.
     268             :  */
     269             : int
     270             : pg_codepage_to_encoding(UINT cp)
     271             : {
     272             :     char        sys[16];
     273             :     int         i;
     274             : 
     275             :     sprintf(sys, "CP%u", cp);
     276             : 
     277             :     /* Check the table */
     278             :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     279             :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     280             :             return encoding_match_list[i].pg_enc_code;
     281             : 
     282             :     ereport(WARNING,
     283             :             (errmsg("could not determine encoding for codeset \"%s\"", sys)));
     284             : 
     285             :     return -1;
     286             : }
     287             : #endif
     288             : #endif                          /* WIN32 */
     289             : 
     290             : #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32)
     291             : 
     292             : /*
     293             :  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
     294             :  * encoding, if we can determine it.  Return -1 if we can't determine it.
     295             :  *
     296             :  * Pass in NULL to get the encoding for the current locale setting.
     297             :  * Pass "" to get the encoding selected by the server's environment.
     298             :  *
     299             :  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
     300             :  * with any desired encoding.
     301             :  *
     302             :  * If running in the backend and write_message is false, this function must
     303             :  * cope with the possibility that elog() and palloc() are not yet usable.
     304             :  */
     305             : int
     306       29074 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     307             : {
     308             :     char       *sys;
     309             :     int         i;
     310             : 
     311             :     /* Get the CODESET property, and also LC_CTYPE if not passed in */
     312       29074 :     if (ctype)
     313             :     {
     314             :         char       *save;
     315             :         char       *name;
     316             : 
     317             :         /* If locale is C or POSIX, we can allow all encodings */
     318        3330 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     319        1312 :             pg_strcasecmp(ctype, "POSIX") == 0)
     320         830 :             return PG_SQL_ASCII;
     321             : 
     322        1188 :         save = setlocale(LC_CTYPE, NULL);
     323        1188 :         if (!save)
     324           0 :             return -1;          /* setlocale() broken? */
     325             :         /* must copy result, or it might change after setlocale */
     326        1188 :         save = strdup(save);
     327        1188 :         if (!save)
     328           0 :             return -1;          /* out of memory; unlikely */
     329             : 
     330        1188 :         name = setlocale(LC_CTYPE, ctype);
     331        1188 :         if (!name)
     332             :         {
     333           0 :             free(save);
     334           0 :             return -1;          /* bogus ctype passed in? */
     335             :         }
     336             : 
     337             : #ifndef WIN32
     338        1188 :         sys = nl_langinfo(CODESET);
     339        1188 :         if (sys)
     340        1188 :             sys = strdup(sys);
     341             : #else
     342             :         sys = win32_langinfo(name);
     343             : #endif
     344             : 
     345        1188 :         setlocale(LC_CTYPE, save);
     346        1188 :         free(save);
     347             :     }
     348             :     else
     349             :     {
     350             :         /* much easier... */
     351       27056 :         ctype = setlocale(LC_CTYPE, NULL);
     352       27056 :         if (!ctype)
     353           0 :             return -1;          /* setlocale() broken? */
     354             : 
     355             :         /* If locale is C or POSIX, we can allow all encodings */
     356       53280 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     357       26224 :             pg_strcasecmp(ctype, "POSIX") == 0)
     358         832 :             return PG_SQL_ASCII;
     359             : 
     360             : #ifndef WIN32
     361       26224 :         sys = nl_langinfo(CODESET);
     362       26224 :         if (sys)
     363       26224 :             sys = strdup(sys);
     364             : #else
     365             :         sys = win32_langinfo(ctype);
     366             : #endif
     367             :     }
     368             : 
     369       27412 :     if (!sys)
     370           0 :         return -1;              /* out of memory; unlikely */
     371             : 
     372             :     /* Check the table */
     373      603064 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     374             :     {
     375      603064 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     376             :         {
     377       27412 :             free(sys);
     378       27412 :             return encoding_match_list[i].pg_enc_code;
     379             :         }
     380             :     }
     381             : 
     382             :     /* Special-case kluges for particular platforms go here */
     383             : 
     384             : #ifdef __darwin__
     385             : 
     386             :     /*
     387             :      * Current macOS has many locales that report an empty string for CODESET,
     388             :      * but they all seem to actually use UTF-8.
     389             :      */
     390             :     if (strlen(sys) == 0)
     391             :     {
     392             :         free(sys);
     393             :         return PG_UTF8;
     394             :     }
     395             : #endif
     396             : 
     397             :     /*
     398             :      * We print a warning if we got a CODESET string but couldn't recognize
     399             :      * it.  This means we need another entry in the table.
     400             :      */
     401           0 :     if (write_message)
     402             :     {
     403             : #ifdef FRONTEND
     404           0 :         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
     405             :                 ctype, sys);
     406             :         /* keep newline separate so there's only one translatable string */
     407           0 :         fputc('\n', stderr);
     408             : #else
     409           0 :         ereport(WARNING,
     410             :                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
     411             :                         ctype, sys)));
     412             : #endif
     413             :     }
     414             : 
     415           0 :     free(sys);
     416           0 :     return -1;
     417             : }
     418             : #else                           /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
     419             : 
     420             : /*
     421             :  * stub if no multi-language platform support
     422             :  *
     423             :  * Note: we could return -1 here, but that would have the effect of
     424             :  * forcing users to specify an encoding to initdb on such platforms.
     425             :  * It seems better to silently default to SQL_ASCII.
     426             :  */
     427             : int
     428             : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     429             : {
     430             :     return PG_SQL_ASCII;
     431             : }
     432             : 
     433             : #endif                          /* (HAVE_LANGINFO_H && CODESET) || WIN32 */

Generated by: LCOV version 1.14