LCOV - code coverage report
Current view: top level - src/port - chklocale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 29 41 70.7 %
Date: 2020-05-25 05:06:35 Functions: 1 1 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * chklocale.c
       4             :  *      Functions for handling locale-related info
       5             :  *
       6             :  *
       7             :  * Copyright (c) 1996-2020, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/port/chklocale.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #ifdef HAVE_LANGINFO_H
      23             : #include <langinfo.h>
      24             : #endif
      25             : 
      26             : #include "mb/pg_wchar.h"
      27             : 
      28             : 
      29             : /*
      30             :  * This table needs to recognize all the CODESET spellings for supported
      31             :  * backend encodings, as well as frontend-only encodings where possible
      32             :  * (the latter case is currently only needed for initdb to recognize
      33             :  * error situations).  On Windows, we rely on entries for codepage
      34             :  * numbers (CPnnn).
      35             :  *
      36             :  * Note that we search the table with pg_strcasecmp(), so variant
      37             :  * capitalizations don't need their own entries.
      38             :  */
      39             : struct encoding_match
      40             : {
      41             :     enum pg_enc pg_enc_code;
      42             :     const char *system_enc_name;
      43             : };
      44             : 
      45             : static const struct encoding_match encoding_match_list[] = {
      46             :     {PG_EUC_JP, "EUC-JP"},
      47             :     {PG_EUC_JP, "eucJP"},
      48             :     {PG_EUC_JP, "IBM-eucJP"},
      49             :     {PG_EUC_JP, "sdeckanji"},
      50             :     {PG_EUC_JP, "CP20932"},
      51             : 
      52             :     {PG_EUC_CN, "EUC-CN"},
      53             :     {PG_EUC_CN, "eucCN"},
      54             :     {PG_EUC_CN, "IBM-eucCN"},
      55             :     {PG_EUC_CN, "GB2312"},
      56             :     {PG_EUC_CN, "dechanzi"},
      57             :     {PG_EUC_CN, "CP20936"},
      58             : 
      59             :     {PG_EUC_KR, "EUC-KR"},
      60             :     {PG_EUC_KR, "eucKR"},
      61             :     {PG_EUC_KR, "IBM-eucKR"},
      62             :     {PG_EUC_KR, "deckorean"},
      63             :     {PG_EUC_KR, "5601"},
      64             :     {PG_EUC_KR, "CP51949"},
      65             : 
      66             :     {PG_EUC_TW, "EUC-TW"},
      67             :     {PG_EUC_TW, "eucTW"},
      68             :     {PG_EUC_TW, "IBM-eucTW"},
      69             :     {PG_EUC_TW, "cns11643"},
      70             :     /* No codepage for EUC-TW ? */
      71             : 
      72             :     {PG_UTF8, "UTF-8"},
      73             :     {PG_UTF8, "utf8"},
      74             :     {PG_UTF8, "CP65001"},
      75             : 
      76             :     {PG_LATIN1, "ISO-8859-1"},
      77             :     {PG_LATIN1, "ISO8859-1"},
      78             :     {PG_LATIN1, "iso88591"},
      79             :     {PG_LATIN1, "CP28591"},
      80             : 
      81             :     {PG_LATIN2, "ISO-8859-2"},
      82             :     {PG_LATIN2, "ISO8859-2"},
      83             :     {PG_LATIN2, "iso88592"},
      84             :     {PG_LATIN2, "CP28592"},
      85             : 
      86             :     {PG_LATIN3, "ISO-8859-3"},
      87             :     {PG_LATIN3, "ISO8859-3"},
      88             :     {PG_LATIN3, "iso88593"},
      89             :     {PG_LATIN3, "CP28593"},
      90             : 
      91             :     {PG_LATIN4, "ISO-8859-4"},
      92             :     {PG_LATIN4, "ISO8859-4"},
      93             :     {PG_LATIN4, "iso88594"},
      94             :     {PG_LATIN4, "CP28594"},
      95             : 
      96             :     {PG_LATIN5, "ISO-8859-9"},
      97             :     {PG_LATIN5, "ISO8859-9"},
      98             :     {PG_LATIN5, "iso88599"},
      99             :     {PG_LATIN5, "CP28599"},
     100             : 
     101             :     {PG_LATIN6, "ISO-8859-10"},
     102             :     {PG_LATIN6, "ISO8859-10"},
     103             :     {PG_LATIN6, "iso885910"},
     104             : 
     105             :     {PG_LATIN7, "ISO-8859-13"},
     106             :     {PG_LATIN7, "ISO8859-13"},
     107             :     {PG_LATIN7, "iso885913"},
     108             : 
     109             :     {PG_LATIN8, "ISO-8859-14"},
     110             :     {PG_LATIN8, "ISO8859-14"},
     111             :     {PG_LATIN8, "iso885914"},
     112             : 
     113             :     {PG_LATIN9, "ISO-8859-15"},
     114             :     {PG_LATIN9, "ISO8859-15"},
     115             :     {PG_LATIN9, "iso885915"},
     116             :     {PG_LATIN9, "CP28605"},
     117             : 
     118             :     {PG_LATIN10, "ISO-8859-16"},
     119             :     {PG_LATIN10, "ISO8859-16"},
     120             :     {PG_LATIN10, "iso885916"},
     121             : 
     122             :     {PG_KOI8R, "KOI8-R"},
     123             :     {PG_KOI8R, "CP20866"},
     124             : 
     125             :     {PG_KOI8U, "KOI8-U"},
     126             :     {PG_KOI8U, "CP21866"},
     127             : 
     128             :     {PG_WIN866, "CP866"},
     129             :     {PG_WIN874, "CP874"},
     130             :     {PG_WIN1250, "CP1250"},
     131             :     {PG_WIN1251, "CP1251"},
     132             :     {PG_WIN1251, "ansi-1251"},
     133             :     {PG_WIN1252, "CP1252"},
     134             :     {PG_WIN1253, "CP1253"},
     135             :     {PG_WIN1254, "CP1254"},
     136             :     {PG_WIN1255, "CP1255"},
     137             :     {PG_WIN1256, "CP1256"},
     138             :     {PG_WIN1257, "CP1257"},
     139             :     {PG_WIN1258, "CP1258"},
     140             : 
     141             :     {PG_ISO_8859_5, "ISO-8859-5"},
     142             :     {PG_ISO_8859_5, "ISO8859-5"},
     143             :     {PG_ISO_8859_5, "iso88595"},
     144             :     {PG_ISO_8859_5, "CP28595"},
     145             : 
     146             :     {PG_ISO_8859_6, "ISO-8859-6"},
     147             :     {PG_ISO_8859_6, "ISO8859-6"},
     148             :     {PG_ISO_8859_6, "iso88596"},
     149             :     {PG_ISO_8859_6, "CP28596"},
     150             : 
     151             :     {PG_ISO_8859_7, "ISO-8859-7"},
     152             :     {PG_ISO_8859_7, "ISO8859-7"},
     153             :     {PG_ISO_8859_7, "iso88597"},
     154             :     {PG_ISO_8859_7, "CP28597"},
     155             : 
     156             :     {PG_ISO_8859_8, "ISO-8859-8"},
     157             :     {PG_ISO_8859_8, "ISO8859-8"},
     158             :     {PG_ISO_8859_8, "iso88598"},
     159             :     {PG_ISO_8859_8, "CP28598"},
     160             : 
     161             :     {PG_SJIS, "SJIS"},
     162             :     {PG_SJIS, "PCK"},
     163             :     {PG_SJIS, "CP932"},
     164             :     {PG_SJIS, "SHIFT_JIS"},
     165             : 
     166             :     {PG_BIG5, "BIG5"},
     167             :     {PG_BIG5, "BIG5HKSCS"},
     168             :     {PG_BIG5, "Big5-HKSCS"},
     169             :     {PG_BIG5, "CP950"},
     170             : 
     171             :     {PG_GBK, "GBK"},
     172             :     {PG_GBK, "CP936"},
     173             : 
     174             :     {PG_UHC, "UHC"},
     175             :     {PG_UHC, "CP949"},
     176             : 
     177             :     {PG_JOHAB, "JOHAB"},
     178             :     {PG_JOHAB, "CP1361"},
     179             : 
     180             :     {PG_GB18030, "GB18030"},
     181             :     {PG_GB18030, "CP54936"},
     182             : 
     183             :     {PG_SHIFT_JIS_2004, "SJIS_2004"},
     184             : 
     185             :     {PG_SQL_ASCII, "US-ASCII"},
     186             : 
     187             :     {PG_SQL_ASCII, NULL}        /* end marker */
     188             : };
     189             : 
     190             : #ifdef WIN32
     191             : /*
     192             :  * On Windows, use CP<code page number> instead of the nl_langinfo() result
     193             :  *
     194             :  * Visual Studio 2012 expanded the set of valid LC_CTYPE values, so have its
     195             :  * locale machinery determine the code page.  See comments at IsoLocaleName().
     196             :  * For other compilers, follow the locale's predictable format.
     197             :  *
     198             :  * Visual Studio 2015 should still be able to do the same, but the declaration
     199             :  * of lc_codepage is missing in _locale_t, causing this code compilation to
     200             :  * fail, hence this falls back instead on GetLocaleInfoEx. VS 2015 may be an
     201             :  * exception and post-VS2015 versions should be able to handle properly the
     202             :  * codepage number using _create_locale(). So, instead of the same logic as
     203             :  * VS 2012 and VS 2013, this routine uses GetLocaleInfoEx to parse short
     204             :  * locale names like "de-DE", "fr-FR", etc. If those cannot be parsed correctly
     205             :  * process falls back to the pre-VS-2010 manual parsing done with
     206             :  * using <Language>_<Country>.<CodePage> as a base.
     207             :  *
     208             :  * Returns a malloc()'d string for the caller to free.
     209             :  */
     210             : static char *
     211             : win32_langinfo(const char *ctype)
     212             : {
     213             :     char       *r = NULL;
     214             : 
     215             : #if defined(_MSC_VER) && (_MSC_VER < 1900)
     216             :     _locale_t   loct = NULL;
     217             : 
     218             :     loct = _create_locale(LC_CTYPE, ctype);
     219             :     if (loct != NULL)
     220             :     {
     221             :         r = malloc(16);         /* excess */
     222             :         if (r != NULL)
     223             :             sprintf(r, "CP%u", loct->locinfo->lc_codepage);
     224             :         _free_locale(loct);
     225             :     }
     226             : #else
     227             :     char       *codepage;
     228             : 
     229             : #if defined(_MSC_VER) && (_MSC_VER >= 1900)
     230             :     uint32      cp;
     231             :     WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];
     232             : 
     233             :     memset(wctype, 0, sizeof(wctype));
     234             :     MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
     235             : 
     236             :     if (GetLocaleInfoEx(wctype,
     237             :                         LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
     238             :                         (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
     239             :     {
     240             :         r = malloc(16);         /* excess */
     241             :         if (r != NULL)
     242             :         {
     243             :             /*
     244             :              * If the return value is CP_ACP that means no ANSI code page is
     245             :              * available, so only Unicode can be used for the locale.
     246             :              */
     247             :             if (cp == CP_ACP)
     248             :                 strcpy(r, "utf8");
     249             :             else
     250             :                 sprintf(r, "CP%u", cp);
     251             :         }
     252             :     }
     253             :     else
     254             : #endif
     255             :     {
     256             :         /*
     257             :          * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For
     258             :          * example, English_United States.1252.  If we see digits after the
     259             :          * last dot, assume it's a codepage number.  Otherwise, we might be
     260             :          * dealing with a Unix-style locale string; Windows' setlocale() will
     261             :          * take those even though GetLocaleInfoEx() won't, so we end up here.
     262             :          * In that case, just return what's after the last dot and hope we can
     263             :          * find it in our table.
     264             :          */
     265             :         codepage = strrchr(ctype, '.');
     266             :         if (codepage != NULL)
     267             :         {
     268             :             size_t      ln;
     269             : 
     270             :             codepage++;
     271             :             ln = strlen(codepage);
     272             :             r = malloc(ln + 3);
     273             :             if (r != NULL)
     274             :             {
     275             :                 if (strspn(codepage, "0123456789") == ln)
     276             :                     sprintf(r, "CP%s", codepage);
     277             :                 else
     278             :                     strcpy(r, codepage);
     279             :             }
     280             :         }
     281             : 
     282             :     }
     283             : #endif
     284             : 
     285             :     return r;
     286             : }
     287             : 
     288             : #ifndef FRONTEND
     289             : /*
     290             :  * Given a Windows code page identifier, find the corresponding PostgreSQL
     291             :  * encoding.  Issue a warning and return -1 if none found.
     292             :  */
     293             : int
     294             : pg_codepage_to_encoding(UINT cp)
     295             : {
     296             :     char        sys[16];
     297             :     int         i;
     298             : 
     299             :     sprintf(sys, "CP%u", cp);
     300             : 
     301             :     /* Check the table */
     302             :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     303             :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     304             :             return encoding_match_list[i].pg_enc_code;
     305             : 
     306             :     ereport(WARNING,
     307             :             (errmsg("could not determine encoding for codeset \"%s\"", sys)));
     308             : 
     309             :     return -1;
     310             : }
     311             : #endif
     312             : #endif                          /* WIN32 */
     313             : 
     314             : #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32)
     315             : 
     316             : /*
     317             :  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
     318             :  * encoding, if we can determine it.  Return -1 if we can't determine it.
     319             :  *
     320             :  * Pass in NULL to get the encoding for the current locale setting.
     321             :  * Pass "" to get the encoding selected by the server's environment.
     322             :  *
     323             :  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
     324             :  * with any desired encoding.
     325             :  *
     326             :  * If running in the backend and write_message is false, this function must
     327             :  * cope with the possibility that elog() and palloc() are not yet usable.
     328             :  */
     329             : int
     330       19284 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     331             : {
     332             :     char       *sys;
     333             :     int         i;
     334             : 
     335             :     /* Get the CODESET property, and also LC_CTYPE if not passed in */
     336       19284 :     if (ctype)
     337             :     {
     338             :         char       *save;
     339             :         char       *name;
     340             : 
     341             :         /* If locale is C or POSIX, we can allow all encodings */
     342        8458 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     343        3890 :             pg_strcasecmp(ctype, "POSIX") == 0)
     344        1044 :             return PG_SQL_ASCII;
     345             : 
     346        3524 :         save = setlocale(LC_CTYPE, NULL);
     347        3524 :         if (!save)
     348           0 :             return -1;          /* setlocale() broken? */
     349             :         /* must copy result, or it might change after setlocale */
     350        3524 :         save = strdup(save);
     351        3524 :         if (!save)
     352           0 :             return -1;          /* out of memory; unlikely */
     353             : 
     354        3524 :         name = setlocale(LC_CTYPE, ctype);
     355        3524 :         if (!name)
     356             :         {
     357           0 :             free(save);
     358           0 :             return -1;          /* bogus ctype passed in? */
     359             :         }
     360             : 
     361             : #ifndef WIN32
     362        3524 :         sys = nl_langinfo(CODESET);
     363        3524 :         if (sys)
     364        3524 :             sys = strdup(sys);
     365             : #else
     366             :         sys = win32_langinfo(name);
     367             : #endif
     368             : 
     369        3524 :         setlocale(LC_CTYPE, save);
     370        3524 :         free(save);
     371             :     }
     372             :     else
     373             :     {
     374             :         /* much easier... */
     375       14716 :         ctype = setlocale(LC_CTYPE, NULL);
     376       14716 :         if (!ctype)
     377           0 :             return -1;          /* setlocale() broken? */
     378             : 
     379             :         /* If locale is C or POSIX, we can allow all encodings */
     380       29356 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     381       14640 :             pg_strcasecmp(ctype, "POSIX") == 0)
     382          76 :             return PG_SQL_ASCII;
     383             : 
     384             : #ifndef WIN32
     385       14640 :         sys = nl_langinfo(CODESET);
     386       14640 :         if (sys)
     387       14640 :             sys = strdup(sys);
     388             : #else
     389             :         sys = win32_langinfo(ctype);
     390             : #endif
     391             :     }
     392             : 
     393       18164 :     if (!sys)
     394           0 :         return -1;              /* out of memory; unlikely */
     395             : 
     396             :     /* Check the table */
     397      399608 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     398             :     {
     399      399608 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     400             :         {
     401       18164 :             free(sys);
     402       18164 :             return encoding_match_list[i].pg_enc_code;
     403             :         }
     404             :     }
     405             : 
     406             :     /* Special-case kluges for particular platforms go here */
     407             : 
     408             : #ifdef __darwin__
     409             : 
     410             :     /*
     411             :      * Current macOS has many locales that report an empty string for CODESET,
     412             :      * but they all seem to actually use UTF-8.
     413             :      */
     414             :     if (strlen(sys) == 0)
     415             :     {
     416             :         free(sys);
     417             :         return PG_UTF8;
     418             :     }
     419             : #endif
     420             : 
     421             :     /*
     422             :      * We print a warning if we got a CODESET string but couldn't recognize
     423             :      * it.  This means we need another entry in the table.
     424             :      */
     425           0 :     if (write_message)
     426             :     {
     427             : #ifdef FRONTEND
     428           0 :         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
     429             :                 ctype, sys);
     430             :         /* keep newline separate so there's only one translatable string */
     431           0 :         fputc('\n', stderr);
     432             : #else
     433           0 :         ereport(WARNING,
     434             :                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
     435             :                         ctype, sys)));
     436             : #endif
     437             :     }
     438             : 
     439           0 :     free(sys);
     440           0 :     return -1;
     441             : }
     442             : #else                           /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
     443             : 
     444             : /*
     445             :  * stub if no multi-language platform support
     446             :  *
     447             :  * Note: we could return -1 here, but that would have the effect of
     448             :  * forcing users to specify an encoding to initdb on such platforms.
     449             :  * It seems better to silently default to SQL_ASCII.
     450             :  */
     451             : int
     452             : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     453             : {
     454             :     return PG_SQL_ASCII;
     455             : }
     456             : 
     457             : #endif                          /* (HAVE_LANGINFO_H && CODESET) || WIN32 */

Generated by: LCOV version 1.13