LCOV - code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 228 299 76.3 %
Date: 2019-09-22 07:07:17 Functions: 30 39 76.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/builtins.h"
      41             : #include "utils/memutils.h"
      42             : #include "utils/syscache.h"
      43             : 
      44             : /*
      45             :  * We maintain a simple linked list caching the fmgr lookup info for the
      46             :  * currently selected conversion functions, as well as any that have been
      47             :  * selected previously in the current session.  (We remember previous
      48             :  * settings because we must be able to restore a previous setting during
      49             :  * transaction rollback, without doing any fresh catalog accesses.)
      50             :  *
      51             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      52             :  */
      53             : typedef struct ConvProcInfo
      54             : {
      55             :     int         s_encoding;     /* server and client encoding IDs */
      56             :     int         c_encoding;
      57             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      58             :     FmgrInfo    to_client_info;
      59             : } ConvProcInfo;
      60             : 
      61             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      62             : 
      63             : /*
      64             :  * These variables point to the currently active conversion functions,
      65             :  * or are NULL when no conversion is needed.
      66             :  */
      67             : static FmgrInfo *ToServerConvProc = NULL;
      68             : static FmgrInfo *ToClientConvProc = NULL;
      69             : 
      70             : /*
      71             :  * These variables track the currently-selected encodings.
      72             :  */
      73             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      74             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      75             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      76             : 
      77             : /*
      78             :  * During backend startup we can't set client encoding because we (a)
      79             :  * can't look up the conversion functions, and (b) may not know the database
      80             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      81             :  * remembers it for InitializeClientEncoding() to apply later.
      82             :  */
      83             : static bool backend_startup_complete = false;
      84             : static int  pending_client_encoding = PG_SQL_ASCII;
      85             : 
      86             : 
      87             : /* Internal functions */
      88             : static char *perform_default_encoding_conversion(const char *src,
      89             :                                                  int len, bool is_client_to_server);
      90             : static int  cliplen(const char *str, int len, int limit);
      91             : 
      92             : 
      93             : /*
      94             :  * Prepare for a future call to SetClientEncoding.  Success should mean
      95             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
      96             :  *
      97             :  * (But note that success before backend_startup_complete does not guarantee
      98             :  * success after ...)
      99             :  *
     100             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     101             :  */
     102             : int
     103       24036 : PrepareClientEncoding(int encoding)
     104             : {
     105             :     int         current_server_encoding;
     106             :     ListCell   *lc;
     107             : 
     108       24036 :     if (!PG_VALID_FE_ENCODING(encoding))
     109           0 :         return -1;
     110             : 
     111             :     /* Can't do anything during startup, per notes above */
     112       24036 :     if (!backend_startup_complete)
     113       11510 :         return 0;
     114             : 
     115       12526 :     current_server_encoding = GetDatabaseEncoding();
     116             : 
     117             :     /*
     118             :      * Check for cases that require no conversion function.
     119             :      */
     120       12526 :     if (current_server_encoding == encoding ||
     121        3278 :         current_server_encoding == PG_SQL_ASCII ||
     122             :         encoding == PG_SQL_ASCII)
     123       12520 :         return 0;
     124             : 
     125           6 :     if (IsTransactionState())
     126             :     {
     127             :         /*
     128             :          * If we're in a live transaction, it's safe to access the catalogs,
     129             :          * so look up the functions.  We repeat the lookup even if the info is
     130             :          * already cached, so that we can react to changes in the contents of
     131             :          * pg_conversion.
     132             :          */
     133             :         Oid         to_server_proc,
     134             :                     to_client_proc;
     135             :         ConvProcInfo *convinfo;
     136             :         MemoryContext oldcontext;
     137             : 
     138           6 :         to_server_proc = FindDefaultConversionProc(encoding,
     139             :                                                    current_server_encoding);
     140           6 :         if (!OidIsValid(to_server_proc))
     141           0 :             return -1;
     142           6 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     143             :                                                    encoding);
     144           6 :         if (!OidIsValid(to_client_proc))
     145           0 :             return -1;
     146             : 
     147             :         /*
     148             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     149             :          */
     150           6 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     151             :                                                        sizeof(ConvProcInfo));
     152           6 :         convinfo->s_encoding = current_server_encoding;
     153           6 :         convinfo->c_encoding = encoding;
     154           6 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     155             :                       TopMemoryContext);
     156           6 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     157             :                       TopMemoryContext);
     158             : 
     159             :         /* Attach new info to head of list */
     160           6 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     161           6 :         ConvProcList = lcons(convinfo, ConvProcList);
     162           6 :         MemoryContextSwitchTo(oldcontext);
     163             : 
     164             :         /*
     165             :          * We cannot yet remove any older entry for the same encoding pair,
     166             :          * since it could still be in use.  SetClientEncoding will clean up.
     167             :          */
     168             : 
     169           6 :         return 0;               /* success */
     170             :     }
     171             :     else
     172             :     {
     173             :         /*
     174             :          * If we're not in a live transaction, the only thing we can do is
     175             :          * restore a previous setting using the cache.  This covers all
     176             :          * transaction-rollback cases.  The only case it might not work for is
     177             :          * trying to change client_encoding on the fly by editing
     178             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     179             :          * thing to do anyway.
     180             :          */
     181           0 :         foreach(lc, ConvProcList)
     182             :         {
     183           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     184             : 
     185           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     186           0 :                 oldinfo->c_encoding == encoding)
     187           0 :                 return 0;
     188             :         }
     189             : 
     190           0 :         return -1;              /* it's not cached, so fail */
     191             :     }
     192             : }
     193             : 
     194             : /*
     195             :  * Set the active client encoding and set up the conversion-function pointers.
     196             :  * PrepareClientEncoding should have been called previously for this encoding.
     197             :  *
     198             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     199             :  */
     200             : int
     201       20074 : SetClientEncoding(int encoding)
     202             : {
     203             :     int         current_server_encoding;
     204             :     bool        found;
     205             :     ListCell   *lc;
     206             : 
     207       20074 :     if (!PG_VALID_FE_ENCODING(encoding))
     208           0 :         return -1;
     209             : 
     210             :     /* Can't do anything during startup, per notes above */
     211       20074 :     if (!backend_startup_complete)
     212             :     {
     213        9164 :         pending_client_encoding = encoding;
     214        9164 :         return 0;
     215             :     }
     216             : 
     217       10910 :     current_server_encoding = GetDatabaseEncoding();
     218             : 
     219             :     /*
     220             :      * Check for cases that require no conversion function.
     221             :      */
     222       10910 :     if (current_server_encoding == encoding ||
     223        1662 :         current_server_encoding == PG_SQL_ASCII ||
     224             :         encoding == PG_SQL_ASCII)
     225             :     {
     226       10904 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     227       10904 :         ToServerConvProc = NULL;
     228       10904 :         ToClientConvProc = NULL;
     229       10904 :         return 0;
     230             :     }
     231             : 
     232             :     /*
     233             :      * Search the cache for the entry previously prepared by
     234             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     235             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     236             :      * leak memory.
     237             :      */
     238           6 :     found = false;
     239          12 :     foreach(lc, ConvProcList)
     240             :     {
     241           6 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     242             : 
     243          12 :         if (convinfo->s_encoding == current_server_encoding &&
     244           6 :             convinfo->c_encoding == encoding)
     245             :         {
     246           6 :             if (!found)
     247             :             {
     248             :                 /* Found newest entry, so set up */
     249           6 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     250           6 :                 ToServerConvProc = &convinfo->to_server_info;
     251           6 :                 ToClientConvProc = &convinfo->to_client_info;
     252           6 :                 found = true;
     253             :             }
     254             :             else
     255             :             {
     256             :                 /* Duplicate entry, release it */
     257           0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     258           0 :                 pfree(convinfo);
     259             :             }
     260             :         }
     261             :     }
     262             : 
     263           6 :     if (found)
     264           6 :         return 0;               /* success */
     265             :     else
     266           0 :         return -1;              /* it's not cached, so fail */
     267             : }
     268             : 
     269             : /*
     270             :  * Initialize client encoding conversions.
     271             :  *      Called from InitPostgres() once during backend startup.
     272             :  */
     273             : void
     274        9136 : InitializeClientEncoding(void)
     275             : {
     276             :     Assert(!backend_startup_complete);
     277        9136 :     backend_startup_complete = true;
     278             : 
     279       18272 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     280        9136 :         SetClientEncoding(pending_client_encoding) < 0)
     281             :     {
     282             :         /*
     283             :          * Oops, the requested conversion is not available. We couldn't fail
     284             :          * before, but we can now.
     285             :          */
     286           0 :         ereport(FATAL,
     287             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     288             :                  errmsg("conversion between %s and %s is not supported",
     289             :                         pg_enc2name_tbl[pending_client_encoding].name,
     290             :                         GetDatabaseEncodingName())));
     291             :     }
     292        9136 : }
     293             : 
     294             : /*
     295             :  * returns the current client encoding
     296             :  */
     297             : int
     298        4516 : pg_get_client_encoding(void)
     299             : {
     300        4516 :     return ClientEncoding->encoding;
     301             : }
     302             : 
     303             : /*
     304             :  * returns the current client encoding name
     305             :  */
     306             : const char *
     307           0 : pg_get_client_encoding_name(void)
     308             : {
     309           0 :     return ClientEncoding->name;
     310             : }
     311             : 
     312             : /*
     313             :  * Convert src string to another encoding (general case).
     314             :  *
     315             :  * See the notes about string conversion functions at the top of this file.
     316             :  */
     317             : unsigned char *
     318        1154 : pg_do_encoding_conversion(unsigned char *src, int len,
     319             :                           int src_encoding, int dest_encoding)
     320             : {
     321             :     unsigned char *result;
     322             :     Oid         proc;
     323             : 
     324        1154 :     if (len <= 0)
     325           4 :         return src;             /* empty string is always valid */
     326             : 
     327        1150 :     if (src_encoding == dest_encoding)
     328         622 :         return src;             /* no conversion required, assume valid */
     329             : 
     330         528 :     if (dest_encoding == PG_SQL_ASCII)
     331           0 :         return src;             /* any string is valid in SQL_ASCII */
     332             : 
     333         528 :     if (src_encoding == PG_SQL_ASCII)
     334             :     {
     335             :         /* No conversion is possible, but we must validate the result */
     336          16 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     337          16 :         return src;
     338             :     }
     339             : 
     340         512 :     if (!IsTransactionState())  /* shouldn't happen */
     341           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     342             : 
     343         512 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     344         512 :     if (!OidIsValid(proc))
     345           0 :         ereport(ERROR,
     346             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     347             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     348             :                         pg_encoding_to_char(src_encoding),
     349             :                         pg_encoding_to_char(dest_encoding))));
     350             : 
     351             :     /*
     352             :      * Allocate space for conversion result, being wary of integer overflow
     353             :      */
     354         512 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     355           0 :         ereport(ERROR,
     356             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     357             :                  errmsg("out of memory"),
     358             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     359             :                            len)));
     360             : 
     361         512 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     362             : 
     363         512 :     OidFunctionCall5(proc,
     364             :                      Int32GetDatum(src_encoding),
     365             :                      Int32GetDatum(dest_encoding),
     366             :                      CStringGetDatum(src),
     367             :                      CStringGetDatum(result),
     368             :                      Int32GetDatum(len));
     369         512 :     return result;
     370             : }
     371             : 
     372             : /*
     373             :  * Convert string to encoding encoding_name. The source
     374             :  * encoding is the DB encoding.
     375             :  *
     376             :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     377             : Datum
     378           0 : pg_convert_to(PG_FUNCTION_ARGS)
     379             : {
     380           0 :     Datum       string = PG_GETARG_DATUM(0);
     381           0 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     382           0 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     383             :                                                         CStringGetDatum(DatabaseEncoding->name));
     384             :     Datum       result;
     385             : 
     386             :     /*
     387             :      * pg_convert expects a bytea as its first argument. We're passing it a
     388             :      * text argument here, relying on the fact that they are both in fact
     389             :      * varlena types, and thus structurally identical.
     390             :      */
     391           0 :     result = DirectFunctionCall3(pg_convert, string,
     392             :                                  src_encoding_name, dest_encoding_name);
     393             : 
     394           0 :     PG_RETURN_DATUM(result);
     395             : }
     396             : 
     397             : /*
     398             :  * Convert string from encoding encoding_name. The destination
     399             :  * encoding is the DB encoding.
     400             :  *
     401             :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     402             : Datum
     403          20 : pg_convert_from(PG_FUNCTION_ARGS)
     404             : {
     405          20 :     Datum       string = PG_GETARG_DATUM(0);
     406          20 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     407          20 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     408             :                                                          CStringGetDatum(DatabaseEncoding->name));
     409             :     Datum       result;
     410             : 
     411          20 :     result = DirectFunctionCall3(pg_convert, string,
     412             :                                  src_encoding_name, dest_encoding_name);
     413             : 
     414             :     /*
     415             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     416             :      * the fact that they are both in fact varlena types, and thus
     417             :      * structurally identical. Although not all bytea values are valid text,
     418             :      * in this case it will be because we've told pg_convert to return one
     419             :      * that is valid as text in the current database encoding.
     420             :      */
     421          20 :     PG_RETURN_DATUM(result);
     422             : }
     423             : 
     424             : /*
     425             :  * Convert string between two arbitrary encodings.
     426             :  *
     427             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     428             :  */
     429             : Datum
     430         532 : pg_convert(PG_FUNCTION_ARGS)
     431             : {
     432         532 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     433         532 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     434         532 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     435         532 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     436         532 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     437             :     const char *src_str;
     438             :     char       *dest_str;
     439             :     bytea      *retval;
     440             :     int         len;
     441             : 
     442         532 :     if (src_encoding < 0)
     443           0 :         ereport(ERROR,
     444             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     445             :                  errmsg("invalid source encoding name \"%s\"",
     446             :                         src_encoding_name)));
     447         532 :     if (dest_encoding < 0)
     448           0 :         ereport(ERROR,
     449             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     450             :                  errmsg("invalid destination encoding name \"%s\"",
     451             :                         dest_encoding_name)));
     452             : 
     453             :     /* make sure that source string is valid */
     454         532 :     len = VARSIZE_ANY_EXHDR(string);
     455         532 :     src_str = VARDATA_ANY(string);
     456         532 :     pg_verify_mbstr_len(src_encoding, src_str, len, false);
     457             : 
     458             :     /* perform conversion */
     459         532 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     460             :                                                   len,
     461             :                                                   src_encoding,
     462             :                                                   dest_encoding);
     463             : 
     464             :     /* update len if conversion actually happened */
     465         532 :     if (dest_str != src_str)
     466         512 :         len = strlen(dest_str);
     467             : 
     468             :     /*
     469             :      * build bytea data type structure.
     470             :      */
     471         532 :     retval = (bytea *) palloc(len + VARHDRSZ);
     472         532 :     SET_VARSIZE(retval, len + VARHDRSZ);
     473         532 :     memcpy(VARDATA(retval), dest_str, len);
     474             : 
     475         532 :     if (dest_str != src_str)
     476         512 :         pfree(dest_str);
     477             : 
     478             :     /* free memory if allocated by the toaster */
     479         532 :     PG_FREE_IF_COPY(string, 0);
     480             : 
     481         532 :     PG_RETURN_BYTEA_P(retval);
     482             : }
     483             : 
     484             : /*
     485             :  * get the length of the string considered as text in the specified
     486             :  * encoding. Raises an error if the data is not valid in that
     487             :  * encoding.
     488             :  *
     489             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     490             :  */
     491             : Datum
     492           0 : length_in_encoding(PG_FUNCTION_ARGS)
     493             : {
     494           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     495           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     496           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     497             :     const char *src_str;
     498             :     int         len;
     499             :     int         retval;
     500             : 
     501           0 :     if (src_encoding < 0)
     502           0 :         ereport(ERROR,
     503             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     504             :                  errmsg("invalid encoding name \"%s\"",
     505             :                         src_encoding_name)));
     506             : 
     507           0 :     len = VARSIZE_ANY_EXHDR(string);
     508           0 :     src_str = VARDATA_ANY(string);
     509             : 
     510           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     511             : 
     512           0 :     PG_RETURN_INT32(retval);
     513             : }
     514             : 
     515             : /*
     516             :  * Get maximum multibyte character length in the specified encoding.
     517             :  *
     518             :  * Note encoding is specified numerically, not by name as above.
     519             :  */
     520             : Datum
     521           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     522             : {
     523           0 :     int         encoding = PG_GETARG_INT32(0);
     524             : 
     525           0 :     if (PG_VALID_ENCODING(encoding))
     526           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     527             :     else
     528           0 :         PG_RETURN_NULL();
     529             : }
     530             : 
     531             : /*
     532             :  * Convert client encoding to server encoding.
     533             :  *
     534             :  * See the notes about string conversion functions at the top of this file.
     535             :  */
     536             : char *
     537      569004 : pg_client_to_server(const char *s, int len)
     538             : {
     539      569004 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     540             : }
     541             : 
     542             : /*
     543             :  * Convert any encoding to server encoding.
     544             :  *
     545             :  * See the notes about string conversion functions at the top of this file.
     546             :  *
     547             :  * Unlike the other string conversion functions, this will apply validation
     548             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     549             :  * used to process data coming in from outside the database, and we never
     550             :  * want to just assume validity.
     551             :  */
     552             : char *
     553     2907212 : pg_any_to_server(const char *s, int len, int encoding)
     554             : {
     555     2907212 :     if (len <= 0)
     556       91992 :         return unconstify(char *, s);   /* empty string is always valid */
     557             : 
     558     2815220 :     if (encoding == DatabaseEncoding->encoding ||
     559             :         encoding == PG_SQL_ASCII)
     560             :     {
     561             :         /*
     562             :          * No conversion is needed, but we must still validate the data.
     563             :          */
     564     2815198 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     565     2815196 :         return unconstify(char *, s);
     566             :     }
     567             : 
     568          22 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     569             :     {
     570             :         /*
     571             :          * No conversion is possible, but we must still validate the data,
     572             :          * because the client-side code might have done string escaping using
     573             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     574             :          * then we just do a straight validation under that encoding.  For an
     575             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     576             :          * to the parser but we have no way to convert it.  We compromise by
     577             :          * rejecting the data if it contains any non-ASCII characters.
     578             :          */
     579           2 :         if (PG_VALID_BE_ENCODING(encoding))
     580           2 :             (void) pg_verify_mbstr(encoding, s, len, false);
     581             :         else
     582             :         {
     583             :             int         i;
     584             : 
     585           0 :             for (i = 0; i < len; i++)
     586             :             {
     587           0 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     588           0 :                     ereport(ERROR,
     589             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     590             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     591             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     592             :                                     (unsigned char) s[i])));
     593             :             }
     594             :         }
     595           2 :         return unconstify(char *, s);
     596             :     }
     597             : 
     598             :     /* Fast path if we can use cached conversion function */
     599          20 :     if (encoding == ClientEncoding->encoding)
     600          20 :         return perform_default_encoding_conversion(s, len, true);
     601             : 
     602             :     /* General case ... will not work outside transactions */
     603           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     604             :                                               len,
     605             :                                               encoding,
     606           0 :                                               DatabaseEncoding->encoding);
     607             : }
     608             : 
     609             : /*
     610             :  * Convert server encoding to client encoding.
     611             :  *
     612             :  * See the notes about string conversion functions at the top of this file.
     613             :  */
     614             : char *
     615     9006250 : pg_server_to_client(const char *s, int len)
     616             : {
     617     9006250 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     618             : }
     619             : 
     620             : /*
     621             :  * Convert server encoding to any encoding.
     622             :  *
     623             :  * See the notes about string conversion functions at the top of this file.
     624             :  */
     625             : char *
     626    13587902 : pg_server_to_any(const char *s, int len, int encoding)
     627             : {
     628    13587902 :     if (len <= 0)
     629      109966 :         return unconstify(char *, s);   /* empty string is always valid */
     630             : 
     631    13477936 :     if (encoding == DatabaseEncoding->encoding ||
     632             :         encoding == PG_SQL_ASCII)
     633    13477622 :         return unconstify(char *, s);   /* assume data is valid */
     634             : 
     635         314 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     636             :     {
     637             :         /* No conversion is possible, but we must validate the result */
     638          88 :         (void) pg_verify_mbstr(encoding, s, len, false);
     639          88 :         return unconstify(char *, s);
     640             :     }
     641             : 
     642             :     /* Fast path if we can use cached conversion function */
     643         226 :     if (encoding == ClientEncoding->encoding)
     644         226 :         return perform_default_encoding_conversion(s, len, false);
     645             : 
     646             :     /* General case ... will not work outside transactions */
     647           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     648             :                                               len,
     649           0 :                                               DatabaseEncoding->encoding,
     650             :                                               encoding);
     651             : }
     652             : 
     653             : /*
     654             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     655             :  *  this function does not access database at all, it is safe to call
     656             :  *  outside transactions.  If the conversion has not been set up by
     657             :  *  SetClientEncoding(), no conversion is performed.
     658             :  */
     659             : static char *
     660         246 : perform_default_encoding_conversion(const char *src, int len,
     661             :                                     bool is_client_to_server)
     662             : {
     663             :     char       *result;
     664             :     int         src_encoding,
     665             :                 dest_encoding;
     666             :     FmgrInfo   *flinfo;
     667             : 
     668         246 :     if (is_client_to_server)
     669             :     {
     670          20 :         src_encoding = ClientEncoding->encoding;
     671          20 :         dest_encoding = DatabaseEncoding->encoding;
     672          20 :         flinfo = ToServerConvProc;
     673             :     }
     674             :     else
     675             :     {
     676         226 :         src_encoding = DatabaseEncoding->encoding;
     677         226 :         dest_encoding = ClientEncoding->encoding;
     678         226 :         flinfo = ToClientConvProc;
     679             :     }
     680             : 
     681         246 :     if (flinfo == NULL)
     682           0 :         return unconstify(char *, src);
     683             : 
     684             :     /*
     685             :      * Allocate space for conversion result, being wary of integer overflow
     686             :      */
     687         246 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     688           0 :         ereport(ERROR,
     689             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     690             :                  errmsg("out of memory"),
     691             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     692             :                            len)));
     693             : 
     694         246 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     695             : 
     696         246 :     FunctionCall5(flinfo,
     697             :                   Int32GetDatum(src_encoding),
     698             :                   Int32GetDatum(dest_encoding),
     699             :                   CStringGetDatum(src),
     700             :                   CStringGetDatum(result),
     701             :                   Int32GetDatum(len));
     702         246 :     return result;
     703             : }
     704             : 
     705             : 
     706             : /* convert a multibyte string to a wchar */
     707             : int
     708           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     709             : {
     710           0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     711             : }
     712             : 
     713             : /* convert a multibyte string to a wchar with a limited length */
     714             : int
     715      610024 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     716             : {
     717      610024 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     718             : }
     719             : 
     720             : /* same, with any encoding */
     721             : int
     722       12094 : pg_encoding_mb2wchar_with_len(int encoding,
     723             :                               const char *from, pg_wchar *to, int len)
     724             : {
     725       12094 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     726             : }
     727             : 
     728             : /* convert a wchar string to a multibyte */
     729             : int
     730           0 : pg_wchar2mb(const pg_wchar *from, char *to)
     731             : {
     732           0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
     733             : }
     734             : 
     735             : /* convert a wchar string to a multibyte with a limited length */
     736             : int
     737     1008242 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
     738             : {
     739     1008242 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     740             : }
     741             : 
     742             : /* same, with any encoding */
     743             : int
     744           0 : pg_encoding_wchar2mb_with_len(int encoding,
     745             :                               const pg_wchar *from, char *to, int len)
     746             : {
     747           0 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     748             : }
     749             : 
     750             : /* returns the byte length of a multibyte character */
     751             : int
     752   122959306 : pg_mblen(const char *mbstr)
     753             : {
     754   122959306 :     return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
     755             : }
     756             : 
     757             : /* returns the display length of a multibyte character */
     758             : int
     759        4356 : pg_dsplen(const char *mbstr)
     760             : {
     761        4356 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
     762             : }
     763             : 
     764             : /* returns the length (counted in wchars) of a multibyte string */
     765             : int
     766         468 : pg_mbstrlen(const char *mbstr)
     767             : {
     768         468 :     int         len = 0;
     769             : 
     770             :     /* optimization for single byte encoding */
     771         468 :     if (pg_database_encoding_max_length() == 1)
     772           0 :         return strlen(mbstr);
     773             : 
     774        1552 :     while (*mbstr)
     775             :     {
     776         616 :         mbstr += pg_mblen(mbstr);
     777         616 :         len++;
     778             :     }
     779         468 :     return len;
     780             : }
     781             : 
     782             : /* returns the length (counted in wchars) of a multibyte string
     783             :  * (not necessarily NULL terminated)
     784             :  */
     785             : int
     786      926558 : pg_mbstrlen_with_len(const char *mbstr, int limit)
     787             : {
     788      926558 :     int         len = 0;
     789             : 
     790             :     /* optimization for single byte encoding */
     791      926558 :     if (pg_database_encoding_max_length() == 1)
     792         192 :         return limit;
     793             : 
     794   105874650 :     while (limit > 0 && *mbstr)
     795             :     {
     796   104021918 :         int         l = pg_mblen(mbstr);
     797             : 
     798   104021918 :         limit -= l;
     799   104021918 :         mbstr += l;
     800   104021918 :         len++;
     801             :     }
     802      926366 :     return len;
     803             : }
     804             : 
     805             : /*
     806             :  * returns the byte length of a multibyte string
     807             :  * (not necessarily NULL terminated)
     808             :  * that is no longer than limit.
     809             :  * this function does not break multibyte character boundary.
     810             :  */
     811             : int
     812       35072 : pg_mbcliplen(const char *mbstr, int len, int limit)
     813             : {
     814       35072 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
     815             :                                  len, limit);
     816             : }
     817             : 
     818             : /*
     819             :  * pg_mbcliplen with specified encoding
     820             :  */
     821             : int
     822       35072 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
     823             :                       int len, int limit)
     824             : {
     825             :     mblen_converter mblen_fn;
     826       35072 :     int         clen = 0;
     827             :     int         l;
     828             : 
     829             :     /* optimization for single byte encoding */
     830       35072 :     if (pg_encoding_max_length(encoding) == 1)
     831        4652 :         return cliplen(mbstr, len, limit);
     832             : 
     833       30420 :     mblen_fn = pg_wchar_table[encoding].mblen;
     834             : 
     835      410648 :     while (len > 0 && *mbstr)
     836             :     {
     837      363482 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
     838      363482 :         if ((clen + l) > limit)
     839          32 :             break;
     840      363450 :         clen += l;
     841      363450 :         if (clen == limit)
     842       13642 :             break;
     843      349808 :         len -= l;
     844      349808 :         mbstr += l;
     845             :     }
     846       30420 :     return clen;
     847             : }
     848             : 
     849             : /*
     850             :  * Similar to pg_mbcliplen except the limit parameter specifies the
     851             :  * character length, not the byte length.
     852             :  */
     853             : int
     854         168 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
     855             : {
     856         168 :     int         clen = 0;
     857         168 :     int         nch = 0;
     858             :     int         l;
     859             : 
     860             :     /* optimization for single byte encoding */
     861         168 :     if (pg_database_encoding_max_length() == 1)
     862           0 :         return cliplen(mbstr, len, limit);
     863             : 
     864         956 :     while (len > 0 && *mbstr)
     865             :     {
     866         776 :         l = pg_mblen(mbstr);
     867         776 :         nch++;
     868         776 :         if (nch > limit)
     869         156 :             break;
     870         620 :         clen += l;
     871         620 :         len -= l;
     872         620 :         mbstr += l;
     873             :     }
     874         168 :     return clen;
     875             : }
     876             : 
     877             : /* mbcliplen for any single-byte encoding */
     878             : static int
     879        4652 : cliplen(const char *str, int len, int limit)
     880             : {
     881        4652 :     int         l = 0;
     882             : 
     883        4652 :     len = Min(len, limit);
     884       44596 :     while (l < len && str[l])
     885       35292 :         l++;
     886        4652 :     return l;
     887             : }
     888             : 
     889             : void
     890        8534 : SetDatabaseEncoding(int encoding)
     891             : {
     892        8534 :     if (!PG_VALID_BE_ENCODING(encoding))
     893           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
     894             : 
     895        8534 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
     896             :     Assert(DatabaseEncoding->encoding == encoding);
     897        8534 : }
     898             : 
     899             : void
     900       11092 : SetMessageEncoding(int encoding)
     901             : {
     902             :     /* Some calls happen before we can elog()! */
     903             :     Assert(PG_VALID_ENCODING(encoding));
     904             : 
     905       11092 :     MessageEncoding = &pg_enc2name_tbl[encoding];
     906             :     Assert(MessageEncoding->encoding == encoding);
     907       11092 : }
     908             : 
     909             : #ifdef ENABLE_NLS
     910             : /*
     911             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
     912             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
     913             :  * fail for gettext-internal causes like out-of-memory.
     914             :  */
     915             : static bool
     916         254 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
     917             : {
     918         254 :     bool        elog_ok = (CurrentMemoryContext != NULL);
     919             :     int         i;
     920             : 
     921         712 :     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
     922             :     {
     923         712 :         if (pg_enc2gettext_tbl[i].encoding == encoding)
     924             :         {
     925         254 :             if (bind_textdomain_codeset(domainname,
     926             :                                         pg_enc2gettext_tbl[i].name) != NULL)
     927         254 :                 return true;
     928             : 
     929           0 :             if (elog_ok)
     930           0 :                 elog(LOG, "bind_textdomain_codeset failed");
     931             :             else
     932           0 :                 write_stderr("bind_textdomain_codeset failed");
     933             : 
     934           0 :             break;
     935             :         }
     936             :     }
     937             : 
     938           0 :     return false;
     939             : }
     940             : 
     941             : /*
     942             :  * Bind a gettext message domain to the codeset corresponding to the database
     943             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
     944             :  * Return the MessageEncoding implied by the new settings.
     945             :  *
     946             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
     947             :  * When that matches the database encoding, we don't need to do anything.  In
     948             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
     949             :  * database encoding, except for the C locale.  (On Windows, we also permit a
     950             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
     951             :  * gettext to the right codeset.
     952             :  *
     953             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
     954             :  * convenient departure for software that passes the strings to Windows ANSI
     955             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
     956             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
     957             :  *
     958             :  * This function is called before elog() and palloc() are usable.
     959             :  */
     960             : int
     961       13354 : pg_bind_textdomain_codeset(const char *domainname)
     962             : {
     963       13354 :     bool        elog_ok = (CurrentMemoryContext != NULL);
     964       13354 :     int         encoding = GetDatabaseEncoding();
     965             :     int         new_msgenc;
     966             : 
     967             : #ifndef WIN32
     968       13354 :     const char *ctype = setlocale(LC_CTYPE, NULL);
     969             : 
     970       13354 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
     971             : #endif
     972         584 :         if (encoding != PG_SQL_ASCII &&
     973         254 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
     974         254 :             return encoding;
     975             : 
     976       13100 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
     977       13100 :     if (new_msgenc < 0)
     978           0 :         new_msgenc = PG_SQL_ASCII;
     979             : 
     980             : #ifdef WIN32
     981             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
     982             :         /* On failure, the old message encoding remains valid. */
     983             :         return GetMessageEncoding();
     984             : #endif
     985             : 
     986       13100 :     return new_msgenc;
     987             : }
     988             : #endif
     989             : 
     990             : /*
     991             :  * The database encoding, also called the server encoding, represents the
     992             :  * encoding of data stored in text-like data types.  Affected types include
     993             :  * cstring, text, varchar, name, xml, and json.
     994             :  */
     995             : int
     996    13049318 : GetDatabaseEncoding(void)
     997             : {
     998    13049318 :     return DatabaseEncoding->encoding;
     999             : }
    1000             : 
    1001             : const char *
    1002       17460 : GetDatabaseEncodingName(void)
    1003             : {
    1004       17460 :     return DatabaseEncoding->name;
    1005             : }
    1006             : 
    1007             : Datum
    1008          10 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1009             : {
    1010          10 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1011             : }
    1012             : 
    1013             : Datum
    1014           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1015             : {
    1016           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1017             : }
    1018             : 
    1019             : /*
    1020             :  * gettext() returns messages in this encoding.  This often matches the
    1021             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1022             :  * not attached to a database, and under a database encoding lacking iconv
    1023             :  * support (MULE_INTERNAL).
    1024             :  */
    1025             : int
    1026           0 : GetMessageEncoding(void)
    1027             : {
    1028           0 :     return MessageEncoding->encoding;
    1029             : }
    1030             : 
    1031             : #ifdef WIN32
    1032             : /*
    1033             :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1034             :  * string. The character length is also passed to utf16len if not
    1035             :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1036             :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1037             :  */
    1038             : WCHAR *
    1039             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1040             : {
    1041             :     int         msgenc = GetMessageEncoding();
    1042             :     WCHAR      *utf16;
    1043             :     int         dstlen;
    1044             :     UINT        codepage;
    1045             : 
    1046             :     if (msgenc == PG_SQL_ASCII)
    1047             :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1048             :         return NULL;
    1049             : 
    1050             :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1051             : 
    1052             :     /*
    1053             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1054             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1055             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1056             :      */
    1057             :     if (codepage != 0)
    1058             :     {
    1059             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1060             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1061             :         utf16[dstlen] = (WCHAR) 0;
    1062             :     }
    1063             :     else
    1064             :     {
    1065             :         char       *utf8;
    1066             : 
    1067             :         /*
    1068             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1069             :          * absence of one, hope for the input to be valid UTF8.
    1070             :          */
    1071             :         if (IsTransactionState())
    1072             :         {
    1073             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1074             :                                                       len,
    1075             :                                                       msgenc,
    1076             :                                                       PG_UTF8);
    1077             :             if (utf8 != str)
    1078             :                 len = strlen(utf8);
    1079             :         }
    1080             :         else
    1081             :             utf8 = (char *) str;
    1082             : 
    1083             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1084             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1085             :         utf16[dstlen] = (WCHAR) 0;
    1086             : 
    1087             :         if (utf8 != str)
    1088             :             pfree(utf8);
    1089             :     }
    1090             : 
    1091             :     if (dstlen == 0 && len > 0)
    1092             :     {
    1093             :         pfree(utf16);
    1094             :         return NULL;            /* error */
    1095             :     }
    1096             : 
    1097             :     if (utf16len)
    1098             :         *utf16len = dstlen;
    1099             :     return utf16;
    1100             : }
    1101             : 
    1102             : #endif

Generated by: LCOV version 1.13