LCOV - code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 232 303 76.6 %
Date: 2019-06-18 07:06:57 Functions: 30 39 76.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/builtins.h"
      41             : #include "utils/memutils.h"
      42             : #include "utils/syscache.h"
      43             : 
      44             : /*
      45             :  * We maintain a simple linked list caching the fmgr lookup info for the
      46             :  * currently selected conversion functions, as well as any that have been
      47             :  * selected previously in the current session.  (We remember previous
      48             :  * settings because we must be able to restore a previous setting during
      49             :  * transaction rollback, without doing any fresh catalog accesses.)
      50             :  *
      51             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      52             :  */
      53             : typedef struct ConvProcInfo
      54             : {
      55             :     int         s_encoding;     /* server and client encoding IDs */
      56             :     int         c_encoding;
      57             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      58             :     FmgrInfo    to_client_info;
      59             : } ConvProcInfo;
      60             : 
      61             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      62             : 
      63             : /*
      64             :  * These variables point to the currently active conversion functions,
      65             :  * or are NULL when no conversion is needed.
      66             :  */
      67             : static FmgrInfo *ToServerConvProc = NULL;
      68             : static FmgrInfo *ToClientConvProc = NULL;
      69             : 
      70             : /*
      71             :  * These variables track the currently-selected encodings.
      72             :  */
      73             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      74             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      75             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      76             : 
      77             : /*
      78             :  * During backend startup we can't set client encoding because we (a)
      79             :  * can't look up the conversion functions, and (b) may not know the database
      80             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      81             :  * remembers it for InitializeClientEncoding() to apply later.
      82             :  */
      83             : static bool backend_startup_complete = false;
      84             : static int  pending_client_encoding = PG_SQL_ASCII;
      85             : 
      86             : 
      87             : /* Internal functions */
      88             : static char *perform_default_encoding_conversion(const char *src,
      89             :                                                  int len, bool is_client_to_server);
      90             : static int  cliplen(const char *str, int len, int limit);
      91             : 
      92             : 
      93             : /*
      94             :  * Prepare for a future call to SetClientEncoding.  Success should mean
      95             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
      96             :  *
      97             :  * (But note that success before backend_startup_complete does not guarantee
      98             :  * success after ...)
      99             :  *
     100             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     101             :  */
     102             : int
     103       23900 : PrepareClientEncoding(int encoding)
     104             : {
     105             :     int         current_server_encoding;
     106             :     ListCell   *lc;
     107             : 
     108       23900 :     if (!PG_VALID_FE_ENCODING(encoding))
     109           0 :         return -1;
     110             : 
     111             :     /* Can't do anything during startup, per notes above */
     112       23900 :     if (!backend_startup_complete)
     113       11420 :         return 0;
     114             : 
     115       12480 :     current_server_encoding = GetDatabaseEncoding();
     116             : 
     117             :     /*
     118             :      * Check for cases that require no conversion function.
     119             :      */
     120       12480 :     if (current_server_encoding == encoding ||
     121        3330 :         current_server_encoding == PG_SQL_ASCII ||
     122             :         encoding == PG_SQL_ASCII)
     123       12474 :         return 0;
     124             : 
     125           6 :     if (IsTransactionState())
     126             :     {
     127             :         /*
     128             :          * If we're in a live transaction, it's safe to access the catalogs,
     129             :          * so look up the functions.  We repeat the lookup even if the info is
     130             :          * already cached, so that we can react to changes in the contents of
     131             :          * pg_conversion.
     132             :          */
     133             :         Oid         to_server_proc,
     134             :                     to_client_proc;
     135             :         ConvProcInfo *convinfo;
     136             :         MemoryContext oldcontext;
     137             : 
     138           6 :         to_server_proc = FindDefaultConversionProc(encoding,
     139             :                                                    current_server_encoding);
     140           6 :         if (!OidIsValid(to_server_proc))
     141           0 :             return -1;
     142           6 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     143             :                                                    encoding);
     144           6 :         if (!OidIsValid(to_client_proc))
     145           0 :             return -1;
     146             : 
     147             :         /*
     148             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     149             :          */
     150           6 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     151             :                                                        sizeof(ConvProcInfo));
     152           6 :         convinfo->s_encoding = current_server_encoding;
     153           6 :         convinfo->c_encoding = encoding;
     154           6 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     155             :                       TopMemoryContext);
     156           6 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     157             :                       TopMemoryContext);
     158             : 
     159             :         /* Attach new info to head of list */
     160           6 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     161           6 :         ConvProcList = lcons(convinfo, ConvProcList);
     162           6 :         MemoryContextSwitchTo(oldcontext);
     163             : 
     164             :         /*
     165             :          * We cannot yet remove any older entry for the same encoding pair,
     166             :          * since it could still be in use.  SetClientEncoding will clean up.
     167             :          */
     168             : 
     169           6 :         return 0;               /* success */
     170             :     }
     171             :     else
     172             :     {
     173             :         /*
     174             :          * If we're not in a live transaction, the only thing we can do is
     175             :          * restore a previous setting using the cache.  This covers all
     176             :          * transaction-rollback cases.  The only case it might not work for is
     177             :          * trying to change client_encoding on the fly by editing
     178             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     179             :          * thing to do anyway.
     180             :          */
     181           0 :         foreach(lc, ConvProcList)
     182             :         {
     183           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     184             : 
     185           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     186           0 :                 oldinfo->c_encoding == encoding)
     187           0 :                 return 0;
     188             :         }
     189             : 
     190           0 :         return -1;              /* it's not cached, so fail */
     191             :     }
     192             : }
     193             : 
     194             : /*
     195             :  * Set the active client encoding and set up the conversion-function pointers.
     196             :  * PrepareClientEncoding should have been called previously for this encoding.
     197             :  *
     198             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     199             :  */
     200             : int
     201       19862 : SetClientEncoding(int encoding)
     202             : {
     203             :     int         current_server_encoding;
     204             :     bool        found;
     205             :     ListCell   *lc;
     206             :     ListCell   *prev;
     207             :     ListCell   *next;
     208             : 
     209       19862 :     if (!PG_VALID_FE_ENCODING(encoding))
     210           0 :         return -1;
     211             : 
     212             :     /* Can't do anything during startup, per notes above */
     213       19862 :     if (!backend_startup_complete)
     214             :     {
     215        9024 :         pending_client_encoding = encoding;
     216        9024 :         return 0;
     217             :     }
     218             : 
     219       10838 :     current_server_encoding = GetDatabaseEncoding();
     220             : 
     221             :     /*
     222             :      * Check for cases that require no conversion function.
     223             :      */
     224       10838 :     if (current_server_encoding == encoding ||
     225        1688 :         current_server_encoding == PG_SQL_ASCII ||
     226             :         encoding == PG_SQL_ASCII)
     227             :     {
     228       10832 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     229       10832 :         ToServerConvProc = NULL;
     230       10832 :         ToClientConvProc = NULL;
     231       10832 :         return 0;
     232             :     }
     233             : 
     234             :     /*
     235             :      * Search the cache for the entry previously prepared by
     236             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     237             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     238             :      * leak memory.
     239             :      */
     240           6 :     found = false;
     241           6 :     prev = NULL;
     242          12 :     for (lc = list_head(ConvProcList); lc; lc = next)
     243             :     {
     244           6 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     245             : 
     246           6 :         next = lnext(lc);
     247             : 
     248          12 :         if (convinfo->s_encoding == current_server_encoding &&
     249           6 :             convinfo->c_encoding == encoding)
     250             :         {
     251           6 :             if (!found)
     252             :             {
     253             :                 /* Found newest entry, so set up */
     254           6 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     255           6 :                 ToServerConvProc = &convinfo->to_server_info;
     256           6 :                 ToClientConvProc = &convinfo->to_client_info;
     257           6 :                 found = true;
     258             :             }
     259             :             else
     260             :             {
     261             :                 /* Duplicate entry, release it */
     262           0 :                 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
     263           0 :                 pfree(convinfo);
     264           0 :                 continue;       /* prev mustn't advance */
     265             :             }
     266             :         }
     267             : 
     268           6 :         prev = lc;
     269             :     }
     270             : 
     271           6 :     if (found)
     272           6 :         return 0;               /* success */
     273             :     else
     274           0 :         return -1;              /* it's not cached, so fail */
     275             : }
     276             : 
     277             : /*
     278             :  * Initialize client encoding conversions.
     279             :  *      Called from InitPostgres() once during backend startup.
     280             :  */
     281             : void
     282        9030 : InitializeClientEncoding(void)
     283             : {
     284             :     Assert(!backend_startup_complete);
     285        9030 :     backend_startup_complete = true;
     286             : 
     287       18060 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     288        9030 :         SetClientEncoding(pending_client_encoding) < 0)
     289             :     {
     290             :         /*
     291             :          * Oops, the requested conversion is not available. We couldn't fail
     292             :          * before, but we can now.
     293             :          */
     294           0 :         ereport(FATAL,
     295             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     296             :                  errmsg("conversion between %s and %s is not supported",
     297             :                         pg_enc2name_tbl[pending_client_encoding].name,
     298             :                         GetDatabaseEncodingName())));
     299             :     }
     300        9030 : }
     301             : 
     302             : /*
     303             :  * returns the current client encoding
     304             :  */
     305             : int
     306        4582 : pg_get_client_encoding(void)
     307             : {
     308        4582 :     return ClientEncoding->encoding;
     309             : }
     310             : 
     311             : /*
     312             :  * returns the current client encoding name
     313             :  */
     314             : const char *
     315           0 : pg_get_client_encoding_name(void)
     316             : {
     317           0 :     return ClientEncoding->name;
     318             : }
     319             : 
     320             : /*
     321             :  * Convert src string to another encoding (general case).
     322             :  *
     323             :  * See the notes about string conversion functions at the top of this file.
     324             :  */
     325             : unsigned char *
     326        1170 : pg_do_encoding_conversion(unsigned char *src, int len,
     327             :                           int src_encoding, int dest_encoding)
     328             : {
     329             :     unsigned char *result;
     330             :     Oid         proc;
     331             : 
     332        1170 :     if (len <= 0)
     333           4 :         return src;             /* empty string is always valid */
     334             : 
     335        1166 :     if (src_encoding == dest_encoding)
     336         622 :         return src;             /* no conversion required, assume valid */
     337             : 
     338         544 :     if (dest_encoding == PG_SQL_ASCII)
     339           8 :         return src;             /* any string is valid in SQL_ASCII */
     340             : 
     341         536 :     if (src_encoding == PG_SQL_ASCII)
     342             :     {
     343             :         /* No conversion is possible, but we must validate the result */
     344          24 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     345          24 :         return src;
     346             :     }
     347             : 
     348         512 :     if (!IsTransactionState())  /* shouldn't happen */
     349           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     350             : 
     351         512 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     352         512 :     if (!OidIsValid(proc))
     353           0 :         ereport(ERROR,
     354             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     355             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     356             :                         pg_encoding_to_char(src_encoding),
     357             :                         pg_encoding_to_char(dest_encoding))));
     358             : 
     359             :     /*
     360             :      * Allocate space for conversion result, being wary of integer overflow
     361             :      */
     362         512 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     363           0 :         ereport(ERROR,
     364             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     365             :                  errmsg("out of memory"),
     366             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     367             :                            len)));
     368             : 
     369         512 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     370             : 
     371         512 :     OidFunctionCall5(proc,
     372             :                      Int32GetDatum(src_encoding),
     373             :                      Int32GetDatum(dest_encoding),
     374             :                      CStringGetDatum(src),
     375             :                      CStringGetDatum(result),
     376             :                      Int32GetDatum(len));
     377         512 :     return result;
     378             : }
     379             : 
     380             : /*
     381             :  * Convert string to encoding encoding_name. The source
     382             :  * encoding is the DB encoding.
     383             :  *
     384             :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     385             : Datum
     386           0 : pg_convert_to(PG_FUNCTION_ARGS)
     387             : {
     388           0 :     Datum       string = PG_GETARG_DATUM(0);
     389           0 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     390           0 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     391             :                                                         CStringGetDatum(DatabaseEncoding->name));
     392             :     Datum       result;
     393             : 
     394             :     /*
     395             :      * pg_convert expects a bytea as its first argument. We're passing it a
     396             :      * text argument here, relying on the fact that they are both in fact
     397             :      * varlena types, and thus structurally identical.
     398             :      */
     399           0 :     result = DirectFunctionCall3(pg_convert, string,
     400             :                                  src_encoding_name, dest_encoding_name);
     401             : 
     402           0 :     PG_RETURN_DATUM(result);
     403             : }
     404             : 
     405             : /*
     406             :  * Convert string from encoding encoding_name. The destination
     407             :  * encoding is the DB encoding.
     408             :  *
     409             :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     410             : Datum
     411          20 : pg_convert_from(PG_FUNCTION_ARGS)
     412             : {
     413          20 :     Datum       string = PG_GETARG_DATUM(0);
     414          20 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     415          20 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     416             :                                                          CStringGetDatum(DatabaseEncoding->name));
     417             :     Datum       result;
     418             : 
     419          20 :     result = DirectFunctionCall3(pg_convert, string,
     420             :                                  src_encoding_name, dest_encoding_name);
     421             : 
     422             :     /*
     423             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     424             :      * the fact that they are both in fact varlena types, and thus
     425             :      * structurally identical. Although not all bytea values are valid text,
     426             :      * in this case it will be because we've told pg_convert to return one
     427             :      * that is valid as text in the current database encoding.
     428             :      */
     429          20 :     PG_RETURN_DATUM(result);
     430             : }
     431             : 
     432             : /*
     433             :  * Convert string between two arbitrary encodings.
     434             :  *
     435             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     436             :  */
     437             : Datum
     438         548 : pg_convert(PG_FUNCTION_ARGS)
     439             : {
     440         548 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     441         548 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     442         548 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     443         548 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     444         548 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     445             :     const char *src_str;
     446             :     char       *dest_str;
     447             :     bytea      *retval;
     448             :     int         len;
     449             : 
     450         548 :     if (src_encoding < 0)
     451           0 :         ereport(ERROR,
     452             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     453             :                  errmsg("invalid source encoding name \"%s\"",
     454             :                         src_encoding_name)));
     455         548 :     if (dest_encoding < 0)
     456           0 :         ereport(ERROR,
     457             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     458             :                  errmsg("invalid destination encoding name \"%s\"",
     459             :                         dest_encoding_name)));
     460             : 
     461             :     /* make sure that source string is valid */
     462         548 :     len = VARSIZE_ANY_EXHDR(string);
     463         548 :     src_str = VARDATA_ANY(string);
     464         548 :     pg_verify_mbstr_len(src_encoding, src_str, len, false);
     465             : 
     466             :     /* perform conversion */
     467         548 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     468             :                                                   len,
     469             :                                                   src_encoding,
     470             :                                                   dest_encoding);
     471             : 
     472             :     /* update len if conversion actually happened */
     473         548 :     if (dest_str != src_str)
     474         512 :         len = strlen(dest_str);
     475             : 
     476             :     /*
     477             :      * build bytea data type structure.
     478             :      */
     479         548 :     retval = (bytea *) palloc(len + VARHDRSZ);
     480         548 :     SET_VARSIZE(retval, len + VARHDRSZ);
     481         548 :     memcpy(VARDATA(retval), dest_str, len);
     482             : 
     483         548 :     if (dest_str != src_str)
     484         512 :         pfree(dest_str);
     485             : 
     486             :     /* free memory if allocated by the toaster */
     487         548 :     PG_FREE_IF_COPY(string, 0);
     488             : 
     489         548 :     PG_RETURN_BYTEA_P(retval);
     490             : }
     491             : 
     492             : /*
     493             :  * get the length of the string considered as text in the specified
     494             :  * encoding. Raises an error if the data is not valid in that
     495             :  * encoding.
     496             :  *
     497             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     498             :  */
     499             : Datum
     500           0 : length_in_encoding(PG_FUNCTION_ARGS)
     501             : {
     502           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     503           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     504           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     505             :     const char *src_str;
     506             :     int         len;
     507             :     int         retval;
     508             : 
     509           0 :     if (src_encoding < 0)
     510           0 :         ereport(ERROR,
     511             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     512             :                  errmsg("invalid encoding name \"%s\"",
     513             :                         src_encoding_name)));
     514             : 
     515           0 :     len = VARSIZE_ANY_EXHDR(string);
     516           0 :     src_str = VARDATA_ANY(string);
     517             : 
     518           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     519             : 
     520           0 :     PG_RETURN_INT32(retval);
     521             : }
     522             : 
     523             : /*
     524             :  * Get maximum multibyte character length in the specified encoding.
     525             :  *
     526             :  * Note encoding is specified numerically, not by name as above.
     527             :  */
     528             : Datum
     529           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     530             : {
     531           0 :     int         encoding = PG_GETARG_INT32(0);
     532             : 
     533           0 :     if (PG_VALID_ENCODING(encoding))
     534           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     535             :     else
     536           0 :         PG_RETURN_NULL();
     537             : }
     538             : 
     539             : /*
     540             :  * Convert client encoding to server encoding.
     541             :  *
     542             :  * See the notes about string conversion functions at the top of this file.
     543             :  */
     544             : char *
     545      564874 : pg_client_to_server(const char *s, int len)
     546             : {
     547      564874 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     548             : }
     549             : 
     550             : /*
     551             :  * Convert any encoding to server encoding.
     552             :  *
     553             :  * See the notes about string conversion functions at the top of this file.
     554             :  *
     555             :  * Unlike the other string conversion functions, this will apply validation
     556             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     557             :  * used to process data coming in from outside the database, and we never
     558             :  * want to just assume validity.
     559             :  */
     560             : char *
     561     2891636 : pg_any_to_server(const char *s, int len, int encoding)
     562             : {
     563     2891636 :     if (len <= 0)
     564       90750 :         return unconstify(char *, s);   /* empty string is always valid */
     565             : 
     566     2800886 :     if (encoding == DatabaseEncoding->encoding ||
     567             :         encoding == PG_SQL_ASCII)
     568             :     {
     569             :         /*
     570             :          * No conversion is needed, but we must still validate the data.
     571             :          */
     572     2800864 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     573     2800862 :         return unconstify(char *, s);
     574             :     }
     575             : 
     576          22 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     577             :     {
     578             :         /*
     579             :          * No conversion is possible, but we must still validate the data,
     580             :          * because the client-side code might have done string escaping using
     581             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     582             :          * then we just do a straight validation under that encoding.  For an
     583             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     584             :          * to the parser but we have no way to convert it.  We compromise by
     585             :          * rejecting the data if it contains any non-ASCII characters.
     586             :          */
     587           2 :         if (PG_VALID_BE_ENCODING(encoding))
     588           2 :             (void) pg_verify_mbstr(encoding, s, len, false);
     589             :         else
     590             :         {
     591             :             int         i;
     592             : 
     593           0 :             for (i = 0; i < len; i++)
     594             :             {
     595           0 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     596           0 :                     ereport(ERROR,
     597             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     598             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     599             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     600             :                                     (unsigned char) s[i])));
     601             :             }
     602             :         }
     603           2 :         return unconstify(char *, s);
     604             :     }
     605             : 
     606             :     /* Fast path if we can use cached conversion function */
     607          20 :     if (encoding == ClientEncoding->encoding)
     608          20 :         return perform_default_encoding_conversion(s, len, true);
     609             : 
     610             :     /* General case ... will not work outside transactions */
     611           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     612             :                                               len,
     613             :                                               encoding,
     614           0 :                                               DatabaseEncoding->encoding);
     615             : }
     616             : 
     617             : /*
     618             :  * Convert server encoding to client encoding.
     619             :  *
     620             :  * See the notes about string conversion functions at the top of this file.
     621             :  */
     622             : char *
     623     9123590 : pg_server_to_client(const char *s, int len)
     624             : {
     625     9123590 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     626             : }
     627             : 
     628             : /*
     629             :  * Convert server encoding to any encoding.
     630             :  *
     631             :  * See the notes about string conversion functions at the top of this file.
     632             :  */
     633             : char *
     634    13478338 : pg_server_to_any(const char *s, int len, int encoding)
     635             : {
     636    13478338 :     if (len <= 0)
     637      109610 :         return unconstify(char *, s);   /* empty string is always valid */
     638             : 
     639    13368728 :     if (encoding == DatabaseEncoding->encoding ||
     640             :         encoding == PG_SQL_ASCII)
     641    13368414 :         return unconstify(char *, s);   /* assume data is valid */
     642             : 
     643         314 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     644             :     {
     645             :         /* No conversion is possible, but we must validate the result */
     646          88 :         (void) pg_verify_mbstr(encoding, s, len, false);
     647          88 :         return unconstify(char *, s);
     648             :     }
     649             : 
     650             :     /* Fast path if we can use cached conversion function */
     651         226 :     if (encoding == ClientEncoding->encoding)
     652         226 :         return perform_default_encoding_conversion(s, len, false);
     653             : 
     654             :     /* General case ... will not work outside transactions */
     655           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     656             :                                               len,
     657           0 :                                               DatabaseEncoding->encoding,
     658             :                                               encoding);
     659             : }
     660             : 
     661             : /*
     662             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     663             :  *  this function does not access database at all, it is safe to call
     664             :  *  outside transactions.  If the conversion has not been set up by
     665             :  *  SetClientEncoding(), no conversion is performed.
     666             :  */
     667             : static char *
     668         246 : perform_default_encoding_conversion(const char *src, int len,
     669             :                                     bool is_client_to_server)
     670             : {
     671             :     char       *result;
     672             :     int         src_encoding,
     673             :                 dest_encoding;
     674             :     FmgrInfo   *flinfo;
     675             : 
     676         246 :     if (is_client_to_server)
     677             :     {
     678          20 :         src_encoding = ClientEncoding->encoding;
     679          20 :         dest_encoding = DatabaseEncoding->encoding;
     680          20 :         flinfo = ToServerConvProc;
     681             :     }
     682             :     else
     683             :     {
     684         226 :         src_encoding = DatabaseEncoding->encoding;
     685         226 :         dest_encoding = ClientEncoding->encoding;
     686         226 :         flinfo = ToClientConvProc;
     687             :     }
     688             : 
     689         246 :     if (flinfo == NULL)
     690           0 :         return unconstify(char *, src);
     691             : 
     692             :     /*
     693             :      * Allocate space for conversion result, being wary of integer overflow
     694             :      */
     695         246 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     696           0 :         ereport(ERROR,
     697             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     698             :                  errmsg("out of memory"),
     699             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     700             :                            len)));
     701             : 
     702         246 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     703             : 
     704         246 :     FunctionCall5(flinfo,
     705             :                   Int32GetDatum(src_encoding),
     706             :                   Int32GetDatum(dest_encoding),
     707             :                   CStringGetDatum(src),
     708             :                   CStringGetDatum(result),
     709             :                   Int32GetDatum(len));
     710         246 :     return result;
     711             : }
     712             : 
     713             : 
     714             : /* convert a multibyte string to a wchar */
     715             : int
     716           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     717             : {
     718           0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     719             : }
     720             : 
     721             : /* convert a multibyte string to a wchar with a limited length */
     722             : int
     723      607414 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     724             : {
     725      607414 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     726             : }
     727             : 
     728             : /* same, with any encoding */
     729             : int
     730       12094 : pg_encoding_mb2wchar_with_len(int encoding,
     731             :                               const char *from, pg_wchar *to, int len)
     732             : {
     733       12094 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     734             : }
     735             : 
     736             : /* convert a wchar string to a multibyte */
     737             : int
     738           0 : pg_wchar2mb(const pg_wchar *from, char *to)
     739             : {
     740           0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
     741             : }
     742             : 
     743             : /* convert a wchar string to a multibyte with a limited length */
     744             : int
     745     1008066 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
     746             : {
     747     1008066 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     748             : }
     749             : 
     750             : /* same, with any encoding */
     751             : int
     752           0 : pg_encoding_wchar2mb_with_len(int encoding,
     753             :                               const pg_wchar *from, char *to, int len)
     754             : {
     755           0 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     756             : }
     757             : 
     758             : /* returns the byte length of a multibyte character */
     759             : int
     760   122892166 : pg_mblen(const char *mbstr)
     761             : {
     762   122892166 :     return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
     763             : }
     764             : 
     765             : /* returns the display length of a multibyte character */
     766             : int
     767        4356 : pg_dsplen(const char *mbstr)
     768             : {
     769        4356 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
     770             : }
     771             : 
     772             : /* returns the length (counted in wchars) of a multibyte string */
     773             : int
     774         468 : pg_mbstrlen(const char *mbstr)
     775             : {
     776         468 :     int         len = 0;
     777             : 
     778             :     /* optimization for single byte encoding */
     779         468 :     if (pg_database_encoding_max_length() == 1)
     780           0 :         return strlen(mbstr);
     781             : 
     782        1552 :     while (*mbstr)
     783             :     {
     784         616 :         mbstr += pg_mblen(mbstr);
     785         616 :         len++;
     786             :     }
     787         468 :     return len;
     788             : }
     789             : 
     790             : /* returns the length (counted in wchars) of a multibyte string
     791             :  * (not necessarily NULL terminated)
     792             :  */
     793             : int
     794      926708 : pg_mbstrlen_with_len(const char *mbstr, int limit)
     795             : {
     796      926708 :     int         len = 0;
     797             : 
     798             :     /* optimization for single byte encoding */
     799      926708 :     if (pg_database_encoding_max_length() == 1)
     800         192 :         return limit;
     801             : 
     802   105817844 :     while (limit > 0 && *mbstr)
     803             :     {
     804   103964812 :         int         l = pg_mblen(mbstr);
     805             : 
     806   103964812 :         limit -= l;
     807   103964812 :         mbstr += l;
     808   103964812 :         len++;
     809             :     }
     810      926516 :     return len;
     811             : }
     812             : 
     813             : /*
     814             :  * returns the byte length of a multibyte string
     815             :  * (not necessarily NULL terminated)
     816             :  * that is no longer than limit.
     817             :  * this function does not break multibyte character boundary.
     818             :  */
     819             : int
     820       34378 : pg_mbcliplen(const char *mbstr, int len, int limit)
     821             : {
     822       34378 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
     823             :                                  len, limit);
     824             : }
     825             : 
     826             : /*
     827             :  * pg_mbcliplen with specified encoding
     828             :  */
     829             : int
     830       34378 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
     831             :                       int len, int limit)
     832             : {
     833             :     mblen_converter mblen_fn;
     834       34378 :     int         clen = 0;
     835             :     int         l;
     836             : 
     837             :     /* optimization for single byte encoding */
     838       34378 :     if (pg_encoding_max_length(encoding) == 1)
     839        4702 :         return cliplen(mbstr, len, limit);
     840             : 
     841       29676 :     mblen_fn = pg_wchar_table[encoding].mblen;
     842             : 
     843      377970 :     while (len > 0 && *mbstr)
     844             :     {
     845      331968 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
     846      331968 :         if ((clen + l) > limit)
     847          32 :             break;
     848      331936 :         clen += l;
     849      331936 :         if (clen == limit)
     850       13318 :             break;
     851      318618 :         len -= l;
     852      318618 :         mbstr += l;
     853             :     }
     854       29676 :     return clen;
     855             : }
     856             : 
     857             : /*
     858             :  * Similar to pg_mbcliplen except the limit parameter specifies the
     859             :  * character length, not the byte length.
     860             :  */
     861             : int
     862         168 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
     863             : {
     864         168 :     int         clen = 0;
     865         168 :     int         nch = 0;
     866             :     int         l;
     867             : 
     868             :     /* optimization for single byte encoding */
     869         168 :     if (pg_database_encoding_max_length() == 1)
     870           0 :         return cliplen(mbstr, len, limit);
     871             : 
     872         956 :     while (len > 0 && *mbstr)
     873             :     {
     874         776 :         l = pg_mblen(mbstr);
     875         776 :         nch++;
     876         776 :         if (nch > limit)
     877         156 :             break;
     878         620 :         clen += l;
     879         620 :         len -= l;
     880         620 :         mbstr += l;
     881             :     }
     882         168 :     return clen;
     883             : }
     884             : 
     885             : /* mbcliplen for any single-byte encoding */
     886             : static int
     887        4702 : cliplen(const char *str, int len, int limit)
     888             : {
     889        4702 :     int         l = 0;
     890             : 
     891        4702 :     len = Min(len, limit);
     892       36196 :     while (l < len && str[l])
     893       26792 :         l++;
     894        4702 :     return l;
     895             : }
     896             : 
     897             : void
     898        8428 : SetDatabaseEncoding(int encoding)
     899             : {
     900        8428 :     if (!PG_VALID_BE_ENCODING(encoding))
     901           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
     902             : 
     903        8428 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
     904             :     Assert(DatabaseEncoding->encoding == encoding);
     905        8428 : }
     906             : 
     907             : void
     908       10978 : SetMessageEncoding(int encoding)
     909             : {
     910             :     /* Some calls happen before we can elog()! */
     911             :     Assert(PG_VALID_ENCODING(encoding));
     912             : 
     913       10978 :     MessageEncoding = &pg_enc2name_tbl[encoding];
     914             :     Assert(MessageEncoding->encoding == encoding);
     915       10978 : }
     916             : 
     917             : #ifdef ENABLE_NLS
     918             : /*
     919             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
     920             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
     921             :  * fail for gettext-internal causes like out-of-memory.
     922             :  */
     923             : static bool
     924         268 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
     925             : {
     926         268 :     bool        elog_ok = (CurrentMemoryContext != NULL);
     927             :     int         i;
     928             : 
     929         756 :     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
     930             :     {
     931         756 :         if (pg_enc2gettext_tbl[i].encoding == encoding)
     932             :         {
     933         268 :             if (bind_textdomain_codeset(domainname,
     934             :                                         pg_enc2gettext_tbl[i].name) != NULL)
     935         268 :                 return true;
     936             : 
     937           0 :             if (elog_ok)
     938           0 :                 elog(LOG, "bind_textdomain_codeset failed");
     939             :             else
     940           0 :                 write_stderr("bind_textdomain_codeset failed");
     941             : 
     942           0 :             break;
     943             :         }
     944             :     }
     945             : 
     946           0 :     return false;
     947             : }
     948             : 
     949             : /*
     950             :  * Bind a gettext message domain to the codeset corresponding to the database
     951             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
     952             :  * Return the MessageEncoding implied by the new settings.
     953             :  *
     954             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
     955             :  * When that matches the database encoding, we don't need to do anything.  In
     956             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
     957             :  * database encoding, except for the C locale.  (On Windows, we also permit a
     958             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
     959             :  * gettext to the right codeset.
     960             :  *
     961             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
     962             :  * convenient departure for software that passes the strings to Windows ANSI
     963             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
     964             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
     965             :  *
     966             :  * This function is called before elog() and palloc() are usable.
     967             :  */
     968             : int
     969       13270 : pg_bind_textdomain_codeset(const char *domainname)
     970             : {
     971       13270 :     bool        elog_ok = (CurrentMemoryContext != NULL);
     972       13270 :     int         encoding = GetDatabaseEncoding();
     973             :     int         new_msgenc;
     974             : 
     975             : #ifndef WIN32
     976       13270 :     const char *ctype = setlocale(LC_CTYPE, NULL);
     977             : 
     978       13270 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
     979             : #endif
     980         624 :         if (encoding != PG_SQL_ASCII &&
     981         268 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
     982         268 :             return encoding;
     983             : 
     984       13002 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
     985       13002 :     if (new_msgenc < 0)
     986           0 :         new_msgenc = PG_SQL_ASCII;
     987             : 
     988             : #ifdef WIN32
     989             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
     990             :         /* On failure, the old message encoding remains valid. */
     991             :         return GetMessageEncoding();
     992             : #endif
     993             : 
     994       13002 :     return new_msgenc;
     995             : }
     996             : #endif
     997             : 
     998             : /*
     999             :  * The database encoding, also called the server encoding, represents the
    1000             :  * encoding of data stored in text-like data types.  Affected types include
    1001             :  * cstring, text, varchar, name, xml, and json.
    1002             :  */
    1003             : int
    1004    13033466 : GetDatabaseEncoding(void)
    1005             : {
    1006    13033466 :     return DatabaseEncoding->encoding;
    1007             : }
    1008             : 
    1009             : const char *
    1010       17246 : GetDatabaseEncodingName(void)
    1011             : {
    1012       17246 :     return DatabaseEncoding->name;
    1013             : }
    1014             : 
    1015             : Datum
    1016           2 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1017             : {
    1018           2 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1019             : }
    1020             : 
    1021             : Datum
    1022           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1023             : {
    1024           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1025             : }
    1026             : 
    1027             : /*
    1028             :  * gettext() returns messages in this encoding.  This often matches the
    1029             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1030             :  * not attached to a database, and under a database encoding lacking iconv
    1031             :  * support (MULE_INTERNAL).
    1032             :  */
    1033             : int
    1034           0 : GetMessageEncoding(void)
    1035             : {
    1036           0 :     return MessageEncoding->encoding;
    1037             : }
    1038             : 
    1039             : #ifdef WIN32
    1040             : /*
    1041             :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1042             :  * string. The character length is also passed to utf16len if not
    1043             :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1044             :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1045             :  */
    1046             : WCHAR *
    1047             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1048             : {
    1049             :     int         msgenc = GetMessageEncoding();
    1050             :     WCHAR      *utf16;
    1051             :     int         dstlen;
    1052             :     UINT        codepage;
    1053             : 
    1054             :     if (msgenc == PG_SQL_ASCII)
    1055             :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1056             :         return NULL;
    1057             : 
    1058             :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1059             : 
    1060             :     /*
    1061             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1062             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1063             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1064             :      */
    1065             :     if (codepage != 0)
    1066             :     {
    1067             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1068             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1069             :         utf16[dstlen] = (WCHAR) 0;
    1070             :     }
    1071             :     else
    1072             :     {
    1073             :         char       *utf8;
    1074             : 
    1075             :         /*
    1076             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1077             :          * absence of one, hope for the input to be valid UTF8.
    1078             :          */
    1079             :         if (IsTransactionState())
    1080             :         {
    1081             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1082             :                                                       len,
    1083             :                                                       msgenc,
    1084             :                                                       PG_UTF8);
    1085             :             if (utf8 != str)
    1086             :                 len = strlen(utf8);
    1087             :         }
    1088             :         else
    1089             :             utf8 = (char *) str;
    1090             : 
    1091             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1092             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1093             :         utf16[dstlen] = (WCHAR) 0;
    1094             : 
    1095             :         if (utf8 != str)
    1096             :             pfree(utf8);
    1097             :     }
    1098             : 
    1099             :     if (dstlen == 0 && len > 0)
    1100             :     {
    1101             :         pfree(utf16);
    1102             :         return NULL;            /* error */
    1103             :     }
    1104             : 
    1105             :     if (utf16len)
    1106             :         *utf16len = dstlen;
    1107             :     return utf16;
    1108             : }
    1109             : 
    1110             : #endif

Generated by: LCOV version 1.13