LCOV - code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 321 502 63.9 %
Date: 2020-05-29 00:07:09 Functions: 42 53 79.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/builtins.h"
      41             : #include "utils/memutils.h"
      42             : #include "utils/syscache.h"
      43             : 
      44             : /*
      45             :  * We maintain a simple linked list caching the fmgr lookup info for the
      46             :  * currently selected conversion functions, as well as any that have been
      47             :  * selected previously in the current session.  (We remember previous
      48             :  * settings because we must be able to restore a previous setting during
      49             :  * transaction rollback, without doing any fresh catalog accesses.)
      50             :  *
      51             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      52             :  */
      53             : typedef struct ConvProcInfo
      54             : {
      55             :     int         s_encoding;     /* server and client encoding IDs */
      56             :     int         c_encoding;
      57             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      58             :     FmgrInfo    to_client_info;
      59             : } ConvProcInfo;
      60             : 
      61             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      62             : 
      63             : /*
      64             :  * These variables point to the currently active conversion functions,
      65             :  * or are NULL when no conversion is needed.
      66             :  */
      67             : static FmgrInfo *ToServerConvProc = NULL;
      68             : static FmgrInfo *ToClientConvProc = NULL;
      69             : 
      70             : /*
      71             :  * This variable stores the conversion function to convert from UTF-8
      72             :  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
      73             :  * or if we lack a conversion function for this.
      74             :  */
      75             : static FmgrInfo *Utf8ToServerConvProc = NULL;
      76             : 
      77             : /*
      78             :  * These variables track the currently-selected encodings.
      79             :  */
      80             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      81             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      82             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      83             : 
      84             : /*
      85             :  * During backend startup we can't set client encoding because we (a)
      86             :  * can't look up the conversion functions, and (b) may not know the database
      87             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      88             :  * remembers it for InitializeClientEncoding() to apply later.
      89             :  */
      90             : static bool backend_startup_complete = false;
      91             : static int  pending_client_encoding = PG_SQL_ASCII;
      92             : 
      93             : 
      94             : /* Internal functions */
      95             : static char *perform_default_encoding_conversion(const char *src,
      96             :                                                  int len, bool is_client_to_server);
      97             : static int  cliplen(const char *str, int len, int limit);
      98             : 
      99             : 
     100             : /*
     101             :  * Prepare for a future call to SetClientEncoding.  Success should mean
     102             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     103             :  *
     104             :  * (But note that success before backend_startup_complete does not guarantee
     105             :  * success after ...)
     106             :  *
     107             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     108             :  */
     109             : int
     110       26678 : PrepareClientEncoding(int encoding)
     111             : {
     112             :     int         current_server_encoding;
     113             :     ListCell   *lc;
     114             : 
     115       26678 :     if (!PG_VALID_FE_ENCODING(encoding))
     116           0 :         return -1;
     117             : 
     118             :     /* Can't do anything during startup, per notes above */
     119       26678 :     if (!backend_startup_complete)
     120       13042 :         return 0;
     121             : 
     122       13636 :     current_server_encoding = GetDatabaseEncoding();
     123             : 
     124             :     /*
     125             :      * Check for cases that require no conversion function.
     126             :      */
     127       13636 :     if (current_server_encoding == encoding ||
     128        3186 :         current_server_encoding == PG_SQL_ASCII ||
     129             :         encoding == PG_SQL_ASCII)
     130       13630 :         return 0;
     131             : 
     132           6 :     if (IsTransactionState())
     133             :     {
     134             :         /*
     135             :          * If we're in a live transaction, it's safe to access the catalogs,
     136             :          * so look up the functions.  We repeat the lookup even if the info is
     137             :          * already cached, so that we can react to changes in the contents of
     138             :          * pg_conversion.
     139             :          */
     140             :         Oid         to_server_proc,
     141             :                     to_client_proc;
     142             :         ConvProcInfo *convinfo;
     143             :         MemoryContext oldcontext;
     144             : 
     145           6 :         to_server_proc = FindDefaultConversionProc(encoding,
     146             :                                                    current_server_encoding);
     147           6 :         if (!OidIsValid(to_server_proc))
     148           0 :             return -1;
     149           6 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     150             :                                                    encoding);
     151           6 :         if (!OidIsValid(to_client_proc))
     152           0 :             return -1;
     153             : 
     154             :         /*
     155             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     156             :          */
     157           6 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     158             :                                                        sizeof(ConvProcInfo));
     159           6 :         convinfo->s_encoding = current_server_encoding;
     160           6 :         convinfo->c_encoding = encoding;
     161           6 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     162             :                       TopMemoryContext);
     163           6 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     164             :                       TopMemoryContext);
     165             : 
     166             :         /* Attach new info to head of list */
     167           6 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     168           6 :         ConvProcList = lcons(convinfo, ConvProcList);
     169           6 :         MemoryContextSwitchTo(oldcontext);
     170             : 
     171             :         /*
     172             :          * We cannot yet remove any older entry for the same encoding pair,
     173             :          * since it could still be in use.  SetClientEncoding will clean up.
     174             :          */
     175             : 
     176           6 :         return 0;               /* success */
     177             :     }
     178             :     else
     179             :     {
     180             :         /*
     181             :          * If we're not in a live transaction, the only thing we can do is
     182             :          * restore a previous setting using the cache.  This covers all
     183             :          * transaction-rollback cases.  The only case it might not work for is
     184             :          * trying to change client_encoding on the fly by editing
     185             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     186             :          * thing to do anyway.
     187             :          */
     188           0 :         foreach(lc, ConvProcList)
     189             :         {
     190           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     191             : 
     192           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     193           0 :                 oldinfo->c_encoding == encoding)
     194           0 :                 return 0;
     195             :         }
     196             : 
     197           0 :         return -1;              /* it's not cached, so fail */
     198             :     }
     199             : }
     200             : 
     201             : /*
     202             :  * Set the active client encoding and set up the conversion-function pointers.
     203             :  * PrepareClientEncoding should have been called previously for this encoding.
     204             :  *
     205             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     206             :  */
     207             : int
     208       22804 : SetClientEncoding(int encoding)
     209             : {
     210             :     int         current_server_encoding;
     211             :     bool        found;
     212             :     ListCell   *lc;
     213             : 
     214       22804 :     if (!PG_VALID_FE_ENCODING(encoding))
     215           0 :         return -1;
     216             : 
     217             :     /* Can't do anything during startup, per notes above */
     218       22804 :     if (!backend_startup_complete)
     219             :     {
     220       10738 :         pending_client_encoding = encoding;
     221       10738 :         return 0;
     222             :     }
     223             : 
     224       12066 :     current_server_encoding = GetDatabaseEncoding();
     225             : 
     226             :     /*
     227             :      * Check for cases that require no conversion function.
     228             :      */
     229       12066 :     if (current_server_encoding == encoding ||
     230        1616 :         current_server_encoding == PG_SQL_ASCII ||
     231             :         encoding == PG_SQL_ASCII)
     232             :     {
     233       12060 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     234       12060 :         ToServerConvProc = NULL;
     235       12060 :         ToClientConvProc = NULL;
     236       12060 :         return 0;
     237             :     }
     238             : 
     239             :     /*
     240             :      * Search the cache for the entry previously prepared by
     241             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     242             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     243             :      * leak memory.
     244             :      */
     245           6 :     found = false;
     246          12 :     foreach(lc, ConvProcList)
     247             :     {
     248           6 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     249             : 
     250           6 :         if (convinfo->s_encoding == current_server_encoding &&
     251           6 :             convinfo->c_encoding == encoding)
     252             :         {
     253           6 :             if (!found)
     254             :             {
     255             :                 /* Found newest entry, so set up */
     256           6 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     257           6 :                 ToServerConvProc = &convinfo->to_server_info;
     258           6 :                 ToClientConvProc = &convinfo->to_client_info;
     259           6 :                 found = true;
     260             :             }
     261             :             else
     262             :             {
     263             :                 /* Duplicate entry, release it */
     264           0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     265           0 :                 pfree(convinfo);
     266             :             }
     267             :         }
     268             :     }
     269             : 
     270           6 :     if (found)
     271           6 :         return 0;               /* success */
     272             :     else
     273           0 :         return -1;              /* it's not cached, so fail */
     274             : }
     275             : 
     276             : /*
     277             :  * Initialize client encoding conversions.
     278             :  *      Called from InitPostgres() once during backend startup.
     279             :  */
     280             : void
     281       10334 : InitializeClientEncoding(void)
     282             : {
     283             :     int         current_server_encoding;
     284             : 
     285             :     Assert(!backend_startup_complete);
     286       10334 :     backend_startup_complete = true;
     287             : 
     288       20668 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     289       10334 :         SetClientEncoding(pending_client_encoding) < 0)
     290             :     {
     291             :         /*
     292             :          * Oops, the requested conversion is not available. We couldn't fail
     293             :          * before, but we can now.
     294             :          */
     295           0 :         ereport(FATAL,
     296             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     297             :                  errmsg("conversion between %s and %s is not supported",
     298             :                         pg_enc2name_tbl[pending_client_encoding].name,
     299             :                         GetDatabaseEncodingName())));
     300             :     }
     301             : 
     302             :     /*
     303             :      * Also look up the UTF8-to-server conversion function if needed.  Since
     304             :      * the server encoding is fixed within any one backend process, we don't
     305             :      * have to do this more than once.
     306             :      */
     307       10334 :     current_server_encoding = GetDatabaseEncoding();
     308       10334 :     if (current_server_encoding != PG_UTF8 &&
     309             :         current_server_encoding != PG_SQL_ASCII)
     310             :     {
     311             :         Oid         utf8_to_server_proc;
     312             : 
     313             :         Assert(IsTransactionState());
     314             :         utf8_to_server_proc =
     315         194 :             FindDefaultConversionProc(PG_UTF8,
     316             :                                       current_server_encoding);
     317             :         /* If there's no such conversion, just leave the pointer as NULL */
     318         194 :         if (OidIsValid(utf8_to_server_proc))
     319             :         {
     320             :             FmgrInfo   *finfo;
     321             : 
     322         194 :             finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
     323             :                                                     sizeof(FmgrInfo));
     324         194 :             fmgr_info_cxt(utf8_to_server_proc, finfo,
     325             :                           TopMemoryContext);
     326             :             /* Set Utf8ToServerConvProc only after data is fully valid */
     327         194 :             Utf8ToServerConvProc = finfo;
     328             :         }
     329             :     }
     330       10334 : }
     331             : 
     332             : /*
     333             :  * returns the current client encoding
     334             :  */
     335             : int
     336        4100 : pg_get_client_encoding(void)
     337             : {
     338        4100 :     return ClientEncoding->encoding;
     339             : }
     340             : 
     341             : /*
     342             :  * returns the current client encoding name
     343             :  */
     344             : const char *
     345           0 : pg_get_client_encoding_name(void)
     346             : {
     347           0 :     return ClientEncoding->name;
     348             : }
     349             : 
     350             : /*
     351             :  * Convert src string to another encoding (general case).
     352             :  *
     353             :  * See the notes about string conversion functions at the top of this file.
     354             :  */
     355             : unsigned char *
     356        1158 : pg_do_encoding_conversion(unsigned char *src, int len,
     357             :                           int src_encoding, int dest_encoding)
     358             : {
     359             :     unsigned char *result;
     360             :     Oid         proc;
     361             : 
     362        1158 :     if (len <= 0)
     363           4 :         return src;             /* empty string is always valid */
     364             : 
     365        1154 :     if (src_encoding == dest_encoding)
     366         626 :         return src;             /* no conversion required, assume valid */
     367             : 
     368         528 :     if (dest_encoding == PG_SQL_ASCII)
     369           0 :         return src;             /* any string is valid in SQL_ASCII */
     370             : 
     371         528 :     if (src_encoding == PG_SQL_ASCII)
     372             :     {
     373             :         /* No conversion is possible, but we must validate the result */
     374          16 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     375          16 :         return src;
     376             :     }
     377             : 
     378         512 :     if (!IsTransactionState())  /* shouldn't happen */
     379           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     380             : 
     381         512 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     382         512 :     if (!OidIsValid(proc))
     383           0 :         ereport(ERROR,
     384             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     385             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     386             :                         pg_encoding_to_char(src_encoding),
     387             :                         pg_encoding_to_char(dest_encoding))));
     388             : 
     389             :     /*
     390             :      * Allocate space for conversion result, being wary of integer overflow.
     391             :      *
     392             :      * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
     393             :      * required space, so it might exceed MaxAllocSize even though the result
     394             :      * would actually fit.  We do not want to hand back a result string that
     395             :      * exceeds MaxAllocSize, because callers might not cope gracefully --- but
     396             :      * if we just allocate more than that, and don't use it, that's fine.
     397             :      */
     398         512 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     399           0 :         ereport(ERROR,
     400             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     401             :                  errmsg("out of memory"),
     402             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     403             :                            len)));
     404             : 
     405             :     result = (unsigned char *)
     406         512 :         MemoryContextAllocHuge(CurrentMemoryContext,
     407         512 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     408             : 
     409         512 :     OidFunctionCall5(proc,
     410             :                      Int32GetDatum(src_encoding),
     411             :                      Int32GetDatum(dest_encoding),
     412             :                      CStringGetDatum(src),
     413             :                      CStringGetDatum(result),
     414             :                      Int32GetDatum(len));
     415             : 
     416             :     /*
     417             :      * If the result is large, it's worth repalloc'ing to release any extra
     418             :      * space we asked for.  The cutoff here is somewhat arbitrary, but we
     419             :      * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
     420             :      */
     421         512 :     if (len > 1000000)
     422             :     {
     423           0 :         Size        resultlen = strlen((char *) result);
     424             : 
     425           0 :         if (resultlen >= MaxAllocSize)
     426           0 :             ereport(ERROR,
     427             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     428             :                      errmsg("out of memory"),
     429             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     430             :                                len)));
     431             : 
     432           0 :         result = (unsigned char *) repalloc(result, resultlen + 1);
     433             :     }
     434             : 
     435         512 :     return result;
     436             : }
     437             : 
     438             : /*
     439             :  * Convert string to encoding encoding_name. The source
     440             :  * encoding is the DB encoding.
     441             :  *
     442             :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     443             : Datum
     444           0 : pg_convert_to(PG_FUNCTION_ARGS)
     445             : {
     446           0 :     Datum       string = PG_GETARG_DATUM(0);
     447           0 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     448           0 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     449             :                                                         CStringGetDatum(DatabaseEncoding->name));
     450             :     Datum       result;
     451             : 
     452             :     /*
     453             :      * pg_convert expects a bytea as its first argument. We're passing it a
     454             :      * text argument here, relying on the fact that they are both in fact
     455             :      * varlena types, and thus structurally identical.
     456             :      */
     457           0 :     result = DirectFunctionCall3(pg_convert, string,
     458             :                                  src_encoding_name, dest_encoding_name);
     459             : 
     460           0 :     PG_RETURN_DATUM(result);
     461             : }
     462             : 
     463             : /*
     464             :  * Convert string from encoding encoding_name. The destination
     465             :  * encoding is the DB encoding.
     466             :  *
     467             :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     468             : Datum
     469          20 : pg_convert_from(PG_FUNCTION_ARGS)
     470             : {
     471          20 :     Datum       string = PG_GETARG_DATUM(0);
     472          20 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     473          20 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     474             :                                                          CStringGetDatum(DatabaseEncoding->name));
     475             :     Datum       result;
     476             : 
     477          20 :     result = DirectFunctionCall3(pg_convert, string,
     478             :                                  src_encoding_name, dest_encoding_name);
     479             : 
     480             :     /*
     481             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     482             :      * the fact that they are both in fact varlena types, and thus
     483             :      * structurally identical. Although not all bytea values are valid text,
     484             :      * in this case it will be because we've told pg_convert to return one
     485             :      * that is valid as text in the current database encoding.
     486             :      */
     487          20 :     PG_RETURN_DATUM(result);
     488             : }
     489             : 
     490             : /*
     491             :  * Convert string between two arbitrary encodings.
     492             :  *
     493             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     494             :  */
     495             : Datum
     496         532 : pg_convert(PG_FUNCTION_ARGS)
     497             : {
     498         532 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     499         532 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     500         532 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     501         532 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     502         532 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     503             :     const char *src_str;
     504             :     char       *dest_str;
     505             :     bytea      *retval;
     506             :     int         len;
     507             : 
     508         532 :     if (src_encoding < 0)
     509           0 :         ereport(ERROR,
     510             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     511             :                  errmsg("invalid source encoding name \"%s\"",
     512             :                         src_encoding_name)));
     513         532 :     if (dest_encoding < 0)
     514           0 :         ereport(ERROR,
     515             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     516             :                  errmsg("invalid destination encoding name \"%s\"",
     517             :                         dest_encoding_name)));
     518             : 
     519             :     /* make sure that source string is valid */
     520         532 :     len = VARSIZE_ANY_EXHDR(string);
     521         532 :     src_str = VARDATA_ANY(string);
     522         532 :     pg_verify_mbstr_len(src_encoding, src_str, len, false);
     523             : 
     524             :     /* perform conversion */
     525         532 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     526             :                                                   len,
     527             :                                                   src_encoding,
     528             :                                                   dest_encoding);
     529             : 
     530             :     /* update len if conversion actually happened */
     531         532 :     if (dest_str != src_str)
     532         512 :         len = strlen(dest_str);
     533             : 
     534             :     /*
     535             :      * build bytea data type structure.
     536             :      */
     537         532 :     retval = (bytea *) palloc(len + VARHDRSZ);
     538         532 :     SET_VARSIZE(retval, len + VARHDRSZ);
     539         532 :     memcpy(VARDATA(retval), dest_str, len);
     540             : 
     541         532 :     if (dest_str != src_str)
     542         512 :         pfree(dest_str);
     543             : 
     544             :     /* free memory if allocated by the toaster */
     545         532 :     PG_FREE_IF_COPY(string, 0);
     546             : 
     547         532 :     PG_RETURN_BYTEA_P(retval);
     548             : }
     549             : 
     550             : /*
     551             :  * get the length of the string considered as text in the specified
     552             :  * encoding. Raises an error if the data is not valid in that
     553             :  * encoding.
     554             :  *
     555             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     556             :  */
     557             : Datum
     558           0 : length_in_encoding(PG_FUNCTION_ARGS)
     559             : {
     560           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     561           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     562           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     563             :     const char *src_str;
     564             :     int         len;
     565             :     int         retval;
     566             : 
     567           0 :     if (src_encoding < 0)
     568           0 :         ereport(ERROR,
     569             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     570             :                  errmsg("invalid encoding name \"%s\"",
     571             :                         src_encoding_name)));
     572             : 
     573           0 :     len = VARSIZE_ANY_EXHDR(string);
     574           0 :     src_str = VARDATA_ANY(string);
     575             : 
     576           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     577             : 
     578           0 :     PG_RETURN_INT32(retval);
     579             : }
     580             : 
     581             : /*
     582             :  * Get maximum multibyte character length in the specified encoding.
     583             :  *
     584             :  * Note encoding is specified numerically, not by name as above.
     585             :  */
     586             : Datum
     587           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     588             : {
     589           0 :     int         encoding = PG_GETARG_INT32(0);
     590             : 
     591           0 :     if (PG_VALID_ENCODING(encoding))
     592           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     593             :     else
     594           0 :         PG_RETURN_NULL();
     595             : }
     596             : 
     597             : /*
     598             :  * Convert client encoding to server encoding.
     599             :  *
     600             :  * See the notes about string conversion functions at the top of this file.
     601             :  */
     602             : char *
     603      604198 : pg_client_to_server(const char *s, int len)
     604             : {
     605      604198 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     606             : }
     607             : 
     608             : /*
     609             :  * Convert any encoding to server encoding.
     610             :  *
     611             :  * See the notes about string conversion functions at the top of this file.
     612             :  *
     613             :  * Unlike the other string conversion functions, this will apply validation
     614             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     615             :  * used to process data coming in from outside the database, and we never
     616             :  * want to just assume validity.
     617             :  */
     618             : char *
     619     1775464 : pg_any_to_server(const char *s, int len, int encoding)
     620             : {
     621     1775464 :     if (len <= 0)
     622       92098 :         return unconstify(char *, s);   /* empty string is always valid */
     623             : 
     624     1683366 :     if (encoding == DatabaseEncoding->encoding ||
     625             :         encoding == PG_SQL_ASCII)
     626             :     {
     627             :         /*
     628             :          * No conversion is needed, but we must still validate the data.
     629             :          */
     630     1683344 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     631     1683342 :         return unconstify(char *, s);
     632             :     }
     633             : 
     634          22 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     635             :     {
     636             :         /*
     637             :          * No conversion is possible, but we must still validate the data,
     638             :          * because the client-side code might have done string escaping using
     639             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     640             :          * then we just do a straight validation under that encoding.  For an
     641             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     642             :          * to the parser but we have no way to convert it.  We compromise by
     643             :          * rejecting the data if it contains any non-ASCII characters.
     644             :          */
     645           2 :         if (PG_VALID_BE_ENCODING(encoding))
     646           2 :             (void) pg_verify_mbstr(encoding, s, len, false);
     647             :         else
     648             :         {
     649             :             int         i;
     650             : 
     651           0 :             for (i = 0; i < len; i++)
     652             :             {
     653           0 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     654           0 :                     ereport(ERROR,
     655             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     656             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     657             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     658             :                                     (unsigned char) s[i])));
     659             :             }
     660             :         }
     661           2 :         return unconstify(char *, s);
     662             :     }
     663             : 
     664             :     /* Fast path if we can use cached conversion function */
     665          20 :     if (encoding == ClientEncoding->encoding)
     666          20 :         return perform_default_encoding_conversion(s, len, true);
     667             : 
     668             :     /* General case ... will not work outside transactions */
     669           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     670             :                                               len,
     671             :                                               encoding,
     672           0 :                                               DatabaseEncoding->encoding);
     673             : }
     674             : 
     675             : /*
     676             :  * Convert server encoding to client encoding.
     677             :  *
     678             :  * See the notes about string conversion functions at the top of this file.
     679             :  */
     680             : char *
     681     9402970 : pg_server_to_client(const char *s, int len)
     682             : {
     683     9402970 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     684             : }
     685             : 
     686             : /*
     687             :  * Convert server encoding to any encoding.
     688             :  *
     689             :  * See the notes about string conversion functions at the top of this file.
     690             :  */
     691             : char *
     692    13985438 : pg_server_to_any(const char *s, int len, int encoding)
     693             : {
     694    13985438 :     if (len <= 0)
     695      112054 :         return unconstify(char *, s);   /* empty string is always valid */
     696             : 
     697    13873384 :     if (encoding == DatabaseEncoding->encoding ||
     698             :         encoding == PG_SQL_ASCII)
     699    13872978 :         return unconstify(char *, s);   /* assume data is valid */
     700             : 
     701         406 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     702             :     {
     703             :         /* No conversion is possible, but we must validate the result */
     704         176 :         (void) pg_verify_mbstr(encoding, s, len, false);
     705         176 :         return unconstify(char *, s);
     706             :     }
     707             : 
     708             :     /* Fast path if we can use cached conversion function */
     709         230 :     if (encoding == ClientEncoding->encoding)
     710         230 :         return perform_default_encoding_conversion(s, len, false);
     711             : 
     712             :     /* General case ... will not work outside transactions */
     713           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     714             :                                               len,
     715           0 :                                               DatabaseEncoding->encoding,
     716             :                                               encoding);
     717             : }
     718             : 
     719             : /*
     720             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     721             :  *  this function does not access database at all, it is safe to call
     722             :  *  outside transactions.  If the conversion has not been set up by
     723             :  *  SetClientEncoding(), no conversion is performed.
     724             :  */
     725             : static char *
     726         250 : perform_default_encoding_conversion(const char *src, int len,
     727             :                                     bool is_client_to_server)
     728             : {
     729             :     char       *result;
     730             :     int         src_encoding,
     731             :                 dest_encoding;
     732             :     FmgrInfo   *flinfo;
     733             : 
     734         250 :     if (is_client_to_server)
     735             :     {
     736          20 :         src_encoding = ClientEncoding->encoding;
     737          20 :         dest_encoding = DatabaseEncoding->encoding;
     738          20 :         flinfo = ToServerConvProc;
     739             :     }
     740             :     else
     741             :     {
     742         230 :         src_encoding = DatabaseEncoding->encoding;
     743         230 :         dest_encoding = ClientEncoding->encoding;
     744         230 :         flinfo = ToClientConvProc;
     745             :     }
     746             : 
     747         250 :     if (flinfo == NULL)
     748           0 :         return unconstify(char *, src);
     749             : 
     750             :     /*
     751             :      * Allocate space for conversion result, being wary of integer overflow.
     752             :      * See comments in pg_do_encoding_conversion.
     753             :      */
     754         250 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     755           0 :         ereport(ERROR,
     756             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     757             :                  errmsg("out of memory"),
     758             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     759             :                            len)));
     760             : 
     761             :     result = (char *)
     762         250 :         MemoryContextAllocHuge(CurrentMemoryContext,
     763         250 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     764             : 
     765         250 :     FunctionCall5(flinfo,
     766             :                   Int32GetDatum(src_encoding),
     767             :                   Int32GetDatum(dest_encoding),
     768             :                   CStringGetDatum(src),
     769             :                   CStringGetDatum(result),
     770             :                   Int32GetDatum(len));
     771             : 
     772             :     /*
     773             :      * Release extra space if there might be a lot --- see comments in
     774             :      * pg_do_encoding_conversion.
     775             :      */
     776         250 :     if (len > 1000000)
     777             :     {
     778           0 :         Size        resultlen = strlen(result);
     779             : 
     780           0 :         if (resultlen >= MaxAllocSize)
     781           0 :             ereport(ERROR,
     782             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     783             :                      errmsg("out of memory"),
     784             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     785             :                                len)));
     786             : 
     787           0 :         result = (char *) repalloc(result, resultlen + 1);
     788             :     }
     789             : 
     790         250 :     return result;
     791             : }
     792             : 
     793             : /*
     794             :  * Convert a single Unicode code point into a string in the server encoding.
     795             :  *
     796             :  * The code point given by "c" is converted and stored at *s, which must
     797             :  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
     798             :  * The output will have a trailing '\0'.  Throws error if the conversion
     799             :  * cannot be performed.
     800             :  *
     801             :  * Note that this relies on having previously looked up any required
     802             :  * conversion function.  That's partly for speed but mostly because the parser
     803             :  * may call this outside any transaction, or in an aborted transaction.
     804             :  */
     805             : void
     806         350 : pg_unicode_to_server(pg_wchar c, unsigned char *s)
     807             : {
     808             :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     809             :     int         c_as_utf8_len;
     810             :     int         server_encoding;
     811             : 
     812             :     /*
     813             :      * Complain if invalid Unicode code point.  The choice of errcode here is
     814             :      * debatable, but really our caller should have checked this anyway.
     815             :      */
     816         350 :     if (!is_valid_unicode_codepoint(c))
     817           0 :         ereport(ERROR,
     818             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     819             :                  errmsg("invalid Unicode code point")));
     820             : 
     821             :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     822         350 :     if (c <= 0x7F)
     823             :     {
     824         156 :         s[0] = (unsigned char) c;
     825         156 :         s[1] = '\0';
     826         350 :         return;
     827             :     }
     828             : 
     829             :     /* If the server encoding is UTF-8, we just need to reformat the code */
     830         194 :     server_encoding = GetDatabaseEncoding();
     831         194 :     if (server_encoding == PG_UTF8)
     832             :     {
     833         194 :         unicode_to_utf8(c, s);
     834         194 :         s[pg_utf_mblen(s)] = '\0';
     835         194 :         return;
     836             :     }
     837             : 
     838             :     /* For all other cases, we must have a conversion function available */
     839           0 :     if (Utf8ToServerConvProc == NULL)
     840           0 :         ereport(ERROR,
     841             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     842             :                  errmsg("conversion between %s and %s is not supported",
     843             :                         pg_enc2name_tbl[PG_UTF8].name,
     844             :                         GetDatabaseEncodingName())));
     845             : 
     846             :     /* Construct UTF-8 source string */
     847           0 :     unicode_to_utf8(c, c_as_utf8);
     848           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     849           0 :     c_as_utf8[c_as_utf8_len] = '\0';
     850             : 
     851             :     /* Convert, or throw error if we can't */
     852           0 :     FunctionCall5(Utf8ToServerConvProc,
     853             :                   Int32GetDatum(PG_UTF8),
     854             :                   Int32GetDatum(server_encoding),
     855             :                   CStringGetDatum(c_as_utf8),
     856             :                   CStringGetDatum(s),
     857             :                   Int32GetDatum(c_as_utf8_len));
     858             : }
     859             : 
     860             : 
     861             : /* convert a multibyte string to a wchar */
     862             : int
     863           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     864             : {
     865           0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     866             : }
     867             : 
     868             : /* convert a multibyte string to a wchar with a limited length */
     869             : int
     870      620808 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     871             : {
     872      620808 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     873             : }
     874             : 
     875             : /* same, with any encoding */
     876             : int
     877       12094 : pg_encoding_mb2wchar_with_len(int encoding,
     878             :                               const char *from, pg_wchar *to, int len)
     879             : {
     880       12094 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     881             : }
     882             : 
     883             : /* convert a wchar string to a multibyte */
     884             : int
     885           0 : pg_wchar2mb(const pg_wchar *from, char *to)
     886             : {
     887           0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
     888             : }
     889             : 
     890             : /* convert a wchar string to a multibyte with a limited length */
     891             : int
     892     1009756 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
     893             : {
     894     1009756 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     895             : }
     896             : 
     897             : /* same, with any encoding */
     898             : int
     899           0 : pg_encoding_wchar2mb_with_len(int encoding,
     900             :                               const pg_wchar *from, char *to, int len)
     901             : {
     902           0 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
     903             : }
     904             : 
     905             : /* returns the byte length of a multibyte character */
     906             : int
     907   128310446 : pg_mblen(const char *mbstr)
     908             : {
     909   128310446 :     return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
     910             : }
     911             : 
     912             : /* returns the display length of a multibyte character */
     913             : int
     914        4560 : pg_dsplen(const char *mbstr)
     915             : {
     916        4560 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
     917             : }
     918             : 
     919             : /* returns the length (counted in wchars) of a multibyte string */
     920             : int
     921         468 : pg_mbstrlen(const char *mbstr)
     922             : {
     923         468 :     int         len = 0;
     924             : 
     925             :     /* optimization for single byte encoding */
     926         468 :     if (pg_database_encoding_max_length() == 1)
     927           0 :         return strlen(mbstr);
     928             : 
     929        1084 :     while (*mbstr)
     930             :     {
     931         616 :         mbstr += pg_mblen(mbstr);
     932         616 :         len++;
     933             :     }
     934         468 :     return len;
     935             : }
     936             : 
     937             : /* returns the length (counted in wchars) of a multibyte string
     938             :  * (not necessarily NULL terminated)
     939             :  */
     940             : int
     941      729186 : pg_mbstrlen_with_len(const char *mbstr, int limit)
     942             : {
     943      729186 :     int         len = 0;
     944             : 
     945             :     /* optimization for single byte encoding */
     946      729186 :     if (pg_database_encoding_max_length() == 1)
     947         192 :         return limit;
     948             : 
     949   106785514 :     while (limit > 0 && *mbstr)
     950             :     {
     951   106056520 :         int         l = pg_mblen(mbstr);
     952             : 
     953   106056520 :         limit -= l;
     954   106056520 :         mbstr += l;
     955   106056520 :         len++;
     956             :     }
     957      728994 :     return len;
     958             : }
     959             : 
     960             : /*
     961             :  * returns the byte length of a multibyte string
     962             :  * (not necessarily NULL terminated)
     963             :  * that is no longer than limit.
     964             :  * this function does not break multibyte character boundary.
     965             :  */
     966             : int
     967       39768 : pg_mbcliplen(const char *mbstr, int len, int limit)
     968             : {
     969       39768 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
     970             :                                  len, limit);
     971             : }
     972             : 
     973             : /*
     974             :  * pg_mbcliplen with specified encoding
     975             :  */
     976             : int
     977       39768 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
     978             :                       int len, int limit)
     979             : {
     980             :     mblen_converter mblen_fn;
     981       39768 :     int         clen = 0;
     982             :     int         l;
     983             : 
     984             :     /* optimization for single byte encoding */
     985       39768 :     if (pg_encoding_max_length(encoding) == 1)
     986        5106 :         return cliplen(mbstr, len, limit);
     987             : 
     988       34662 :     mblen_fn = pg_wchar_table[encoding].mblen;
     989             : 
     990      442972 :     while (len > 0 && *mbstr)
     991             :     {
     992      423512 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
     993      423512 :         if ((clen + l) > limit)
     994          48 :             break;
     995      423464 :         clen += l;
     996      423464 :         if (clen == limit)
     997       15154 :             break;
     998      408310 :         len -= l;
     999      408310 :         mbstr += l;
    1000             :     }
    1001       34662 :     return clen;
    1002             : }
    1003             : 
    1004             : /*
    1005             :  * Similar to pg_mbcliplen except the limit parameter specifies the
    1006             :  * character length, not the byte length.
    1007             :  */
    1008             : int
    1009         168 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
    1010             : {
    1011         168 :     int         clen = 0;
    1012         168 :     int         nch = 0;
    1013             :     int         l;
    1014             : 
    1015             :     /* optimization for single byte encoding */
    1016         168 :     if (pg_database_encoding_max_length() == 1)
    1017           0 :         return cliplen(mbstr, len, limit);
    1018             : 
    1019         788 :     while (len > 0 && *mbstr)
    1020             :     {
    1021         776 :         l = pg_mblen(mbstr);
    1022         776 :         nch++;
    1023         776 :         if (nch > limit)
    1024         156 :             break;
    1025         620 :         clen += l;
    1026         620 :         len -= l;
    1027         620 :         mbstr += l;
    1028             :     }
    1029         168 :     return clen;
    1030             : }
    1031             : 
    1032             : /* mbcliplen for any single-byte encoding */
    1033             : static int
    1034        5106 : cliplen(const char *str, int len, int limit)
    1035             : {
    1036        5106 :     int         l = 0;
    1037             : 
    1038        5106 :     len = Min(len, limit);
    1039       44130 :     while (l < len && str[l])
    1040       39024 :         l++;
    1041        5106 :     return l;
    1042             : }
    1043             : 
    1044             : void
    1045        9574 : SetDatabaseEncoding(int encoding)
    1046             : {
    1047        9574 :     if (!PG_VALID_BE_ENCODING(encoding))
    1048           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
    1049             : 
    1050        9574 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
    1051             :     Assert(DatabaseEncoding->encoding == encoding);
    1052        9574 : }
    1053             : 
    1054             : void
    1055       12652 : SetMessageEncoding(int encoding)
    1056             : {
    1057             :     /* Some calls happen before we can elog()! */
    1058             :     Assert(PG_VALID_ENCODING(encoding));
    1059             : 
    1060       12652 :     MessageEncoding = &pg_enc2name_tbl[encoding];
    1061             :     Assert(MessageEncoding->encoding == encoding);
    1062       12652 : }
    1063             : 
    1064             : #ifdef ENABLE_NLS
    1065             : /*
    1066             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
    1067             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
    1068             :  * fail for gettext-internal causes like out-of-memory.
    1069             :  */
    1070             : static bool
    1071         254 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
    1072             : {
    1073         254 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1074             :     int         i;
    1075             : 
    1076         712 :     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
    1077             :     {
    1078         712 :         if (pg_enc2gettext_tbl[i].encoding == encoding)
    1079             :         {
    1080         254 :             if (bind_textdomain_codeset(domainname,
    1081             :                                         pg_enc2gettext_tbl[i].name) != NULL)
    1082         254 :                 return true;
    1083             : 
    1084           0 :             if (elog_ok)
    1085           0 :                 elog(LOG, "bind_textdomain_codeset failed");
    1086             :             else
    1087           0 :                 write_stderr("bind_textdomain_codeset failed");
    1088             : 
    1089           0 :             break;
    1090             :         }
    1091             :     }
    1092             : 
    1093           0 :     return false;
    1094             : }
    1095             : 
    1096             : /*
    1097             :  * Bind a gettext message domain to the codeset corresponding to the database
    1098             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
    1099             :  * Return the MessageEncoding implied by the new settings.
    1100             :  *
    1101             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
    1102             :  * When that matches the database encoding, we don't need to do anything.  In
    1103             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
    1104             :  * database encoding, except for the C locale.  (On Windows, we also permit a
    1105             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
    1106             :  * gettext to the right codeset.
    1107             :  *
    1108             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
    1109             :  * convenient departure for software that passes the strings to Windows ANSI
    1110             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
    1111             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
    1112             :  *
    1113             :  * This function is called before elog() and palloc() are usable.
    1114             :  */
    1115             : int
    1116       14928 : pg_bind_textdomain_codeset(const char *domainname)
    1117             : {
    1118       14928 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1119       14928 :     int         encoding = GetDatabaseEncoding();
    1120             :     int         new_msgenc;
    1121             : 
    1122             : #ifndef WIN32
    1123       14928 :     const char *ctype = setlocale(LC_CTYPE, NULL);
    1124             : 
    1125       14928 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
    1126             : #endif
    1127         584 :         if (encoding != PG_SQL_ASCII &&
    1128         254 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
    1129         254 :             return encoding;
    1130             : 
    1131       14674 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
    1132       14674 :     if (new_msgenc < 0)
    1133           0 :         new_msgenc = PG_SQL_ASCII;
    1134             : 
    1135             : #ifdef WIN32
    1136             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1137             :         /* On failure, the old message encoding remains valid. */
    1138             :         return GetMessageEncoding();
    1139             : #endif
    1140             : 
    1141       14674 :     return new_msgenc;
    1142             : }
    1143             : #endif
    1144             : 
    1145             : /*
    1146             :  * The database encoding, also called the server encoding, represents the
    1147             :  * encoding of data stored in text-like data types.  Affected types include
    1148             :  * cstring, text, varchar, name, xml, and json.
    1149             :  */
    1150             : int
    1151    13875480 : GetDatabaseEncoding(void)
    1152             : {
    1153    13875480 :     return DatabaseEncoding->encoding;
    1154             : }
    1155             : 
    1156             : const char *
    1157       19720 : GetDatabaseEncodingName(void)
    1158             : {
    1159       19720 :     return DatabaseEncoding->name;
    1160             : }
    1161             : 
    1162             : Datum
    1163          30 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1164             : {
    1165          30 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1166             : }
    1167             : 
    1168             : Datum
    1169           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1170             : {
    1171           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1172             : }
    1173             : 
    1174             : Datum
    1175           8 : PG_char_to_encoding(PG_FUNCTION_ARGS)
    1176             : {
    1177           8 :     Name        s = PG_GETARG_NAME(0);
    1178             : 
    1179           8 :     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
    1180             : }
    1181             : 
    1182             : Datum
    1183        2204 : PG_encoding_to_char(PG_FUNCTION_ARGS)
    1184             : {
    1185        2204 :     int32       encoding = PG_GETARG_INT32(0);
    1186        2204 :     const char *encoding_name = pg_encoding_to_char(encoding);
    1187             : 
    1188        2204 :     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
    1189             : }
    1190             : 
    1191             : /*
    1192             :  * gettext() returns messages in this encoding.  This often matches the
    1193             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1194             :  * not attached to a database, and under a database encoding lacking iconv
    1195             :  * support (MULE_INTERNAL).
    1196             :  */
    1197             : int
    1198           0 : GetMessageEncoding(void)
    1199             : {
    1200           0 :     return MessageEncoding->encoding;
    1201             : }
    1202             : 
    1203             : 
    1204             : /*
    1205             :  * Generic character incrementer function.
    1206             :  *
    1207             :  * Not knowing anything about the properties of the encoding in use, we just
    1208             :  * keep incrementing the last byte until we get a validly-encoded result,
    1209             :  * or we run out of values to try.  We don't bother to try incrementing
    1210             :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1211             :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1212             :  * is not a valid final byte in the encoding.)
    1213             :  */
    1214             : static bool
    1215          58 : pg_generic_charinc(unsigned char *charptr, int len)
    1216             : {
    1217          58 :     unsigned char *lastbyte = charptr + len - 1;
    1218             :     mbverifier  mbverify;
    1219             : 
    1220             :     /* We can just invoke the character verifier directly. */
    1221          58 :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
    1222             : 
    1223          58 :     while (*lastbyte < (unsigned char) 255)
    1224             :     {
    1225          58 :         (*lastbyte)++;
    1226          58 :         if ((*mbverify) (charptr, len) == len)
    1227          58 :             return true;
    1228             :     }
    1229             : 
    1230           0 :     return false;
    1231             : }
    1232             : 
    1233             : /*
    1234             :  * UTF-8 character incrementer function.
    1235             :  *
    1236             :  * For a one-byte character less than 0x7F, we just increment the byte.
    1237             :  *
    1238             :  * For a multibyte character, every byte but the first must fall between 0x80
    1239             :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1240             :  * the last byte that's not already at its maximum value.  If we can't find a
    1241             :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1242             :  * need some special-case logic to skip regions used for surrogate pair
    1243             :  * handling, as those should not occur in valid UTF-8.
    1244             :  *
    1245             :  * Note that we don't reset lower-order bytes back to their minimums, since
    1246             :  * we can't afford to make an exhaustive search (see make_greater_string).
    1247             :  */
    1248             : static bool
    1249        1390 : pg_utf8_increment(unsigned char *charptr, int length)
    1250             : {
    1251             :     unsigned char a;
    1252             :     unsigned char limit;
    1253             : 
    1254        1390 :     switch (length)
    1255             :     {
    1256           0 :         default:
    1257             :             /* reject lengths 5 and 6 for now */
    1258           0 :             return false;
    1259           0 :         case 4:
    1260           0 :             a = charptr[3];
    1261           0 :             if (a < 0xBF)
    1262             :             {
    1263           0 :                 charptr[3]++;
    1264           0 :                 break;
    1265             :             }
    1266             :             /* FALL THRU */
    1267             :         case 3:
    1268           0 :             a = charptr[2];
    1269           0 :             if (a < 0xBF)
    1270             :             {
    1271           0 :                 charptr[2]++;
    1272           0 :                 break;
    1273             :             }
    1274             :             /* FALL THRU */
    1275             :         case 2:
    1276           0 :             a = charptr[1];
    1277           0 :             switch (*charptr)
    1278             :             {
    1279           0 :                 case 0xED:
    1280           0 :                     limit = 0x9F;
    1281           0 :                     break;
    1282           0 :                 case 0xF4:
    1283           0 :                     limit = 0x8F;
    1284           0 :                     break;
    1285           0 :                 default:
    1286           0 :                     limit = 0xBF;
    1287           0 :                     break;
    1288             :             }
    1289           0 :             if (a < limit)
    1290             :             {
    1291           0 :                 charptr[1]++;
    1292           0 :                 break;
    1293             :             }
    1294             :             /* FALL THRU */
    1295             :         case 1:
    1296        1390 :             a = *charptr;
    1297        1390 :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1298           0 :                 return false;
    1299        1390 :             charptr[0]++;
    1300        1390 :             break;
    1301             :     }
    1302             : 
    1303        1390 :     return true;
    1304             : }
    1305             : 
    1306             : /*
    1307             :  * EUC-JP character incrementer function.
    1308             :  *
    1309             :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1310             :  * representing JIS X 0201 characters with the second byte ranging between
    1311             :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1312             :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1313             :  *
    1314             :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1315             :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1316             :  * is incremented if possible, otherwise the second-to-last byte.
    1317             :  *
    1318             :  * If the sequence starts with a value other than the above and its MSB
    1319             :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1320             :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1321             :  * incremented if possible, otherwise the second-to-last byte.
    1322             :  *
    1323             :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1324             :  * incremented up to 0x7f.
    1325             :  */
    1326             : static bool
    1327           0 : pg_eucjp_increment(unsigned char *charptr, int length)
    1328             : {
    1329             :     unsigned char c1,
    1330             :                 c2;
    1331             :     int         i;
    1332             : 
    1333           0 :     c1 = *charptr;
    1334             : 
    1335           0 :     switch (c1)
    1336             :     {
    1337           0 :         case SS2:               /* JIS X 0201 */
    1338           0 :             if (length != 2)
    1339           0 :                 return false;
    1340             : 
    1341           0 :             c2 = charptr[1];
    1342             : 
    1343           0 :             if (c2 >= 0xdf)
    1344           0 :                 charptr[0] = charptr[1] = 0xa1;
    1345           0 :             else if (c2 < 0xa1)
    1346           0 :                 charptr[1] = 0xa1;
    1347             :             else
    1348           0 :                 charptr[1]++;
    1349           0 :             break;
    1350             : 
    1351           0 :         case SS3:               /* JIS X 0212 */
    1352           0 :             if (length != 3)
    1353           0 :                 return false;
    1354             : 
    1355           0 :             for (i = 2; i > 0; i--)
    1356             :             {
    1357           0 :                 c2 = charptr[i];
    1358           0 :                 if (c2 < 0xa1)
    1359             :                 {
    1360           0 :                     charptr[i] = 0xa1;
    1361           0 :                     return true;
    1362             :                 }
    1363           0 :                 else if (c2 < 0xfe)
    1364             :                 {
    1365           0 :                     charptr[i]++;
    1366           0 :                     return true;
    1367             :                 }
    1368             :             }
    1369             : 
    1370             :             /* Out of 3-byte code region */
    1371           0 :             return false;
    1372             : 
    1373           0 :         default:
    1374           0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1375             :             {
    1376           0 :                 if (length != 2)
    1377           0 :                     return false;
    1378             : 
    1379           0 :                 for (i = 1; i >= 0; i--)
    1380             :                 {
    1381           0 :                     c2 = charptr[i];
    1382           0 :                     if (c2 < 0xa1)
    1383             :                     {
    1384           0 :                         charptr[i] = 0xa1;
    1385           0 :                         return true;
    1386             :                     }
    1387           0 :                     else if (c2 < 0xfe)
    1388             :                     {
    1389           0 :                         charptr[i]++;
    1390           0 :                         return true;
    1391             :                     }
    1392             :                 }
    1393             : 
    1394             :                 /* Out of 2 byte code region */
    1395           0 :                 return false;
    1396             :             }
    1397             :             else
    1398             :             {                   /* ASCII, single byte */
    1399           0 :                 if (c1 > 0x7e)
    1400           0 :                     return false;
    1401           0 :                 (*charptr)++;
    1402             :             }
    1403           0 :             break;
    1404             :     }
    1405             : 
    1406           0 :     return true;
    1407             : }
    1408             : 
    1409             : /*
    1410             :  * get the character incrementer for the encoding for the current database
    1411             :  */
    1412             : mbcharacter_incrementer
    1413        1448 : pg_database_encoding_character_incrementer(void)
    1414             : {
    1415             :     /*
    1416             :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1417             :      * now we just use a switch.
    1418             :      */
    1419        1448 :     switch (GetDatabaseEncoding())
    1420             :     {
    1421        1390 :         case PG_UTF8:
    1422        1390 :             return pg_utf8_increment;
    1423             : 
    1424           0 :         case PG_EUC_JP:
    1425           0 :             return pg_eucjp_increment;
    1426             : 
    1427          58 :         default:
    1428          58 :             return pg_generic_charinc;
    1429             :     }
    1430             : }
    1431             : 
    1432             : /*
    1433             :  * fetch maximum length of the encoding for the current database
    1434             :  */
    1435             : int
    1436    10370478 : pg_database_encoding_max_length(void)
    1437             : {
    1438    10370478 :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1439             : }
    1440             : 
    1441             : /*
    1442             :  * Verify mbstr to make sure that it is validly encoded in the current
    1443             :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1444             :  */
    1445             : bool
    1446        4380 : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1447             : {
    1448             :     return
    1449        4380 :         pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
    1450             : }
    1451             : 
    1452             : /*
    1453             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1454             :  * encoding.
    1455             :  */
    1456             : bool
    1457     1826642 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1458             : {
    1459     1826642 :     return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
    1460             : }
    1461             : 
    1462             : /*
    1463             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1464             :  * encoding.
    1465             :  *
    1466             :  * mbstr is not necessarily zero terminated; length of mbstr is
    1467             :  * specified by len.
    1468             :  *
    1469             :  * If OK, return length of string in the encoding.
    1470             :  * If a problem is found, return -1 when noError is
    1471             :  * true; when noError is false, ereport() a descriptive message.
    1472             :  */
    1473             : int
    1474     1832256 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1475             : {
    1476             :     mbverifier  mbverify;
    1477             :     int         mb_len;
    1478             : 
    1479             :     Assert(PG_VALID_ENCODING(encoding));
    1480             : 
    1481             :     /*
    1482             :      * In single-byte encodings, we need only reject nulls (\0).
    1483             :      */
    1484     1832256 :     if (pg_encoding_max_length(encoding) <= 1)
    1485             :     {
    1486       38596 :         const char *nullpos = memchr(mbstr, 0, len);
    1487             : 
    1488       38596 :         if (nullpos == NULL)
    1489       38596 :             return len;
    1490           0 :         if (noError)
    1491           0 :             return -1;
    1492           0 :         report_invalid_encoding(encoding, nullpos, 1);
    1493             :     }
    1494             : 
    1495             :     /* fetch function pointer just once */
    1496     1793660 :     mbverify = pg_wchar_table[encoding].mbverify;
    1497             : 
    1498     1793660 :     mb_len = 0;
    1499             : 
    1500   146185476 :     while (len > 0)
    1501             :     {
    1502             :         int         l;
    1503             : 
    1504             :         /* fast path for ASCII-subset characters */
    1505   144391818 :         if (!IS_HIGHBIT_SET(*mbstr))
    1506             :         {
    1507   144377072 :             if (*mbstr != '\0')
    1508             :             {
    1509   144377070 :                 mb_len++;
    1510   144377070 :                 mbstr++;
    1511   144377070 :                 len--;
    1512   144377070 :                 continue;
    1513             :             }
    1514           2 :             if (noError)
    1515           0 :                 return -1;
    1516           2 :             report_invalid_encoding(encoding, mbstr, len);
    1517             :         }
    1518             : 
    1519       14746 :         l = (*mbverify) ((const unsigned char *) mbstr, len);
    1520             : 
    1521       14746 :         if (l < 0)
    1522             :         {
    1523           0 :             if (noError)
    1524           0 :                 return -1;
    1525           0 :             report_invalid_encoding(encoding, mbstr, len);
    1526             :         }
    1527             : 
    1528       14746 :         mbstr += l;
    1529       14746 :         len -= l;
    1530       14746 :         mb_len++;
    1531             :     }
    1532     1793658 :     return mb_len;
    1533             : }
    1534             : 
    1535             : /*
    1536             :  * check_encoding_conversion_args: check arguments of a conversion function
    1537             :  *
    1538             :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1539             :  * the caller will check whether it accepts the ID.
    1540             :  *
    1541             :  * Note: the errors here are not really user-facing, so elog instead of
    1542             :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1543             :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1544             :  */
    1545             : void
    1546         806 : check_encoding_conversion_args(int src_encoding,
    1547             :                                int dest_encoding,
    1548             :                                int len,
    1549             :                                int expected_src_encoding,
    1550             :                                int expected_dest_encoding)
    1551             : {
    1552         806 :     if (!PG_VALID_ENCODING(src_encoding))
    1553           0 :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1554         806 :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1555           0 :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1556             :              pg_enc2name_tbl[expected_src_encoding].name,
    1557             :              pg_enc2name_tbl[src_encoding].name);
    1558         806 :     if (!PG_VALID_ENCODING(dest_encoding))
    1559           0 :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1560         806 :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1561           0 :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1562             :              pg_enc2name_tbl[expected_dest_encoding].name,
    1563             :              pg_enc2name_tbl[dest_encoding].name);
    1564         806 :     if (len < 0)
    1565           0 :         elog(ERROR, "encoding conversion length must not be negative");
    1566         806 : }
    1567             : 
    1568             : /*
    1569             :  * report_invalid_encoding: complain about invalid multibyte character
    1570             :  *
    1571             :  * note: len is remaining length of string, not length of character;
    1572             :  * len must be greater than zero, as we always examine the first byte.
    1573             :  */
    1574             : void
    1575           2 : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1576             : {
    1577           2 :     int         l = pg_encoding_mblen(encoding, mbstr);
    1578             :     char        buf[8 * 5 + 1];
    1579           2 :     char       *p = buf;
    1580             :     int         j,
    1581             :                 jlimit;
    1582             : 
    1583           2 :     jlimit = Min(l, len);
    1584           2 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1585             : 
    1586           4 :     for (j = 0; j < jlimit; j++)
    1587             :     {
    1588           2 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1589           2 :         if (j < jlimit - 1)
    1590           0 :             p += sprintf(p, " ");
    1591             :     }
    1592             : 
    1593           2 :     ereport(ERROR,
    1594             :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    1595             :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    1596             :                     pg_enc2name_tbl[encoding].name,
    1597             :                     buf)));
    1598             : }
    1599             : 
    1600             : /*
    1601             :  * report_untranslatable_char: complain about untranslatable character
    1602             :  *
    1603             :  * note: len is remaining length of string, not length of character;
    1604             :  * len must be greater than zero, as we always examine the first byte.
    1605             :  */
    1606             : void
    1607           0 : report_untranslatable_char(int src_encoding, int dest_encoding,
    1608             :                            const char *mbstr, int len)
    1609             : {
    1610           0 :     int         l = pg_encoding_mblen(src_encoding, mbstr);
    1611             :     char        buf[8 * 5 + 1];
    1612           0 :     char       *p = buf;
    1613             :     int         j,
    1614             :                 jlimit;
    1615             : 
    1616           0 :     jlimit = Min(l, len);
    1617           0 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1618             : 
    1619           0 :     for (j = 0; j < jlimit; j++)
    1620             :     {
    1621           0 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1622           0 :         if (j < jlimit - 1)
    1623           0 :             p += sprintf(p, " ");
    1624             :     }
    1625             : 
    1626           0 :     ereport(ERROR,
    1627             :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    1628             :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    1629             :                     buf,
    1630             :                     pg_enc2name_tbl[src_encoding].name,
    1631             :                     pg_enc2name_tbl[dest_encoding].name)));
    1632             : }
    1633             : 
    1634             : 
    1635             : #ifdef WIN32
    1636             : /*
    1637             :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1638             :  * string. The character length is also passed to utf16len if not
    1639             :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1640             :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1641             :  */
    1642             : WCHAR *
    1643             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1644             : {
    1645             :     int         msgenc = GetMessageEncoding();
    1646             :     WCHAR      *utf16;
    1647             :     int         dstlen;
    1648             :     UINT        codepage;
    1649             : 
    1650             :     if (msgenc == PG_SQL_ASCII)
    1651             :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1652             :         return NULL;
    1653             : 
    1654             :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1655             : 
    1656             :     /*
    1657             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1658             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1659             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1660             :      */
    1661             :     if (codepage != 0)
    1662             :     {
    1663             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1664             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1665             :         utf16[dstlen] = (WCHAR) 0;
    1666             :     }
    1667             :     else
    1668             :     {
    1669             :         char       *utf8;
    1670             : 
    1671             :         /*
    1672             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1673             :          * absence of one, hope for the input to be valid UTF8.
    1674             :          */
    1675             :         if (IsTransactionState())
    1676             :         {
    1677             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1678             :                                                       len,
    1679             :                                                       msgenc,
    1680             :                                                       PG_UTF8);
    1681             :             if (utf8 != str)
    1682             :                 len = strlen(utf8);
    1683             :         }
    1684             :         else
    1685             :             utf8 = (char *) str;
    1686             : 
    1687             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1688             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1689             :         utf16[dstlen] = (WCHAR) 0;
    1690             : 
    1691             :         if (utf8 != str)
    1692             :             pfree(utf8);
    1693             :     }
    1694             : 
    1695             :     if (dstlen == 0 && len > 0)
    1696             :     {
    1697             :         pfree(utf16);
    1698             :         return NULL;            /* error */
    1699             :     }
    1700             : 
    1701             :     if (utf16len)
    1702             :         *utf16len = dstlen;
    1703             :     return utf16;
    1704             : }
    1705             : 
    1706             : #endif                          /* WIN32 */

Generated by: LCOV version 1.13