LCOV - PostgreSQL 19devel - src/backend/utils/mb/mbutils.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/mb - mbutils.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	338	531	63.7 %
Date:	2025-08-09 08:18:06	Functions:	45	55	81.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/fmgrprotos.h"
      41             : #include "utils/memutils.h"
      42             : #include "utils/relcache.h"
      43             : #include "varatt.h"
      44             : 
      45             : /*
      46             :  * We maintain a simple linked list caching the fmgr lookup info for the
      47             :  * currently selected conversion functions, as well as any that have been
      48             :  * selected previously in the current session.  (We remember previous
      49             :  * settings because we must be able to restore a previous setting during
      50             :  * transaction rollback, without doing any fresh catalog accesses.)
      51             :  *
      52             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      53             :  */
      54             : typedef struct ConvProcInfo
      55             : {
      56             :     int         s_encoding;     /* server and client encoding IDs */
      57             :     int         c_encoding;
      58             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      59             :     FmgrInfo    to_client_info;
      60             : } ConvProcInfo;
      61             : 
      62             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      63             : 
      64             : /*
      65             :  * These variables point to the currently active conversion functions,
      66             :  * or are NULL when no conversion is needed.
      67             :  */
      68             : static FmgrInfo *ToServerConvProc = NULL;
      69             : static FmgrInfo *ToClientConvProc = NULL;
      70             : 
      71             : /*
      72             :  * This variable stores the conversion function to convert from UTF-8
      73             :  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
      74             :  * or if we lack a conversion function for this.
      75             :  */
      76             : static FmgrInfo *Utf8ToServerConvProc = NULL;
      77             : 
      78             : /*
      79             :  * These variables track the currently-selected encodings.
      80             :  */
      81             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      82             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      83             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      84             : 
      85             : /*
      86             :  * During backend startup we can't set client encoding because we (a)
      87             :  * can't look up the conversion functions, and (b) may not know the database
      88             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      89             :  * remembers it for InitializeClientEncoding() to apply later.
      90             :  */
      91             : static bool backend_startup_complete = false;
      92             : static int  pending_client_encoding = PG_SQL_ASCII;
      93             : 
      94             : 
      95             : /* Internal functions */
      96             : static char *perform_default_encoding_conversion(const char *src,
      97             :                                                  int len, bool is_client_to_server);
      98             : static int  cliplen(const char *str, int len, int limit);
      99             : 
     100             : 
     101             : /*
     102             :  * Prepare for a future call to SetClientEncoding.  Success should mean
     103             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     104             :  *
     105             :  * (But note that success before backend_startup_complete does not guarantee
     106             :  * success after ...)
     107             :  *
     108             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     109             :  */
     110             : int
     111       67724 : PrepareClientEncoding(int encoding)
     112             : {
     113             :     int         current_server_encoding;
     114             :     ListCell   *lc;
     115             : 
     116       67724 :     if (!PG_VALID_FE_ENCODING(encoding))
     117           0 :         return -1;
     118             : 
     119             :     /* Can't do anything during startup, per notes above */
     120       67724 :     if (!backend_startup_complete)
     121       34198 :         return 0;
     122             : 
     123       33526 :     current_server_encoding = GetDatabaseEncoding();
     124             : 
     125             :     /*
     126             :      * Check for cases that require no conversion function.
     127             :      */
     128       33526 :     if (current_server_encoding == encoding ||
     129        2774 :         current_server_encoding == PG_SQL_ASCII ||
     130             :         encoding == PG_SQL_ASCII)
     131       33506 :         return 0;
     132             : 
     133          20 :     if (IsTransactionState())
     134             :     {
     135             :         /*
     136             :          * If we're in a live transaction, it's safe to access the catalogs,
     137             :          * so look up the functions.  We repeat the lookup even if the info is
     138             :          * already cached, so that we can react to changes in the contents of
     139             :          * pg_conversion.
     140             :          */
     141             :         Oid         to_server_proc,
     142             :                     to_client_proc;
     143             :         ConvProcInfo *convinfo;
     144             :         MemoryContext oldcontext;
     145             : 
     146          20 :         to_server_proc = FindDefaultConversionProc(encoding,
     147             :                                                    current_server_encoding);
     148          20 :         if (!OidIsValid(to_server_proc))
     149           0 :             return -1;
     150          20 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     151             :                                                    encoding);
     152          20 :         if (!OidIsValid(to_client_proc))
     153           0 :             return -1;
     154             : 
     155             :         /*
     156             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     157             :          */
     158          20 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     159             :                                                        sizeof(ConvProcInfo));
     160          20 :         convinfo->s_encoding = current_server_encoding;
     161          20 :         convinfo->c_encoding = encoding;
     162          20 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     163             :                       TopMemoryContext);
     164          20 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     165             :                       TopMemoryContext);
     166             : 
     167             :         /* Attach new info to head of list */
     168          20 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     169          20 :         ConvProcList = lcons(convinfo, ConvProcList);
     170          20 :         MemoryContextSwitchTo(oldcontext);
     171             : 
     172             :         /*
     173             :          * We cannot yet remove any older entry for the same encoding pair,
     174             :          * since it could still be in use.  SetClientEncoding will clean up.
     175             :          */
     176             : 
     177          20 :         return 0;               /* success */
     178             :     }
     179             :     else
     180             :     {
     181             :         /*
     182             :          * If we're not in a live transaction, the only thing we can do is
     183             :          * restore a previous setting using the cache.  This covers all
     184             :          * transaction-rollback cases.  The only case it might not work for is
     185             :          * trying to change client_encoding on the fly by editing
     186             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     187             :          * thing to do anyway.
     188             :          */
     189           0 :         foreach(lc, ConvProcList)
     190             :         {
     191           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     192             : 
     193           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     194           0 :                 oldinfo->c_encoding == encoding)
     195           0 :                 return 0;
     196             :         }
     197             : 
     198           0 :         return -1;              /* it's not cached, so fail */
     199             :     }
     200             : }
     201             : 
     202             : /*
     203             :  * Set the active client encoding and set up the conversion-function pointers.
     204             :  * PrepareClientEncoding should have been called previously for this encoding.
     205             :  *
     206             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     207             :  */
     208             : int
     209       70282 : SetClientEncoding(int encoding)
     210             : {
     211             :     int         current_server_encoding;
     212             :     bool        found;
     213             :     ListCell   *lc;
     214             : 
     215       70282 :     if (!PG_VALID_FE_ENCODING(encoding))
     216           0 :         return -1;
     217             : 
     218             :     /* Can't do anything during startup, per notes above */
     219       70282 :     if (!backend_startup_complete)
     220             :     {
     221       34020 :         pending_client_encoding = encoding;
     222       34020 :         return 0;
     223             :     }
     224             : 
     225       36262 :     current_server_encoding = GetDatabaseEncoding();
     226             : 
     227             :     /*
     228             :      * Check for cases that require no conversion function.
     229             :      */
     230       36262 :     if (current_server_encoding == encoding ||
     231        2774 :         current_server_encoding == PG_SQL_ASCII ||
     232             :         encoding == PG_SQL_ASCII)
     233             :     {
     234       36242 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     235       36242 :         ToServerConvProc = NULL;
     236       36242 :         ToClientConvProc = NULL;
     237       36242 :         return 0;
     238             :     }
     239             : 
     240             :     /*
     241             :      * Search the cache for the entry previously prepared by
     242             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     243             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     244             :      * leak memory.
     245             :      */
     246          20 :     found = false;
     247          46 :     foreach(lc, ConvProcList)
     248             :     {
     249          26 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     250             : 
     251          26 :         if (convinfo->s_encoding == current_server_encoding &&
     252          26 :             convinfo->c_encoding == encoding)
     253             :         {
     254          20 :             if (!found)
     255             :             {
     256             :                 /* Found newest entry, so set up */
     257          20 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     258          20 :                 ToServerConvProc = &convinfo->to_server_info;
     259          20 :                 ToClientConvProc = &convinfo->to_client_info;
     260          20 :                 found = true;
     261             :             }
     262             :             else
     263             :             {
     264             :                 /* Duplicate entry, release it */
     265           0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     266           0 :                 pfree(convinfo);
     267             :             }
     268             :         }
     269             :     }
     270             : 
     271          20 :     if (found)
     272          20 :         return 0;               /* success */
     273             :     else
     274           0 :         return -1;              /* it's not cached, so fail */
     275             : }
     276             : 
     277             : /*
     278             :  * Initialize client encoding conversions.
     279             :  *      Called from InitPostgres() once during backend startup.
     280             :  */
     281             : void
     282       32882 : InitializeClientEncoding(void)
     283             : {
     284             :     int         current_server_encoding;
     285             : 
     286             :     Assert(!backend_startup_complete);
     287       32882 :     backend_startup_complete = true;
     288             : 
     289       65764 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     290       32882 :         SetClientEncoding(pending_client_encoding) < 0)
     291             :     {
     292             :         /*
     293             :          * Oops, the requested conversion is not available. We couldn't fail
     294             :          * before, but we can now.
     295             :          */
     296           0 :         ereport(FATAL,
     297             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     298             :                  errmsg("conversion between %s and %s is not supported",
     299             :                         pg_enc2name_tbl[pending_client_encoding].name,
     300             :                         GetDatabaseEncodingName())));
     301             :     }
     302             : 
     303             :     /*
     304             :      * Also look up the UTF8-to-server conversion function if needed.  Since
     305             :      * the server encoding is fixed within any one backend process, we don't
     306             :      * have to do this more than once.
     307             :      */
     308       32882 :     current_server_encoding = GetDatabaseEncoding();
     309       32882 :     if (current_server_encoding != PG_UTF8 &&
     310             :         current_server_encoding != PG_SQL_ASCII)
     311             :     {
     312             :         Oid         utf8_to_server_proc;
     313             : 
     314         202 :         AssertCouldGetRelation();
     315             :         utf8_to_server_proc =
     316         202 :             FindDefaultConversionProc(PG_UTF8,
     317             :                                       current_server_encoding);
     318             :         /* If there's no such conversion, just leave the pointer as NULL */
     319         202 :         if (OidIsValid(utf8_to_server_proc))
     320             :         {
     321             :             FmgrInfo   *finfo;
     322             : 
     323         202 :             finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
     324             :                                                     sizeof(FmgrInfo));
     325         202 :             fmgr_info_cxt(utf8_to_server_proc, finfo,
     326             :                           TopMemoryContext);
     327             :             /* Set Utf8ToServerConvProc only after data is fully valid */
     328         202 :             Utf8ToServerConvProc = finfo;
     329             :         }
     330             :     }
     331       32882 : }
     332             : 
     333             : /*
     334             :  * returns the current client encoding
     335             :  */
     336             : int
     337       11244 : pg_get_client_encoding(void)
     338             : {
     339       11244 :     return ClientEncoding->encoding;
     340             : }
     341             : 
     342             : /*
     343             :  * returns the current client encoding name
     344             :  */
     345             : const char *
     346           0 : pg_get_client_encoding_name(void)
     347             : {
     348           0 :     return ClientEncoding->name;
     349             : }
     350             : 
     351             : /*
     352             :  * Convert src string to another encoding (general case).
     353             :  *
     354             :  * See the notes about string conversion functions at the top of this file.
     355             :  */
     356             : unsigned char *
     357        3014 : pg_do_encoding_conversion(unsigned char *src, int len,
     358             :                           int src_encoding, int dest_encoding)
     359             : {
     360             :     unsigned char *result;
     361             :     Oid         proc;
     362             : 
     363        3014 :     if (len <= 0)
     364          30 :         return src;             /* empty string is always valid */
     365             : 
     366        2984 :     if (src_encoding == dest_encoding)
     367        2200 :         return src;             /* no conversion required, assume valid */
     368             : 
     369         784 :     if (dest_encoding == PG_SQL_ASCII)
     370           0 :         return src;             /* any string is valid in SQL_ASCII */
     371             : 
     372         784 :     if (src_encoding == PG_SQL_ASCII)
     373             :     {
     374             :         /* No conversion is possible, but we must validate the result */
     375          16 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     376          16 :         return src;
     377             :     }
     378             : 
     379         768 :     if (!IsTransactionState())  /* shouldn't happen */
     380           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     381             : 
     382         768 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     383         768 :     if (!OidIsValid(proc))
     384           0 :         ereport(ERROR,
     385             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     386             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     387             :                         pg_encoding_to_char(src_encoding),
     388             :                         pg_encoding_to_char(dest_encoding))));
     389             : 
     390             :     /*
     391             :      * Allocate space for conversion result, being wary of integer overflow.
     392             :      *
     393             :      * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
     394             :      * required space, so it might exceed MaxAllocSize even though the result
     395             :      * would actually fit.  We do not want to hand back a result string that
     396             :      * exceeds MaxAllocSize, because callers might not cope gracefully --- but
     397             :      * if we just allocate more than that, and don't use it, that's fine.
     398             :      */
     399         768 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     400           0 :         ereport(ERROR,
     401             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     402             :                  errmsg("out of memory"),
     403             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     404             :                            len)));
     405             : 
     406             :     result = (unsigned char *)
     407         768 :         MemoryContextAllocHuge(CurrentMemoryContext,
     408         768 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     409             : 
     410         768 :     (void) OidFunctionCall6(proc,
     411             :                             Int32GetDatum(src_encoding),
     412             :                             Int32GetDatum(dest_encoding),
     413             :                             CStringGetDatum((char *) src),
     414             :                             CStringGetDatum((char *) result),
     415             :                             Int32GetDatum(len),
     416             :                             BoolGetDatum(false));
     417             : 
     418             :     /*
     419             :      * If the result is large, it's worth repalloc'ing to release any extra
     420             :      * space we asked for.  The cutoff here is somewhat arbitrary, but we
     421             :      * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
     422             :      */
     423         768 :     if (len > 1000000)
     424             :     {
     425           0 :         Size        resultlen = strlen((char *) result);
     426             : 
     427           0 :         if (resultlen >= MaxAllocSize)
     428           0 :             ereport(ERROR,
     429             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     430             :                      errmsg("out of memory"),
     431             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     432             :                                len)));
     433             : 
     434           0 :         result = (unsigned char *) repalloc(result, resultlen + 1);
     435             :     }
     436             : 
     437         768 :     return result;
     438             : }
     439             : 
     440             : /*
     441             :  * Convert src string to another encoding.
     442             :  *
     443             :  * This function has a different API than the other conversion functions.
     444             :  * The caller should've looked up the conversion function using
     445             :  * FindDefaultConversionProc().  Unlike the other functions, the converted
     446             :  * result is not palloc'd.  It is written to the caller-supplied buffer
     447             :  * instead.
     448             :  *
     449             :  * src_encoding   - encoding to convert from
     450             :  * dest_encoding  - encoding to convert to
     451             :  * src, srclen    - input buffer and its length in bytes
     452             :  * dest, destlen  - destination buffer and its size in bytes
     453             :  *
     454             :  * The output is null-terminated.
     455             :  *
     456             :  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
     457             :  * wouldn't necessarily fit in the output buffer, and the function will not
     458             :  * convert the whole input.
     459             :  *
     460             :  * TODO: The conversion function interface is not great.  Firstly, it
     461             :  * would be nice to pass through the destination buffer size to the
     462             :  * conversion function, so that if you pass a shorter destination buffer, it
     463             :  * could still continue to fill up the whole buffer.  Currently, we have to
     464             :  * assume worst case expansion and stop the conversion short, even if there
     465             :  * is in fact space left in the destination buffer.  Secondly, it would be
     466             :  * nice to return the number of bytes written to the caller, to avoid a call
     467             :  * to strlen().
     468             :  */
     469             : int
     470        5790 : pg_do_encoding_conversion_buf(Oid proc,
     471             :                               int src_encoding,
     472             :                               int dest_encoding,
     473             :                               unsigned char *src, int srclen,
     474             :                               unsigned char *dest, int destlen,
     475             :                               bool noError)
     476             : {
     477             :     Datum       result;
     478             : 
     479             :     /*
     480             :      * If the destination buffer is not large enough to hold the result in the
     481             :      * worst case, limit the input size passed to the conversion function.
     482             :      */
     483        5790 :     if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
     484        5742 :         srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
     485             : 
     486        5790 :     result = OidFunctionCall6(proc,
     487             :                               Int32GetDatum(src_encoding),
     488             :                               Int32GetDatum(dest_encoding),
     489             :                               CStringGetDatum((char *) src),
     490             :                               CStringGetDatum((char *) dest),
     491             :                               Int32GetDatum(srclen),
     492             :                               BoolGetDatum(noError));
     493        3420 :     return DatumGetInt32(result);
     494             : }
     495             : 
     496             : /*
     497             :  * Convert string to encoding encoding_name. The source
     498             :  * encoding is the DB encoding.
     499             :  *
     500             :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     501             : Datum
     502         396 : pg_convert_to(PG_FUNCTION_ARGS)
     503             : {
     504         396 :     Datum       string = PG_GETARG_DATUM(0);
     505         396 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     506         396 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     507             :                                                         CStringGetDatum(DatabaseEncoding->name));
     508             :     Datum       result;
     509             : 
     510             :     /*
     511             :      * pg_convert expects a bytea as its first argument. We're passing it a
     512             :      * text argument here, relying on the fact that they are both in fact
     513             :      * varlena types, and thus structurally identical.
     514             :      */
     515         396 :     result = DirectFunctionCall3(pg_convert, string,
     516             :                                  src_encoding_name, dest_encoding_name);
     517             : 
     518         396 :     PG_RETURN_DATUM(result);
     519             : }
     520             : 
     521             : /*
     522             :  * Convert string from encoding encoding_name. The destination
     523             :  * encoding is the DB encoding.
     524             :  *
     525             :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     526             : Datum
     527         580 : pg_convert_from(PG_FUNCTION_ARGS)
     528             : {
     529         580 :     Datum       string = PG_GETARG_DATUM(0);
     530         580 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     531         580 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     532             :                                                          CStringGetDatum(DatabaseEncoding->name));
     533             :     Datum       result;
     534             : 
     535         580 :     result = DirectFunctionCall3(pg_convert, string,
     536             :                                  src_encoding_name, dest_encoding_name);
     537             : 
     538             :     /*
     539             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     540             :      * the fact that they are both in fact varlena types, and thus
     541             :      * structurally identical. Although not all bytea values are valid text,
     542             :      * in this case it will be because we've told pg_convert to return one
     543             :      * that is valid as text in the current database encoding.
     544             :      */
     545         574 :     PG_RETURN_DATUM(result);
     546             : }
     547             : 
     548             : /*
     549             :  * Convert string between two arbitrary encodings.
     550             :  *
     551             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     552             :  */
     553             : Datum
     554        1744 : pg_convert(PG_FUNCTION_ARGS)
     555             : {
     556        1744 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     557        1744 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     558        1744 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     559        1744 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     560        1744 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     561             :     const char *src_str;
     562             :     char       *dest_str;
     563             :     bytea      *retval;
     564             :     int         len;
     565             : 
     566        1744 :     if (src_encoding < 0)
     567           0 :         ereport(ERROR,
     568             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     569             :                  errmsg("invalid source encoding name \"%s\"",
     570             :                         src_encoding_name)));
     571        1744 :     if (dest_encoding < 0)
     572           0 :         ereport(ERROR,
     573             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     574             :                  errmsg("invalid destination encoding name \"%s\"",
     575             :                         dest_encoding_name)));
     576             : 
     577             :     /* make sure that source string is valid */
     578        1744 :     len = VARSIZE_ANY_EXHDR(string);
     579        1744 :     src_str = VARDATA_ANY(string);
     580        1744 :     (void) pg_verify_mbstr(src_encoding, src_str, len, false);
     581             : 
     582             :     /* perform conversion */
     583        1738 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     584             :                                                   len,
     585             :                                                   src_encoding,
     586             :                                                   dest_encoding);
     587             : 
     588             : 
     589             :     /* return source string if no conversion happened */
     590        1738 :     if (dest_str == src_str)
     591         970 :         PG_RETURN_BYTEA_P(string);
     592             : 
     593             :     /*
     594             :      * build bytea data type structure.
     595             :      */
     596         768 :     len = strlen(dest_str);
     597         768 :     retval = (bytea *) palloc(len + VARHDRSZ);
     598         768 :     SET_VARSIZE(retval, len + VARHDRSZ);
     599         768 :     memcpy(VARDATA(retval), dest_str, len);
     600         768 :     pfree(dest_str);
     601             : 
     602             :     /* free memory if allocated by the toaster */
     603         768 :     PG_FREE_IF_COPY(string, 0);
     604             : 
     605         768 :     PG_RETURN_BYTEA_P(retval);
     606             : }
     607             : 
     608             : /*
     609             :  * get the length of the string considered as text in the specified
     610             :  * encoding. Raises an error if the data is not valid in that
     611             :  * encoding.
     612             :  *
     613             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     614             :  */
     615             : Datum
     616           0 : length_in_encoding(PG_FUNCTION_ARGS)
     617             : {
     618           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     619           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     620           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     621             :     const char *src_str;
     622             :     int         len;
     623             :     int         retval;
     624             : 
     625           0 :     if (src_encoding < 0)
     626           0 :         ereport(ERROR,
     627             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     628             :                  errmsg("invalid encoding name \"%s\"",
     629             :                         src_encoding_name)));
     630             : 
     631           0 :     len = VARSIZE_ANY_EXHDR(string);
     632           0 :     src_str = VARDATA_ANY(string);
     633             : 
     634           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     635             : 
     636           0 :     PG_RETURN_INT32(retval);
     637             : }
     638             : 
     639             : /*
     640             :  * Get maximum multibyte character length in the specified encoding.
     641             :  *
     642             :  * Note encoding is specified numerically, not by name as above.
     643             :  */
     644             : Datum
     645           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     646             : {
     647           0 :     int         encoding = PG_GETARG_INT32(0);
     648             : 
     649           0 :     if (PG_VALID_ENCODING(encoding))
     650           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     651             :     else
     652           0 :         PG_RETURN_NULL();
     653             : }
     654             : 
     655             : /*
     656             :  * Convert client encoding to server encoding.
     657             :  *
     658             :  * See the notes about string conversion functions at the top of this file.
     659             :  */
     660             : char *
     661      801918 : pg_client_to_server(const char *s, int len)
     662             : {
     663      801918 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     664             : }
     665             : 
     666             : /*
     667             :  * Convert any encoding to server encoding.
     668             :  *
     669             :  * See the notes about string conversion functions at the top of this file.
     670             :  *
     671             :  * Unlike the other string conversion functions, this will apply validation
     672             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     673             :  * used to process data coming in from outside the database, and we never
     674             :  * want to just assume validity.
     675             :  */
     676             : char *
     677      887218 : pg_any_to_server(const char *s, int len, int encoding)
     678             : {
     679      887218 :     if (len <= 0)
     680       79270 :         return unconstify(char *, s);   /* empty string is always valid */
     681             : 
     682      807948 :     if (encoding == DatabaseEncoding->encoding ||
     683             :         encoding == PG_SQL_ASCII)
     684             :     {
     685             :         /*
     686             :          * No conversion is needed, but we must still validate the data.
     687             :          */
     688      807580 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     689      807578 :         return unconstify(char *, s);
     690             :     }
     691             : 
     692         368 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     693             :     {
     694             :         /*
     695             :          * No conversion is possible, but we must still validate the data,
     696             :          * because the client-side code might have done string escaping using
     697             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     698             :          * then we just do a straight validation under that encoding.  For an
     699             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     700             :          * to the parser but we have no way to convert it.  We compromise by
     701             :          * rejecting the data if it contains any non-ASCII characters.
     702             :          */
     703         308 :         if (PG_VALID_BE_ENCODING(encoding))
     704         248 :             (void) pg_verify_mbstr(encoding, s, len, false);
     705             :         else
     706             :         {
     707             :             int         i;
     708             : 
     709        1908 :             for (i = 0; i < len; i++)
     710             :             {
     711        1848 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     712           0 :                     ereport(ERROR,
     713             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     714             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     715             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     716             :                                     (unsigned char) s[i])));
     717             :             }
     718             :         }
     719         308 :         return unconstify(char *, s);
     720             :     }
     721             : 
     722             :     /* Fast path if we can use cached conversion function */
     723          60 :     if (encoding == ClientEncoding->encoding)
     724          60 :         return perform_default_encoding_conversion(s, len, true);
     725             : 
     726             :     /* General case ... will not work outside transactions */
     727           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     728             :                                               len,
     729             :                                               encoding,
     730           0 :                                               DatabaseEncoding->encoding);
     731             : }
     732             : 
     733             : /*
     734             :  * Convert server encoding to client encoding.
     735             :  *
     736             :  * See the notes about string conversion functions at the top of this file.
     737             :  */
     738             : char *
     739    35689120 : pg_server_to_client(const char *s, int len)
     740             : {
     741    35689120 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     742             : }
     743             : 
     744             : /*
     745             :  * Convert server encoding to any encoding.
     746             :  *
     747             :  * See the notes about string conversion functions at the top of this file.
     748             :  */
     749             : char *
     750    35727870 : pg_server_to_any(const char *s, int len, int encoding)
     751             : {
     752    35727870 :     if (len <= 0)
     753      263830 :         return unconstify(char *, s);   /* empty string is always valid */
     754             : 
     755    35464040 :     if (encoding == DatabaseEncoding->encoding ||
     756             :         encoding == PG_SQL_ASCII)
     757    35463488 :         return unconstify(char *, s);   /* assume data is valid */
     758             : 
     759         552 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     760             :     {
     761             :         /* No conversion is possible, but we must validate the result */
     762         168 :         (void) pg_verify_mbstr(encoding, s, len, false);
     763         168 :         return unconstify(char *, s);
     764             :     }
     765             : 
     766             :     /* Fast path if we can use cached conversion function */
     767         384 :     if (encoding == ClientEncoding->encoding)
     768         384 :         return perform_default_encoding_conversion(s, len, false);
     769             : 
     770             :     /* General case ... will not work outside transactions */
     771           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     772             :                                               len,
     773           0 :                                               DatabaseEncoding->encoding,
     774             :                                               encoding);
     775             : }
     776             : 
     777             : /*
     778             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     779             :  *  this function does not access database at all, it is safe to call
     780             :  *  outside transactions.  If the conversion has not been set up by
     781             :  *  SetClientEncoding(), no conversion is performed.
     782             :  */
     783             : static char *
     784         444 : perform_default_encoding_conversion(const char *src, int len,
     785             :                                     bool is_client_to_server)
     786             : {
     787             :     char       *result;
     788             :     int         src_encoding,
     789             :                 dest_encoding;
     790             :     FmgrInfo   *flinfo;
     791             : 
     792         444 :     if (is_client_to_server)
     793             :     {
     794          60 :         src_encoding = ClientEncoding->encoding;
     795          60 :         dest_encoding = DatabaseEncoding->encoding;
     796          60 :         flinfo = ToServerConvProc;
     797             :     }
     798             :     else
     799             :     {
     800         384 :         src_encoding = DatabaseEncoding->encoding;
     801         384 :         dest_encoding = ClientEncoding->encoding;
     802         384 :         flinfo = ToClientConvProc;
     803             :     }
     804             : 
     805         444 :     if (flinfo == NULL)
     806           0 :         return unconstify(char *, src);
     807             : 
     808             :     /*
     809             :      * Allocate space for conversion result, being wary of integer overflow.
     810             :      * See comments in pg_do_encoding_conversion.
     811             :      */
     812         444 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     813           0 :         ereport(ERROR,
     814             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     815             :                  errmsg("out of memory"),
     816             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     817             :                            len)));
     818             : 
     819             :     result = (char *)
     820         444 :         MemoryContextAllocHuge(CurrentMemoryContext,
     821         444 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     822             : 
     823         444 :     FunctionCall6(flinfo,
     824             :                   Int32GetDatum(src_encoding),
     825             :                   Int32GetDatum(dest_encoding),
     826             :                   CStringGetDatum(src),
     827             :                   CStringGetDatum(result),
     828             :                   Int32GetDatum(len),
     829             :                   BoolGetDatum(false));
     830             : 
     831             :     /*
     832             :      * Release extra space if there might be a lot --- see comments in
     833             :      * pg_do_encoding_conversion.
     834             :      */
     835         444 :     if (len > 1000000)
     836             :     {
     837           0 :         Size        resultlen = strlen(result);
     838             : 
     839           0 :         if (resultlen >= MaxAllocSize)
     840           0 :             ereport(ERROR,
     841             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     842             :                      errmsg("out of memory"),
     843             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     844             :                                len)));
     845             : 
     846           0 :         result = (char *) repalloc(result, resultlen + 1);
     847             :     }
     848             : 
     849         444 :     return result;
     850             : }
     851             : 
     852             : /*
     853             :  * Convert a single Unicode code point into a string in the server encoding.
     854             :  *
     855             :  * The code point given by "c" is converted and stored at *s, which must
     856             :  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
     857             :  * The output will have a trailing '\0'.  Throws error if the conversion
     858             :  * cannot be performed.
     859             :  *
     860             :  * Note that this relies on having previously looked up any required
     861             :  * conversion function.  That's partly for speed but mostly because the parser
     862             :  * may call this outside any transaction, or in an aborted transaction.
     863             :  */
     864             : void
     865        1010 : pg_unicode_to_server(pg_wchar c, unsigned char *s)
     866             : {
     867             :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     868             :     int         c_as_utf8_len;
     869             :     int         server_encoding;
     870             : 
     871             :     /*
     872             :      * Complain if invalid Unicode code point.  The choice of errcode here is
     873             :      * debatable, but really our caller should have checked this anyway.
     874             :      */
     875        1010 :     if (!is_valid_unicode_codepoint(c))
     876           0 :         ereport(ERROR,
     877             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     878             :                  errmsg("invalid Unicode code point")));
     879             : 
     880             :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     881        1010 :     if (c <= 0x7F)
     882             :     {
     883         352 :         s[0] = (unsigned char) c;
     884         352 :         s[1] = '\0';
     885        1010 :         return;
     886             :     }
     887             : 
     888             :     /* If the server encoding is UTF-8, we just need to reformat the code */
     889         658 :     server_encoding = GetDatabaseEncoding();
     890         658 :     if (server_encoding == PG_UTF8)
     891             :     {
     892         658 :         unicode_to_utf8(c, s);
     893         658 :         s[pg_utf_mblen(s)] = '\0';
     894         658 :         return;
     895             :     }
     896             : 
     897             :     /* For all other cases, we must have a conversion function available */
     898           0 :     if (Utf8ToServerConvProc == NULL)
     899           0 :         ereport(ERROR,
     900             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     901             :                  errmsg("conversion between %s and %s is not supported",
     902             :                         pg_enc2name_tbl[PG_UTF8].name,
     903             :                         GetDatabaseEncodingName())));
     904             : 
     905             :     /* Construct UTF-8 source string */
     906           0 :     unicode_to_utf8(c, c_as_utf8);
     907           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     908           0 :     c_as_utf8[c_as_utf8_len] = '\0';
     909             : 
     910             :     /* Convert, or throw error if we can't */
     911           0 :     FunctionCall6(Utf8ToServerConvProc,
     912             :                   Int32GetDatum(PG_UTF8),
     913             :                   Int32GetDatum(server_encoding),
     914             :                   CStringGetDatum((char *) c_as_utf8),
     915             :                   CStringGetDatum((char *) s),
     916             :                   Int32GetDatum(c_as_utf8_len),
     917             :                   BoolGetDatum(false));
     918             : }
     919             : 
     920             : /*
     921             :  * Convert a single Unicode code point into a string in the server encoding.
     922             :  *
     923             :  * Same as pg_unicode_to_server(), except that we don't throw errors,
     924             :  * but simply return false on conversion failure.
     925             :  */
     926             : bool
     927          84 : pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
     928             : {
     929             :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     930             :     int         c_as_utf8_len;
     931             :     int         converted_len;
     932             :     int         server_encoding;
     933             : 
     934             :     /* Fail if invalid Unicode code point */
     935          84 :     if (!is_valid_unicode_codepoint(c))
     936           0 :         return false;
     937             : 
     938             :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     939          84 :     if (c <= 0x7F)
     940             :     {
     941          24 :         s[0] = (unsigned char) c;
     942          24 :         s[1] = '\0';
     943          24 :         return true;
     944             :     }
     945             : 
     946             :     /* If the server encoding is UTF-8, we just need to reformat the code */
     947          60 :     server_encoding = GetDatabaseEncoding();
     948          60 :     if (server_encoding == PG_UTF8)
     949             :     {
     950          60 :         unicode_to_utf8(c, s);
     951          60 :         s[pg_utf_mblen(s)] = '\0';
     952          60 :         return true;
     953             :     }
     954             : 
     955             :     /* For all other cases, we must have a conversion function available */
     956           0 :     if (Utf8ToServerConvProc == NULL)
     957           0 :         return false;
     958             : 
     959             :     /* Construct UTF-8 source string */
     960           0 :     unicode_to_utf8(c, c_as_utf8);
     961           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     962           0 :     c_as_utf8[c_as_utf8_len] = '\0';
     963             : 
     964             :     /* Convert, but without throwing error if we can't */
     965           0 :     converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
     966             :                                                 Int32GetDatum(PG_UTF8),
     967             :                                                 Int32GetDatum(server_encoding),
     968             :                                                 CStringGetDatum((char *) c_as_utf8),
     969             :                                                 CStringGetDatum((char *) s),
     970             :                                                 Int32GetDatum(c_as_utf8_len),
     971             :                                                 BoolGetDatum(true)));
     972             : 
     973             :     /* Conversion was successful iff it consumed the whole input */
     974           0 :     return (converted_len == c_as_utf8_len);
     975             : }
     976             : 
     977             : 
     978             : /* convert a multibyte string to a wchar */
     979             : int
     980           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     981             : {
     982           0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     983             : }
     984             : 
     985             : /* convert a multibyte string to a wchar with a limited length */
     986             : int
     987     7127556 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     988             : {
     989     7127556 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     990             : }
     991             : 
     992             : /* same, with any encoding */
     993             : int
     994       18280 : pg_encoding_mb2wchar_with_len(int encoding,
     995             :                               const char *from, pg_wchar *to, int len)
     996             : {
     997       18280 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     998             : }
     999             : 
    1000             : /* convert a wchar string to a multibyte */
    1001             : int
    1002           0 : pg_wchar2mb(const pg_wchar *from, char *to)
    1003             : {
    1004           0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
    1005             : }
    1006             : 
    1007             : /* convert a wchar string to a multibyte with a limited length */
    1008             : int
    1009     1115446 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
    1010             : {
    1011     1115446 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1012             : }
    1013             : 
    1014             : /* same, with any encoding */
    1015             : int
    1016           0 : pg_encoding_wchar2mb_with_len(int encoding,
    1017             :                               const pg_wchar *from, char *to, int len)
    1018             : {
    1019           0 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1020             : }
    1021             : 
    1022             : /* returns the byte length of a multibyte character */
    1023             : int
    1024   253098260 : pg_mblen(const char *mbstr)
    1025             : {
    1026   253098260 :     return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1027             : }
    1028             : 
    1029             : /* returns the display length of a multibyte character */
    1030             : int
    1031        8724 : pg_dsplen(const char *mbstr)
    1032             : {
    1033        8724 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
    1034             : }
    1035             : 
    1036             : /* returns the length (counted in wchars) of a multibyte string */
    1037             : int
    1038         702 : pg_mbstrlen(const char *mbstr)
    1039             : {
    1040         702 :     int         len = 0;
    1041             : 
    1042             :     /* optimization for single byte encoding */
    1043         702 :     if (pg_database_encoding_max_length() == 1)
    1044           0 :         return strlen(mbstr);
    1045             : 
    1046        1626 :     while (*mbstr)
    1047             :     {
    1048         924 :         mbstr += pg_mblen(mbstr);
    1049         924 :         len++;
    1050             :     }
    1051         702 :     return len;
    1052             : }
    1053             : 
    1054             : /* returns the length (counted in wchars) of a multibyte string
    1055             :  * (not necessarily NULL terminated)
    1056             :  */
    1057             : int
    1058     1605078 : pg_mbstrlen_with_len(const char *mbstr, int limit)
    1059             : {
    1060     1605078 :     int         len = 0;
    1061             : 
    1062             :     /* optimization for single byte encoding */
    1063     1605078 :     if (pg_database_encoding_max_length() == 1)
    1064      400014 :         return limit;
    1065             : 
    1066   221229530 :     while (limit > 0 && *mbstr)
    1067             :     {
    1068   220024466 :         int         l = pg_mblen(mbstr);
    1069             : 
    1070   220024466 :         limit -= l;
    1071   220024466 :         mbstr += l;
    1072   220024466 :         len++;
    1073             :     }
    1074     1205064 :     return len;
    1075             : }
    1076             : 
    1077             : /*
    1078             :  * returns the byte length of a multibyte string
    1079             :  * (not necessarily NULL terminated)
    1080             :  * that is no longer than limit.
    1081             :  * this function does not break multibyte character boundary.
    1082             :  */
    1083             : int
    1084      309616 : pg_mbcliplen(const char *mbstr, int len, int limit)
    1085             : {
    1086      309616 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
    1087             :                                  len, limit);
    1088             : }
    1089             : 
    1090             : /*
    1091             :  * pg_mbcliplen with specified encoding; string must be valid in encoding
    1092             :  */
    1093             : int
    1094      309616 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
    1095             :                       int len, int limit)
    1096             : {
    1097             :     mblen_converter mblen_fn;
    1098      309616 :     int         clen = 0;
    1099             :     int         l;
    1100             : 
    1101             :     /* optimization for single byte encoding */
    1102      309616 :     if (pg_encoding_max_length(encoding) == 1)
    1103       37162 :         return cliplen(mbstr, len, limit);
    1104             : 
    1105      272454 :     mblen_fn = pg_wchar_table[encoding].mblen;
    1106             : 
    1107     2943332 :     while (len > 0 && *mbstr)
    1108             :     {
    1109     2804022 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
    1110     2804022 :         if ((clen + l) > limit)
    1111          94 :             break;
    1112     2803928 :         clen += l;
    1113     2803928 :         if (clen == limit)
    1114      133050 :             break;
    1115     2670878 :         len -= l;
    1116     2670878 :         mbstr += l;
    1117             :     }
    1118      272454 :     return clen;
    1119             : }
    1120             : 
    1121             : /*
    1122             :  * Similar to pg_mbcliplen except the limit parameter specifies the
    1123             :  * character length, not the byte length.
    1124             :  */
    1125             : int
    1126         528 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
    1127             : {
    1128         528 :     int         clen = 0;
    1129         528 :     int         nch = 0;
    1130             :     int         l;
    1131             : 
    1132             :     /* optimization for single byte encoding */
    1133         528 :     if (pg_database_encoding_max_length() == 1)
    1134           0 :         return cliplen(mbstr, len, limit);
    1135             : 
    1136        2328 :     while (len > 0 && *mbstr)
    1137             :     {
    1138        2310 :         l = pg_mblen(mbstr);
    1139        2310 :         nch++;
    1140        2310 :         if (nch > limit)
    1141         510 :             break;
    1142        1800 :         clen += l;
    1143        1800 :         len -= l;
    1144        1800 :         mbstr += l;
    1145             :     }
    1146         528 :     return clen;
    1147             : }
    1148             : 
    1149             : /* mbcliplen for any single-byte encoding */
    1150             : static int
    1151       37162 : cliplen(const char *str, int len, int limit)
    1152             : {
    1153       37162 :     int         l = 0;
    1154             : 
    1155       37162 :     len = Min(len, limit);
    1156      289172 :     while (l < len && str[l])
    1157      252010 :         l++;
    1158       37162 :     return l;
    1159             : }
    1160             : 
    1161             : void
    1162       31872 : SetDatabaseEncoding(int encoding)
    1163             : {
    1164       31872 :     if (!PG_VALID_BE_ENCODING(encoding))
    1165           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
    1166             : 
    1167       31872 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
    1168             :     Assert(DatabaseEncoding->encoding == encoding);
    1169       31872 : }
    1170             : 
    1171             : void
    1172       35594 : SetMessageEncoding(int encoding)
    1173             : {
    1174             :     /* Some calls happen before we can elog()! */
    1175             :     Assert(PG_VALID_ENCODING(encoding));
    1176             : 
    1177       35594 :     MessageEncoding = &pg_enc2name_tbl[encoding];
    1178             :     Assert(MessageEncoding->encoding == encoding);
    1179       35594 : }
    1180             : 
    1181             : #ifdef ENABLE_NLS
    1182             : /*
    1183             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
    1184             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
    1185             :  * fail for gettext-internal causes like out-of-memory.
    1186             :  */
    1187             : static bool
    1188        3146 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
    1189             : {
    1190        3146 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1191             : 
    1192        3146 :     if (!PG_VALID_ENCODING(encoding) || pg_enc2gettext_tbl[encoding] == NULL)
    1193           0 :         return false;
    1194             : 
    1195        3146 :     if (bind_textdomain_codeset(domainname,
    1196             :                                 pg_enc2gettext_tbl[encoding]) != NULL)
    1197        3146 :         return true;
    1198             : 
    1199           0 :     if (elog_ok)
    1200           0 :         elog(LOG, "bind_textdomain_codeset failed");
    1201             :     else
    1202           0 :         write_stderr("bind_textdomain_codeset failed");
    1203             : 
    1204           0 :     return false;
    1205             : }
    1206             : 
    1207             : /*
    1208             :  * Bind a gettext message domain to the codeset corresponding to the database
    1209             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
    1210             :  * Return the MessageEncoding implied by the new settings.
    1211             :  *
    1212             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
    1213             :  * When that matches the database encoding, we don't need to do anything.  In
    1214             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
    1215             :  * database encoding, except for the C locale.  (On Windows, we also permit a
    1216             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
    1217             :  * gettext to the right codeset.
    1218             :  *
    1219             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
    1220             :  * convenient departure for software that passes the strings to Windows ANSI
    1221             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
    1222             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
    1223             :  *
    1224             :  * This function is called before elog() and palloc() are usable.
    1225             :  */
    1226             : int
    1227       39256 : pg_bind_textdomain_codeset(const char *domainname)
    1228             : {
    1229       39256 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1230       39256 :     int         encoding = GetDatabaseEncoding();
    1231             :     int         new_msgenc;
    1232             : 
    1233             : #ifndef WIN32
    1234       39256 :     const char *ctype = setlocale(LC_CTYPE, NULL);
    1235             : 
    1236       39256 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
    1237             : #endif
    1238        7070 :         if (encoding != PG_SQL_ASCII &&
    1239        3146 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
    1240        3146 :             return encoding;
    1241             : 
    1242       36110 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
    1243       36110 :     if (new_msgenc < 0)
    1244           0 :         new_msgenc = PG_SQL_ASCII;
    1245             : 
    1246             : #ifdef WIN32
    1247             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1248             :         /* On failure, the old message encoding remains valid. */
    1249             :         return GetMessageEncoding();
    1250             : #endif
    1251             : 
    1252       36110 :     return new_msgenc;
    1253             : }
    1254             : #endif
    1255             : 
    1256             : /*
    1257             :  * The database encoding, also called the server encoding, represents the
    1258             :  * encoding of data stored in text-like data types.  Affected types include
    1259             :  * cstring, text, varchar, name, xml, and json.
    1260             :  */
    1261             : int
    1262    14986190 : GetDatabaseEncoding(void)
    1263             : {
    1264    14986190 :     return DatabaseEncoding->encoding;
    1265             : }
    1266             : 
    1267             : const char *
    1268       66674 : GetDatabaseEncodingName(void)
    1269             : {
    1270       66674 :     return DatabaseEncoding->name;
    1271             : }
    1272             : 
    1273             : Datum
    1274          88 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1275             : {
    1276          88 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1277             : }
    1278             : 
    1279             : Datum
    1280           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1281             : {
    1282           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1283             : }
    1284             : 
    1285             : Datum
    1286          36 : PG_char_to_encoding(PG_FUNCTION_ARGS)
    1287             : {
    1288          36 :     Name        s = PG_GETARG_NAME(0);
    1289             : 
    1290          36 :     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
    1291             : }
    1292             : 
    1293             : Datum
    1294        4912 : PG_encoding_to_char(PG_FUNCTION_ARGS)
    1295             : {
    1296        4912 :     int32       encoding = PG_GETARG_INT32(0);
    1297        4912 :     const char *encoding_name = pg_encoding_to_char(encoding);
    1298             : 
    1299        4912 :     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
    1300             : }
    1301             : 
    1302             : /*
    1303             :  * gettext() returns messages in this encoding.  This often matches the
    1304             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1305             :  * not attached to a database, and under a database encoding lacking iconv
    1306             :  * support (MULE_INTERNAL).
    1307             :  */
    1308             : int
    1309           0 : GetMessageEncoding(void)
    1310             : {
    1311           0 :     return MessageEncoding->encoding;
    1312             : }
    1313             : 
    1314             : 
    1315             : /*
    1316             :  * Generic character incrementer function.
    1317             :  *
    1318             :  * Not knowing anything about the properties of the encoding in use, we just
    1319             :  * keep incrementing the last byte until we get a validly-encoded result,
    1320             :  * or we run out of values to try.  We don't bother to try incrementing
    1321             :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1322             :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1323             :  * is not a valid final byte in the encoding.)
    1324             :  */
    1325             : static bool
    1326         104 : pg_generic_charinc(unsigned char *charptr, int len)
    1327             : {
    1328         104 :     unsigned char *lastbyte = charptr + len - 1;
    1329             :     mbchar_verifier mbverify;
    1330             : 
    1331             :     /* We can just invoke the character verifier directly. */
    1332         104 :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
    1333             : 
    1334         104 :     while (*lastbyte < (unsigned char) 255)
    1335             :     {
    1336         104 :         (*lastbyte)++;
    1337         104 :         if ((*mbverify) (charptr, len) == len)
    1338         104 :             return true;
    1339             :     }
    1340             : 
    1341           0 :     return false;
    1342             : }
    1343             : 
    1344             : /*
    1345             :  * UTF-8 character incrementer function.
    1346             :  *
    1347             :  * For a one-byte character less than 0x7F, we just increment the byte.
    1348             :  *
    1349             :  * For a multibyte character, every byte but the first must fall between 0x80
    1350             :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1351             :  * the last byte that's not already at its maximum value.  If we can't find a
    1352             :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1353             :  * need some special-case logic to skip regions used for surrogate pair
    1354             :  * handling, as those should not occur in valid UTF-8.
    1355             :  *
    1356             :  * Note that we don't reset lower-order bytes back to their minimums, since
    1357             :  * we can't afford to make an exhaustive search (see make_greater_string).
    1358             :  */
    1359             : static bool
    1360        3416 : pg_utf8_increment(unsigned char *charptr, int length)
    1361             : {
    1362             :     unsigned char a;
    1363             :     unsigned char limit;
    1364             : 
    1365        3416 :     switch (length)
    1366             :     {
    1367           0 :         default:
    1368             :             /* reject lengths 5 and 6 for now */
    1369           0 :             return false;
    1370           0 :         case 4:
    1371           0 :             a = charptr[3];
    1372           0 :             if (a < 0xBF)
    1373             :             {
    1374           0 :                 charptr[3]++;
    1375           0 :                 break;
    1376             :             }
    1377             :             /* FALL THRU */
    1378             :         case 3:
    1379           0 :             a = charptr[2];
    1380           0 :             if (a < 0xBF)
    1381             :             {
    1382           0 :                 charptr[2]++;
    1383           0 :                 break;
    1384             :             }
    1385             :             /* FALL THRU */
    1386             :         case 2:
    1387           0 :             a = charptr[1];
    1388           0 :             switch (*charptr)
    1389             :             {
    1390           0 :                 case 0xED:
    1391           0 :                     limit = 0x9F;
    1392           0 :                     break;
    1393           0 :                 case 0xF4:
    1394           0 :                     limit = 0x8F;
    1395           0 :                     break;
    1396           0 :                 default:
    1397           0 :                     limit = 0xBF;
    1398           0 :                     break;
    1399             :             }
    1400           0 :             if (a < limit)
    1401             :             {
    1402           0 :                 charptr[1]++;
    1403           0 :                 break;
    1404             :             }
    1405             :             /* FALL THRU */
    1406             :         case 1:
    1407        3416 :             a = *charptr;
    1408        3416 :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1409           0 :                 return false;
    1410        3416 :             charptr[0]++;
    1411        3416 :             break;
    1412             :     }
    1413             : 
    1414        3416 :     return true;
    1415             : }
    1416             : 
    1417             : /*
    1418             :  * EUC-JP character incrementer function.
    1419             :  *
    1420             :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1421             :  * representing JIS X 0201 characters with the second byte ranging between
    1422             :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1423             :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1424             :  *
    1425             :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1426             :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1427             :  * is incremented if possible, otherwise the second-to-last byte.
    1428             :  *
    1429             :  * If the sequence starts with a value other than the above and its MSB
    1430             :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1431             :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1432             :  * incremented if possible, otherwise the second-to-last byte.
    1433             :  *
    1434             :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1435             :  * incremented up to 0x7f.
    1436             :  */
    1437             : static bool
    1438           0 : pg_eucjp_increment(unsigned char *charptr, int length)
    1439             : {
    1440             :     unsigned char c1,
    1441             :                 c2;
    1442             :     int         i;
    1443             : 
    1444           0 :     c1 = *charptr;
    1445             : 
    1446           0 :     switch (c1)
    1447             :     {
    1448           0 :         case SS2:               /* JIS X 0201 */
    1449           0 :             if (length != 2)
    1450           0 :                 return false;
    1451             : 
    1452           0 :             c2 = charptr[1];
    1453             : 
    1454           0 :             if (c2 >= 0xdf)
    1455           0 :                 charptr[0] = charptr[1] = 0xa1;
    1456           0 :             else if (c2 < 0xa1)
    1457           0 :                 charptr[1] = 0xa1;
    1458             :             else
    1459           0 :                 charptr[1]++;
    1460           0 :             break;
    1461             : 
    1462           0 :         case SS3:               /* JIS X 0212 */
    1463           0 :             if (length != 3)
    1464           0 :                 return false;
    1465             : 
    1466           0 :             for (i = 2; i > 0; i--)
    1467             :             {
    1468           0 :                 c2 = charptr[i];
    1469           0 :                 if (c2 < 0xa1)
    1470             :                 {
    1471           0 :                     charptr[i] = 0xa1;
    1472           0 :                     return true;
    1473             :                 }
    1474           0 :                 else if (c2 < 0xfe)
    1475             :                 {
    1476           0 :                     charptr[i]++;
    1477           0 :                     return true;
    1478             :                 }
    1479             :             }
    1480             : 
    1481             :             /* Out of 3-byte code region */
    1482           0 :             return false;
    1483             : 
    1484           0 :         default:
    1485           0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1486             :             {
    1487           0 :                 if (length != 2)
    1488           0 :                     return false;
    1489             : 
    1490           0 :                 for (i = 1; i >= 0; i--)
    1491             :                 {
    1492           0 :                     c2 = charptr[i];
    1493           0 :                     if (c2 < 0xa1)
    1494             :                     {
    1495           0 :                         charptr[i] = 0xa1;
    1496           0 :                         return true;
    1497             :                     }
    1498           0 :                     else if (c2 < 0xfe)
    1499             :                     {
    1500           0 :                         charptr[i]++;
    1501           0 :                         return true;
    1502             :                     }
    1503             :                 }
    1504             : 
    1505             :                 /* Out of 2 byte code region */
    1506           0 :                 return false;
    1507             :             }
    1508             :             else
    1509             :             {                   /* ASCII, single byte */
    1510           0 :                 if (c1 > 0x7e)
    1511           0 :                     return false;
    1512           0 :                 (*charptr)++;
    1513             :             }
    1514           0 :             break;
    1515             :     }
    1516             : 
    1517           0 :     return true;
    1518             : }
    1519             : 
    1520             : /*
    1521             :  * get the character incrementer for the encoding for the current database
    1522             :  */
    1523             : mbcharacter_incrementer
    1524        3520 : pg_database_encoding_character_incrementer(void)
    1525             : {
    1526             :     /*
    1527             :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1528             :      * now we just use a switch.
    1529             :      */
    1530        3520 :     switch (GetDatabaseEncoding())
    1531             :     {
    1532        3416 :         case PG_UTF8:
    1533        3416 :             return pg_utf8_increment;
    1534             : 
    1535           0 :         case PG_EUC_JP:
    1536           0 :             return pg_eucjp_increment;
    1537             : 
    1538         104 :         default:
    1539         104 :             return pg_generic_charinc;
    1540             :     }
    1541             : }
    1542             : 
    1543             : /*
    1544             :  * fetch maximum length of the encoding for the current database
    1545             :  */
    1546             : int
    1547    12800606 : pg_database_encoding_max_length(void)
    1548             : {
    1549    12800606 :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1550             : }
    1551             : 
    1552             : /*
    1553             :  * Verify mbstr to make sure that it is validly encoded in the current
    1554             :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1555             :  */
    1556             : bool
    1557        4446 : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1558             : {
    1559        4446 :     return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
    1560             : }
    1561             : 
    1562             : /*
    1563             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1564             :  * encoding.
    1565             :  */
    1566             : bool
    1567     1128170 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1568             : {
    1569             :     int         oklen;
    1570             : 
    1571             :     Assert(PG_VALID_ENCODING(encoding));
    1572             : 
    1573     1128170 :     oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
    1574     1128170 :     if (oklen != len)
    1575             :     {
    1576           8 :         if (noError)
    1577           0 :             return false;
    1578           8 :         report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
    1579             :     }
    1580     1128162 :     return true;
    1581             : }
    1582             : 
    1583             : /*
    1584             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1585             :  * encoding.
    1586             :  *
    1587             :  * mbstr is not necessarily zero terminated; length of mbstr is
    1588             :  * specified by len.
    1589             :  *
    1590             :  * If OK, return length of string in the encoding.
    1591             :  * If a problem is found, return -1 when noError is
    1592             :  * true; when noError is false, ereport() a descriptive message.
    1593             :  *
    1594             :  * Note: We cannot use the faster encoding-specific mbverifystr() function
    1595             :  * here, because we need to count the number of characters in the string.
    1596             :  */
    1597             : int
    1598           0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1599             : {
    1600             :     mbchar_verifier mbverifychar;
    1601             :     int         mb_len;
    1602             : 
    1603             :     Assert(PG_VALID_ENCODING(encoding));
    1604             : 
    1605             :     /*
    1606             :      * In single-byte encodings, we need only reject nulls (\0).
    1607             :      */
    1608           0 :     if (pg_encoding_max_length(encoding) <= 1)
    1609             :     {
    1610           0 :         const char *nullpos = memchr(mbstr, 0, len);
    1611             : 
    1612           0 :         if (nullpos == NULL)
    1613           0 :             return len;
    1614           0 :         if (noError)
    1615           0 :             return -1;
    1616           0 :         report_invalid_encoding(encoding, nullpos, 1);
    1617             :     }
    1618             : 
    1619             :     /* fetch function pointer just once */
    1620           0 :     mbverifychar = pg_wchar_table[encoding].mbverifychar;
    1621             : 
    1622           0 :     mb_len = 0;
    1623             : 
    1624           0 :     while (len > 0)
    1625             :     {
    1626             :         int         l;
    1627             : 
    1628             :         /* fast path for ASCII-subset characters */
    1629           0 :         if (!IS_HIGHBIT_SET(*mbstr))
    1630             :         {
    1631           0 :             if (*mbstr != '\0')
    1632             :             {
    1633           0 :                 mb_len++;
    1634           0 :                 mbstr++;
    1635           0 :                 len--;
    1636           0 :                 continue;
    1637             :             }
    1638           0 :             if (noError)
    1639           0 :                 return -1;
    1640           0 :             report_invalid_encoding(encoding, mbstr, len);
    1641             :         }
    1642             : 
    1643           0 :         l = (*mbverifychar) ((const unsigned char *) mbstr, len);
    1644             : 
    1645           0 :         if (l < 0)
    1646             :         {
    1647           0 :             if (noError)
    1648           0 :                 return -1;
    1649           0 :             report_invalid_encoding(encoding, mbstr, len);
    1650             :         }
    1651             : 
    1652           0 :         mbstr += l;
    1653           0 :         len -= l;
    1654           0 :         mb_len++;
    1655             :     }
    1656           0 :     return mb_len;
    1657             : }
    1658             : 
    1659             : /*
    1660             :  * check_encoding_conversion_args: check arguments of a conversion function
    1661             :  *
    1662             :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1663             :  * the caller will check whether it accepts the ID.
    1664             :  *
    1665             :  * Note: the errors here are not really user-facing, so elog instead of
    1666             :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1667             :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1668             :  */
    1669             : void
    1670        7066 : check_encoding_conversion_args(int src_encoding,
    1671             :                                int dest_encoding,
    1672             :                                int len,
    1673             :                                int expected_src_encoding,
    1674             :                                int expected_dest_encoding)
    1675             : {
    1676        7066 :     if (!PG_VALID_ENCODING(src_encoding))
    1677           0 :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1678        7066 :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1679           0 :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1680             :              pg_enc2name_tbl[expected_src_encoding].name,
    1681             :              pg_enc2name_tbl[src_encoding].name);
    1682        7066 :     if (!PG_VALID_ENCODING(dest_encoding))
    1683           0 :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1684        7066 :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1685           0 :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1686             :              pg_enc2name_tbl[expected_dest_encoding].name,
    1687             :              pg_enc2name_tbl[dest_encoding].name);
    1688        7066 :     if (len < 0)
    1689           0 :         elog(ERROR, "encoding conversion length must not be negative");
    1690        7066 : }
    1691             : 
    1692             : /*
    1693             :  * report_invalid_encoding: complain about invalid multibyte character
    1694             :  *
    1695             :  * note: len is remaining length of string, not length of character;
    1696             :  * len must be greater than zero (or we'd neglect initializing "buf").
    1697             :  */
    1698             : void
    1699        2990 : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1700             : {
    1701        2990 :     int         l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
    1702             :     char        buf[8 * 5 + 1];
    1703        2990 :     char       *p = buf;
    1704             :     int         j,
    1705             :                 jlimit;
    1706             : 
    1707        2990 :     jlimit = Min(l, len);
    1708        2990 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1709             : 
    1710        9226 :     for (j = 0; j < jlimit; j++)
    1711             :     {
    1712        6236 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1713        6236 :         if (j < jlimit - 1)
    1714        3246 :             p += sprintf(p, " ");
    1715             :     }
    1716             : 
    1717        2990 :     ereport(ERROR,
    1718             :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    1719             :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    1720             :                     pg_enc2name_tbl[encoding].name,
    1721             :                     buf)));
    1722             : }
    1723             : 
    1724             : /*
    1725             :  * report_untranslatable_char: complain about untranslatable character
    1726             :  *
    1727             :  * note: len is remaining length of string, not length of character;
    1728             :  * len must be greater than zero (or we'd neglect initializing "buf").
    1729             :  */
    1730             : void
    1731         936 : report_untranslatable_char(int src_encoding, int dest_encoding,
    1732             :                            const char *mbstr, int len)
    1733             : {
    1734             :     int         l;
    1735             :     char        buf[8 * 5 + 1];
    1736         936 :     char       *p = buf;
    1737             :     int         j,
    1738             :                 jlimit;
    1739             : 
    1740             :     /*
    1741             :      * We probably could use plain pg_encoding_mblen(), because
    1742             :      * gb18030_to_utf8() verifies before it converts.  All conversions should.
    1743             :      * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs.  Even
    1744             :      * so, be defensive, since a buggy conversion might pass invalid data.
    1745             :      * This is not a performance-critical path.
    1746             :      */
    1747         936 :     l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
    1748         936 :     jlimit = Min(l, len);
    1749         936 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1750             : 
    1751        3528 :     for (j = 0; j < jlimit; j++)
    1752             :     {
    1753        2592 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1754        2592 :         if (j < jlimit - 1)
    1755        1656 :             p += sprintf(p, " ");
    1756             :     }
    1757             : 
    1758         936 :     ereport(ERROR,
    1759             :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    1760             :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    1761             :                     buf,
    1762             :                     pg_enc2name_tbl[src_encoding].name,
    1763             :                     pg_enc2name_tbl[dest_encoding].name)));
    1764             : }
    1765             : 
    1766             : 
    1767             : #ifdef WIN32
    1768             : /*
    1769             :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1770             :  * string. The character length is also passed to utf16len if not
    1771             :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1772             :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1773             :  */
    1774             : WCHAR *
    1775             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1776             : {
    1777             :     int         msgenc = GetMessageEncoding();
    1778             :     WCHAR      *utf16;
    1779             :     int         dstlen;
    1780             :     UINT        codepage;
    1781             : 
    1782             :     if (msgenc == PG_SQL_ASCII)
    1783             :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1784             :         return NULL;
    1785             : 
    1786             :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1787             : 
    1788             :     /*
    1789             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1790             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1791             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1792             :      */
    1793             :     if (codepage != 0)
    1794             :     {
    1795             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1796             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1797             :         utf16[dstlen] = (WCHAR) 0;
    1798             :     }
    1799             :     else
    1800             :     {
    1801             :         char       *utf8;
    1802             : 
    1803             :         /*
    1804             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1805             :          * absence of one, hope for the input to be valid UTF8.
    1806             :          */
    1807             :         if (IsTransactionState())
    1808             :         {
    1809             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1810             :                                                       len,
    1811             :                                                       msgenc,
    1812             :                                                       PG_UTF8);
    1813             :             if (utf8 != str)
    1814             :                 len = strlen(utf8);
    1815             :         }
    1816             :         else
    1817             :             utf8 = (char *) str;
    1818             : 
    1819             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1820             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1821             :         utf16[dstlen] = (WCHAR) 0;
    1822             : 
    1823             :         if (utf8 != str)
    1824             :             pfree(utf8);
    1825             :     }
    1826             : 
    1827             :     if (dstlen == 0 && len > 0)
    1828             :     {
    1829             :         pfree(utf16);
    1830             :         return NULL;            /* error */
    1831             :     }
    1832             : 
    1833             :     if (utf16len)
    1834             :         *utf16len = dstlen;
    1835             :     return utf16;
    1836             : }
    1837             : 
    1838             : #endif                          /* WIN32 */

Generated by: LCOV version 1.16