LCOV - code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 364 555 65.6 %
Date: 2026-02-10 12:18:23 Functions: 51 61 83.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/fmgrprotos.h"
      41             : #include "utils/memdebug.h"
      42             : #include "utils/memutils.h"
      43             : #include "utils/relcache.h"
      44             : #include "varatt.h"
      45             : 
      46             : /*
      47             :  * We maintain a simple linked list caching the fmgr lookup info for the
      48             :  * currently selected conversion functions, as well as any that have been
      49             :  * selected previously in the current session.  (We remember previous
      50             :  * settings because we must be able to restore a previous setting during
      51             :  * transaction rollback, without doing any fresh catalog accesses.)
      52             :  *
      53             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      54             :  */
      55             : typedef struct ConvProcInfo
      56             : {
      57             :     int         s_encoding;     /* server and client encoding IDs */
      58             :     int         c_encoding;
      59             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      60             :     FmgrInfo    to_client_info;
      61             : } ConvProcInfo;
      62             : 
      63             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      64             : 
      65             : /*
      66             :  * These variables point to the currently active conversion functions,
      67             :  * or are NULL when no conversion is needed.
      68             :  */
      69             : static FmgrInfo *ToServerConvProc = NULL;
      70             : static FmgrInfo *ToClientConvProc = NULL;
      71             : 
      72             : /*
      73             :  * This variable stores the conversion function to convert from UTF-8
      74             :  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
      75             :  * or if we lack a conversion function for this.
      76             :  */
      77             : static FmgrInfo *Utf8ToServerConvProc = NULL;
      78             : 
      79             : /*
      80             :  * These variables track the currently-selected encodings.
      81             :  */
      82             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      83             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      84             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      85             : 
      86             : /*
      87             :  * During backend startup we can't set client encoding because we (a)
      88             :  * can't look up the conversion functions, and (b) may not know the database
      89             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      90             :  * remembers it for InitializeClientEncoding() to apply later.
      91             :  */
      92             : static bool backend_startup_complete = false;
      93             : static int  pending_client_encoding = PG_SQL_ASCII;
      94             : 
      95             : 
      96             : /* Internal functions */
      97             : static char *perform_default_encoding_conversion(const char *src,
      98             :                                                  int len, bool is_client_to_server);
      99             : static int  cliplen(const char *str, int len, int limit);
     100             : 
     101             : pg_noreturn
     102             : static void report_invalid_encoding_int(int encoding, const char *mbstr,
     103             :                                         int mblen, int len);
     104             : 
     105             : pg_noreturn
     106             : static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
     107             : 
     108             : 
     109             : /*
     110             :  * Prepare for a future call to SetClientEncoding.  Success should mean
     111             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     112             :  *
     113             :  * (But note that success before backend_startup_complete does not guarantee
     114             :  * success after ...)
     115             :  *
     116             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     117             :  */
     118             : int
     119       72078 : PrepareClientEncoding(int encoding)
     120             : {
     121             :     int         current_server_encoding;
     122             :     ListCell   *lc;
     123             : 
     124       72078 :     if (!PG_VALID_FE_ENCODING(encoding))
     125           0 :         return -1;
     126             : 
     127             :     /* Can't do anything during startup, per notes above */
     128       72078 :     if (!backend_startup_complete)
     129       36386 :         return 0;
     130             : 
     131       35692 :     current_server_encoding = GetDatabaseEncoding();
     132             : 
     133             :     /*
     134             :      * Check for cases that require no conversion function.
     135             :      */
     136       35692 :     if (current_server_encoding == encoding ||
     137        3010 :         current_server_encoding == PG_SQL_ASCII ||
     138             :         encoding == PG_SQL_ASCII)
     139       35672 :         return 0;
     140             : 
     141          20 :     if (IsTransactionState())
     142             :     {
     143             :         /*
     144             :          * If we're in a live transaction, it's safe to access the catalogs,
     145             :          * so look up the functions.  We repeat the lookup even if the info is
     146             :          * already cached, so that we can react to changes in the contents of
     147             :          * pg_conversion.
     148             :          */
     149             :         Oid         to_server_proc,
     150             :                     to_client_proc;
     151             :         ConvProcInfo *convinfo;
     152             :         MemoryContext oldcontext;
     153             : 
     154          20 :         to_server_proc = FindDefaultConversionProc(encoding,
     155             :                                                    current_server_encoding);
     156          20 :         if (!OidIsValid(to_server_proc))
     157           0 :             return -1;
     158          20 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     159             :                                                    encoding);
     160          20 :         if (!OidIsValid(to_client_proc))
     161           0 :             return -1;
     162             : 
     163             :         /*
     164             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     165             :          */
     166          20 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     167             :                                                        sizeof(ConvProcInfo));
     168          20 :         convinfo->s_encoding = current_server_encoding;
     169          20 :         convinfo->c_encoding = encoding;
     170          20 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     171             :                       TopMemoryContext);
     172          20 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     173             :                       TopMemoryContext);
     174             : 
     175             :         /* Attach new info to head of list */
     176          20 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     177          20 :         ConvProcList = lcons(convinfo, ConvProcList);
     178          20 :         MemoryContextSwitchTo(oldcontext);
     179             : 
     180             :         /*
     181             :          * We cannot yet remove any older entry for the same encoding pair,
     182             :          * since it could still be in use.  SetClientEncoding will clean up.
     183             :          */
     184             : 
     185          20 :         return 0;               /* success */
     186             :     }
     187             :     else
     188             :     {
     189             :         /*
     190             :          * If we're not in a live transaction, the only thing we can do is
     191             :          * restore a previous setting using the cache.  This covers all
     192             :          * transaction-rollback cases.  The only case it might not work for is
     193             :          * trying to change client_encoding on the fly by editing
     194             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     195             :          * thing to do anyway.
     196             :          */
     197           0 :         foreach(lc, ConvProcList)
     198             :         {
     199           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     200             : 
     201           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     202           0 :                 oldinfo->c_encoding == encoding)
     203           0 :                 return 0;
     204             :         }
     205             : 
     206           0 :         return -1;              /* it's not cached, so fail */
     207             :     }
     208             : }
     209             : 
     210             : /*
     211             :  * Set the active client encoding and set up the conversion-function pointers.
     212             :  * PrepareClientEncoding should have been called previously for this encoding.
     213             :  *
     214             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     215             :  */
     216             : int
     217       74862 : SetClientEncoding(int encoding)
     218             : {
     219             :     int         current_server_encoding;
     220             :     bool        found;
     221             :     ListCell   *lc;
     222             : 
     223       74862 :     if (!PG_VALID_FE_ENCODING(encoding))
     224           0 :         return -1;
     225             : 
     226             :     /* Can't do anything during startup, per notes above */
     227       74862 :     if (!backend_startup_complete)
     228             :     {
     229       36210 :         pending_client_encoding = encoding;
     230       36210 :         return 0;
     231             :     }
     232             : 
     233       38652 :     current_server_encoding = GetDatabaseEncoding();
     234             : 
     235             :     /*
     236             :      * Check for cases that require no conversion function.
     237             :      */
     238       38652 :     if (current_server_encoding == encoding ||
     239        3010 :         current_server_encoding == PG_SQL_ASCII ||
     240             :         encoding == PG_SQL_ASCII)
     241             :     {
     242       38632 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     243       38632 :         ToServerConvProc = NULL;
     244       38632 :         ToClientConvProc = NULL;
     245       38632 :         return 0;
     246             :     }
     247             : 
     248             :     /*
     249             :      * Search the cache for the entry previously prepared by
     250             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     251             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     252             :      * leak memory.
     253             :      */
     254          20 :     found = false;
     255          46 :     foreach(lc, ConvProcList)
     256             :     {
     257          26 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     258             : 
     259          26 :         if (convinfo->s_encoding == current_server_encoding &&
     260          26 :             convinfo->c_encoding == encoding)
     261             :         {
     262          20 :             if (!found)
     263             :             {
     264             :                 /* Found newest entry, so set up */
     265          20 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     266          20 :                 ToServerConvProc = &convinfo->to_server_info;
     267          20 :                 ToClientConvProc = &convinfo->to_client_info;
     268          20 :                 found = true;
     269             :             }
     270             :             else
     271             :             {
     272             :                 /* Duplicate entry, release it */
     273           0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     274           0 :                 pfree(convinfo);
     275             :             }
     276             :         }
     277             :     }
     278             : 
     279          20 :     if (found)
     280          20 :         return 0;               /* success */
     281             :     else
     282           0 :         return -1;              /* it's not cached, so fail */
     283             : }
     284             : 
     285             : /*
     286             :  * Initialize client encoding conversions.
     287             :  *      Called from InitPostgres() once during backend startup.
     288             :  */
     289             : void
     290       34962 : InitializeClientEncoding(void)
     291             : {
     292             :     int         current_server_encoding;
     293             : 
     294             :     Assert(!backend_startup_complete);
     295       34962 :     backend_startup_complete = true;
     296             : 
     297       69924 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     298       34962 :         SetClientEncoding(pending_client_encoding) < 0)
     299             :     {
     300             :         /*
     301             :          * Oops, the requested conversion is not available. We couldn't fail
     302             :          * before, but we can now.
     303             :          */
     304           0 :         ereport(FATAL,
     305             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     306             :                  errmsg("conversion between %s and %s is not supported",
     307             :                         pg_enc2name_tbl[pending_client_encoding].name,
     308             :                         GetDatabaseEncodingName())));
     309             :     }
     310             : 
     311             :     /*
     312             :      * Also look up the UTF8-to-server conversion function if needed.  Since
     313             :      * the server encoding is fixed within any one backend process, we don't
     314             :      * have to do this more than once.
     315             :      */
     316       34962 :     current_server_encoding = GetDatabaseEncoding();
     317       34962 :     if (current_server_encoding != PG_UTF8 &&
     318             :         current_server_encoding != PG_SQL_ASCII)
     319             :     {
     320             :         Oid         utf8_to_server_proc;
     321             : 
     322         198 :         AssertCouldGetRelation();
     323             :         utf8_to_server_proc =
     324         198 :             FindDefaultConversionProc(PG_UTF8,
     325             :                                       current_server_encoding);
     326             :         /* If there's no such conversion, just leave the pointer as NULL */
     327         198 :         if (OidIsValid(utf8_to_server_proc))
     328             :         {
     329             :             FmgrInfo   *finfo;
     330             : 
     331         198 :             finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
     332             :                                                     sizeof(FmgrInfo));
     333         198 :             fmgr_info_cxt(utf8_to_server_proc, finfo,
     334             :                           TopMemoryContext);
     335             :             /* Set Utf8ToServerConvProc only after data is fully valid */
     336         198 :             Utf8ToServerConvProc = finfo;
     337             :         }
     338             :     }
     339       34962 : }
     340             : 
     341             : /*
     342             :  * returns the current client encoding
     343             :  */
     344             : int
     345       11042 : pg_get_client_encoding(void)
     346             : {
     347       11042 :     return ClientEncoding->encoding;
     348             : }
     349             : 
     350             : /*
     351             :  * returns the current client encoding name
     352             :  */
     353             : const char *
     354           0 : pg_get_client_encoding_name(void)
     355             : {
     356           0 :     return ClientEncoding->name;
     357             : }
     358             : 
     359             : /*
     360             :  * Convert src string to another encoding (general case).
     361             :  *
     362             :  * See the notes about string conversion functions at the top of this file.
     363             :  */
     364             : unsigned char *
     365        3050 : pg_do_encoding_conversion(unsigned char *src, int len,
     366             :                           int src_encoding, int dest_encoding)
     367             : {
     368             :     unsigned char *result;
     369             :     Oid         proc;
     370             : 
     371        3050 :     if (len <= 0)
     372          36 :         return src;             /* empty string is always valid */
     373             : 
     374        3014 :     if (src_encoding == dest_encoding)
     375        2200 :         return src;             /* no conversion required, assume valid */
     376             : 
     377         814 :     if (dest_encoding == PG_SQL_ASCII)
     378           0 :         return src;             /* any string is valid in SQL_ASCII */
     379             : 
     380         814 :     if (src_encoding == PG_SQL_ASCII)
     381             :     {
     382             :         /* No conversion is possible, but we must validate the result */
     383          16 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     384          16 :         return src;
     385             :     }
     386             : 
     387         798 :     if (!IsTransactionState())  /* shouldn't happen */
     388           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     389             : 
     390         798 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     391         798 :     if (!OidIsValid(proc))
     392           0 :         ereport(ERROR,
     393             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     394             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     395             :                         pg_encoding_to_char(src_encoding),
     396             :                         pg_encoding_to_char(dest_encoding))));
     397             : 
     398             :     /*
     399             :      * Allocate space for conversion result, being wary of integer overflow.
     400             :      *
     401             :      * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
     402             :      * required space, so it might exceed MaxAllocSize even though the result
     403             :      * would actually fit.  We do not want to hand back a result string that
     404             :      * exceeds MaxAllocSize, because callers might not cope gracefully --- but
     405             :      * if we just allocate more than that, and don't use it, that's fine.
     406             :      */
     407         798 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     408           0 :         ereport(ERROR,
     409             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     410             :                  errmsg("out of memory"),
     411             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     412             :                            len)));
     413             : 
     414             :     result = (unsigned char *)
     415         798 :         MemoryContextAllocHuge(CurrentMemoryContext,
     416         798 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     417             : 
     418         798 :     (void) OidFunctionCall6(proc,
     419             :                             Int32GetDatum(src_encoding),
     420             :                             Int32GetDatum(dest_encoding),
     421             :                             CStringGetDatum((char *) src),
     422             :                             CStringGetDatum((char *) result),
     423             :                             Int32GetDatum(len),
     424             :                             BoolGetDatum(false));
     425             : 
     426             :     /*
     427             :      * If the result is large, it's worth repalloc'ing to release any extra
     428             :      * space we asked for.  The cutoff here is somewhat arbitrary, but we
     429             :      * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
     430             :      */
     431         798 :     if (len > 1000000)
     432             :     {
     433           0 :         Size        resultlen = strlen((char *) result);
     434             : 
     435           0 :         if (resultlen >= MaxAllocSize)
     436           0 :             ereport(ERROR,
     437             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     438             :                      errmsg("out of memory"),
     439             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     440             :                                len)));
     441             : 
     442           0 :         result = (unsigned char *) repalloc(result, resultlen + 1);
     443             :     }
     444             : 
     445         798 :     return result;
     446             : }
     447             : 
     448             : /*
     449             :  * Convert src string to another encoding.
     450             :  *
     451             :  * This function has a different API than the other conversion functions.
     452             :  * The caller should've looked up the conversion function using
     453             :  * FindDefaultConversionProc().  Unlike the other functions, the converted
     454             :  * result is not palloc'd.  It is written to the caller-supplied buffer
     455             :  * instead.
     456             :  *
     457             :  * src_encoding   - encoding to convert from
     458             :  * dest_encoding  - encoding to convert to
     459             :  * src, srclen    - input buffer and its length in bytes
     460             :  * dest, destlen  - destination buffer and its size in bytes
     461             :  *
     462             :  * The output is null-terminated.
     463             :  *
     464             :  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
     465             :  * wouldn't necessarily fit in the output buffer, and the function will not
     466             :  * convert the whole input.
     467             :  *
     468             :  * TODO: The conversion function interface is not great.  Firstly, it
     469             :  * would be nice to pass through the destination buffer size to the
     470             :  * conversion function, so that if you pass a shorter destination buffer, it
     471             :  * could still continue to fill up the whole buffer.  Currently, we have to
     472             :  * assume worst case expansion and stop the conversion short, even if there
     473             :  * is in fact space left in the destination buffer.  Secondly, it would be
     474             :  * nice to return the number of bytes written to the caller, to avoid a call
     475             :  * to strlen().
     476             :  */
     477             : int
     478        5820 : pg_do_encoding_conversion_buf(Oid proc,
     479             :                               int src_encoding,
     480             :                               int dest_encoding,
     481             :                               unsigned char *src, int srclen,
     482             :                               unsigned char *dest, int destlen,
     483             :                               bool noError)
     484             : {
     485             :     Datum       result;
     486             : 
     487             :     /*
     488             :      * If the destination buffer is not large enough to hold the result in the
     489             :      * worst case, limit the input size passed to the conversion function.
     490             :      */
     491        5820 :     if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
     492        5760 :         srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
     493             : 
     494        5820 :     result = OidFunctionCall6(proc,
     495             :                               Int32GetDatum(src_encoding),
     496             :                               Int32GetDatum(dest_encoding),
     497             :                               CStringGetDatum((char *) src),
     498             :                               CStringGetDatum((char *) dest),
     499             :                               Int32GetDatum(srclen),
     500             :                               BoolGetDatum(noError));
     501        3450 :     return DatumGetInt32(result);
     502             : }
     503             : 
     504             : /*
     505             :  * Convert string to encoding encoding_name. The source
     506             :  * encoding is the DB encoding.
     507             :  *
     508             :  * BYTEA convert_to(TEXT string, NAME encoding_name)
     509             :  */
     510             : Datum
     511         408 : pg_convert_to(PG_FUNCTION_ARGS)
     512             : {
     513         408 :     Datum       string = PG_GETARG_DATUM(0);
     514         408 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     515         408 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     516             :                                                         CStringGetDatum(DatabaseEncoding->name));
     517             :     Datum       result;
     518             : 
     519             :     /*
     520             :      * pg_convert expects a bytea as its first argument. We're passing it a
     521             :      * text argument here, relying on the fact that they are both in fact
     522             :      * varlena types, and thus structurally identical.
     523             :      */
     524         408 :     result = DirectFunctionCall3(pg_convert, string,
     525             :                                  src_encoding_name, dest_encoding_name);
     526             : 
     527         402 :     PG_RETURN_DATUM(result);
     528             : }
     529             : 
     530             : /*
     531             :  * Convert string from encoding encoding_name. The destination
     532             :  * encoding is the DB encoding.
     533             :  *
     534             :  * TEXT convert_from(BYTEA string, NAME encoding_name)
     535             :  */
     536             : Datum
     537         592 : pg_convert_from(PG_FUNCTION_ARGS)
     538             : {
     539         592 :     Datum       string = PG_GETARG_DATUM(0);
     540         592 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     541         592 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     542             :                                                          CStringGetDatum(DatabaseEncoding->name));
     543             :     Datum       result;
     544             : 
     545         592 :     result = DirectFunctionCall3(pg_convert, string,
     546             :                                  src_encoding_name, dest_encoding_name);
     547             : 
     548             :     /*
     549             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     550             :      * the fact that they are both in fact varlena types, and thus
     551             :      * structurally identical. Although not all bytea values are valid text,
     552             :      * in this case it will be because we've told pg_convert to return one
     553             :      * that is valid as text in the current database encoding.
     554             :      */
     555         586 :     PG_RETURN_DATUM(result);
     556             : }
     557             : 
     558             : /*
     559             :  * Convert string between two arbitrary encodings.
     560             :  *
     561             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     562             :  */
     563             : Datum
     564        1768 : pg_convert(PG_FUNCTION_ARGS)
     565             : {
     566        1768 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     567        1768 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     568        1768 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     569        1768 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     570        1768 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     571             :     const char *src_str;
     572             :     char       *dest_str;
     573             :     bytea      *retval;
     574             :     int         len;
     575             : 
     576        1768 :     if (src_encoding < 0)
     577           0 :         ereport(ERROR,
     578             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     579             :                  errmsg("invalid source encoding name \"%s\"",
     580             :                         src_encoding_name)));
     581        1768 :     if (dest_encoding < 0)
     582           0 :         ereport(ERROR,
     583             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     584             :                  errmsg("invalid destination encoding name \"%s\"",
     585             :                         dest_encoding_name)));
     586             : 
     587             :     /* make sure that source string is valid */
     588        1768 :     len = VARSIZE_ANY_EXHDR(string);
     589        1768 :     src_str = VARDATA_ANY(string);
     590        1768 :     (void) pg_verify_mbstr(src_encoding, src_str, len, false);
     591             : 
     592             :     /* perform conversion */
     593        1756 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     594             :                                                   len,
     595             :                                                   src_encoding,
     596             :                                                   dest_encoding);
     597             : 
     598             : 
     599             :     /* return source string if no conversion happened */
     600        1756 :     if (dest_str == src_str)
     601         976 :         PG_RETURN_BYTEA_P(string);
     602             : 
     603             :     /*
     604             :      * build bytea data type structure.
     605             :      */
     606         780 :     len = strlen(dest_str);
     607         780 :     retval = (bytea *) palloc(len + VARHDRSZ);
     608         780 :     SET_VARSIZE(retval, len + VARHDRSZ);
     609         780 :     memcpy(VARDATA(retval), dest_str, len);
     610         780 :     pfree(dest_str);
     611             : 
     612             :     /* free memory if allocated by the toaster */
     613         780 :     PG_FREE_IF_COPY(string, 0);
     614             : 
     615         780 :     PG_RETURN_BYTEA_P(retval);
     616             : }
     617             : 
     618             : /*
     619             :  * get the length of the string considered as text in the specified
     620             :  * encoding. Raises an error if the data is not valid in that
     621             :  * encoding.
     622             :  *
     623             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     624             :  */
     625             : Datum
     626           0 : length_in_encoding(PG_FUNCTION_ARGS)
     627             : {
     628           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     629           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     630           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     631             :     const char *src_str;
     632             :     int         len;
     633             :     int         retval;
     634             : 
     635           0 :     if (src_encoding < 0)
     636           0 :         ereport(ERROR,
     637             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     638             :                  errmsg("invalid encoding name \"%s\"",
     639             :                         src_encoding_name)));
     640             : 
     641           0 :     len = VARSIZE_ANY_EXHDR(string);
     642           0 :     src_str = VARDATA_ANY(string);
     643             : 
     644           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     645             : 
     646           0 :     PG_RETURN_INT32(retval);
     647             : }
     648             : 
     649             : /*
     650             :  * Get maximum multibyte character length in the specified encoding.
     651             :  *
     652             :  * Note encoding is specified numerically, not by name as above.
     653             :  */
     654             : Datum
     655           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     656             : {
     657           0 :     int         encoding = PG_GETARG_INT32(0);
     658             : 
     659           0 :     if (PG_VALID_ENCODING(encoding))
     660           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     661             :     else
     662           0 :         PG_RETURN_NULL();
     663             : }
     664             : 
     665             : /*
     666             :  * Convert client encoding to server encoding.
     667             :  *
     668             :  * See the notes about string conversion functions at the top of this file.
     669             :  */
     670             : char *
     671      848680 : pg_client_to_server(const char *s, int len)
     672             : {
     673      848680 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     674             : }
     675             : 
     676             : /*
     677             :  * Convert any encoding to server encoding.
     678             :  *
     679             :  * See the notes about string conversion functions at the top of this file.
     680             :  *
     681             :  * Unlike the other string conversion functions, this will apply validation
     682             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     683             :  * used to process data coming in from outside the database, and we never
     684             :  * want to just assume validity.
     685             :  */
     686             : char *
     687      936566 : pg_any_to_server(const char *s, int len, int encoding)
     688             : {
     689      936566 :     if (len <= 0)
     690       80508 :         return unconstify(char *, s);   /* empty string is always valid */
     691             : 
     692      856058 :     if (encoding == DatabaseEncoding->encoding ||
     693             :         encoding == PG_SQL_ASCII)
     694             :     {
     695             :         /*
     696             :          * No conversion is needed, but we must still validate the data.
     697             :          */
     698      855690 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     699      855688 :         return unconstify(char *, s);
     700             :     }
     701             : 
     702         368 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     703             :     {
     704             :         /*
     705             :          * No conversion is possible, but we must still validate the data,
     706             :          * because the client-side code might have done string escaping using
     707             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     708             :          * then we just do a straight validation under that encoding.  For an
     709             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     710             :          * to the parser but we have no way to convert it.  We compromise by
     711             :          * rejecting the data if it contains any non-ASCII characters.
     712             :          */
     713         308 :         if (PG_VALID_BE_ENCODING(encoding))
     714         248 :             (void) pg_verify_mbstr(encoding, s, len, false);
     715             :         else
     716             :         {
     717             :             int         i;
     718             : 
     719        1908 :             for (i = 0; i < len; i++)
     720             :             {
     721        1848 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     722           0 :                     ereport(ERROR,
     723             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     724             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     725             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     726             :                                     (unsigned char) s[i])));
     727             :             }
     728             :         }
     729         308 :         return unconstify(char *, s);
     730             :     }
     731             : 
     732             :     /* Fast path if we can use cached conversion function */
     733          60 :     if (encoding == ClientEncoding->encoding)
     734          60 :         return perform_default_encoding_conversion(s, len, true);
     735             : 
     736             :     /* General case ... will not work outside transactions */
     737           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     738             :                                               len,
     739             :                                               encoding,
     740           0 :                                               DatabaseEncoding->encoding);
     741             : }
     742             : 
     743             : /*
     744             :  * Convert server encoding to client encoding.
     745             :  *
     746             :  * See the notes about string conversion functions at the top of this file.
     747             :  */
     748             : char *
     749    37101632 : pg_server_to_client(const char *s, int len)
     750             : {
     751    37101632 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     752             : }
     753             : 
     754             : /*
     755             :  * Convert server encoding to any encoding.
     756             :  *
     757             :  * See the notes about string conversion functions at the top of this file.
     758             :  */
     759             : char *
     760    37140446 : pg_server_to_any(const char *s, int len, int encoding)
     761             : {
     762    37140446 :     if (len <= 0)
     763      265534 :         return unconstify(char *, s);   /* empty string is always valid */
     764             : 
     765    36874912 :     if (encoding == DatabaseEncoding->encoding ||
     766             :         encoding == PG_SQL_ASCII)
     767    36874338 :         return unconstify(char *, s);   /* assume data is valid */
     768             : 
     769         574 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     770             :     {
     771             :         /* No conversion is possible, but we must validate the result */
     772         168 :         (void) pg_verify_mbstr(encoding, s, len, false);
     773         168 :         return unconstify(char *, s);
     774             :     }
     775             : 
     776             :     /* Fast path if we can use cached conversion function */
     777         406 :     if (encoding == ClientEncoding->encoding)
     778         388 :         return perform_default_encoding_conversion(s, len, false);
     779             : 
     780             :     /* General case ... will not work outside transactions */
     781          18 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     782             :                                               len,
     783          18 :                                               DatabaseEncoding->encoding,
     784             :                                               encoding);
     785             : }
     786             : 
     787             : /*
     788             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     789             :  *  this function does not access database at all, it is safe to call
     790             :  *  outside transactions.  If the conversion has not been set up by
     791             :  *  SetClientEncoding(), no conversion is performed.
     792             :  */
     793             : static char *
     794         448 : perform_default_encoding_conversion(const char *src, int len,
     795             :                                     bool is_client_to_server)
     796             : {
     797             :     char       *result;
     798             :     int         src_encoding,
     799             :                 dest_encoding;
     800             :     FmgrInfo   *flinfo;
     801             : 
     802         448 :     if (is_client_to_server)
     803             :     {
     804          60 :         src_encoding = ClientEncoding->encoding;
     805          60 :         dest_encoding = DatabaseEncoding->encoding;
     806          60 :         flinfo = ToServerConvProc;
     807             :     }
     808             :     else
     809             :     {
     810         388 :         src_encoding = DatabaseEncoding->encoding;
     811         388 :         dest_encoding = ClientEncoding->encoding;
     812         388 :         flinfo = ToClientConvProc;
     813             :     }
     814             : 
     815         448 :     if (flinfo == NULL)
     816           0 :         return unconstify(char *, src);
     817             : 
     818             :     /*
     819             :      * Allocate space for conversion result, being wary of integer overflow.
     820             :      * See comments in pg_do_encoding_conversion.
     821             :      */
     822         448 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     823           0 :         ereport(ERROR,
     824             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     825             :                  errmsg("out of memory"),
     826             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     827             :                            len)));
     828             : 
     829             :     result = (char *)
     830         448 :         MemoryContextAllocHuge(CurrentMemoryContext,
     831         448 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     832             : 
     833         448 :     FunctionCall6(flinfo,
     834             :                   Int32GetDatum(src_encoding),
     835             :                   Int32GetDatum(dest_encoding),
     836             :                   CStringGetDatum(src),
     837             :                   CStringGetDatum(result),
     838             :                   Int32GetDatum(len),
     839             :                   BoolGetDatum(false));
     840             : 
     841             :     /*
     842             :      * Release extra space if there might be a lot --- see comments in
     843             :      * pg_do_encoding_conversion.
     844             :      */
     845         448 :     if (len > 1000000)
     846             :     {
     847           0 :         Size        resultlen = strlen(result);
     848             : 
     849           0 :         if (resultlen >= MaxAllocSize)
     850           0 :             ereport(ERROR,
     851             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     852             :                      errmsg("out of memory"),
     853             :                      errdetail("String of %d bytes is too long for encoding conversion.",
     854             :                                len)));
     855             : 
     856           0 :         result = (char *) repalloc(result, resultlen + 1);
     857             :     }
     858             : 
     859         448 :     return result;
     860             : }
     861             : 
     862             : /*
     863             :  * Convert a single Unicode code point into a string in the server encoding.
     864             :  *
     865             :  * The code point given by "c" is converted and stored at *s, which must
     866             :  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
     867             :  * The output will have a trailing '\0'.  Throws error if the conversion
     868             :  * cannot be performed.
     869             :  *
     870             :  * Note that this relies on having previously looked up any required
     871             :  * conversion function.  That's partly for speed but mostly because the parser
     872             :  * may call this outside any transaction, or in an aborted transaction.
     873             :  */
     874             : void
     875        1046 : pg_unicode_to_server(char32_t c, unsigned char *s)
     876             : {
     877             :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     878             :     int         c_as_utf8_len;
     879             :     int         server_encoding;
     880             : 
     881             :     /*
     882             :      * Complain if invalid Unicode code point.  The choice of errcode here is
     883             :      * debatable, but really our caller should have checked this anyway.
     884             :      */
     885        1046 :     if (!is_valid_unicode_codepoint(c))
     886           0 :         ereport(ERROR,
     887             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     888             :                  errmsg("invalid Unicode code point")));
     889             : 
     890             :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     891        1046 :     if (c <= 0x7F)
     892             :     {
     893         352 :         s[0] = (unsigned char) c;
     894         352 :         s[1] = '\0';
     895        1046 :         return;
     896             :     }
     897             : 
     898             :     /* If the server encoding is UTF-8, we just need to reformat the code */
     899         694 :     server_encoding = GetDatabaseEncoding();
     900         694 :     if (server_encoding == PG_UTF8)
     901             :     {
     902         694 :         unicode_to_utf8(c, s);
     903         694 :         s[pg_utf_mblen(s)] = '\0';
     904         694 :         return;
     905             :     }
     906             : 
     907             :     /* For all other cases, we must have a conversion function available */
     908           0 :     if (Utf8ToServerConvProc == NULL)
     909           0 :         ereport(ERROR,
     910             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     911             :                  errmsg("conversion between %s and %s is not supported",
     912             :                         pg_enc2name_tbl[PG_UTF8].name,
     913             :                         GetDatabaseEncodingName())));
     914             : 
     915             :     /* Construct UTF-8 source string */
     916           0 :     unicode_to_utf8(c, c_as_utf8);
     917           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     918           0 :     c_as_utf8[c_as_utf8_len] = '\0';
     919             : 
     920             :     /* Convert, or throw error if we can't */
     921           0 :     FunctionCall6(Utf8ToServerConvProc,
     922             :                   Int32GetDatum(PG_UTF8),
     923             :                   Int32GetDatum(server_encoding),
     924             :                   CStringGetDatum((char *) c_as_utf8),
     925             :                   CStringGetDatum((char *) s),
     926             :                   Int32GetDatum(c_as_utf8_len),
     927             :                   BoolGetDatum(false));
     928             : }
     929             : 
     930             : /*
     931             :  * Convert a single Unicode code point into a string in the server encoding.
     932             :  *
     933             :  * Same as pg_unicode_to_server(), except that we don't throw errors,
     934             :  * but simply return false on conversion failure.
     935             :  */
     936             : bool
     937          84 : pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
     938             : {
     939             :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     940             :     int         c_as_utf8_len;
     941             :     int         converted_len;
     942             :     int         server_encoding;
     943             : 
     944             :     /* Fail if invalid Unicode code point */
     945          84 :     if (!is_valid_unicode_codepoint(c))
     946           0 :         return false;
     947             : 
     948             :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     949          84 :     if (c <= 0x7F)
     950             :     {
     951          24 :         s[0] = (unsigned char) c;
     952          24 :         s[1] = '\0';
     953          24 :         return true;
     954             :     }
     955             : 
     956             :     /* If the server encoding is UTF-8, we just need to reformat the code */
     957          60 :     server_encoding = GetDatabaseEncoding();
     958          60 :     if (server_encoding == PG_UTF8)
     959             :     {
     960          60 :         unicode_to_utf8(c, s);
     961          60 :         s[pg_utf_mblen(s)] = '\0';
     962          60 :         return true;
     963             :     }
     964             : 
     965             :     /* For all other cases, we must have a conversion function available */
     966           0 :     if (Utf8ToServerConvProc == NULL)
     967           0 :         return false;
     968             : 
     969             :     /* Construct UTF-8 source string */
     970           0 :     unicode_to_utf8(c, c_as_utf8);
     971           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     972           0 :     c_as_utf8[c_as_utf8_len] = '\0';
     973             : 
     974             :     /* Convert, but without throwing error if we can't */
     975           0 :     converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
     976             :                                                 Int32GetDatum(PG_UTF8),
     977             :                                                 Int32GetDatum(server_encoding),
     978             :                                                 CStringGetDatum((char *) c_as_utf8),
     979             :                                                 CStringGetDatum((char *) s),
     980             :                                                 Int32GetDatum(c_as_utf8_len),
     981             :                                                 BoolGetDatum(true)));
     982             : 
     983             :     /* Conversion was successful iff it consumed the whole input */
     984           0 :     return (converted_len == c_as_utf8_len);
     985             : }
     986             : 
     987             : 
     988             : /* convert a multibyte string to a wchar */
     989             : int
     990           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     991             : {
     992           0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     993             : }
     994             : 
     995             : /* convert a multibyte string to a wchar with a limited length */
     996             : int
     997    10222728 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     998             : {
     999    10222728 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
    1000             : }
    1001             : 
    1002             : /* same, with any encoding */
    1003             : int
    1004       18616 : pg_encoding_mb2wchar_with_len(int encoding,
    1005             :                               const char *from, pg_wchar *to, int len)
    1006             : {
    1007       18616 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
    1008             : }
    1009             : 
    1010             : /* convert a wchar string to a multibyte */
    1011             : int
    1012           0 : pg_wchar2mb(const pg_wchar *from, char *to)
    1013             : {
    1014           0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
    1015             : }
    1016             : 
    1017             : /* convert a wchar string to a multibyte with a limited length */
    1018             : int
    1019     1116212 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
    1020             : {
    1021     1116212 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1022             : }
    1023             : 
    1024             : /* same, with any encoding */
    1025             : int
    1026         192 : pg_encoding_wchar2mb_with_len(int encoding,
    1027             :                               const pg_wchar *from, char *to, int len)
    1028             : {
    1029         192 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1030             : }
    1031             : 
    1032             : /*
    1033             :  * Returns the byte length of a multibyte character sequence in a
    1034             :  * null-terminated string.  Raises an illegal byte sequence error if the
    1035             :  * sequence would hit a null terminator.
    1036             :  *
    1037             :  * The caller is expected to have checked for a terminator at *mbstr == 0
    1038             :  * before calling, but some callers want 1 in that case, so this function
    1039             :  * continues that tradition.
    1040             :  *
    1041             :  * This must only be used for strings that have a null-terminator to enable
    1042             :  * bounds detection.
    1043             :  */
    1044             : int
    1045     4203206 : pg_mblen_cstr(const char *mbstr)
    1046             : {
    1047     4203206 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1048             : 
    1049             :     /*
    1050             :      * The .mblen functions return 1 when given a pointer to a terminator.
    1051             :      * Some callers depend on that, so we tolerate it for now.  Well-behaved
    1052             :      * callers check the leading byte for a terminator *before* calling.
    1053             :      */
    1054     4228046 :     for (int i = 1; i < length; ++i)
    1055       24846 :         if (unlikely(mbstr[i] == 0))
    1056           6 :             report_invalid_encoding_db(mbstr, length, i);
    1057             : 
    1058             :     /*
    1059             :      * String should be NUL-terminated, but checking that would make typical
    1060             :      * callers O(N^2), tripling Valgrind check-world time.  Unless
    1061             :      * VALGRIND_EXPENSIVE, check 1 byte after each actual character.  (If we
    1062             :      * found a character, not a terminator, the next byte must be a terminator
    1063             :      * or the start of the next character.)  If the caller iterates the whole
    1064             :      * string, the last call will diagnose a missing terminator.
    1065             :      */
    1066     4203200 :     if (mbstr[0] != '\0')
    1067             :     {
    1068             : #ifdef VALGRIND_EXPENSIVE
    1069             :         VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
    1070             : #else
    1071             :         VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
    1072             : #endif
    1073             :     }
    1074             : 
    1075     4203200 :     return length;
    1076             : }
    1077             : 
    1078             : /*
    1079             :  * Returns the byte length of a multibyte character sequence bounded by a range
    1080             :  * [mbstr, end) of at least one byte in size.  Raises an illegal byte sequence
    1081             :  * error if the sequence would exceed the range.
    1082             :  */
    1083             : int
    1084     5524330 : pg_mblen_range(const char *mbstr, const char *end)
    1085             : {
    1086     5524330 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1087             : 
    1088             :     Assert(end > mbstr);
    1089             : #ifdef VALGRIND_EXPENSIVE
    1090             :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
    1091             : #else
    1092             :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1093             : #endif
    1094             : 
    1095     5524330 :     if (unlikely(mbstr + length > end))
    1096          12 :         report_invalid_encoding_db(mbstr, length, end - mbstr);
    1097             : 
    1098     5524318 :     return length;
    1099             : }
    1100             : 
    1101             : /*
    1102             :  * Returns the byte length of a multibyte character sequence bounded by a range
    1103             :  * extending for 'limit' bytes, which must be at least one.  Raises an illegal
    1104             :  * byte sequence error if the sequence would exceed the range.
    1105             :  */
    1106             : int
    1107   220471946 : pg_mblen_with_len(const char *mbstr, int limit)
    1108             : {
    1109   220471946 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1110             : 
    1111             :     Assert(limit >= 1);
    1112             : #ifdef VALGRIND_EXPENSIVE
    1113             :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
    1114             : #else
    1115             :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1116             : #endif
    1117             : 
    1118   220471946 :     if (unlikely(length > limit))
    1119          18 :         report_invalid_encoding_db(mbstr, length, limit);
    1120             : 
    1121   220471928 :     return length;
    1122             : }
    1123             : 
    1124             : 
    1125             : /*
    1126             :  * Returns the length of a multibyte character sequence, without any
    1127             :  * validation of bounds.
    1128             :  *
    1129             :  * PLEASE NOTE:  This function can only be used safely if the caller has
    1130             :  * already verified the input string, since otherwise there is a risk of
    1131             :  * overrunning the buffer if the string is invalid.  A prior call to a
    1132             :  * pg_mbstrlen* function suffices.
    1133             :  */
    1134             : int
    1135    21399420 : pg_mblen_unbounded(const char *mbstr)
    1136             : {
    1137    21399420 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1138             : 
    1139             :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1140             : 
    1141    21399420 :     return length;
    1142             : }
    1143             : 
    1144             : /*
    1145             :  * Historical name for pg_mblen_unbounded().  Should not be used and will be
    1146             :  * removed in a later version.
    1147             :  */
    1148             : int
    1149           0 : pg_mblen(const char *mbstr)
    1150             : {
    1151           0 :     return pg_mblen_unbounded(mbstr);
    1152             : }
    1153             : 
    1154             : /* returns the display length of a multibyte character */
    1155             : int
    1156        8724 : pg_dsplen(const char *mbstr)
    1157             : {
    1158        8724 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
    1159             : }
    1160             : 
    1161             : /* returns the length (counted in wchars) of a multibyte string */
    1162             : int
    1163         702 : pg_mbstrlen(const char *mbstr)
    1164             : {
    1165         702 :     int         len = 0;
    1166             : 
    1167             :     /* optimization for single byte encoding */
    1168         702 :     if (pg_database_encoding_max_length() == 1)
    1169           0 :         return strlen(mbstr);
    1170             : 
    1171        1626 :     while (*mbstr)
    1172             :     {
    1173         924 :         mbstr += pg_mblen_cstr(mbstr);
    1174         924 :         len++;
    1175             :     }
    1176         702 :     return len;
    1177             : }
    1178             : 
    1179             : /* returns the length (counted in wchars) of a multibyte string
    1180             :  * (stops at the first of "limit" or a NUL)
    1181             :  */
    1182             : int
    1183     1610884 : pg_mbstrlen_with_len(const char *mbstr, int limit)
    1184             : {
    1185     1610884 :     int         len = 0;
    1186             : 
    1187             :     /* optimization for single byte encoding */
    1188     1610884 :     if (pg_database_encoding_max_length() == 1)
    1189      400014 :         return limit;
    1190             : 
    1191   221678832 :     while (limit > 0 && *mbstr)
    1192             :     {
    1193   220467974 :         int         l = pg_mblen_with_len(mbstr, limit);
    1194             : 
    1195   220467962 :         limit -= l;
    1196   220467962 :         mbstr += l;
    1197   220467962 :         len++;
    1198             :     }
    1199     1210858 :     return len;
    1200             : }
    1201             : 
    1202             : /*
    1203             :  * returns the byte length of a multibyte string
    1204             :  * (not necessarily NULL terminated)
    1205             :  * that is no longer than limit.
    1206             :  * this function does not break multibyte character boundary.
    1207             :  */
    1208             : int
    1209      330534 : pg_mbcliplen(const char *mbstr, int len, int limit)
    1210             : {
    1211      330534 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
    1212             :                                  len, limit);
    1213             : }
    1214             : 
    1215             : /*
    1216             :  * pg_mbcliplen with specified encoding; string must be valid in encoding
    1217             :  */
    1218             : int
    1219      330534 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
    1220             :                       int len, int limit)
    1221             : {
    1222             :     mblen_converter mblen_fn;
    1223      330534 :     int         clen = 0;
    1224             :     int         l;
    1225             : 
    1226             :     /* optimization for single byte encoding */
    1227      330534 :     if (pg_encoding_max_length(encoding) == 1)
    1228       38490 :         return cliplen(mbstr, len, limit);
    1229             : 
    1230      292044 :     mblen_fn = pg_wchar_table[encoding].mblen;
    1231             : 
    1232     3219170 :     while (len > 0 && *mbstr)
    1233             :     {
    1234     3067838 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
    1235     3067838 :         if ((clen + l) > limit)
    1236          94 :             break;
    1237     3067744 :         clen += l;
    1238     3067744 :         if (clen == limit)
    1239      140618 :             break;
    1240     2927126 :         len -= l;
    1241     2927126 :         mbstr += l;
    1242             :     }
    1243      292044 :     return clen;
    1244             : }
    1245             : 
    1246             : /*
    1247             :  * Similar to pg_mbcliplen except the limit parameter specifies the
    1248             :  * character length, not the byte length.
    1249             :  */
    1250             : int
    1251         528 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
    1252             : {
    1253         528 :     int         clen = 0;
    1254         528 :     int         nch = 0;
    1255             :     int         l;
    1256             : 
    1257             :     /* optimization for single byte encoding */
    1258         528 :     if (pg_database_encoding_max_length() == 1)
    1259           0 :         return cliplen(mbstr, len, limit);
    1260             : 
    1261        2328 :     while (len > 0 && *mbstr)
    1262             :     {
    1263        2310 :         l = pg_mblen_with_len(mbstr, len);
    1264        2310 :         nch++;
    1265        2310 :         if (nch > limit)
    1266         510 :             break;
    1267        1800 :         clen += l;
    1268        1800 :         len -= l;
    1269        1800 :         mbstr += l;
    1270             :     }
    1271         528 :     return clen;
    1272             : }
    1273             : 
    1274             : /* mbcliplen for any single-byte encoding */
    1275             : static int
    1276       38490 : cliplen(const char *str, int len, int limit)
    1277             : {
    1278       38490 :     int         l = 0;
    1279             : 
    1280       38490 :     len = Min(len, limit);
    1281      294640 :     while (l < len && str[l])
    1282      256150 :         l++;
    1283       38490 :     return l;
    1284             : }
    1285             : 
    1286             : void
    1287       33912 : SetDatabaseEncoding(int encoding)
    1288             : {
    1289       33912 :     if (!PG_VALID_BE_ENCODING(encoding))
    1290           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
    1291             : 
    1292       33912 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
    1293             :     Assert(DatabaseEncoding->encoding == encoding);
    1294       33912 : }
    1295             : 
    1296             : void
    1297       37890 : SetMessageEncoding(int encoding)
    1298             : {
    1299             :     /* Some calls happen before we can elog()! */
    1300             :     Assert(PG_VALID_ENCODING(encoding));
    1301             : 
    1302       37890 :     MessageEncoding = &pg_enc2name_tbl[encoding];
    1303             :     Assert(MessageEncoding->encoding == encoding);
    1304       37890 : }
    1305             : 
    1306             : #ifdef ENABLE_NLS
    1307             : /*
    1308             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
    1309             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
    1310             :  * fail for gettext-internal causes like out-of-memory.
    1311             :  */
    1312             : static bool
    1313        3268 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
    1314             : {
    1315        3268 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1316             : 
    1317        3268 :     if (!PG_VALID_ENCODING(encoding) || pg_enc2gettext_tbl[encoding] == NULL)
    1318           0 :         return false;
    1319             : 
    1320        3268 :     if (bind_textdomain_codeset(domainname,
    1321             :                                 pg_enc2gettext_tbl[encoding]) != NULL)
    1322        3268 :         return true;
    1323             : 
    1324           0 :     if (elog_ok)
    1325           0 :         elog(LOG, "bind_textdomain_codeset failed");
    1326             :     else
    1327           0 :         write_stderr("bind_textdomain_codeset failed");
    1328             : 
    1329           0 :     return false;
    1330             : }
    1331             : 
    1332             : /*
    1333             :  * Bind a gettext message domain to the codeset corresponding to the database
    1334             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
    1335             :  * Return the MessageEncoding implied by the new settings.
    1336             :  *
    1337             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
    1338             :  * When that matches the database encoding, we don't need to do anything.  In
    1339             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
    1340             :  * database encoding, except for the C locale.  (On Windows, we also permit a
    1341             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
    1342             :  * gettext to the right codeset.
    1343             :  *
    1344             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
    1345             :  * convenient departure for software that passes the strings to Windows ANSI
    1346             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
    1347             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
    1348             :  *
    1349             :  * This function is called before elog() and palloc() are usable.
    1350             :  */
    1351             : int
    1352       41668 : pg_bind_textdomain_codeset(const char *domainname)
    1353             : {
    1354       41668 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1355       41668 :     int         encoding = GetDatabaseEncoding();
    1356             :     int         new_msgenc;
    1357             : 
    1358             : #ifndef WIN32
    1359       41668 :     const char *ctype = setlocale(LC_CTYPE, NULL);
    1360             : 
    1361       41668 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
    1362             : #endif
    1363        7326 :         if (encoding != PG_SQL_ASCII &&
    1364        3268 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
    1365        3268 :             return encoding;
    1366             : 
    1367       38400 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
    1368       38400 :     if (new_msgenc < 0)
    1369           0 :         new_msgenc = PG_SQL_ASCII;
    1370             : 
    1371             : #ifdef WIN32
    1372             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1373             :         /* On failure, the old message encoding remains valid. */
    1374             :         return GetMessageEncoding();
    1375             : #endif
    1376             : 
    1377       38400 :     return new_msgenc;
    1378             : }
    1379             : #endif
    1380             : 
    1381             : /*
    1382             :  * The database encoding, also called the server encoding, represents the
    1383             :  * encoding of data stored in text-like data types.  Affected types include
    1384             :  * cstring, text, varchar, name, xml, and json.
    1385             :  */
    1386             : int
    1387     8489002 : GetDatabaseEncoding(void)
    1388             : {
    1389     8489002 :     return DatabaseEncoding->encoding;
    1390             : }
    1391             : 
    1392             : const char *
    1393       71044 : GetDatabaseEncodingName(void)
    1394             : {
    1395       71044 :     return DatabaseEncoding->name;
    1396             : }
    1397             : 
    1398             : Datum
    1399         102 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1400             : {
    1401         102 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1402             : }
    1403             : 
    1404             : Datum
    1405           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1406             : {
    1407           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1408             : }
    1409             : 
    1410             : Datum
    1411          36 : PG_char_to_encoding(PG_FUNCTION_ARGS)
    1412             : {
    1413          36 :     Name        s = PG_GETARG_NAME(0);
    1414             : 
    1415          36 :     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
    1416             : }
    1417             : 
    1418             : Datum
    1419        4910 : PG_encoding_to_char(PG_FUNCTION_ARGS)
    1420             : {
    1421        4910 :     int32       encoding = PG_GETARG_INT32(0);
    1422        4910 :     const char *encoding_name = pg_encoding_to_char(encoding);
    1423             : 
    1424        4910 :     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
    1425             : }
    1426             : 
    1427             : /*
    1428             :  * gettext() returns messages in this encoding.  This often matches the
    1429             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1430             :  * not attached to a database, and under a database encoding lacking iconv
    1431             :  * support (MULE_INTERNAL).
    1432             :  */
    1433             : int
    1434           0 : GetMessageEncoding(void)
    1435             : {
    1436           0 :     return MessageEncoding->encoding;
    1437             : }
    1438             : 
    1439             : 
    1440             : /*
    1441             :  * Generic character incrementer function.
    1442             :  *
    1443             :  * Not knowing anything about the properties of the encoding in use, we just
    1444             :  * keep incrementing the last byte until we get a validly-encoded result,
    1445             :  * or we run out of values to try.  We don't bother to try incrementing
    1446             :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1447             :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1448             :  * is not a valid final byte in the encoding.)
    1449             :  */
    1450             : static bool
    1451         104 : pg_generic_charinc(unsigned char *charptr, int len)
    1452             : {
    1453         104 :     unsigned char *lastbyte = charptr + len - 1;
    1454             :     mbchar_verifier mbverify;
    1455             : 
    1456             :     /* We can just invoke the character verifier directly. */
    1457         104 :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
    1458             : 
    1459         104 :     while (*lastbyte < (unsigned char) 255)
    1460             :     {
    1461         104 :         (*lastbyte)++;
    1462         104 :         if ((*mbverify) (charptr, len) == len)
    1463         104 :             return true;
    1464             :     }
    1465             : 
    1466           0 :     return false;
    1467             : }
    1468             : 
    1469             : /*
    1470             :  * UTF-8 character incrementer function.
    1471             :  *
    1472             :  * For a one-byte character less than 0x7F, we just increment the byte.
    1473             :  *
    1474             :  * For a multibyte character, every byte but the first must fall between 0x80
    1475             :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1476             :  * the last byte that's not already at its maximum value.  If we can't find a
    1477             :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1478             :  * need some special-case logic to skip regions used for surrogate pair
    1479             :  * handling, as those should not occur in valid UTF-8.
    1480             :  *
    1481             :  * Note that we don't reset lower-order bytes back to their minimums, since
    1482             :  * we can't afford to make an exhaustive search (see make_greater_string).
    1483             :  */
    1484             : static bool
    1485        3482 : pg_utf8_increment(unsigned char *charptr, int length)
    1486             : {
    1487             :     unsigned char a;
    1488             :     unsigned char limit;
    1489             : 
    1490        3482 :     switch (length)
    1491             :     {
    1492           0 :         default:
    1493             :             /* reject lengths 5 and 6 for now */
    1494           0 :             return false;
    1495           0 :         case 4:
    1496           0 :             a = charptr[3];
    1497           0 :             if (a < 0xBF)
    1498             :             {
    1499           0 :                 charptr[3]++;
    1500           0 :                 break;
    1501             :             }
    1502             :             /* FALL THRU */
    1503             :         case 3:
    1504           0 :             a = charptr[2];
    1505           0 :             if (a < 0xBF)
    1506             :             {
    1507           0 :                 charptr[2]++;
    1508           0 :                 break;
    1509             :             }
    1510             :             /* FALL THRU */
    1511             :         case 2:
    1512           0 :             a = charptr[1];
    1513           0 :             switch (*charptr)
    1514             :             {
    1515           0 :                 case 0xED:
    1516           0 :                     limit = 0x9F;
    1517           0 :                     break;
    1518           0 :                 case 0xF4:
    1519           0 :                     limit = 0x8F;
    1520           0 :                     break;
    1521           0 :                 default:
    1522           0 :                     limit = 0xBF;
    1523           0 :                     break;
    1524             :             }
    1525           0 :             if (a < limit)
    1526             :             {
    1527           0 :                 charptr[1]++;
    1528           0 :                 break;
    1529             :             }
    1530             :             /* FALL THRU */
    1531             :         case 1:
    1532        3482 :             a = *charptr;
    1533        3482 :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1534           0 :                 return false;
    1535        3482 :             charptr[0]++;
    1536        3482 :             break;
    1537             :     }
    1538             : 
    1539        3482 :     return true;
    1540             : }
    1541             : 
    1542             : /*
    1543             :  * EUC-JP character incrementer function.
    1544             :  *
    1545             :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1546             :  * representing JIS X 0201 characters with the second byte ranging between
    1547             :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1548             :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1549             :  *
    1550             :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1551             :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1552             :  * is incremented if possible, otherwise the second-to-last byte.
    1553             :  *
    1554             :  * If the sequence starts with a value other than the above and its MSB
    1555             :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1556             :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1557             :  * incremented if possible, otherwise the second-to-last byte.
    1558             :  *
    1559             :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1560             :  * incremented up to 0x7f.
    1561             :  */
    1562             : static bool
    1563           0 : pg_eucjp_increment(unsigned char *charptr, int length)
    1564             : {
    1565             :     unsigned char c1,
    1566             :                 c2;
    1567             :     int         i;
    1568             : 
    1569           0 :     c1 = *charptr;
    1570             : 
    1571           0 :     switch (c1)
    1572             :     {
    1573           0 :         case SS2:               /* JIS X 0201 */
    1574           0 :             if (length != 2)
    1575           0 :                 return false;
    1576             : 
    1577           0 :             c2 = charptr[1];
    1578             : 
    1579           0 :             if (c2 >= 0xdf)
    1580           0 :                 charptr[0] = charptr[1] = 0xa1;
    1581           0 :             else if (c2 < 0xa1)
    1582           0 :                 charptr[1] = 0xa1;
    1583             :             else
    1584           0 :                 charptr[1]++;
    1585           0 :             break;
    1586             : 
    1587           0 :         case SS3:               /* JIS X 0212 */
    1588           0 :             if (length != 3)
    1589           0 :                 return false;
    1590             : 
    1591           0 :             for (i = 2; i > 0; i--)
    1592             :             {
    1593           0 :                 c2 = charptr[i];
    1594           0 :                 if (c2 < 0xa1)
    1595             :                 {
    1596           0 :                     charptr[i] = 0xa1;
    1597           0 :                     return true;
    1598             :                 }
    1599           0 :                 else if (c2 < 0xfe)
    1600             :                 {
    1601           0 :                     charptr[i]++;
    1602           0 :                     return true;
    1603             :                 }
    1604             :             }
    1605             : 
    1606             :             /* Out of 3-byte code region */
    1607           0 :             return false;
    1608             : 
    1609           0 :         default:
    1610           0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1611             :             {
    1612           0 :                 if (length != 2)
    1613           0 :                     return false;
    1614             : 
    1615           0 :                 for (i = 1; i >= 0; i--)
    1616             :                 {
    1617           0 :                     c2 = charptr[i];
    1618           0 :                     if (c2 < 0xa1)
    1619             :                     {
    1620           0 :                         charptr[i] = 0xa1;
    1621           0 :                         return true;
    1622             :                     }
    1623           0 :                     else if (c2 < 0xfe)
    1624             :                     {
    1625           0 :                         charptr[i]++;
    1626           0 :                         return true;
    1627             :                     }
    1628             :                 }
    1629             : 
    1630             :                 /* Out of 2 byte code region */
    1631           0 :                 return false;
    1632             :             }
    1633             :             else
    1634             :             {                   /* ASCII, single byte */
    1635           0 :                 if (c1 > 0x7e)
    1636           0 :                     return false;
    1637           0 :                 (*charptr)++;
    1638             :             }
    1639           0 :             break;
    1640             :     }
    1641             : 
    1642           0 :     return true;
    1643             : }
    1644             : 
    1645             : /*
    1646             :  * get the character incrementer for the encoding for the current database
    1647             :  */
    1648             : mbcharacter_incrementer
    1649        3586 : pg_database_encoding_character_incrementer(void)
    1650             : {
    1651             :     /*
    1652             :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1653             :      * now we just use a switch.
    1654             :      */
    1655        3586 :     switch (GetDatabaseEncoding())
    1656             :     {
    1657        3482 :         case PG_UTF8:
    1658        3482 :             return pg_utf8_increment;
    1659             : 
    1660           0 :         case PG_EUC_JP:
    1661           0 :             return pg_eucjp_increment;
    1662             : 
    1663         104 :         default:
    1664         104 :             return pg_generic_charinc;
    1665             :     }
    1666             : }
    1667             : 
    1668             : /*
    1669             :  * fetch maximum length of the encoding for the current database
    1670             :  */
    1671             : int
    1672     5998312 : pg_database_encoding_max_length(void)
    1673             : {
    1674     5998312 :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1675             : }
    1676             : 
    1677             : /*
    1678             :  * Verify mbstr to make sure that it is validly encoded in the current
    1679             :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1680             :  */
    1681             : bool
    1682        4584 : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1683             : {
    1684        4584 :     return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
    1685             : }
    1686             : 
    1687             : /*
    1688             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1689             :  * encoding.
    1690             :  */
    1691             : bool
    1692     1193168 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1693             : {
    1694             :     int         oklen;
    1695             : 
    1696             :     Assert(PG_VALID_ENCODING(encoding));
    1697             : 
    1698     1193168 :     oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
    1699     1193168 :     if (oklen != len)
    1700             :     {
    1701          16 :         if (noError)
    1702           0 :             return false;
    1703          16 :         report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
    1704             :     }
    1705     1193152 :     return true;
    1706             : }
    1707             : 
    1708             : /*
    1709             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1710             :  * encoding.
    1711             :  *
    1712             :  * mbstr is not necessarily zero terminated; length of mbstr is
    1713             :  * specified by len.
    1714             :  *
    1715             :  * If OK, return length of string in the encoding.
    1716             :  * If a problem is found, return -1 when noError is
    1717             :  * true; when noError is false, ereport() a descriptive message.
    1718             :  *
    1719             :  * Note: We cannot use the faster encoding-specific mbverifystr() function
    1720             :  * here, because we need to count the number of characters in the string.
    1721             :  */
    1722             : int
    1723           0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1724             : {
    1725             :     mbchar_verifier mbverifychar;
    1726             :     int         mb_len;
    1727             : 
    1728             :     Assert(PG_VALID_ENCODING(encoding));
    1729             : 
    1730             :     /*
    1731             :      * In single-byte encodings, we need only reject nulls (\0).
    1732             :      */
    1733           0 :     if (pg_encoding_max_length(encoding) <= 1)
    1734             :     {
    1735           0 :         const char *nullpos = memchr(mbstr, 0, len);
    1736             : 
    1737           0 :         if (nullpos == NULL)
    1738           0 :             return len;
    1739           0 :         if (noError)
    1740           0 :             return -1;
    1741           0 :         report_invalid_encoding(encoding, nullpos, 1);
    1742             :     }
    1743             : 
    1744             :     /* fetch function pointer just once */
    1745           0 :     mbverifychar = pg_wchar_table[encoding].mbverifychar;
    1746             : 
    1747           0 :     mb_len = 0;
    1748             : 
    1749           0 :     while (len > 0)
    1750             :     {
    1751             :         int         l;
    1752             : 
    1753             :         /* fast path for ASCII-subset characters */
    1754           0 :         if (!IS_HIGHBIT_SET(*mbstr))
    1755             :         {
    1756           0 :             if (*mbstr != '\0')
    1757             :             {
    1758           0 :                 mb_len++;
    1759           0 :                 mbstr++;
    1760           0 :                 len--;
    1761           0 :                 continue;
    1762             :             }
    1763           0 :             if (noError)
    1764           0 :                 return -1;
    1765           0 :             report_invalid_encoding(encoding, mbstr, len);
    1766             :         }
    1767             : 
    1768           0 :         l = (*mbverifychar) ((const unsigned char *) mbstr, len);
    1769             : 
    1770           0 :         if (l < 0)
    1771             :         {
    1772           0 :             if (noError)
    1773           0 :                 return -1;
    1774           0 :             report_invalid_encoding(encoding, mbstr, len);
    1775             :         }
    1776             : 
    1777           0 :         mbstr += l;
    1778           0 :         len -= l;
    1779           0 :         mb_len++;
    1780             :     }
    1781           0 :     return mb_len;
    1782             : }
    1783             : 
    1784             : /*
    1785             :  * check_encoding_conversion_args: check arguments of a conversion function
    1786             :  *
    1787             :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1788             :  * the caller will check whether it accepts the ID.
    1789             :  *
    1790             :  * Note: the errors here are not really user-facing, so elog instead of
    1791             :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1792             :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1793             :  */
    1794             : void
    1795        7130 : check_encoding_conversion_args(int src_encoding,
    1796             :                                int dest_encoding,
    1797             :                                int len,
    1798             :                                int expected_src_encoding,
    1799             :                                int expected_dest_encoding)
    1800             : {
    1801        7130 :     if (!PG_VALID_ENCODING(src_encoding))
    1802           0 :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1803        7130 :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1804           0 :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1805             :              pg_enc2name_tbl[expected_src_encoding].name,
    1806             :              pg_enc2name_tbl[src_encoding].name);
    1807        7130 :     if (!PG_VALID_ENCODING(dest_encoding))
    1808           0 :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1809        7130 :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1810           0 :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1811             :              pg_enc2name_tbl[expected_dest_encoding].name,
    1812             :              pg_enc2name_tbl[dest_encoding].name);
    1813        7130 :     if (len < 0)
    1814           0 :         elog(ERROR, "encoding conversion length must not be negative");
    1815        7130 : }
    1816             : 
    1817             : /*
    1818             :  * report_invalid_encoding: complain about invalid multibyte character
    1819             :  *
    1820             :  * note: len is remaining length of string, not length of character;
    1821             :  * len must be greater than zero (or we'd neglect initializing "buf").
    1822             :  */
    1823             : void
    1824        2998 : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1825             : {
    1826        2998 :     int         l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
    1827             : 
    1828        2998 :     report_invalid_encoding_int(encoding, mbstr, l, len);
    1829             : }
    1830             : 
    1831             : static void
    1832        3034 : report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
    1833             : {
    1834             :     char        buf[8 * 5 + 1];
    1835        3034 :     char       *p = buf;
    1836             :     int         j,
    1837             :                 jlimit;
    1838             : 
    1839        3034 :     jlimit = Min(mblen, len);
    1840        3034 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1841             : 
    1842        9320 :     for (j = 0; j < jlimit; j++)
    1843             :     {
    1844        6286 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1845        6286 :         if (j < jlimit - 1)
    1846        3252 :             p += sprintf(p, " ");
    1847             :     }
    1848             : 
    1849        3034 :     ereport(ERROR,
    1850             :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    1851             :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    1852             :                     pg_enc2name_tbl[encoding].name,
    1853             :                     buf)));
    1854             : }
    1855             : 
    1856             : static void
    1857          36 : report_invalid_encoding_db(const char *mbstr, int mblen, int len)
    1858             : {
    1859          36 :     report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
    1860             : }
    1861             : 
    1862             : /*
    1863             :  * report_untranslatable_char: complain about untranslatable character
    1864             :  *
    1865             :  * note: len is remaining length of string, not length of character;
    1866             :  * len must be greater than zero (or we'd neglect initializing "buf").
    1867             :  */
    1868             : void
    1869         936 : report_untranslatable_char(int src_encoding, int dest_encoding,
    1870             :                            const char *mbstr, int len)
    1871             : {
    1872             :     int         l;
    1873             :     char        buf[8 * 5 + 1];
    1874         936 :     char       *p = buf;
    1875             :     int         j,
    1876             :                 jlimit;
    1877             : 
    1878             :     /*
    1879             :      * We probably could use plain pg_encoding_mblen(), because
    1880             :      * gb18030_to_utf8() verifies before it converts.  All conversions should.
    1881             :      * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs.  Even
    1882             :      * so, be defensive, since a buggy conversion might pass invalid data.
    1883             :      * This is not a performance-critical path.
    1884             :      */
    1885         936 :     l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
    1886         936 :     jlimit = Min(l, len);
    1887         936 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1888             : 
    1889        3528 :     for (j = 0; j < jlimit; j++)
    1890             :     {
    1891        2592 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1892        2592 :         if (j < jlimit - 1)
    1893        1656 :             p += sprintf(p, " ");
    1894             :     }
    1895             : 
    1896         936 :     ereport(ERROR,
    1897             :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    1898             :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    1899             :                     buf,
    1900             :                     pg_enc2name_tbl[src_encoding].name,
    1901             :                     pg_enc2name_tbl[dest_encoding].name)));
    1902             : }
    1903             : 
    1904             : 
    1905             : #ifdef WIN32
    1906             : /*
    1907             :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1908             :  * string. The character length is also passed to utf16len if not
    1909             :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1910             :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1911             :  */
    1912             : WCHAR *
    1913             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1914             : {
    1915             :     int         msgenc = GetMessageEncoding();
    1916             :     WCHAR      *utf16;
    1917             :     int         dstlen;
    1918             :     UINT        codepage;
    1919             : 
    1920             :     if (msgenc == PG_SQL_ASCII)
    1921             :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1922             :         return NULL;
    1923             : 
    1924             :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1925             : 
    1926             :     /*
    1927             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1928             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1929             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1930             :      */
    1931             :     if (codepage != 0)
    1932             :     {
    1933             :         utf16 = palloc_array(WCHAR, len + 1);
    1934             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1935             :         utf16[dstlen] = (WCHAR) 0;
    1936             :     }
    1937             :     else
    1938             :     {
    1939             :         char       *utf8;
    1940             : 
    1941             :         /*
    1942             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1943             :          * absence of one, hope for the input to be valid UTF8.
    1944             :          */
    1945             :         if (IsTransactionState())
    1946             :         {
    1947             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1948             :                                                       len,
    1949             :                                                       msgenc,
    1950             :                                                       PG_UTF8);
    1951             :             if (utf8 != str)
    1952             :                 len = strlen(utf8);
    1953             :         }
    1954             :         else
    1955             :             utf8 = (char *) str;
    1956             : 
    1957             :         utf16 = palloc_array(WCHAR, len + 1);
    1958             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1959             :         utf16[dstlen] = (WCHAR) 0;
    1960             : 
    1961             :         if (utf8 != str)
    1962             :             pfree(utf8);
    1963             :     }
    1964             : 
    1965             :     if (dstlen == 0 && len > 0)
    1966             :     {
    1967             :         pfree(utf16);
    1968             :         return NULL;            /* error */
    1969             :     }
    1970             : 
    1971             :     if (utf16len)
    1972             :         *utf16len = dstlen;
    1973             :     return utf16;
    1974             : }
    1975             : 
    1976             : #endif                          /* WIN32 */

Generated by: LCOV version 1.16