LCOV - code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 65.6 % 555 364
Test Date: 2026-03-03 06:14:53 Functions: 83.6 % 61 51
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * mbutils.c
       4              :  *    This file contains functions for encoding conversion.
       5              :  *
       6              :  * The string-conversion functions in this file share some API quirks.
       7              :  * Note the following:
       8              :  *
       9              :  * The functions return a palloc'd, null-terminated string if conversion
      10              :  * is required.  However, if no conversion is performed, the given source
      11              :  * string pointer is returned as-is.
      12              :  *
      13              :  * Although the presence of a length argument means that callers can pass
      14              :  * non-null-terminated strings, care is required because the same string
      15              :  * will be passed back if no conversion occurs.  Such callers *must* check
      16              :  * whether result == src and handle that case differently.
      17              :  *
      18              :  * If the source and destination encodings are the same, the source string
      19              :  * is returned without any verification; it's assumed to be valid data.
      20              :  * If that might not be the case, the caller is responsible for validating
      21              :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22              :  * source and destination encodings are different, the functions ensure that
      23              :  * the result is validly encoded according to the destination encoding.
      24              :  *
      25              :  *
      26              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      27              :  * Portions Copyright (c) 1994, Regents of the University of California
      28              :  *
      29              :  *
      30              :  * IDENTIFICATION
      31              :  *    src/backend/utils/mb/mbutils.c
      32              :  *
      33              :  *-------------------------------------------------------------------------
      34              :  */
      35              : #include "postgres.h"
      36              : 
      37              : #include "access/xact.h"
      38              : #include "catalog/namespace.h"
      39              : #include "mb/pg_wchar.h"
      40              : #include "utils/fmgrprotos.h"
      41              : #include "utils/memdebug.h"
      42              : #include "utils/memutils.h"
      43              : #include "utils/relcache.h"
      44              : #include "varatt.h"
      45              : 
      46              : /*
      47              :  * We maintain a simple linked list caching the fmgr lookup info for the
      48              :  * currently selected conversion functions, as well as any that have been
      49              :  * selected previously in the current session.  (We remember previous
      50              :  * settings because we must be able to restore a previous setting during
      51              :  * transaction rollback, without doing any fresh catalog accesses.)
      52              :  *
      53              :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      54              :  */
      55              : typedef struct ConvProcInfo
      56              : {
      57              :     int         s_encoding;     /* server and client encoding IDs */
      58              :     int         c_encoding;
      59              :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      60              :     FmgrInfo    to_client_info;
      61              : } ConvProcInfo;
      62              : 
      63              : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      64              : 
      65              : /*
      66              :  * These variables point to the currently active conversion functions,
      67              :  * or are NULL when no conversion is needed.
      68              :  */
      69              : static FmgrInfo *ToServerConvProc = NULL;
      70              : static FmgrInfo *ToClientConvProc = NULL;
      71              : 
      72              : /*
      73              :  * This variable stores the conversion function to convert from UTF-8
      74              :  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
      75              :  * or if we lack a conversion function for this.
      76              :  */
      77              : static FmgrInfo *Utf8ToServerConvProc = NULL;
      78              : 
      79              : /*
      80              :  * These variables track the currently-selected encodings.
      81              :  */
      82              : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      83              : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      84              : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      85              : 
      86              : /*
      87              :  * During backend startup we can't set client encoding because we (a)
      88              :  * can't look up the conversion functions, and (b) may not know the database
      89              :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      90              :  * remembers it for InitializeClientEncoding() to apply later.
      91              :  */
      92              : static bool backend_startup_complete = false;
      93              : static int  pending_client_encoding = PG_SQL_ASCII;
      94              : 
      95              : 
      96              : /* Internal functions */
      97              : static char *perform_default_encoding_conversion(const char *src,
      98              :                                                  int len, bool is_client_to_server);
      99              : static int  cliplen(const char *str, int len, int limit);
     100              : 
     101              : pg_noreturn
     102              : static void report_invalid_encoding_int(int encoding, const char *mbstr,
     103              :                                         int mblen, int len);
     104              : 
     105              : pg_noreturn
     106              : static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
     107              : 
     108              : 
     109              : /*
     110              :  * Prepare for a future call to SetClientEncoding.  Success should mean
     111              :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     112              :  *
     113              :  * (But note that success before backend_startup_complete does not guarantee
     114              :  * success after ...)
     115              :  *
     116              :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     117              :  */
     118              : int
     119        36865 : PrepareClientEncoding(int encoding)
     120              : {
     121              :     int         current_server_encoding;
     122              :     ListCell   *lc;
     123              : 
     124        36865 :     if (!PG_VALID_FE_ENCODING(encoding))
     125            0 :         return -1;
     126              : 
     127              :     /* Can't do anything during startup, per notes above */
     128        36865 :     if (!backend_startup_complete)
     129        18626 :         return 0;
     130              : 
     131        18239 :     current_server_encoding = GetDatabaseEncoding();
     132              : 
     133              :     /*
     134              :      * Check for cases that require no conversion function.
     135              :      */
     136        18239 :     if (current_server_encoding == encoding ||
     137         1499 :         current_server_encoding == PG_SQL_ASCII ||
     138              :         encoding == PG_SQL_ASCII)
     139        18229 :         return 0;
     140              : 
     141           10 :     if (IsTransactionState())
     142              :     {
     143              :         /*
     144              :          * If we're in a live transaction, it's safe to access the catalogs,
     145              :          * so look up the functions.  We repeat the lookup even if the info is
     146              :          * already cached, so that we can react to changes in the contents of
     147              :          * pg_conversion.
     148              :          */
     149              :         Oid         to_server_proc,
     150              :                     to_client_proc;
     151              :         ConvProcInfo *convinfo;
     152              :         MemoryContext oldcontext;
     153              : 
     154           10 :         to_server_proc = FindDefaultConversionProc(encoding,
     155              :                                                    current_server_encoding);
     156           10 :         if (!OidIsValid(to_server_proc))
     157            0 :             return -1;
     158           10 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     159              :                                                    encoding);
     160           10 :         if (!OidIsValid(to_client_proc))
     161            0 :             return -1;
     162              : 
     163              :         /*
     164              :          * Load the fmgr info into TopMemoryContext (could still fail here)
     165              :          */
     166           10 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     167              :                                                        sizeof(ConvProcInfo));
     168           10 :         convinfo->s_encoding = current_server_encoding;
     169           10 :         convinfo->c_encoding = encoding;
     170           10 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     171              :                       TopMemoryContext);
     172           10 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     173              :                       TopMemoryContext);
     174              : 
     175              :         /* Attach new info to head of list */
     176           10 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     177           10 :         ConvProcList = lcons(convinfo, ConvProcList);
     178           10 :         MemoryContextSwitchTo(oldcontext);
     179              : 
     180              :         /*
     181              :          * We cannot yet remove any older entry for the same encoding pair,
     182              :          * since it could still be in use.  SetClientEncoding will clean up.
     183              :          */
     184              : 
     185           10 :         return 0;               /* success */
     186              :     }
     187              :     else
     188              :     {
     189              :         /*
     190              :          * If we're not in a live transaction, the only thing we can do is
     191              :          * restore a previous setting using the cache.  This covers all
     192              :          * transaction-rollback cases.  The only case it might not work for is
     193              :          * trying to change client_encoding on the fly by editing
     194              :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     195              :          * thing to do anyway.
     196              :          */
     197            0 :         foreach(lc, ConvProcList)
     198              :         {
     199            0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     200              : 
     201            0 :             if (oldinfo->s_encoding == current_server_encoding &&
     202            0 :                 oldinfo->c_encoding == encoding)
     203            0 :                 return 0;
     204              :         }
     205              : 
     206            0 :         return -1;              /* it's not cached, so fail */
     207              :     }
     208              : }
     209              : 
     210              : /*
     211              :  * Set the active client encoding and set up the conversion-function pointers.
     212              :  * PrepareClientEncoding should have been called previously for this encoding.
     213              :  *
     214              :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     215              :  */
     216              : int
     217        38257 : SetClientEncoding(int encoding)
     218              : {
     219              :     int         current_server_encoding;
     220              :     bool        found;
     221              :     ListCell   *lc;
     222              : 
     223        38257 :     if (!PG_VALID_FE_ENCODING(encoding))
     224            0 :         return -1;
     225              : 
     226              :     /* Can't do anything during startup, per notes above */
     227        38257 :     if (!backend_startup_complete)
     228              :     {
     229        18544 :         pending_client_encoding = encoding;
     230        18544 :         return 0;
     231              :     }
     232              : 
     233        19713 :     current_server_encoding = GetDatabaseEncoding();
     234              : 
     235              :     /*
     236              :      * Check for cases that require no conversion function.
     237              :      */
     238        19713 :     if (current_server_encoding == encoding ||
     239         1499 :         current_server_encoding == PG_SQL_ASCII ||
     240              :         encoding == PG_SQL_ASCII)
     241              :     {
     242        19703 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     243        19703 :         ToServerConvProc = NULL;
     244        19703 :         ToClientConvProc = NULL;
     245        19703 :         return 0;
     246              :     }
     247              : 
     248              :     /*
     249              :      * Search the cache for the entry previously prepared by
     250              :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     251              :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     252              :      * leak memory.
     253              :      */
     254           10 :     found = false;
     255           23 :     foreach(lc, ConvProcList)
     256              :     {
     257           13 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     258              : 
     259           13 :         if (convinfo->s_encoding == current_server_encoding &&
     260           13 :             convinfo->c_encoding == encoding)
     261              :         {
     262           10 :             if (!found)
     263              :             {
     264              :                 /* Found newest entry, so set up */
     265           10 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     266           10 :                 ToServerConvProc = &convinfo->to_server_info;
     267           10 :                 ToClientConvProc = &convinfo->to_client_info;
     268           10 :                 found = true;
     269              :             }
     270              :             else
     271              :             {
     272              :                 /* Duplicate entry, release it */
     273            0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     274            0 :                 pfree(convinfo);
     275              :             }
     276              :         }
     277              :     }
     278              : 
     279           10 :     if (found)
     280           10 :         return 0;               /* success */
     281              :     else
     282            0 :         return -1;              /* it's not cached, so fail */
     283              : }
     284              : 
     285              : /*
     286              :  * Initialize client encoding conversions.
     287              :  *      Called from InitPostgres() once during backend startup.
     288              :  */
     289              : void
     290        17885 : InitializeClientEncoding(void)
     291              : {
     292              :     int         current_server_encoding;
     293              : 
     294              :     Assert(!backend_startup_complete);
     295        17885 :     backend_startup_complete = true;
     296              : 
     297        35770 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     298        17885 :         SetClientEncoding(pending_client_encoding) < 0)
     299              :     {
     300              :         /*
     301              :          * Oops, the requested conversion is not available. We couldn't fail
     302              :          * before, but we can now.
     303              :          */
     304            0 :         ereport(FATAL,
     305              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     306              :                  errmsg("conversion between %s and %s is not supported",
     307              :                         pg_enc2name_tbl[pending_client_encoding].name,
     308              :                         GetDatabaseEncodingName())));
     309              :     }
     310              : 
     311              :     /*
     312              :      * Also look up the UTF8-to-server conversion function if needed.  Since
     313              :      * the server encoding is fixed within any one backend process, we don't
     314              :      * have to do this more than once.
     315              :      */
     316        17885 :     current_server_encoding = GetDatabaseEncoding();
     317        17885 :     if (current_server_encoding != PG_UTF8 &&
     318              :         current_server_encoding != PG_SQL_ASCII)
     319              :     {
     320              :         Oid         utf8_to_server_proc;
     321              : 
     322           93 :         AssertCouldGetRelation();
     323              :         utf8_to_server_proc =
     324           93 :             FindDefaultConversionProc(PG_UTF8,
     325              :                                       current_server_encoding);
     326              :         /* If there's no such conversion, just leave the pointer as NULL */
     327           93 :         if (OidIsValid(utf8_to_server_proc))
     328              :         {
     329              :             FmgrInfo   *finfo;
     330              : 
     331           93 :             finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
     332              :                                                     sizeof(FmgrInfo));
     333           93 :             fmgr_info_cxt(utf8_to_server_proc, finfo,
     334              :                           TopMemoryContext);
     335              :             /* Set Utf8ToServerConvProc only after data is fully valid */
     336           93 :             Utf8ToServerConvProc = finfo;
     337              :         }
     338              :     }
     339        17885 : }
     340              : 
     341              : /*
     342              :  * returns the current client encoding
     343              :  */
     344              : int
     345         5705 : pg_get_client_encoding(void)
     346              : {
     347         5705 :     return ClientEncoding->encoding;
     348              : }
     349              : 
     350              : /*
     351              :  * returns the current client encoding name
     352              :  */
     353              : const char *
     354            0 : pg_get_client_encoding_name(void)
     355              : {
     356            0 :     return ClientEncoding->name;
     357              : }
     358              : 
     359              : /*
     360              :  * Convert src string to another encoding (general case).
     361              :  *
     362              :  * See the notes about string conversion functions at the top of this file.
     363              :  */
     364              : unsigned char *
     365         1525 : pg_do_encoding_conversion(unsigned char *src, int len,
     366              :                           int src_encoding, int dest_encoding)
     367              : {
     368              :     unsigned char *result;
     369              :     Oid         proc;
     370              : 
     371         1525 :     if (len <= 0)
     372           18 :         return src;             /* empty string is always valid */
     373              : 
     374         1507 :     if (src_encoding == dest_encoding)
     375         1100 :         return src;             /* no conversion required, assume valid */
     376              : 
     377          407 :     if (dest_encoding == PG_SQL_ASCII)
     378            0 :         return src;             /* any string is valid in SQL_ASCII */
     379              : 
     380          407 :     if (src_encoding == PG_SQL_ASCII)
     381              :     {
     382              :         /* No conversion is possible, but we must validate the result */
     383            8 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     384            8 :         return src;
     385              :     }
     386              : 
     387          399 :     if (!IsTransactionState())  /* shouldn't happen */
     388            0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     389              : 
     390          399 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     391          399 :     if (!OidIsValid(proc))
     392            0 :         ereport(ERROR,
     393              :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     394              :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     395              :                         pg_encoding_to_char(src_encoding),
     396              :                         pg_encoding_to_char(dest_encoding))));
     397              : 
     398              :     /*
     399              :      * Allocate space for conversion result, being wary of integer overflow.
     400              :      *
     401              :      * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
     402              :      * required space, so it might exceed MaxAllocSize even though the result
     403              :      * would actually fit.  We do not want to hand back a result string that
     404              :      * exceeds MaxAllocSize, because callers might not cope gracefully --- but
     405              :      * if we just allocate more than that, and don't use it, that's fine.
     406              :      */
     407          399 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     408            0 :         ereport(ERROR,
     409              :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     410              :                  errmsg("out of memory"),
     411              :                  errdetail("String of %d bytes is too long for encoding conversion.",
     412              :                            len)));
     413              : 
     414              :     result = (unsigned char *)
     415          399 :         MemoryContextAllocHuge(CurrentMemoryContext,
     416          399 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     417              : 
     418          399 :     (void) OidFunctionCall6(proc,
     419              :                             Int32GetDatum(src_encoding),
     420              :                             Int32GetDatum(dest_encoding),
     421              :                             CStringGetDatum((char *) src),
     422              :                             CStringGetDatum((char *) result),
     423              :                             Int32GetDatum(len),
     424              :                             BoolGetDatum(false));
     425              : 
     426              :     /*
     427              :      * If the result is large, it's worth repalloc'ing to release any extra
     428              :      * space we asked for.  The cutoff here is somewhat arbitrary, but we
     429              :      * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
     430              :      */
     431          399 :     if (len > 1000000)
     432              :     {
     433            0 :         Size        resultlen = strlen((char *) result);
     434              : 
     435            0 :         if (resultlen >= MaxAllocSize)
     436            0 :             ereport(ERROR,
     437              :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     438              :                      errmsg("out of memory"),
     439              :                      errdetail("String of %d bytes is too long for encoding conversion.",
     440              :                                len)));
     441              : 
     442            0 :         result = (unsigned char *) repalloc(result, resultlen + 1);
     443              :     }
     444              : 
     445          399 :     return result;
     446              : }
     447              : 
     448              : /*
     449              :  * Convert src string to another encoding.
     450              :  *
     451              :  * This function has a different API than the other conversion functions.
     452              :  * The caller should've looked up the conversion function using
     453              :  * FindDefaultConversionProc().  Unlike the other functions, the converted
     454              :  * result is not palloc'd.  It is written to the caller-supplied buffer
     455              :  * instead.
     456              :  *
     457              :  * src_encoding   - encoding to convert from
     458              :  * dest_encoding  - encoding to convert to
     459              :  * src, srclen    - input buffer and its length in bytes
     460              :  * dest, destlen  - destination buffer and its size in bytes
     461              :  *
     462              :  * The output is null-terminated.
     463              :  *
     464              :  * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
     465              :  * wouldn't necessarily fit in the output buffer, and the function will not
     466              :  * convert the whole input.
     467              :  *
     468              :  * TODO: The conversion function interface is not great.  Firstly, it
     469              :  * would be nice to pass through the destination buffer size to the
     470              :  * conversion function, so that if you pass a shorter destination buffer, it
     471              :  * could still continue to fill up the whole buffer.  Currently, we have to
     472              :  * assume worst case expansion and stop the conversion short, even if there
     473              :  * is in fact space left in the destination buffer.  Secondly, it would be
     474              :  * nice to return the number of bytes written to the caller, to avoid a call
     475              :  * to strlen().
     476              :  */
     477              : int
     478         2910 : pg_do_encoding_conversion_buf(Oid proc,
     479              :                               int src_encoding,
     480              :                               int dest_encoding,
     481              :                               unsigned char *src, int srclen,
     482              :                               unsigned char *dest, int destlen,
     483              :                               bool noError)
     484              : {
     485              :     Datum       result;
     486              : 
     487              :     /*
     488              :      * If the destination buffer is not large enough to hold the result in the
     489              :      * worst case, limit the input size passed to the conversion function.
     490              :      */
     491         2910 :     if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
     492         2880 :         srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
     493              : 
     494         2910 :     result = OidFunctionCall6(proc,
     495              :                               Int32GetDatum(src_encoding),
     496              :                               Int32GetDatum(dest_encoding),
     497              :                               CStringGetDatum((char *) src),
     498              :                               CStringGetDatum((char *) dest),
     499              :                               Int32GetDatum(srclen),
     500              :                               BoolGetDatum(noError));
     501         1725 :     return DatumGetInt32(result);
     502              : }
     503              : 
     504              : /*
     505              :  * Convert string to encoding encoding_name. The source
     506              :  * encoding is the DB encoding.
     507              :  *
     508              :  * BYTEA convert_to(TEXT string, NAME encoding_name)
     509              :  */
     510              : Datum
     511          204 : pg_convert_to(PG_FUNCTION_ARGS)
     512              : {
     513          204 :     Datum       string = PG_GETARG_DATUM(0);
     514          204 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     515          204 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     516              :                                                         CStringGetDatum(DatabaseEncoding->name));
     517              :     Datum       result;
     518              : 
     519              :     /*
     520              :      * pg_convert expects a bytea as its first argument. We're passing it a
     521              :      * text argument here, relying on the fact that they are both in fact
     522              :      * varlena types, and thus structurally identical.
     523              :      */
     524          204 :     result = DirectFunctionCall3(pg_convert, string,
     525              :                                  src_encoding_name, dest_encoding_name);
     526              : 
     527          201 :     PG_RETURN_DATUM(result);
     528              : }
     529              : 
     530              : /*
     531              :  * Convert string from encoding encoding_name. The destination
     532              :  * encoding is the DB encoding.
     533              :  *
     534              :  * TEXT convert_from(BYTEA string, NAME encoding_name)
     535              :  */
     536              : Datum
     537          296 : pg_convert_from(PG_FUNCTION_ARGS)
     538              : {
     539          296 :     Datum       string = PG_GETARG_DATUM(0);
     540          296 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     541          296 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     542              :                                                          CStringGetDatum(DatabaseEncoding->name));
     543              :     Datum       result;
     544              : 
     545          296 :     result = DirectFunctionCall3(pg_convert, string,
     546              :                                  src_encoding_name, dest_encoding_name);
     547              : 
     548              :     /*
     549              :      * pg_convert returns a bytea, which we in turn return as text, relying on
     550              :      * the fact that they are both in fact varlena types, and thus
     551              :      * structurally identical. Although not all bytea values are valid text,
     552              :      * in this case it will be because we've told pg_convert to return one
     553              :      * that is valid as text in the current database encoding.
     554              :      */
     555          293 :     PG_RETURN_DATUM(result);
     556              : }
     557              : 
     558              : /*
     559              :  * Convert string between two arbitrary encodings.
     560              :  *
     561              :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     562              :  */
     563              : Datum
     564          884 : pg_convert(PG_FUNCTION_ARGS)
     565              : {
     566          884 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     567          884 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     568          884 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     569          884 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     570          884 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     571              :     const char *src_str;
     572              :     char       *dest_str;
     573              :     bytea      *retval;
     574              :     int         len;
     575              : 
     576          884 :     if (src_encoding < 0)
     577            0 :         ereport(ERROR,
     578              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     579              :                  errmsg("invalid source encoding name \"%s\"",
     580              :                         src_encoding_name)));
     581          884 :     if (dest_encoding < 0)
     582            0 :         ereport(ERROR,
     583              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     584              :                  errmsg("invalid destination encoding name \"%s\"",
     585              :                         dest_encoding_name)));
     586              : 
     587              :     /* make sure that source string is valid */
     588          884 :     len = VARSIZE_ANY_EXHDR(string);
     589          884 :     src_str = VARDATA_ANY(string);
     590          884 :     (void) pg_verify_mbstr(src_encoding, src_str, len, false);
     591              : 
     592              :     /* perform conversion */
     593          878 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     594              :                                                   len,
     595              :                                                   src_encoding,
     596              :                                                   dest_encoding);
     597              : 
     598              : 
     599              :     /* return source string if no conversion happened */
     600          878 :     if (dest_str == src_str)
     601          488 :         PG_RETURN_BYTEA_P(string);
     602              : 
     603              :     /*
     604              :      * build bytea data type structure.
     605              :      */
     606          390 :     len = strlen(dest_str);
     607          390 :     retval = (bytea *) palloc(len + VARHDRSZ);
     608          390 :     SET_VARSIZE(retval, len + VARHDRSZ);
     609          390 :     memcpy(VARDATA(retval), dest_str, len);
     610          390 :     pfree(dest_str);
     611              : 
     612              :     /* free memory if allocated by the toaster */
     613          390 :     PG_FREE_IF_COPY(string, 0);
     614              : 
     615          390 :     PG_RETURN_BYTEA_P(retval);
     616              : }
     617              : 
     618              : /*
     619              :  * get the length of the string considered as text in the specified
     620              :  * encoding. Raises an error if the data is not valid in that
     621              :  * encoding.
     622              :  *
     623              :  * INT4 length (BYTEA string, NAME src_encoding_name)
     624              :  */
     625              : Datum
     626            0 : length_in_encoding(PG_FUNCTION_ARGS)
     627              : {
     628            0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     629            0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     630            0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     631              :     const char *src_str;
     632              :     int         len;
     633              :     int         retval;
     634              : 
     635            0 :     if (src_encoding < 0)
     636            0 :         ereport(ERROR,
     637              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     638              :                  errmsg("invalid encoding name \"%s\"",
     639              :                         src_encoding_name)));
     640              : 
     641            0 :     len = VARSIZE_ANY_EXHDR(string);
     642            0 :     src_str = VARDATA_ANY(string);
     643              : 
     644            0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     645              : 
     646            0 :     PG_RETURN_INT32(retval);
     647              : }
     648              : 
     649              : /*
     650              :  * Get maximum multibyte character length in the specified encoding.
     651              :  *
     652              :  * Note encoding is specified numerically, not by name as above.
     653              :  */
     654              : Datum
     655            0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     656              : {
     657            0 :     int         encoding = PG_GETARG_INT32(0);
     658              : 
     659            0 :     if (PG_VALID_ENCODING(encoding))
     660            0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     661              :     else
     662            0 :         PG_RETURN_NULL();
     663              : }
     664              : 
     665              : /*
     666              :  * Convert client encoding to server encoding.
     667              :  *
     668              :  * See the notes about string conversion functions at the top of this file.
     669              :  */
     670              : char *
     671       430033 : pg_client_to_server(const char *s, int len)
     672              : {
     673       430033 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     674              : }
     675              : 
     676              : /*
     677              :  * Convert any encoding to server encoding.
     678              :  *
     679              :  * See the notes about string conversion functions at the top of this file.
     680              :  *
     681              :  * Unlike the other string conversion functions, this will apply validation
     682              :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     683              :  * used to process data coming in from outside the database, and we never
     684              :  * want to just assume validity.
     685              :  */
     686              : char *
     687       474467 : pg_any_to_server(const char *s, int len, int encoding)
     688              : {
     689       474467 :     if (len <= 0)
     690        40675 :         return unconstify(char *, s);   /* empty string is always valid */
     691              : 
     692       433792 :     if (encoding == DatabaseEncoding->encoding ||
     693              :         encoding == PG_SQL_ASCII)
     694              :     {
     695              :         /*
     696              :          * No conversion is needed, but we must still validate the data.
     697              :          */
     698       433608 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     699       433607 :         return unconstify(char *, s);
     700              :     }
     701              : 
     702          184 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     703              :     {
     704              :         /*
     705              :          * No conversion is possible, but we must still validate the data,
     706              :          * because the client-side code might have done string escaping using
     707              :          * the selected client_encoding.  If the client encoding is ASCII-safe
     708              :          * then we just do a straight validation under that encoding.  For an
     709              :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     710              :          * to the parser but we have no way to convert it.  We compromise by
     711              :          * rejecting the data if it contains any non-ASCII characters.
     712              :          */
     713          154 :         if (PG_VALID_BE_ENCODING(encoding))
     714          124 :             (void) pg_verify_mbstr(encoding, s, len, false);
     715              :         else
     716              :         {
     717              :             int         i;
     718              : 
     719          954 :             for (i = 0; i < len; i++)
     720              :             {
     721          924 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     722            0 :                     ereport(ERROR,
     723              :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     724              :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     725              :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     726              :                                     (unsigned char) s[i])));
     727              :             }
     728              :         }
     729          154 :         return unconstify(char *, s);
     730              :     }
     731              : 
     732              :     /* Fast path if we can use cached conversion function */
     733           30 :     if (encoding == ClientEncoding->encoding)
     734           30 :         return perform_default_encoding_conversion(s, len, true);
     735              : 
     736              :     /* General case ... will not work outside transactions */
     737            0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     738              :                                               len,
     739              :                                               encoding,
     740            0 :                                               DatabaseEncoding->encoding);
     741              : }
     742              : 
     743              : /*
     744              :  * Convert server encoding to client encoding.
     745              :  *
     746              :  * See the notes about string conversion functions at the top of this file.
     747              :  */
     748              : char *
     749     22698264 : pg_server_to_client(const char *s, int len)
     750              : {
     751     22698264 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     752              : }
     753              : 
     754              : /*
     755              :  * Convert server encoding to any encoding.
     756              :  *
     757              :  * See the notes about string conversion functions at the top of this file.
     758              :  */
     759              : char *
     760     22717618 : pg_server_to_any(const char *s, int len, int encoding)
     761              : {
     762     22717618 :     if (len <= 0)
     763       134743 :         return unconstify(char *, s);   /* empty string is always valid */
     764              : 
     765     22582875 :     if (encoding == DatabaseEncoding->encoding ||
     766              :         encoding == PG_SQL_ASCII)
     767     22582588 :         return unconstify(char *, s);   /* assume data is valid */
     768              : 
     769          287 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     770              :     {
     771              :         /* No conversion is possible, but we must validate the result */
     772           84 :         (void) pg_verify_mbstr(encoding, s, len, false);
     773           84 :         return unconstify(char *, s);
     774              :     }
     775              : 
     776              :     /* Fast path if we can use cached conversion function */
     777          203 :     if (encoding == ClientEncoding->encoding)
     778          194 :         return perform_default_encoding_conversion(s, len, false);
     779              : 
     780              :     /* General case ... will not work outside transactions */
     781            9 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     782              :                                               len,
     783            9 :                                               DatabaseEncoding->encoding,
     784              :                                               encoding);
     785              : }
     786              : 
     787              : /*
     788              :  *  Perform default encoding conversion using cached FmgrInfo. Since
     789              :  *  this function does not access database at all, it is safe to call
     790              :  *  outside transactions.  If the conversion has not been set up by
     791              :  *  SetClientEncoding(), no conversion is performed.
     792              :  */
     793              : static char *
     794          224 : perform_default_encoding_conversion(const char *src, int len,
     795              :                                     bool is_client_to_server)
     796              : {
     797              :     char       *result;
     798              :     int         src_encoding,
     799              :                 dest_encoding;
     800              :     FmgrInfo   *flinfo;
     801              : 
     802          224 :     if (is_client_to_server)
     803              :     {
     804           30 :         src_encoding = ClientEncoding->encoding;
     805           30 :         dest_encoding = DatabaseEncoding->encoding;
     806           30 :         flinfo = ToServerConvProc;
     807              :     }
     808              :     else
     809              :     {
     810          194 :         src_encoding = DatabaseEncoding->encoding;
     811          194 :         dest_encoding = ClientEncoding->encoding;
     812          194 :         flinfo = ToClientConvProc;
     813              :     }
     814              : 
     815          224 :     if (flinfo == NULL)
     816            0 :         return unconstify(char *, src);
     817              : 
     818              :     /*
     819              :      * Allocate space for conversion result, being wary of integer overflow.
     820              :      * See comments in pg_do_encoding_conversion.
     821              :      */
     822          224 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     823            0 :         ereport(ERROR,
     824              :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     825              :                  errmsg("out of memory"),
     826              :                  errdetail("String of %d bytes is too long for encoding conversion.",
     827              :                            len)));
     828              : 
     829              :     result = (char *)
     830          224 :         MemoryContextAllocHuge(CurrentMemoryContext,
     831          224 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     832              : 
     833          224 :     FunctionCall6(flinfo,
     834              :                   Int32GetDatum(src_encoding),
     835              :                   Int32GetDatum(dest_encoding),
     836              :                   CStringGetDatum(src),
     837              :                   CStringGetDatum(result),
     838              :                   Int32GetDatum(len),
     839              :                   BoolGetDatum(false));
     840              : 
     841              :     /*
     842              :      * Release extra space if there might be a lot --- see comments in
     843              :      * pg_do_encoding_conversion.
     844              :      */
     845          224 :     if (len > 1000000)
     846              :     {
     847            0 :         Size        resultlen = strlen(result);
     848              : 
     849            0 :         if (resultlen >= MaxAllocSize)
     850            0 :             ereport(ERROR,
     851              :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     852              :                      errmsg("out of memory"),
     853              :                      errdetail("String of %d bytes is too long for encoding conversion.",
     854              :                                len)));
     855              : 
     856            0 :         result = (char *) repalloc(result, resultlen + 1);
     857              :     }
     858              : 
     859          224 :     return result;
     860              : }
     861              : 
     862              : /*
     863              :  * Convert a single Unicode code point into a string in the server encoding.
     864              :  *
     865              :  * The code point given by "c" is converted and stored at *s, which must
     866              :  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
     867              :  * The output will have a trailing '\0'.  Throws error if the conversion
     868              :  * cannot be performed.
     869              :  *
     870              :  * Note that this relies on having previously looked up any required
     871              :  * conversion function.  That's partly for speed but mostly because the parser
     872              :  * may call this outside any transaction, or in an aborted transaction.
     873              :  */
     874              : void
     875          529 : pg_unicode_to_server(char32_t c, unsigned char *s)
     876              : {
     877              :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     878              :     int         c_as_utf8_len;
     879              :     int         server_encoding;
     880              : 
     881              :     /*
     882              :      * Complain if invalid Unicode code point.  The choice of errcode here is
     883              :      * debatable, but really our caller should have checked this anyway.
     884              :      */
     885          529 :     if (!is_valid_unicode_codepoint(c))
     886            0 :         ereport(ERROR,
     887              :                 (errcode(ERRCODE_SYNTAX_ERROR),
     888              :                  errmsg("invalid Unicode code point")));
     889              : 
     890              :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     891          529 :     if (c <= 0x7F)
     892              :     {
     893          176 :         s[0] = (unsigned char) c;
     894          176 :         s[1] = '\0';
     895          529 :         return;
     896              :     }
     897              : 
     898              :     /* If the server encoding is UTF-8, we just need to reformat the code */
     899          353 :     server_encoding = GetDatabaseEncoding();
     900          353 :     if (server_encoding == PG_UTF8)
     901              :     {
     902          353 :         unicode_to_utf8(c, s);
     903          353 :         s[pg_utf_mblen(s)] = '\0';
     904          353 :         return;
     905              :     }
     906              : 
     907              :     /* For all other cases, we must have a conversion function available */
     908            0 :     if (Utf8ToServerConvProc == NULL)
     909            0 :         ereport(ERROR,
     910              :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     911              :                  errmsg("conversion between %s and %s is not supported",
     912              :                         pg_enc2name_tbl[PG_UTF8].name,
     913              :                         GetDatabaseEncodingName())));
     914              : 
     915              :     /* Construct UTF-8 source string */
     916            0 :     unicode_to_utf8(c, c_as_utf8);
     917            0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     918            0 :     c_as_utf8[c_as_utf8_len] = '\0';
     919              : 
     920              :     /* Convert, or throw error if we can't */
     921            0 :     FunctionCall6(Utf8ToServerConvProc,
     922              :                   Int32GetDatum(PG_UTF8),
     923              :                   Int32GetDatum(server_encoding),
     924              :                   CStringGetDatum((char *) c_as_utf8),
     925              :                   CStringGetDatum((char *) s),
     926              :                   Int32GetDatum(c_as_utf8_len),
     927              :                   BoolGetDatum(false));
     928              : }
     929              : 
     930              : /*
     931              :  * Convert a single Unicode code point into a string in the server encoding.
     932              :  *
     933              :  * Same as pg_unicode_to_server(), except that we don't throw errors,
     934              :  * but simply return false on conversion failure.
     935              :  */
     936              : bool
     937           42 : pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
     938              : {
     939              :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     940              :     int         c_as_utf8_len;
     941              :     int         converted_len;
     942              :     int         server_encoding;
     943              : 
     944              :     /* Fail if invalid Unicode code point */
     945           42 :     if (!is_valid_unicode_codepoint(c))
     946            0 :         return false;
     947              : 
     948              :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     949           42 :     if (c <= 0x7F)
     950              :     {
     951           12 :         s[0] = (unsigned char) c;
     952           12 :         s[1] = '\0';
     953           12 :         return true;
     954              :     }
     955              : 
     956              :     /* If the server encoding is UTF-8, we just need to reformat the code */
     957           30 :     server_encoding = GetDatabaseEncoding();
     958           30 :     if (server_encoding == PG_UTF8)
     959              :     {
     960           30 :         unicode_to_utf8(c, s);
     961           30 :         s[pg_utf_mblen(s)] = '\0';
     962           30 :         return true;
     963              :     }
     964              : 
     965              :     /* For all other cases, we must have a conversion function available */
     966            0 :     if (Utf8ToServerConvProc == NULL)
     967            0 :         return false;
     968              : 
     969              :     /* Construct UTF-8 source string */
     970            0 :     unicode_to_utf8(c, c_as_utf8);
     971            0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     972            0 :     c_as_utf8[c_as_utf8_len] = '\0';
     973              : 
     974              :     /* Convert, but without throwing error if we can't */
     975            0 :     converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
     976              :                                                 Int32GetDatum(PG_UTF8),
     977              :                                                 Int32GetDatum(server_encoding),
     978              :                                                 CStringGetDatum((char *) c_as_utf8),
     979              :                                                 CStringGetDatum((char *) s),
     980              :                                                 Int32GetDatum(c_as_utf8_len),
     981              :                                                 BoolGetDatum(true)));
     982              : 
     983              :     /* Conversion was successful iff it consumed the whole input */
     984            0 :     return (converted_len == c_as_utf8_len);
     985              : }
     986              : 
     987              : 
     988              : /* convert a multibyte string to a wchar */
     989              : int
     990            0 : pg_mb2wchar(const char *from, pg_wchar *to)
     991              : {
     992            0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     993              : }
     994              : 
     995              : /* convert a multibyte string to a wchar with a limited length */
     996              : int
     997      5106031 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     998              : {
     999      5106031 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
    1000              : }
    1001              : 
    1002              : /* same, with any encoding */
    1003              : int
    1004         9308 : pg_encoding_mb2wchar_with_len(int encoding,
    1005              :                               const char *from, pg_wchar *to, int len)
    1006              : {
    1007         9308 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
    1008              : }
    1009              : 
    1010              : /* convert a wchar string to a multibyte */
    1011              : int
    1012            0 : pg_wchar2mb(const pg_wchar *from, char *to)
    1013              : {
    1014            0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
    1015              : }
    1016              : 
    1017              : /* convert a wchar string to a multibyte with a limited length */
    1018              : int
    1019       558160 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
    1020              : {
    1021       558160 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1022              : }
    1023              : 
    1024              : /* same, with any encoding */
    1025              : int
    1026           96 : pg_encoding_wchar2mb_with_len(int encoding,
    1027              :                               const pg_wchar *from, char *to, int len)
    1028              : {
    1029           96 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1030              : }
    1031              : 
    1032              : /*
    1033              :  * Returns the byte length of a multibyte character sequence in a
    1034              :  * null-terminated string.  Raises an illegal byte sequence error if the
    1035              :  * sequence would hit a null terminator.
    1036              :  *
    1037              :  * The caller is expected to have checked for a terminator at *mbstr == 0
    1038              :  * before calling, but some callers want 1 in that case, so this function
    1039              :  * continues that tradition.
    1040              :  *
    1041              :  * This must only be used for strings that have a null-terminator to enable
    1042              :  * bounds detection.
    1043              :  */
    1044              : int
    1045      2103088 : pg_mblen_cstr(const char *mbstr)
    1046              : {
    1047      2103088 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1048              : 
    1049              :     /*
    1050              :      * The .mblen functions return 1 when given a pointer to a terminator.
    1051              :      * Some callers depend on that, so we tolerate it for now.  Well-behaved
    1052              :      * callers check the leading byte for a terminator *before* calling.
    1053              :      */
    1054      2115508 :     for (int i = 1; i < length; ++i)
    1055        12423 :         if (unlikely(mbstr[i] == 0))
    1056            3 :             report_invalid_encoding_db(mbstr, length, i);
    1057              : 
    1058              :     /*
    1059              :      * String should be NUL-terminated, but checking that would make typical
    1060              :      * callers O(N^2), tripling Valgrind check-world time.  Unless
    1061              :      * VALGRIND_EXPENSIVE, check 1 byte after each actual character.  (If we
    1062              :      * found a character, not a terminator, the next byte must be a terminator
    1063              :      * or the start of the next character.)  If the caller iterates the whole
    1064              :      * string, the last call will diagnose a missing terminator.
    1065              :      */
    1066      2103085 :     if (mbstr[0] != '\0')
    1067              :     {
    1068              : #ifdef VALGRIND_EXPENSIVE
    1069              :         VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
    1070              : #else
    1071              :         VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
    1072              : #endif
    1073              :     }
    1074              : 
    1075      2103085 :     return length;
    1076              : }
    1077              : 
    1078              : /*
    1079              :  * Returns the byte length of a multibyte character sequence bounded by a range
    1080              :  * [mbstr, end) of at least one byte in size.  Raises an illegal byte sequence
    1081              :  * error if the sequence would exceed the range.
    1082              :  */
    1083              : int
    1084      2764394 : pg_mblen_range(const char *mbstr, const char *end)
    1085              : {
    1086      2764394 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1087              : 
    1088              :     Assert(end > mbstr);
    1089              : 
    1090      2764394 :     if (unlikely(mbstr + length > end))
    1091            6 :         report_invalid_encoding_db(mbstr, length, end - mbstr);
    1092              : 
    1093              : #ifdef VALGRIND_EXPENSIVE
    1094              :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
    1095              : #else
    1096              :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1097              : #endif
    1098              : 
    1099      2764388 :     return length;
    1100              : }
    1101              : 
    1102              : /*
    1103              :  * Returns the byte length of a multibyte character sequence bounded by a range
    1104              :  * extending for 'limit' bytes, which must be at least one.  Raises an illegal
    1105              :  * byte sequence error if the sequence would exceed the range.
    1106              :  */
    1107              : int
    1108     27747805 : pg_mblen_with_len(const char *mbstr, int limit)
    1109              : {
    1110     27747805 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1111              : 
    1112              :     Assert(limit >= 1);
    1113              : 
    1114     27747805 :     if (unlikely(length > limit))
    1115           12 :         report_invalid_encoding_db(mbstr, length, limit);
    1116              : 
    1117              : #ifdef VALGRIND_EXPENSIVE
    1118              :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
    1119              : #else
    1120              :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1121              : #endif
    1122              : 
    1123     27747793 :     return length;
    1124              : }
    1125              : 
    1126              : 
    1127              : /*
    1128              :  * Returns the length of a multibyte character sequence, without any
    1129              :  * validation of bounds.
    1130              :  *
    1131              :  * PLEASE NOTE:  This function can only be used safely if the caller has
    1132              :  * already verified the input string, since otherwise there is a risk of
    1133              :  * overrunning the buffer if the string is invalid.  A prior call to a
    1134              :  * pg_mbstrlen* function suffices.
    1135              :  */
    1136              : int
    1137     10722239 : pg_mblen_unbounded(const char *mbstr)
    1138              : {
    1139     10722239 :     int         length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1140              : 
    1141              :     VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
    1142              : 
    1143     10722239 :     return length;
    1144              : }
    1145              : 
    1146              : /*
    1147              :  * Historical name for pg_mblen_unbounded().  Should not be used and will be
    1148              :  * removed in a later version.
    1149              :  */
    1150              : int
    1151            0 : pg_mblen(const char *mbstr)
    1152              : {
    1153            0 :     return pg_mblen_unbounded(mbstr);
    1154              : }
    1155              : 
    1156              : /* returns the display length of a multibyte character */
    1157              : int
    1158         4362 : pg_dsplen(const char *mbstr)
    1159              : {
    1160         4362 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
    1161              : }
    1162              : 
    1163              : /* returns the length (counted in wchars) of a multibyte string */
    1164              : int
    1165          351 : pg_mbstrlen(const char *mbstr)
    1166              : {
    1167          351 :     int         len = 0;
    1168              : 
    1169              :     /* optimization for single byte encoding */
    1170          351 :     if (pg_database_encoding_max_length() == 1)
    1171            0 :         return strlen(mbstr);
    1172              : 
    1173          813 :     while (*mbstr)
    1174              :     {
    1175          462 :         mbstr += pg_mblen_cstr(mbstr);
    1176          462 :         len++;
    1177              :     }
    1178          351 :     return len;
    1179              : }
    1180              : 
    1181              : /* returns the length (counted in wchars) of a multibyte string
    1182              :  * (stops at the first of "limit" or a NUL)
    1183              :  */
    1184              : int
    1185       454518 : pg_mbstrlen_with_len(const char *mbstr, int limit)
    1186              : {
    1187       454518 :     int         len = 0;
    1188              : 
    1189              :     /* optimization for single byte encoding */
    1190       454518 :     if (pg_database_encoding_max_length() == 1)
    1191       200007 :         return limit;
    1192              : 
    1193     21513698 :     while (limit > 0 && *mbstr)
    1194              :     {
    1195     21259190 :         int         l = pg_mblen_with_len(mbstr, limit);
    1196              : 
    1197     21259187 :         limit -= l;
    1198     21259187 :         mbstr += l;
    1199     21259187 :         len++;
    1200              :     }
    1201       254508 :     return len;
    1202              : }
    1203              : 
    1204              : /*
    1205              :  * returns the byte length of a multibyte string
    1206              :  * (not necessarily NULL terminated)
    1207              :  * that is no longer than limit.
    1208              :  * this function does not break multibyte character boundary.
    1209              :  */
    1210              : int
    1211       164257 : pg_mbcliplen(const char *mbstr, int len, int limit)
    1212              : {
    1213       164257 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
    1214              :                                  len, limit);
    1215              : }
    1216              : 
    1217              : /*
    1218              :  * pg_mbcliplen with specified encoding; string must be valid in encoding
    1219              :  */
    1220              : int
    1221       164257 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
    1222              :                       int len, int limit)
    1223              : {
    1224              :     mblen_converter mblen_fn;
    1225       164257 :     int         clen = 0;
    1226              :     int         l;
    1227              : 
    1228              :     /* optimization for single byte encoding */
    1229       164257 :     if (pg_encoding_max_length(encoding) == 1)
    1230        19317 :         return cliplen(mbstr, len, limit);
    1231              : 
    1232       144940 :     mblen_fn = pg_wchar_table[encoding].mblen;
    1233              : 
    1234      1567908 :     while (len > 0 && *mbstr)
    1235              :     {
    1236      1493571 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
    1237      1493571 :         if ((clen + l) > limit)
    1238           47 :             break;
    1239      1493524 :         clen += l;
    1240      1493524 :         if (clen == limit)
    1241        70556 :             break;
    1242      1422968 :         len -= l;
    1243      1422968 :         mbstr += l;
    1244              :     }
    1245       144940 :     return clen;
    1246              : }
    1247              : 
    1248              : /*
    1249              :  * Similar to pg_mbcliplen except the limit parameter specifies the
    1250              :  * character length, not the byte length.
    1251              :  */
    1252              : int
    1253          264 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
    1254              : {
    1255          264 :     int         clen = 0;
    1256          264 :     int         nch = 0;
    1257              :     int         l;
    1258              : 
    1259              :     /* optimization for single byte encoding */
    1260          264 :     if (pg_database_encoding_max_length() == 1)
    1261            0 :         return cliplen(mbstr, len, limit);
    1262              : 
    1263         1164 :     while (len > 0 && *mbstr)
    1264              :     {
    1265         1155 :         l = pg_mblen_with_len(mbstr, len);
    1266         1155 :         nch++;
    1267         1155 :         if (nch > limit)
    1268          255 :             break;
    1269          900 :         clen += l;
    1270          900 :         len -= l;
    1271          900 :         mbstr += l;
    1272              :     }
    1273          264 :     return clen;
    1274              : }
    1275              : 
    1276              : /* mbcliplen for any single-byte encoding */
    1277              : static int
    1278        19317 : cliplen(const char *str, int len, int limit)
    1279              : {
    1280        19317 :     int         l = 0;
    1281              : 
    1282        19317 :     len = Min(len, limit);
    1283       147300 :     while (l < len && str[l])
    1284       127983 :         l++;
    1285        19317 :     return l;
    1286              : }
    1287              : 
    1288              : void
    1289        17356 : SetDatabaseEncoding(int encoding)
    1290              : {
    1291        17356 :     if (!PG_VALID_BE_ENCODING(encoding))
    1292            0 :         elog(ERROR, "invalid database encoding: %d", encoding);
    1293              : 
    1294        17356 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
    1295              :     Assert(DatabaseEncoding->encoding == encoding);
    1296        17356 : }
    1297              : 
    1298              : void
    1299        19366 : SetMessageEncoding(int encoding)
    1300              : {
    1301              :     /* Some calls happen before we can elog()! */
    1302              :     Assert(PG_VALID_ENCODING(encoding));
    1303              : 
    1304        19366 :     MessageEncoding = &pg_enc2name_tbl[encoding];
    1305              :     Assert(MessageEncoding->encoding == encoding);
    1306        19366 : }
    1307              : 
    1308              : #ifdef ENABLE_NLS
    1309              : /*
    1310              :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
    1311              :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
    1312              :  * fail for gettext-internal causes like out-of-memory.
    1313              :  */
    1314              : static bool
    1315         1626 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
    1316              : {
    1317         1626 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1318              : 
    1319         1626 :     if (!PG_VALID_ENCODING(encoding) || pg_enc2gettext_tbl[encoding] == NULL)
    1320            0 :         return false;
    1321              : 
    1322         1626 :     if (bind_textdomain_codeset(domainname,
    1323              :                                 pg_enc2gettext_tbl[encoding]) != NULL)
    1324         1626 :         return true;
    1325              : 
    1326            0 :     if (elog_ok)
    1327            0 :         elog(LOG, "bind_textdomain_codeset failed");
    1328              :     else
    1329            0 :         write_stderr("bind_textdomain_codeset failed");
    1330              : 
    1331            0 :     return false;
    1332              : }
    1333              : 
    1334              : /*
    1335              :  * Bind a gettext message domain to the codeset corresponding to the database
    1336              :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
    1337              :  * Return the MessageEncoding implied by the new settings.
    1338              :  *
    1339              :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
    1340              :  * When that matches the database encoding, we don't need to do anything.  In
    1341              :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
    1342              :  * database encoding, except for the C locale.  (On Windows, we also permit a
    1343              :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
    1344              :  * gettext to the right codeset.
    1345              :  *
    1346              :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
    1347              :  * convenient departure for software that passes the strings to Windows ANSI
    1348              :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
    1349              :  * failing that, the LC_CTYPE encoding as it would on other platforms.
    1350              :  *
    1351              :  * This function is called before elog() and palloc() are usable.
    1352              :  */
    1353              : int
    1354        21254 : pg_bind_textdomain_codeset(const char *domainname)
    1355              : {
    1356        21254 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1357        21254 :     int         encoding = GetDatabaseEncoding();
    1358              :     int         new_msgenc;
    1359              : 
    1360              : #ifndef WIN32
    1361        21254 :     const char *ctype = setlocale(LC_CTYPE, NULL);
    1362              : 
    1363        21254 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
    1364              : #endif
    1365         3647 :         if (encoding != PG_SQL_ASCII &&
    1366         1626 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
    1367         1626 :             return encoding;
    1368              : 
    1369        19628 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
    1370        19628 :     if (new_msgenc < 0)
    1371            0 :         new_msgenc = PG_SQL_ASCII;
    1372              : 
    1373              : #ifdef WIN32
    1374              :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1375              :         /* On failure, the old message encoding remains valid. */
    1376              :         return GetMessageEncoding();
    1377              : #endif
    1378              : 
    1379        19628 :     return new_msgenc;
    1380              : }
    1381              : #endif
    1382              : 
    1383              : /*
    1384              :  * The database encoding, also called the server encoding, represents the
    1385              :  * encoding of data stored in text-like data types.  Affected types include
    1386              :  * cstring, text, varchar, name, xml, and json.
    1387              :  */
    1388              : int
    1389      3895550 : GetDatabaseEncoding(void)
    1390              : {
    1391      3895550 :     return DatabaseEncoding->encoding;
    1392              : }
    1393              : 
    1394              : const char *
    1395        36337 : GetDatabaseEncodingName(void)
    1396              : {
    1397        36337 :     return DatabaseEncoding->name;
    1398              : }
    1399              : 
    1400              : Datum
    1401           54 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1402              : {
    1403           54 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1404              : }
    1405              : 
    1406              : Datum
    1407            0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1408              : {
    1409            0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1410              : }
    1411              : 
    1412              : Datum
    1413           18 : PG_char_to_encoding(PG_FUNCTION_ARGS)
    1414              : {
    1415           18 :     Name        s = PG_GETARG_NAME(0);
    1416              : 
    1417           18 :     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
    1418              : }
    1419              : 
    1420              : Datum
    1421         2518 : PG_encoding_to_char(PG_FUNCTION_ARGS)
    1422              : {
    1423         2518 :     int32       encoding = PG_GETARG_INT32(0);
    1424         2518 :     const char *encoding_name = pg_encoding_to_char(encoding);
    1425              : 
    1426         2518 :     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
    1427              : }
    1428              : 
    1429              : /*
    1430              :  * gettext() returns messages in this encoding.  This often matches the
    1431              :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1432              :  * not attached to a database, and under a database encoding lacking iconv
    1433              :  * support (MULE_INTERNAL).
    1434              :  */
    1435              : int
    1436            0 : GetMessageEncoding(void)
    1437              : {
    1438            0 :     return MessageEncoding->encoding;
    1439              : }
    1440              : 
    1441              : 
    1442              : /*
    1443              :  * Generic character incrementer function.
    1444              :  *
    1445              :  * Not knowing anything about the properties of the encoding in use, we just
    1446              :  * keep incrementing the last byte until we get a validly-encoded result,
    1447              :  * or we run out of values to try.  We don't bother to try incrementing
    1448              :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1449              :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1450              :  * is not a valid final byte in the encoding.)
    1451              :  */
    1452              : static bool
    1453           48 : pg_generic_charinc(unsigned char *charptr, int len)
    1454              : {
    1455           48 :     unsigned char *lastbyte = charptr + len - 1;
    1456              :     mbchar_verifier mbverify;
    1457              : 
    1458              :     /* We can just invoke the character verifier directly. */
    1459           48 :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
    1460              : 
    1461           48 :     while (*lastbyte < (unsigned char) 255)
    1462              :     {
    1463           48 :         (*lastbyte)++;
    1464           48 :         if ((*mbverify) (charptr, len) == len)
    1465           48 :             return true;
    1466              :     }
    1467              : 
    1468            0 :     return false;
    1469              : }
    1470              : 
    1471              : /*
    1472              :  * UTF-8 character incrementer function.
    1473              :  *
    1474              :  * For a one-byte character less than 0x7F, we just increment the byte.
    1475              :  *
    1476              :  * For a multibyte character, every byte but the first must fall between 0x80
    1477              :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1478              :  * the last byte that's not already at its maximum value.  If we can't find a
    1479              :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1480              :  * need some special-case logic to skip regions used for surrogate pair
    1481              :  * handling, as those should not occur in valid UTF-8.
    1482              :  *
    1483              :  * Note that we don't reset lower-order bytes back to their minimums, since
    1484              :  * we can't afford to make an exhaustive search (see make_greater_string).
    1485              :  */
    1486              : static bool
    1487         1817 : pg_utf8_increment(unsigned char *charptr, int length)
    1488              : {
    1489              :     unsigned char a;
    1490              :     unsigned char limit;
    1491              : 
    1492         1817 :     switch (length)
    1493              :     {
    1494            0 :         default:
    1495              :             /* reject lengths 5 and 6 for now */
    1496            0 :             return false;
    1497            0 :         case 4:
    1498            0 :             a = charptr[3];
    1499            0 :             if (a < 0xBF)
    1500              :             {
    1501            0 :                 charptr[3]++;
    1502            0 :                 break;
    1503              :             }
    1504              :             pg_fallthrough;
    1505              :         case 3:
    1506            0 :             a = charptr[2];
    1507            0 :             if (a < 0xBF)
    1508              :             {
    1509            0 :                 charptr[2]++;
    1510            0 :                 break;
    1511              :             }
    1512              :             pg_fallthrough;
    1513              :         case 2:
    1514            0 :             a = charptr[1];
    1515            0 :             switch (*charptr)
    1516              :             {
    1517            0 :                 case 0xED:
    1518            0 :                     limit = 0x9F;
    1519            0 :                     break;
    1520            0 :                 case 0xF4:
    1521            0 :                     limit = 0x8F;
    1522            0 :                     break;
    1523            0 :                 default:
    1524            0 :                     limit = 0xBF;
    1525            0 :                     break;
    1526              :             }
    1527            0 :             if (a < limit)
    1528              :             {
    1529            0 :                 charptr[1]++;
    1530            0 :                 break;
    1531              :             }
    1532              :             pg_fallthrough;
    1533              :         case 1:
    1534         1817 :             a = *charptr;
    1535         1817 :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1536            0 :                 return false;
    1537         1817 :             charptr[0]++;
    1538         1817 :             break;
    1539              :     }
    1540              : 
    1541         1817 :     return true;
    1542              : }
    1543              : 
    1544              : /*
    1545              :  * EUC-JP character incrementer function.
    1546              :  *
    1547              :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1548              :  * representing JIS X 0201 characters with the second byte ranging between
    1549              :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1550              :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1551              :  *
    1552              :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1553              :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1554              :  * is incremented if possible, otherwise the second-to-last byte.
    1555              :  *
    1556              :  * If the sequence starts with a value other than the above and its MSB
    1557              :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1558              :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1559              :  * incremented if possible, otherwise the second-to-last byte.
    1560              :  *
    1561              :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1562              :  * incremented up to 0x7f.
    1563              :  */
    1564              : static bool
    1565            0 : pg_eucjp_increment(unsigned char *charptr, int length)
    1566              : {
    1567              :     unsigned char c1,
    1568              :                 c2;
    1569              :     int         i;
    1570              : 
    1571            0 :     c1 = *charptr;
    1572              : 
    1573            0 :     switch (c1)
    1574              :     {
    1575            0 :         case SS2:               /* JIS X 0201 */
    1576            0 :             if (length != 2)
    1577            0 :                 return false;
    1578              : 
    1579            0 :             c2 = charptr[1];
    1580              : 
    1581            0 :             if (c2 >= 0xdf)
    1582            0 :                 charptr[0] = charptr[1] = 0xa1;
    1583            0 :             else if (c2 < 0xa1)
    1584            0 :                 charptr[1] = 0xa1;
    1585              :             else
    1586            0 :                 charptr[1]++;
    1587            0 :             break;
    1588              : 
    1589            0 :         case SS3:               /* JIS X 0212 */
    1590            0 :             if (length != 3)
    1591            0 :                 return false;
    1592              : 
    1593            0 :             for (i = 2; i > 0; i--)
    1594              :             {
    1595            0 :                 c2 = charptr[i];
    1596            0 :                 if (c2 < 0xa1)
    1597              :                 {
    1598            0 :                     charptr[i] = 0xa1;
    1599            0 :                     return true;
    1600              :                 }
    1601            0 :                 else if (c2 < 0xfe)
    1602              :                 {
    1603            0 :                     charptr[i]++;
    1604            0 :                     return true;
    1605              :                 }
    1606              :             }
    1607              : 
    1608              :             /* Out of 3-byte code region */
    1609            0 :             return false;
    1610              : 
    1611            0 :         default:
    1612            0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1613              :             {
    1614            0 :                 if (length != 2)
    1615            0 :                     return false;
    1616              : 
    1617            0 :                 for (i = 1; i >= 0; i--)
    1618              :                 {
    1619            0 :                     c2 = charptr[i];
    1620            0 :                     if (c2 < 0xa1)
    1621              :                     {
    1622            0 :                         charptr[i] = 0xa1;
    1623            0 :                         return true;
    1624              :                     }
    1625            0 :                     else if (c2 < 0xfe)
    1626              :                     {
    1627            0 :                         charptr[i]++;
    1628            0 :                         return true;
    1629              :                     }
    1630              :                 }
    1631              : 
    1632              :                 /* Out of 2 byte code region */
    1633            0 :                 return false;
    1634              :             }
    1635              :             else
    1636              :             {                   /* ASCII, single byte */
    1637            0 :                 if (c1 > 0x7e)
    1638            0 :                     return false;
    1639            0 :                 (*charptr)++;
    1640              :             }
    1641            0 :             break;
    1642              :     }
    1643              : 
    1644            0 :     return true;
    1645              : }
    1646              : 
    1647              : /*
    1648              :  * get the character incrementer for the encoding for the current database
    1649              :  */
    1650              : mbcharacter_incrementer
    1651         1865 : pg_database_encoding_character_incrementer(void)
    1652              : {
    1653              :     /*
    1654              :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1655              :      * now we just use a switch.
    1656              :      */
    1657         1865 :     switch (GetDatabaseEncoding())
    1658              :     {
    1659         1817 :         case PG_UTF8:
    1660         1817 :             return pg_utf8_increment;
    1661              : 
    1662            0 :         case PG_EUC_JP:
    1663            0 :             return pg_eucjp_increment;
    1664              : 
    1665           48 :         default:
    1666           48 :             return pg_generic_charinc;
    1667              :     }
    1668              : }
    1669              : 
    1670              : /*
    1671              :  * fetch maximum length of the encoding for the current database
    1672              :  */
    1673              : int
    1674      2647656 : pg_database_encoding_max_length(void)
    1675              : {
    1676      2647656 :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1677              : }
    1678              : 
    1679              : /*
    1680              :  * Verify mbstr to make sure that it is validly encoded in the current
    1681              :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1682              :  */
    1683              : bool
    1684         2292 : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1685              : {
    1686         2292 :     return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
    1687              : }
    1688              : 
    1689              : /*
    1690              :  * Verify mbstr to make sure that it is validly encoded in the specified
    1691              :  * encoding.
    1692              :  */
    1693              : bool
    1694       602243 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1695              : {
    1696              :     int         oklen;
    1697              : 
    1698              :     Assert(PG_VALID_ENCODING(encoding));
    1699              : 
    1700       602243 :     oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
    1701       602243 :     if (oklen != len)
    1702              :     {
    1703            8 :         if (noError)
    1704            0 :             return false;
    1705            8 :         report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
    1706              :     }
    1707       602235 :     return true;
    1708              : }
    1709              : 
    1710              : /*
    1711              :  * Verify mbstr to make sure that it is validly encoded in the specified
    1712              :  * encoding.
    1713              :  *
    1714              :  * mbstr is not necessarily zero terminated; length of mbstr is
    1715              :  * specified by len.
    1716              :  *
    1717              :  * If OK, return length of string in the encoding.
    1718              :  * If a problem is found, return -1 when noError is
    1719              :  * true; when noError is false, ereport() a descriptive message.
    1720              :  *
    1721              :  * Note: We cannot use the faster encoding-specific mbverifystr() function
    1722              :  * here, because we need to count the number of characters in the string.
    1723              :  */
    1724              : int
    1725            0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1726              : {
    1727              :     mbchar_verifier mbverifychar;
    1728              :     int         mb_len;
    1729              : 
    1730              :     Assert(PG_VALID_ENCODING(encoding));
    1731              : 
    1732              :     /*
    1733              :      * In single-byte encodings, we need only reject nulls (\0).
    1734              :      */
    1735            0 :     if (pg_encoding_max_length(encoding) <= 1)
    1736              :     {
    1737            0 :         const char *nullpos = memchr(mbstr, 0, len);
    1738              : 
    1739            0 :         if (nullpos == NULL)
    1740            0 :             return len;
    1741            0 :         if (noError)
    1742            0 :             return -1;
    1743            0 :         report_invalid_encoding(encoding, nullpos, 1);
    1744              :     }
    1745              : 
    1746              :     /* fetch function pointer just once */
    1747            0 :     mbverifychar = pg_wchar_table[encoding].mbverifychar;
    1748              : 
    1749            0 :     mb_len = 0;
    1750              : 
    1751            0 :     while (len > 0)
    1752              :     {
    1753              :         int         l;
    1754              : 
    1755              :         /* fast path for ASCII-subset characters */
    1756            0 :         if (!IS_HIGHBIT_SET(*mbstr))
    1757              :         {
    1758            0 :             if (*mbstr != '\0')
    1759              :             {
    1760            0 :                 mb_len++;
    1761            0 :                 mbstr++;
    1762            0 :                 len--;
    1763            0 :                 continue;
    1764              :             }
    1765            0 :             if (noError)
    1766            0 :                 return -1;
    1767            0 :             report_invalid_encoding(encoding, mbstr, len);
    1768              :         }
    1769              : 
    1770            0 :         l = (*mbverifychar) ((const unsigned char *) mbstr, len);
    1771              : 
    1772            0 :         if (l < 0)
    1773              :         {
    1774            0 :             if (noError)
    1775            0 :                 return -1;
    1776            0 :             report_invalid_encoding(encoding, mbstr, len);
    1777              :         }
    1778              : 
    1779            0 :         mbstr += l;
    1780            0 :         len -= l;
    1781            0 :         mb_len++;
    1782              :     }
    1783            0 :     return mb_len;
    1784              : }
    1785              : 
    1786              : /*
    1787              :  * check_encoding_conversion_args: check arguments of a conversion function
    1788              :  *
    1789              :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1790              :  * the caller will check whether it accepts the ID.
    1791              :  *
    1792              :  * Note: the errors here are not really user-facing, so elog instead of
    1793              :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1794              :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1795              :  */
    1796              : void
    1797         3565 : check_encoding_conversion_args(int src_encoding,
    1798              :                                int dest_encoding,
    1799              :                                int len,
    1800              :                                int expected_src_encoding,
    1801              :                                int expected_dest_encoding)
    1802              : {
    1803         3565 :     if (!PG_VALID_ENCODING(src_encoding))
    1804            0 :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1805         3565 :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1806            0 :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1807              :              pg_enc2name_tbl[expected_src_encoding].name,
    1808              :              pg_enc2name_tbl[src_encoding].name);
    1809         3565 :     if (!PG_VALID_ENCODING(dest_encoding))
    1810            0 :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1811         3565 :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1812            0 :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1813              :              pg_enc2name_tbl[expected_dest_encoding].name,
    1814              :              pg_enc2name_tbl[dest_encoding].name);
    1815         3565 :     if (len < 0)
    1816            0 :         elog(ERROR, "encoding conversion length must not be negative");
    1817         3565 : }
    1818              : 
    1819              : /*
    1820              :  * report_invalid_encoding: complain about invalid multibyte character
    1821              :  *
    1822              :  * note: len is remaining length of string, not length of character;
    1823              :  * len must be greater than zero (or we'd neglect initializing "buf").
    1824              :  */
    1825              : void
    1826         1499 : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1827              : {
    1828         1499 :     int         l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
    1829              : 
    1830         1499 :     report_invalid_encoding_int(encoding, mbstr, l, len);
    1831              : }
    1832              : 
    1833              : static void
    1834         1520 : report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
    1835              : {
    1836              :     char        buf[8 * 5 + 1];
    1837         1520 :     char       *p = buf;
    1838              :     int         j,
    1839              :                 jlimit;
    1840              : 
    1841         1520 :     jlimit = Min(mblen, len);
    1842         1520 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1843              : 
    1844         4669 :     for (j = 0; j < jlimit; j++)
    1845              :     {
    1846         3149 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1847         3149 :         if (j < jlimit - 1)
    1848         1629 :             p += sprintf(p, " ");
    1849              :     }
    1850              : 
    1851         1520 :     ereport(ERROR,
    1852              :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    1853              :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    1854              :                     pg_enc2name_tbl[encoding].name,
    1855              :                     buf)));
    1856              : }
    1857              : 
    1858              : static void
    1859           21 : report_invalid_encoding_db(const char *mbstr, int mblen, int len)
    1860              : {
    1861           21 :     report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
    1862              : }
    1863              : 
    1864              : /*
    1865              :  * report_untranslatable_char: complain about untranslatable character
    1866              :  *
    1867              :  * note: len is remaining length of string, not length of character;
    1868              :  * len must be greater than zero (or we'd neglect initializing "buf").
    1869              :  */
    1870              : void
    1871          468 : report_untranslatable_char(int src_encoding, int dest_encoding,
    1872              :                            const char *mbstr, int len)
    1873              : {
    1874              :     int         l;
    1875              :     char        buf[8 * 5 + 1];
    1876          468 :     char       *p = buf;
    1877              :     int         j,
    1878              :                 jlimit;
    1879              : 
    1880              :     /*
    1881              :      * We probably could use plain pg_encoding_mblen(), because
    1882              :      * gb18030_to_utf8() verifies before it converts.  All conversions should.
    1883              :      * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs.  Even
    1884              :      * so, be defensive, since a buggy conversion might pass invalid data.
    1885              :      * This is not a performance-critical path.
    1886              :      */
    1887          468 :     l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
    1888          468 :     jlimit = Min(l, len);
    1889          468 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1890              : 
    1891         1764 :     for (j = 0; j < jlimit; j++)
    1892              :     {
    1893         1296 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1894         1296 :         if (j < jlimit - 1)
    1895          828 :             p += sprintf(p, " ");
    1896              :     }
    1897              : 
    1898          468 :     ereport(ERROR,
    1899              :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    1900              :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    1901              :                     buf,
    1902              :                     pg_enc2name_tbl[src_encoding].name,
    1903              :                     pg_enc2name_tbl[dest_encoding].name)));
    1904              : }
    1905              : 
    1906              : 
    1907              : #ifdef WIN32
    1908              : /*
    1909              :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1910              :  * string. The character length is also passed to utf16len if not
    1911              :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1912              :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1913              :  */
    1914              : WCHAR *
    1915              : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1916              : {
    1917              :     int         msgenc = GetMessageEncoding();
    1918              :     WCHAR      *utf16;
    1919              :     int         dstlen;
    1920              :     UINT        codepage;
    1921              : 
    1922              :     if (msgenc == PG_SQL_ASCII)
    1923              :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1924              :         return NULL;
    1925              : 
    1926              :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1927              : 
    1928              :     /*
    1929              :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1930              :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1931              :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1932              :      */
    1933              :     if (codepage != 0)
    1934              :     {
    1935              :         utf16 = palloc_array(WCHAR, len + 1);
    1936              :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1937              :         utf16[dstlen] = (WCHAR) 0;
    1938              :     }
    1939              :     else
    1940              :     {
    1941              :         char       *utf8;
    1942              : 
    1943              :         /*
    1944              :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1945              :          * absence of one, hope for the input to be valid UTF8.
    1946              :          */
    1947              :         if (IsTransactionState())
    1948              :         {
    1949              :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1950              :                                                       len,
    1951              :                                                       msgenc,
    1952              :                                                       PG_UTF8);
    1953              :             if (utf8 != str)
    1954              :                 len = strlen(utf8);
    1955              :         }
    1956              :         else
    1957              :             utf8 = (char *) str;
    1958              : 
    1959              :         utf16 = palloc_array(WCHAR, len + 1);
    1960              :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1961              :         utf16[dstlen] = (WCHAR) 0;
    1962              : 
    1963              :         if (utf8 != str)
    1964              :             pfree(utf8);
    1965              :     }
    1966              : 
    1967              :     if (dstlen == 0 && len > 0)
    1968              :     {
    1969              :         pfree(utf16);
    1970              :         return NULL;            /* error */
    1971              :     }
    1972              : 
    1973              :     if (utf16len)
    1974              :         *utf16len = dstlen;
    1975              :     return utf16;
    1976              : }
    1977              : 
    1978              : #endif                          /* WIN32 */
        

Generated by: LCOV version 2.0-1