Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * mbutils.c
4 : * This file contains functions for encoding conversion.
5 : *
6 : * The string-conversion functions in this file share some API quirks.
7 : * Note the following:
8 : *
9 : * The functions return a palloc'd, null-terminated string if conversion
10 : * is required. However, if no conversion is performed, the given source
11 : * string pointer is returned as-is.
12 : *
13 : * Although the presence of a length argument means that callers can pass
14 : * non-null-terminated strings, care is required because the same string
15 : * will be passed back if no conversion occurs. Such callers *must* check
16 : * whether result == src and handle that case differently.
17 : *
18 : * If the source and destination encodings are the same, the source string
19 : * is returned without any verification; it's assumed to be valid data.
20 : * If that might not be the case, the caller is responsible for validating
21 : * the string using a separate call to pg_verify_mbstr(). Whenever the
22 : * source and destination encodings are different, the functions ensure that
23 : * the result is validly encoded according to the destination encoding.
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/mb/mbutils.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include "access/xact.h"
38 : #include "catalog/namespace.h"
39 : #include "mb/pg_wchar.h"
40 : #include "utils/fmgrprotos.h"
41 : #include "utils/memdebug.h"
42 : #include "utils/memutils.h"
43 : #include "utils/relcache.h"
44 : #include "varatt.h"
45 :
46 : /*
47 : * We maintain a simple linked list caching the fmgr lookup info for the
48 : * currently selected conversion functions, as well as any that have been
49 : * selected previously in the current session. (We remember previous
50 : * settings because we must be able to restore a previous setting during
51 : * transaction rollback, without doing any fresh catalog accesses.)
52 : *
53 : * Since we'll never release this data, we just keep it in TopMemoryContext.
54 : */
55 : typedef struct ConvProcInfo
56 : {
57 : int s_encoding; /* server and client encoding IDs */
58 : int c_encoding;
59 : FmgrInfo to_server_info; /* lookup info for conversion procs */
60 : FmgrInfo to_client_info;
61 : } ConvProcInfo;
62 :
63 : static List *ConvProcList = NIL; /* List of ConvProcInfo */
64 :
65 : /*
66 : * These variables point to the currently active conversion functions,
67 : * or are NULL when no conversion is needed.
68 : */
69 : static FmgrInfo *ToServerConvProc = NULL;
70 : static FmgrInfo *ToClientConvProc = NULL;
71 :
72 : /*
73 : * This variable stores the conversion function to convert from UTF-8
74 : * to the server encoding. It's NULL if the server encoding *is* UTF-8,
75 : * or if we lack a conversion function for this.
76 : */
77 : static FmgrInfo *Utf8ToServerConvProc = NULL;
78 :
79 : /*
80 : * These variables track the currently-selected encodings.
81 : */
82 : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83 : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
84 : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
85 :
86 : /*
87 : * During backend startup we can't set client encoding because we (a)
88 : * can't look up the conversion functions, and (b) may not know the database
89 : * encoding yet either. So SetClientEncoding() just accepts anything and
90 : * remembers it for InitializeClientEncoding() to apply later.
91 : */
92 : static bool backend_startup_complete = false;
93 : static int pending_client_encoding = PG_SQL_ASCII;
94 :
95 :
96 : /* Internal functions */
97 : static char *perform_default_encoding_conversion(const char *src,
98 : int len, bool is_client_to_server);
99 : static int cliplen(const char *str, int len, int limit);
100 :
101 : pg_noreturn
102 : static void report_invalid_encoding_int(int encoding, const char *mbstr,
103 : int mblen, int len);
104 :
105 : pg_noreturn
106 : static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
107 :
108 :
109 : /*
110 : * Prepare for a future call to SetClientEncoding. Success should mean
111 : * that SetClientEncoding is guaranteed to succeed for this encoding request.
112 : *
113 : * (But note that success before backend_startup_complete does not guarantee
114 : * success after ...)
115 : *
116 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
117 : */
118 : int
119 72078 : PrepareClientEncoding(int encoding)
120 : {
121 : int current_server_encoding;
122 : ListCell *lc;
123 :
124 72078 : if (!PG_VALID_FE_ENCODING(encoding))
125 0 : return -1;
126 :
127 : /* Can't do anything during startup, per notes above */
128 72078 : if (!backend_startup_complete)
129 36386 : return 0;
130 :
131 35692 : current_server_encoding = GetDatabaseEncoding();
132 :
133 : /*
134 : * Check for cases that require no conversion function.
135 : */
136 35692 : if (current_server_encoding == encoding ||
137 3010 : current_server_encoding == PG_SQL_ASCII ||
138 : encoding == PG_SQL_ASCII)
139 35672 : return 0;
140 :
141 20 : if (IsTransactionState())
142 : {
143 : /*
144 : * If we're in a live transaction, it's safe to access the catalogs,
145 : * so look up the functions. We repeat the lookup even if the info is
146 : * already cached, so that we can react to changes in the contents of
147 : * pg_conversion.
148 : */
149 : Oid to_server_proc,
150 : to_client_proc;
151 : ConvProcInfo *convinfo;
152 : MemoryContext oldcontext;
153 :
154 20 : to_server_proc = FindDefaultConversionProc(encoding,
155 : current_server_encoding);
156 20 : if (!OidIsValid(to_server_proc))
157 0 : return -1;
158 20 : to_client_proc = FindDefaultConversionProc(current_server_encoding,
159 : encoding);
160 20 : if (!OidIsValid(to_client_proc))
161 0 : return -1;
162 :
163 : /*
164 : * Load the fmgr info into TopMemoryContext (could still fail here)
165 : */
166 20 : convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
167 : sizeof(ConvProcInfo));
168 20 : convinfo->s_encoding = current_server_encoding;
169 20 : convinfo->c_encoding = encoding;
170 20 : fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
171 : TopMemoryContext);
172 20 : fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
173 : TopMemoryContext);
174 :
175 : /* Attach new info to head of list */
176 20 : oldcontext = MemoryContextSwitchTo(TopMemoryContext);
177 20 : ConvProcList = lcons(convinfo, ConvProcList);
178 20 : MemoryContextSwitchTo(oldcontext);
179 :
180 : /*
181 : * We cannot yet remove any older entry for the same encoding pair,
182 : * since it could still be in use. SetClientEncoding will clean up.
183 : */
184 :
185 20 : return 0; /* success */
186 : }
187 : else
188 : {
189 : /*
190 : * If we're not in a live transaction, the only thing we can do is
191 : * restore a previous setting using the cache. This covers all
192 : * transaction-rollback cases. The only case it might not work for is
193 : * trying to change client_encoding on the fly by editing
194 : * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
195 : * thing to do anyway.
196 : */
197 0 : foreach(lc, ConvProcList)
198 : {
199 0 : ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
200 :
201 0 : if (oldinfo->s_encoding == current_server_encoding &&
202 0 : oldinfo->c_encoding == encoding)
203 0 : return 0;
204 : }
205 :
206 0 : return -1; /* it's not cached, so fail */
207 : }
208 : }
209 :
210 : /*
211 : * Set the active client encoding and set up the conversion-function pointers.
212 : * PrepareClientEncoding should have been called previously for this encoding.
213 : *
214 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
215 : */
216 : int
217 74862 : SetClientEncoding(int encoding)
218 : {
219 : int current_server_encoding;
220 : bool found;
221 : ListCell *lc;
222 :
223 74862 : if (!PG_VALID_FE_ENCODING(encoding))
224 0 : return -1;
225 :
226 : /* Can't do anything during startup, per notes above */
227 74862 : if (!backend_startup_complete)
228 : {
229 36210 : pending_client_encoding = encoding;
230 36210 : return 0;
231 : }
232 :
233 38652 : current_server_encoding = GetDatabaseEncoding();
234 :
235 : /*
236 : * Check for cases that require no conversion function.
237 : */
238 38652 : if (current_server_encoding == encoding ||
239 3010 : current_server_encoding == PG_SQL_ASCII ||
240 : encoding == PG_SQL_ASCII)
241 : {
242 38632 : ClientEncoding = &pg_enc2name_tbl[encoding];
243 38632 : ToServerConvProc = NULL;
244 38632 : ToClientConvProc = NULL;
245 38632 : return 0;
246 : }
247 :
248 : /*
249 : * Search the cache for the entry previously prepared by
250 : * PrepareClientEncoding; if there isn't one, we lose. While at it,
251 : * release any duplicate entries so that repeated Prepare/Set cycles don't
252 : * leak memory.
253 : */
254 20 : found = false;
255 46 : foreach(lc, ConvProcList)
256 : {
257 26 : ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
258 :
259 26 : if (convinfo->s_encoding == current_server_encoding &&
260 26 : convinfo->c_encoding == encoding)
261 : {
262 20 : if (!found)
263 : {
264 : /* Found newest entry, so set up */
265 20 : ClientEncoding = &pg_enc2name_tbl[encoding];
266 20 : ToServerConvProc = &convinfo->to_server_info;
267 20 : ToClientConvProc = &convinfo->to_client_info;
268 20 : found = true;
269 : }
270 : else
271 : {
272 : /* Duplicate entry, release it */
273 0 : ConvProcList = foreach_delete_current(ConvProcList, lc);
274 0 : pfree(convinfo);
275 : }
276 : }
277 : }
278 :
279 20 : if (found)
280 20 : return 0; /* success */
281 : else
282 0 : return -1; /* it's not cached, so fail */
283 : }
284 :
285 : /*
286 : * Initialize client encoding conversions.
287 : * Called from InitPostgres() once during backend startup.
288 : */
289 : void
290 34962 : InitializeClientEncoding(void)
291 : {
292 : int current_server_encoding;
293 :
294 : Assert(!backend_startup_complete);
295 34962 : backend_startup_complete = true;
296 :
297 69924 : if (PrepareClientEncoding(pending_client_encoding) < 0 ||
298 34962 : SetClientEncoding(pending_client_encoding) < 0)
299 : {
300 : /*
301 : * Oops, the requested conversion is not available. We couldn't fail
302 : * before, but we can now.
303 : */
304 0 : ereport(FATAL,
305 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
306 : errmsg("conversion between %s and %s is not supported",
307 : pg_enc2name_tbl[pending_client_encoding].name,
308 : GetDatabaseEncodingName())));
309 : }
310 :
311 : /*
312 : * Also look up the UTF8-to-server conversion function if needed. Since
313 : * the server encoding is fixed within any one backend process, we don't
314 : * have to do this more than once.
315 : */
316 34962 : current_server_encoding = GetDatabaseEncoding();
317 34962 : if (current_server_encoding != PG_UTF8 &&
318 : current_server_encoding != PG_SQL_ASCII)
319 : {
320 : Oid utf8_to_server_proc;
321 :
322 198 : AssertCouldGetRelation();
323 : utf8_to_server_proc =
324 198 : FindDefaultConversionProc(PG_UTF8,
325 : current_server_encoding);
326 : /* If there's no such conversion, just leave the pointer as NULL */
327 198 : if (OidIsValid(utf8_to_server_proc))
328 : {
329 : FmgrInfo *finfo;
330 :
331 198 : finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
332 : sizeof(FmgrInfo));
333 198 : fmgr_info_cxt(utf8_to_server_proc, finfo,
334 : TopMemoryContext);
335 : /* Set Utf8ToServerConvProc only after data is fully valid */
336 198 : Utf8ToServerConvProc = finfo;
337 : }
338 : }
339 34962 : }
340 :
341 : /*
342 : * returns the current client encoding
343 : */
344 : int
345 11042 : pg_get_client_encoding(void)
346 : {
347 11042 : return ClientEncoding->encoding;
348 : }
349 :
350 : /*
351 : * returns the current client encoding name
352 : */
353 : const char *
354 0 : pg_get_client_encoding_name(void)
355 : {
356 0 : return ClientEncoding->name;
357 : }
358 :
359 : /*
360 : * Convert src string to another encoding (general case).
361 : *
362 : * See the notes about string conversion functions at the top of this file.
363 : */
364 : unsigned char *
365 3050 : pg_do_encoding_conversion(unsigned char *src, int len,
366 : int src_encoding, int dest_encoding)
367 : {
368 : unsigned char *result;
369 : Oid proc;
370 :
371 3050 : if (len <= 0)
372 36 : return src; /* empty string is always valid */
373 :
374 3014 : if (src_encoding == dest_encoding)
375 2200 : return src; /* no conversion required, assume valid */
376 :
377 814 : if (dest_encoding == PG_SQL_ASCII)
378 0 : return src; /* any string is valid in SQL_ASCII */
379 :
380 814 : if (src_encoding == PG_SQL_ASCII)
381 : {
382 : /* No conversion is possible, but we must validate the result */
383 16 : (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
384 16 : return src;
385 : }
386 :
387 798 : if (!IsTransactionState()) /* shouldn't happen */
388 0 : elog(ERROR, "cannot perform encoding conversion outside a transaction");
389 :
390 798 : proc = FindDefaultConversionProc(src_encoding, dest_encoding);
391 798 : if (!OidIsValid(proc))
392 0 : ereport(ERROR,
393 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
394 : errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
395 : pg_encoding_to_char(src_encoding),
396 : pg_encoding_to_char(dest_encoding))));
397 :
398 : /*
399 : * Allocate space for conversion result, being wary of integer overflow.
400 : *
401 : * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
402 : * required space, so it might exceed MaxAllocSize even though the result
403 : * would actually fit. We do not want to hand back a result string that
404 : * exceeds MaxAllocSize, because callers might not cope gracefully --- but
405 : * if we just allocate more than that, and don't use it, that's fine.
406 : */
407 798 : if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
408 0 : ereport(ERROR,
409 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
410 : errmsg("out of memory"),
411 : errdetail("String of %d bytes is too long for encoding conversion.",
412 : len)));
413 :
414 : result = (unsigned char *)
415 798 : MemoryContextAllocHuge(CurrentMemoryContext,
416 798 : (Size) len * MAX_CONVERSION_GROWTH + 1);
417 :
418 798 : (void) OidFunctionCall6(proc,
419 : Int32GetDatum(src_encoding),
420 : Int32GetDatum(dest_encoding),
421 : CStringGetDatum((char *) src),
422 : CStringGetDatum((char *) result),
423 : Int32GetDatum(len),
424 : BoolGetDatum(false));
425 :
426 : /*
427 : * If the result is large, it's worth repalloc'ing to release any extra
428 : * space we asked for. The cutoff here is somewhat arbitrary, but we
429 : * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
430 : */
431 798 : if (len > 1000000)
432 : {
433 0 : Size resultlen = strlen((char *) result);
434 :
435 0 : if (resultlen >= MaxAllocSize)
436 0 : ereport(ERROR,
437 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
438 : errmsg("out of memory"),
439 : errdetail("String of %d bytes is too long for encoding conversion.",
440 : len)));
441 :
442 0 : result = (unsigned char *) repalloc(result, resultlen + 1);
443 : }
444 :
445 798 : return result;
446 : }
447 :
448 : /*
449 : * Convert src string to another encoding.
450 : *
451 : * This function has a different API than the other conversion functions.
452 : * The caller should've looked up the conversion function using
453 : * FindDefaultConversionProc(). Unlike the other functions, the converted
454 : * result is not palloc'd. It is written to the caller-supplied buffer
455 : * instead.
456 : *
457 : * src_encoding - encoding to convert from
458 : * dest_encoding - encoding to convert to
459 : * src, srclen - input buffer and its length in bytes
460 : * dest, destlen - destination buffer and its size in bytes
461 : *
462 : * The output is null-terminated.
463 : *
464 : * If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
465 : * wouldn't necessarily fit in the output buffer, and the function will not
466 : * convert the whole input.
467 : *
468 : * TODO: The conversion function interface is not great. Firstly, it
469 : * would be nice to pass through the destination buffer size to the
470 : * conversion function, so that if you pass a shorter destination buffer, it
471 : * could still continue to fill up the whole buffer. Currently, we have to
472 : * assume worst case expansion and stop the conversion short, even if there
473 : * is in fact space left in the destination buffer. Secondly, it would be
474 : * nice to return the number of bytes written to the caller, to avoid a call
475 : * to strlen().
476 : */
477 : int
478 5820 : pg_do_encoding_conversion_buf(Oid proc,
479 : int src_encoding,
480 : int dest_encoding,
481 : unsigned char *src, int srclen,
482 : unsigned char *dest, int destlen,
483 : bool noError)
484 : {
485 : Datum result;
486 :
487 : /*
488 : * If the destination buffer is not large enough to hold the result in the
489 : * worst case, limit the input size passed to the conversion function.
490 : */
491 5820 : if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
492 5760 : srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
493 :
494 5820 : result = OidFunctionCall6(proc,
495 : Int32GetDatum(src_encoding),
496 : Int32GetDatum(dest_encoding),
497 : CStringGetDatum((char *) src),
498 : CStringGetDatum((char *) dest),
499 : Int32GetDatum(srclen),
500 : BoolGetDatum(noError));
501 3450 : return DatumGetInt32(result);
502 : }
503 :
504 : /*
505 : * Convert string to encoding encoding_name. The source
506 : * encoding is the DB encoding.
507 : *
508 : * BYTEA convert_to(TEXT string, NAME encoding_name)
509 : */
510 : Datum
511 408 : pg_convert_to(PG_FUNCTION_ARGS)
512 : {
513 408 : Datum string = PG_GETARG_DATUM(0);
514 408 : Datum dest_encoding_name = PG_GETARG_DATUM(1);
515 408 : Datum src_encoding_name = DirectFunctionCall1(namein,
516 : CStringGetDatum(DatabaseEncoding->name));
517 : Datum result;
518 :
519 : /*
520 : * pg_convert expects a bytea as its first argument. We're passing it a
521 : * text argument here, relying on the fact that they are both in fact
522 : * varlena types, and thus structurally identical.
523 : */
524 408 : result = DirectFunctionCall3(pg_convert, string,
525 : src_encoding_name, dest_encoding_name);
526 :
527 402 : PG_RETURN_DATUM(result);
528 : }
529 :
530 : /*
531 : * Convert string from encoding encoding_name. The destination
532 : * encoding is the DB encoding.
533 : *
534 : * TEXT convert_from(BYTEA string, NAME encoding_name)
535 : */
536 : Datum
537 592 : pg_convert_from(PG_FUNCTION_ARGS)
538 : {
539 592 : Datum string = PG_GETARG_DATUM(0);
540 592 : Datum src_encoding_name = PG_GETARG_DATUM(1);
541 592 : Datum dest_encoding_name = DirectFunctionCall1(namein,
542 : CStringGetDatum(DatabaseEncoding->name));
543 : Datum result;
544 :
545 592 : result = DirectFunctionCall3(pg_convert, string,
546 : src_encoding_name, dest_encoding_name);
547 :
548 : /*
549 : * pg_convert returns a bytea, which we in turn return as text, relying on
550 : * the fact that they are both in fact varlena types, and thus
551 : * structurally identical. Although not all bytea values are valid text,
552 : * in this case it will be because we've told pg_convert to return one
553 : * that is valid as text in the current database encoding.
554 : */
555 586 : PG_RETURN_DATUM(result);
556 : }
557 :
558 : /*
559 : * Convert string between two arbitrary encodings.
560 : *
561 : * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
562 : */
563 : Datum
564 1768 : pg_convert(PG_FUNCTION_ARGS)
565 : {
566 1768 : bytea *string = PG_GETARG_BYTEA_PP(0);
567 1768 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
568 1768 : int src_encoding = pg_char_to_encoding(src_encoding_name);
569 1768 : char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
570 1768 : int dest_encoding = pg_char_to_encoding(dest_encoding_name);
571 : const char *src_str;
572 : char *dest_str;
573 : bytea *retval;
574 : int len;
575 :
576 1768 : if (src_encoding < 0)
577 0 : ereport(ERROR,
578 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
579 : errmsg("invalid source encoding name \"%s\"",
580 : src_encoding_name)));
581 1768 : if (dest_encoding < 0)
582 0 : ereport(ERROR,
583 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
584 : errmsg("invalid destination encoding name \"%s\"",
585 : dest_encoding_name)));
586 :
587 : /* make sure that source string is valid */
588 1768 : len = VARSIZE_ANY_EXHDR(string);
589 1768 : src_str = VARDATA_ANY(string);
590 1768 : (void) pg_verify_mbstr(src_encoding, src_str, len, false);
591 :
592 : /* perform conversion */
593 1756 : dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
594 : len,
595 : src_encoding,
596 : dest_encoding);
597 :
598 :
599 : /* return source string if no conversion happened */
600 1756 : if (dest_str == src_str)
601 976 : PG_RETURN_BYTEA_P(string);
602 :
603 : /*
604 : * build bytea data type structure.
605 : */
606 780 : len = strlen(dest_str);
607 780 : retval = (bytea *) palloc(len + VARHDRSZ);
608 780 : SET_VARSIZE(retval, len + VARHDRSZ);
609 780 : memcpy(VARDATA(retval), dest_str, len);
610 780 : pfree(dest_str);
611 :
612 : /* free memory if allocated by the toaster */
613 780 : PG_FREE_IF_COPY(string, 0);
614 :
615 780 : PG_RETURN_BYTEA_P(retval);
616 : }
617 :
618 : /*
619 : * get the length of the string considered as text in the specified
620 : * encoding. Raises an error if the data is not valid in that
621 : * encoding.
622 : *
623 : * INT4 length (BYTEA string, NAME src_encoding_name)
624 : */
625 : Datum
626 0 : length_in_encoding(PG_FUNCTION_ARGS)
627 : {
628 0 : bytea *string = PG_GETARG_BYTEA_PP(0);
629 0 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
630 0 : int src_encoding = pg_char_to_encoding(src_encoding_name);
631 : const char *src_str;
632 : int len;
633 : int retval;
634 :
635 0 : if (src_encoding < 0)
636 0 : ereport(ERROR,
637 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
638 : errmsg("invalid encoding name \"%s\"",
639 : src_encoding_name)));
640 :
641 0 : len = VARSIZE_ANY_EXHDR(string);
642 0 : src_str = VARDATA_ANY(string);
643 :
644 0 : retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
645 :
646 0 : PG_RETURN_INT32(retval);
647 : }
648 :
649 : /*
650 : * Get maximum multibyte character length in the specified encoding.
651 : *
652 : * Note encoding is specified numerically, not by name as above.
653 : */
654 : Datum
655 0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
656 : {
657 0 : int encoding = PG_GETARG_INT32(0);
658 :
659 0 : if (PG_VALID_ENCODING(encoding))
660 0 : PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
661 : else
662 0 : PG_RETURN_NULL();
663 : }
664 :
665 : /*
666 : * Convert client encoding to server encoding.
667 : *
668 : * See the notes about string conversion functions at the top of this file.
669 : */
670 : char *
671 848680 : pg_client_to_server(const char *s, int len)
672 : {
673 848680 : return pg_any_to_server(s, len, ClientEncoding->encoding);
674 : }
675 :
676 : /*
677 : * Convert any encoding to server encoding.
678 : *
679 : * See the notes about string conversion functions at the top of this file.
680 : *
681 : * Unlike the other string conversion functions, this will apply validation
682 : * even if encoding == DatabaseEncoding->encoding. This is because this is
683 : * used to process data coming in from outside the database, and we never
684 : * want to just assume validity.
685 : */
686 : char *
687 936566 : pg_any_to_server(const char *s, int len, int encoding)
688 : {
689 936566 : if (len <= 0)
690 80508 : return unconstify(char *, s); /* empty string is always valid */
691 :
692 856058 : if (encoding == DatabaseEncoding->encoding ||
693 : encoding == PG_SQL_ASCII)
694 : {
695 : /*
696 : * No conversion is needed, but we must still validate the data.
697 : */
698 855690 : (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
699 855688 : return unconstify(char *, s);
700 : }
701 :
702 368 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
703 : {
704 : /*
705 : * No conversion is possible, but we must still validate the data,
706 : * because the client-side code might have done string escaping using
707 : * the selected client_encoding. If the client encoding is ASCII-safe
708 : * then we just do a straight validation under that encoding. For an
709 : * ASCII-unsafe encoding we have a problem: we dare not pass such data
710 : * to the parser but we have no way to convert it. We compromise by
711 : * rejecting the data if it contains any non-ASCII characters.
712 : */
713 308 : if (PG_VALID_BE_ENCODING(encoding))
714 248 : (void) pg_verify_mbstr(encoding, s, len, false);
715 : else
716 : {
717 : int i;
718 :
719 1908 : for (i = 0; i < len; i++)
720 : {
721 1848 : if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
722 0 : ereport(ERROR,
723 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
724 : errmsg("invalid byte value for encoding \"%s\": 0x%02x",
725 : pg_enc2name_tbl[PG_SQL_ASCII].name,
726 : (unsigned char) s[i])));
727 : }
728 : }
729 308 : return unconstify(char *, s);
730 : }
731 :
732 : /* Fast path if we can use cached conversion function */
733 60 : if (encoding == ClientEncoding->encoding)
734 60 : return perform_default_encoding_conversion(s, len, true);
735 :
736 : /* General case ... will not work outside transactions */
737 0 : return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
738 : len,
739 : encoding,
740 0 : DatabaseEncoding->encoding);
741 : }
742 :
743 : /*
744 : * Convert server encoding to client encoding.
745 : *
746 : * See the notes about string conversion functions at the top of this file.
747 : */
748 : char *
749 37101632 : pg_server_to_client(const char *s, int len)
750 : {
751 37101632 : return pg_server_to_any(s, len, ClientEncoding->encoding);
752 : }
753 :
754 : /*
755 : * Convert server encoding to any encoding.
756 : *
757 : * See the notes about string conversion functions at the top of this file.
758 : */
759 : char *
760 37140446 : pg_server_to_any(const char *s, int len, int encoding)
761 : {
762 37140446 : if (len <= 0)
763 265534 : return unconstify(char *, s); /* empty string is always valid */
764 :
765 36874912 : if (encoding == DatabaseEncoding->encoding ||
766 : encoding == PG_SQL_ASCII)
767 36874338 : return unconstify(char *, s); /* assume data is valid */
768 :
769 574 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
770 : {
771 : /* No conversion is possible, but we must validate the result */
772 168 : (void) pg_verify_mbstr(encoding, s, len, false);
773 168 : return unconstify(char *, s);
774 : }
775 :
776 : /* Fast path if we can use cached conversion function */
777 406 : if (encoding == ClientEncoding->encoding)
778 388 : return perform_default_encoding_conversion(s, len, false);
779 :
780 : /* General case ... will not work outside transactions */
781 18 : return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
782 : len,
783 18 : DatabaseEncoding->encoding,
784 : encoding);
785 : }
786 :
787 : /*
788 : * Perform default encoding conversion using cached FmgrInfo. Since
789 : * this function does not access database at all, it is safe to call
790 : * outside transactions. If the conversion has not been set up by
791 : * SetClientEncoding(), no conversion is performed.
792 : */
793 : static char *
794 448 : perform_default_encoding_conversion(const char *src, int len,
795 : bool is_client_to_server)
796 : {
797 : char *result;
798 : int src_encoding,
799 : dest_encoding;
800 : FmgrInfo *flinfo;
801 :
802 448 : if (is_client_to_server)
803 : {
804 60 : src_encoding = ClientEncoding->encoding;
805 60 : dest_encoding = DatabaseEncoding->encoding;
806 60 : flinfo = ToServerConvProc;
807 : }
808 : else
809 : {
810 388 : src_encoding = DatabaseEncoding->encoding;
811 388 : dest_encoding = ClientEncoding->encoding;
812 388 : flinfo = ToClientConvProc;
813 : }
814 :
815 448 : if (flinfo == NULL)
816 0 : return unconstify(char *, src);
817 :
818 : /*
819 : * Allocate space for conversion result, being wary of integer overflow.
820 : * See comments in pg_do_encoding_conversion.
821 : */
822 448 : if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
823 0 : ereport(ERROR,
824 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
825 : errmsg("out of memory"),
826 : errdetail("String of %d bytes is too long for encoding conversion.",
827 : len)));
828 :
829 : result = (char *)
830 448 : MemoryContextAllocHuge(CurrentMemoryContext,
831 448 : (Size) len * MAX_CONVERSION_GROWTH + 1);
832 :
833 448 : FunctionCall6(flinfo,
834 : Int32GetDatum(src_encoding),
835 : Int32GetDatum(dest_encoding),
836 : CStringGetDatum(src),
837 : CStringGetDatum(result),
838 : Int32GetDatum(len),
839 : BoolGetDatum(false));
840 :
841 : /*
842 : * Release extra space if there might be a lot --- see comments in
843 : * pg_do_encoding_conversion.
844 : */
845 448 : if (len > 1000000)
846 : {
847 0 : Size resultlen = strlen(result);
848 :
849 0 : if (resultlen >= MaxAllocSize)
850 0 : ereport(ERROR,
851 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
852 : errmsg("out of memory"),
853 : errdetail("String of %d bytes is too long for encoding conversion.",
854 : len)));
855 :
856 0 : result = (char *) repalloc(result, resultlen + 1);
857 : }
858 :
859 448 : return result;
860 : }
861 :
862 : /*
863 : * Convert a single Unicode code point into a string in the server encoding.
864 : *
865 : * The code point given by "c" is converted and stored at *s, which must
866 : * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
867 : * The output will have a trailing '\0'. Throws error if the conversion
868 : * cannot be performed.
869 : *
870 : * Note that this relies on having previously looked up any required
871 : * conversion function. That's partly for speed but mostly because the parser
872 : * may call this outside any transaction, or in an aborted transaction.
873 : */
874 : void
875 1046 : pg_unicode_to_server(char32_t c, unsigned char *s)
876 : {
877 : unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
878 : int c_as_utf8_len;
879 : int server_encoding;
880 :
881 : /*
882 : * Complain if invalid Unicode code point. The choice of errcode here is
883 : * debatable, but really our caller should have checked this anyway.
884 : */
885 1046 : if (!is_valid_unicode_codepoint(c))
886 0 : ereport(ERROR,
887 : (errcode(ERRCODE_SYNTAX_ERROR),
888 : errmsg("invalid Unicode code point")));
889 :
890 : /* Otherwise, if it's in ASCII range, conversion is trivial */
891 1046 : if (c <= 0x7F)
892 : {
893 352 : s[0] = (unsigned char) c;
894 352 : s[1] = '\0';
895 1046 : return;
896 : }
897 :
898 : /* If the server encoding is UTF-8, we just need to reformat the code */
899 694 : server_encoding = GetDatabaseEncoding();
900 694 : if (server_encoding == PG_UTF8)
901 : {
902 694 : unicode_to_utf8(c, s);
903 694 : s[pg_utf_mblen(s)] = '\0';
904 694 : return;
905 : }
906 :
907 : /* For all other cases, we must have a conversion function available */
908 0 : if (Utf8ToServerConvProc == NULL)
909 0 : ereport(ERROR,
910 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
911 : errmsg("conversion between %s and %s is not supported",
912 : pg_enc2name_tbl[PG_UTF8].name,
913 : GetDatabaseEncodingName())));
914 :
915 : /* Construct UTF-8 source string */
916 0 : unicode_to_utf8(c, c_as_utf8);
917 0 : c_as_utf8_len = pg_utf_mblen(c_as_utf8);
918 0 : c_as_utf8[c_as_utf8_len] = '\0';
919 :
920 : /* Convert, or throw error if we can't */
921 0 : FunctionCall6(Utf8ToServerConvProc,
922 : Int32GetDatum(PG_UTF8),
923 : Int32GetDatum(server_encoding),
924 : CStringGetDatum((char *) c_as_utf8),
925 : CStringGetDatum((char *) s),
926 : Int32GetDatum(c_as_utf8_len),
927 : BoolGetDatum(false));
928 : }
929 :
930 : /*
931 : * Convert a single Unicode code point into a string in the server encoding.
932 : *
933 : * Same as pg_unicode_to_server(), except that we don't throw errors,
934 : * but simply return false on conversion failure.
935 : */
936 : bool
937 84 : pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
938 : {
939 : unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
940 : int c_as_utf8_len;
941 : int converted_len;
942 : int server_encoding;
943 :
944 : /* Fail if invalid Unicode code point */
945 84 : if (!is_valid_unicode_codepoint(c))
946 0 : return false;
947 :
948 : /* Otherwise, if it's in ASCII range, conversion is trivial */
949 84 : if (c <= 0x7F)
950 : {
951 24 : s[0] = (unsigned char) c;
952 24 : s[1] = '\0';
953 24 : return true;
954 : }
955 :
956 : /* If the server encoding is UTF-8, we just need to reformat the code */
957 60 : server_encoding = GetDatabaseEncoding();
958 60 : if (server_encoding == PG_UTF8)
959 : {
960 60 : unicode_to_utf8(c, s);
961 60 : s[pg_utf_mblen(s)] = '\0';
962 60 : return true;
963 : }
964 :
965 : /* For all other cases, we must have a conversion function available */
966 0 : if (Utf8ToServerConvProc == NULL)
967 0 : return false;
968 :
969 : /* Construct UTF-8 source string */
970 0 : unicode_to_utf8(c, c_as_utf8);
971 0 : c_as_utf8_len = pg_utf_mblen(c_as_utf8);
972 0 : c_as_utf8[c_as_utf8_len] = '\0';
973 :
974 : /* Convert, but without throwing error if we can't */
975 0 : converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
976 : Int32GetDatum(PG_UTF8),
977 : Int32GetDatum(server_encoding),
978 : CStringGetDatum((char *) c_as_utf8),
979 : CStringGetDatum((char *) s),
980 : Int32GetDatum(c_as_utf8_len),
981 : BoolGetDatum(true)));
982 :
983 : /* Conversion was successful iff it consumed the whole input */
984 0 : return (converted_len == c_as_utf8_len);
985 : }
986 :
987 :
988 : /* convert a multibyte string to a wchar */
989 : int
990 0 : pg_mb2wchar(const char *from, pg_wchar *to)
991 : {
992 0 : return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
993 : }
994 :
995 : /* convert a multibyte string to a wchar with a limited length */
996 : int
997 10222728 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
998 : {
999 10222728 : return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
1000 : }
1001 :
1002 : /* same, with any encoding */
1003 : int
1004 18616 : pg_encoding_mb2wchar_with_len(int encoding,
1005 : const char *from, pg_wchar *to, int len)
1006 : {
1007 18616 : return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
1008 : }
1009 :
1010 : /* convert a wchar string to a multibyte */
1011 : int
1012 0 : pg_wchar2mb(const pg_wchar *from, char *to)
1013 : {
1014 0 : return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
1015 : }
1016 :
1017 : /* convert a wchar string to a multibyte with a limited length */
1018 : int
1019 1116212 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
1020 : {
1021 1116212 : return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1022 : }
1023 :
1024 : /* same, with any encoding */
1025 : int
1026 192 : pg_encoding_wchar2mb_with_len(int encoding,
1027 : const pg_wchar *from, char *to, int len)
1028 : {
1029 192 : return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1030 : }
1031 :
1032 : /*
1033 : * Returns the byte length of a multibyte character sequence in a
1034 : * null-terminated string. Raises an illegal byte sequence error if the
1035 : * sequence would hit a null terminator.
1036 : *
1037 : * The caller is expected to have checked for a terminator at *mbstr == 0
1038 : * before calling, but some callers want 1 in that case, so this function
1039 : * continues that tradition.
1040 : *
1041 : * This must only be used for strings that have a null-terminator to enable
1042 : * bounds detection.
1043 : */
1044 : int
1045 4203206 : pg_mblen_cstr(const char *mbstr)
1046 : {
1047 4203206 : int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1048 :
1049 : /*
1050 : * The .mblen functions return 1 when given a pointer to a terminator.
1051 : * Some callers depend on that, so we tolerate it for now. Well-behaved
1052 : * callers check the leading byte for a terminator *before* calling.
1053 : */
1054 4228046 : for (int i = 1; i < length; ++i)
1055 24846 : if (unlikely(mbstr[i] == 0))
1056 6 : report_invalid_encoding_db(mbstr, length, i);
1057 :
1058 : /*
1059 : * String should be NUL-terminated, but checking that would make typical
1060 : * callers O(N^2), tripling Valgrind check-world time. Unless
1061 : * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we
1062 : * found a character, not a terminator, the next byte must be a terminator
1063 : * or the start of the next character.) If the caller iterates the whole
1064 : * string, the last call will diagnose a missing terminator.
1065 : */
1066 4203200 : if (mbstr[0] != '\0')
1067 : {
1068 : #ifdef VALGRIND_EXPENSIVE
1069 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
1070 : #else
1071 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
1072 : #endif
1073 : }
1074 :
1075 4203200 : return length;
1076 : }
1077 :
1078 : /*
1079 : * Returns the byte length of a multibyte character sequence bounded by a range
1080 : * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence
1081 : * error if the sequence would exceed the range.
1082 : */
1083 : int
1084 5524330 : pg_mblen_range(const char *mbstr, const char *end)
1085 : {
1086 5524330 : int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1087 :
1088 : Assert(end > mbstr);
1089 : #ifdef VALGRIND_EXPENSIVE
1090 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
1091 : #else
1092 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
1093 : #endif
1094 :
1095 5524330 : if (unlikely(mbstr + length > end))
1096 12 : report_invalid_encoding_db(mbstr, length, end - mbstr);
1097 :
1098 5524318 : return length;
1099 : }
1100 :
1101 : /*
1102 : * Returns the byte length of a multibyte character sequence bounded by a range
1103 : * extending for 'limit' bytes, which must be at least one. Raises an illegal
1104 : * byte sequence error if the sequence would exceed the range.
1105 : */
1106 : int
1107 220471946 : pg_mblen_with_len(const char *mbstr, int limit)
1108 : {
1109 220471946 : int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1110 :
1111 : Assert(limit >= 1);
1112 : #ifdef VALGRIND_EXPENSIVE
1113 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
1114 : #else
1115 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
1116 : #endif
1117 :
1118 220471946 : if (unlikely(length > limit))
1119 18 : report_invalid_encoding_db(mbstr, length, limit);
1120 :
1121 220471928 : return length;
1122 : }
1123 :
1124 :
1125 : /*
1126 : * Returns the length of a multibyte character sequence, without any
1127 : * validation of bounds.
1128 : *
1129 : * PLEASE NOTE: This function can only be used safely if the caller has
1130 : * already verified the input string, since otherwise there is a risk of
1131 : * overrunning the buffer if the string is invalid. A prior call to a
1132 : * pg_mbstrlen* function suffices.
1133 : */
1134 : int
1135 21399420 : pg_mblen_unbounded(const char *mbstr)
1136 : {
1137 21399420 : int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1138 :
1139 : VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
1140 :
1141 21399420 : return length;
1142 : }
1143 :
1144 : /*
1145 : * Historical name for pg_mblen_unbounded(). Should not be used and will be
1146 : * removed in a later version.
1147 : */
1148 : int
1149 0 : pg_mblen(const char *mbstr)
1150 : {
1151 0 : return pg_mblen_unbounded(mbstr);
1152 : }
1153 :
1154 : /* returns the display length of a multibyte character */
1155 : int
1156 8724 : pg_dsplen(const char *mbstr)
1157 : {
1158 8724 : return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1159 : }
1160 :
1161 : /* returns the length (counted in wchars) of a multibyte string */
1162 : int
1163 702 : pg_mbstrlen(const char *mbstr)
1164 : {
1165 702 : int len = 0;
1166 :
1167 : /* optimization for single byte encoding */
1168 702 : if (pg_database_encoding_max_length() == 1)
1169 0 : return strlen(mbstr);
1170 :
1171 1626 : while (*mbstr)
1172 : {
1173 924 : mbstr += pg_mblen_cstr(mbstr);
1174 924 : len++;
1175 : }
1176 702 : return len;
1177 : }
1178 :
1179 : /* returns the length (counted in wchars) of a multibyte string
1180 : * (stops at the first of "limit" or a NUL)
1181 : */
1182 : int
1183 1610884 : pg_mbstrlen_with_len(const char *mbstr, int limit)
1184 : {
1185 1610884 : int len = 0;
1186 :
1187 : /* optimization for single byte encoding */
1188 1610884 : if (pg_database_encoding_max_length() == 1)
1189 400014 : return limit;
1190 :
1191 221678832 : while (limit > 0 && *mbstr)
1192 : {
1193 220467974 : int l = pg_mblen_with_len(mbstr, limit);
1194 :
1195 220467962 : limit -= l;
1196 220467962 : mbstr += l;
1197 220467962 : len++;
1198 : }
1199 1210858 : return len;
1200 : }
1201 :
1202 : /*
1203 : * returns the byte length of a multibyte string
1204 : * (not necessarily NULL terminated)
1205 : * that is no longer than limit.
1206 : * this function does not break multibyte character boundary.
1207 : */
1208 : int
1209 330534 : pg_mbcliplen(const char *mbstr, int len, int limit)
1210 : {
1211 330534 : return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1212 : len, limit);
1213 : }
1214 :
1215 : /*
1216 : * pg_mbcliplen with specified encoding; string must be valid in encoding
1217 : */
1218 : int
1219 330534 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
1220 : int len, int limit)
1221 : {
1222 : mblen_converter mblen_fn;
1223 330534 : int clen = 0;
1224 : int l;
1225 :
1226 : /* optimization for single byte encoding */
1227 330534 : if (pg_encoding_max_length(encoding) == 1)
1228 38490 : return cliplen(mbstr, len, limit);
1229 :
1230 292044 : mblen_fn = pg_wchar_table[encoding].mblen;
1231 :
1232 3219170 : while (len > 0 && *mbstr)
1233 : {
1234 3067838 : l = (*mblen_fn) ((const unsigned char *) mbstr);
1235 3067838 : if ((clen + l) > limit)
1236 94 : break;
1237 3067744 : clen += l;
1238 3067744 : if (clen == limit)
1239 140618 : break;
1240 2927126 : len -= l;
1241 2927126 : mbstr += l;
1242 : }
1243 292044 : return clen;
1244 : }
1245 :
1246 : /*
1247 : * Similar to pg_mbcliplen except the limit parameter specifies the
1248 : * character length, not the byte length.
1249 : */
1250 : int
1251 528 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
1252 : {
1253 528 : int clen = 0;
1254 528 : int nch = 0;
1255 : int l;
1256 :
1257 : /* optimization for single byte encoding */
1258 528 : if (pg_database_encoding_max_length() == 1)
1259 0 : return cliplen(mbstr, len, limit);
1260 :
1261 2328 : while (len > 0 && *mbstr)
1262 : {
1263 2310 : l = pg_mblen_with_len(mbstr, len);
1264 2310 : nch++;
1265 2310 : if (nch > limit)
1266 510 : break;
1267 1800 : clen += l;
1268 1800 : len -= l;
1269 1800 : mbstr += l;
1270 : }
1271 528 : return clen;
1272 : }
1273 :
1274 : /* mbcliplen for any single-byte encoding */
1275 : static int
1276 38490 : cliplen(const char *str, int len, int limit)
1277 : {
1278 38490 : int l = 0;
1279 :
1280 38490 : len = Min(len, limit);
1281 294640 : while (l < len && str[l])
1282 256150 : l++;
1283 38490 : return l;
1284 : }
1285 :
1286 : void
1287 33912 : SetDatabaseEncoding(int encoding)
1288 : {
1289 33912 : if (!PG_VALID_BE_ENCODING(encoding))
1290 0 : elog(ERROR, "invalid database encoding: %d", encoding);
1291 :
1292 33912 : DatabaseEncoding = &pg_enc2name_tbl[encoding];
1293 : Assert(DatabaseEncoding->encoding == encoding);
1294 33912 : }
1295 :
1296 : void
1297 37890 : SetMessageEncoding(int encoding)
1298 : {
1299 : /* Some calls happen before we can elog()! */
1300 : Assert(PG_VALID_ENCODING(encoding));
1301 :
1302 37890 : MessageEncoding = &pg_enc2name_tbl[encoding];
1303 : Assert(MessageEncoding->encoding == encoding);
1304 37890 : }
1305 :
1306 : #ifdef ENABLE_NLS
1307 : /*
1308 : * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
1309 : * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1310 : * fail for gettext-internal causes like out-of-memory.
1311 : */
1312 : static bool
1313 3268 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1314 : {
1315 3268 : bool elog_ok = (CurrentMemoryContext != NULL);
1316 :
1317 3268 : if (!PG_VALID_ENCODING(encoding) || pg_enc2gettext_tbl[encoding] == NULL)
1318 0 : return false;
1319 :
1320 3268 : if (bind_textdomain_codeset(domainname,
1321 : pg_enc2gettext_tbl[encoding]) != NULL)
1322 3268 : return true;
1323 :
1324 0 : if (elog_ok)
1325 0 : elog(LOG, "bind_textdomain_codeset failed");
1326 : else
1327 0 : write_stderr("bind_textdomain_codeset failed");
1328 :
1329 0 : return false;
1330 : }
1331 :
1332 : /*
1333 : * Bind a gettext message domain to the codeset corresponding to the database
1334 : * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1335 : * Return the MessageEncoding implied by the new settings.
1336 : *
1337 : * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1338 : * When that matches the database encoding, we don't need to do anything. In
1339 : * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1340 : * database encoding, except for the C locale. (On Windows, we also permit a
1341 : * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
1342 : * gettext to the right codeset.
1343 : *
1344 : * On Windows, gettext defaults to the Windows ANSI code page. This is a
1345 : * convenient departure for software that passes the strings to Windows ANSI
1346 : * APIs, but we don't do that. Compel gettext to use database encoding or,
1347 : * failing that, the LC_CTYPE encoding as it would on other platforms.
1348 : *
1349 : * This function is called before elog() and palloc() are usable.
1350 : */
1351 : int
1352 41668 : pg_bind_textdomain_codeset(const char *domainname)
1353 : {
1354 41668 : bool elog_ok = (CurrentMemoryContext != NULL);
1355 41668 : int encoding = GetDatabaseEncoding();
1356 : int new_msgenc;
1357 :
1358 : #ifndef WIN32
1359 41668 : const char *ctype = setlocale(LC_CTYPE, NULL);
1360 :
1361 41668 : if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1362 : #endif
1363 7326 : if (encoding != PG_SQL_ASCII &&
1364 3268 : raw_pg_bind_textdomain_codeset(domainname, encoding))
1365 3268 : return encoding;
1366 :
1367 38400 : new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
1368 38400 : if (new_msgenc < 0)
1369 0 : new_msgenc = PG_SQL_ASCII;
1370 :
1371 : #ifdef WIN32
1372 : if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1373 : /* On failure, the old message encoding remains valid. */
1374 : return GetMessageEncoding();
1375 : #endif
1376 :
1377 38400 : return new_msgenc;
1378 : }
1379 : #endif
1380 :
1381 : /*
1382 : * The database encoding, also called the server encoding, represents the
1383 : * encoding of data stored in text-like data types. Affected types include
1384 : * cstring, text, varchar, name, xml, and json.
1385 : */
1386 : int
1387 8489002 : GetDatabaseEncoding(void)
1388 : {
1389 8489002 : return DatabaseEncoding->encoding;
1390 : }
1391 :
1392 : const char *
1393 71044 : GetDatabaseEncodingName(void)
1394 : {
1395 71044 : return DatabaseEncoding->name;
1396 : }
1397 :
1398 : Datum
1399 102 : getdatabaseencoding(PG_FUNCTION_ARGS)
1400 : {
1401 102 : return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1402 : }
1403 :
1404 : Datum
1405 0 : pg_client_encoding(PG_FUNCTION_ARGS)
1406 : {
1407 0 : return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1408 : }
1409 :
1410 : Datum
1411 36 : PG_char_to_encoding(PG_FUNCTION_ARGS)
1412 : {
1413 36 : Name s = PG_GETARG_NAME(0);
1414 :
1415 36 : PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1416 : }
1417 :
1418 : Datum
1419 4910 : PG_encoding_to_char(PG_FUNCTION_ARGS)
1420 : {
1421 4910 : int32 encoding = PG_GETARG_INT32(0);
1422 4910 : const char *encoding_name = pg_encoding_to_char(encoding);
1423 :
1424 4910 : return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1425 : }
1426 :
1427 : /*
1428 : * gettext() returns messages in this encoding. This often matches the
1429 : * database encoding, but it differs for SQL_ASCII databases, for processes
1430 : * not attached to a database, and under a database encoding lacking iconv
1431 : * support (MULE_INTERNAL).
1432 : */
1433 : int
1434 0 : GetMessageEncoding(void)
1435 : {
1436 0 : return MessageEncoding->encoding;
1437 : }
1438 :
1439 :
1440 : /*
1441 : * Generic character incrementer function.
1442 : *
1443 : * Not knowing anything about the properties of the encoding in use, we just
1444 : * keep incrementing the last byte until we get a validly-encoded result,
1445 : * or we run out of values to try. We don't bother to try incrementing
1446 : * higher-order bytes, so there's no growth in runtime for wider characters.
1447 : * (If we did try to do that, we'd need to consider the likelihood that 255
1448 : * is not a valid final byte in the encoding.)
1449 : */
1450 : static bool
1451 104 : pg_generic_charinc(unsigned char *charptr, int len)
1452 : {
1453 104 : unsigned char *lastbyte = charptr + len - 1;
1454 : mbchar_verifier mbverify;
1455 :
1456 : /* We can just invoke the character verifier directly. */
1457 104 : mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1458 :
1459 104 : while (*lastbyte < (unsigned char) 255)
1460 : {
1461 104 : (*lastbyte)++;
1462 104 : if ((*mbverify) (charptr, len) == len)
1463 104 : return true;
1464 : }
1465 :
1466 0 : return false;
1467 : }
1468 :
1469 : /*
1470 : * UTF-8 character incrementer function.
1471 : *
1472 : * For a one-byte character less than 0x7F, we just increment the byte.
1473 : *
1474 : * For a multibyte character, every byte but the first must fall between 0x80
1475 : * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1476 : * the last byte that's not already at its maximum value. If we can't find a
1477 : * byte that's less than the maximum allowable value, we simply fail. We also
1478 : * need some special-case logic to skip regions used for surrogate pair
1479 : * handling, as those should not occur in valid UTF-8.
1480 : *
1481 : * Note that we don't reset lower-order bytes back to their minimums, since
1482 : * we can't afford to make an exhaustive search (see make_greater_string).
1483 : */
1484 : static bool
1485 3482 : pg_utf8_increment(unsigned char *charptr, int length)
1486 : {
1487 : unsigned char a;
1488 : unsigned char limit;
1489 :
1490 3482 : switch (length)
1491 : {
1492 0 : default:
1493 : /* reject lengths 5 and 6 for now */
1494 0 : return false;
1495 0 : case 4:
1496 0 : a = charptr[3];
1497 0 : if (a < 0xBF)
1498 : {
1499 0 : charptr[3]++;
1500 0 : break;
1501 : }
1502 : /* FALL THRU */
1503 : case 3:
1504 0 : a = charptr[2];
1505 0 : if (a < 0xBF)
1506 : {
1507 0 : charptr[2]++;
1508 0 : break;
1509 : }
1510 : /* FALL THRU */
1511 : case 2:
1512 0 : a = charptr[1];
1513 0 : switch (*charptr)
1514 : {
1515 0 : case 0xED:
1516 0 : limit = 0x9F;
1517 0 : break;
1518 0 : case 0xF4:
1519 0 : limit = 0x8F;
1520 0 : break;
1521 0 : default:
1522 0 : limit = 0xBF;
1523 0 : break;
1524 : }
1525 0 : if (a < limit)
1526 : {
1527 0 : charptr[1]++;
1528 0 : break;
1529 : }
1530 : /* FALL THRU */
1531 : case 1:
1532 3482 : a = *charptr;
1533 3482 : if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1534 0 : return false;
1535 3482 : charptr[0]++;
1536 3482 : break;
1537 : }
1538 :
1539 3482 : return true;
1540 : }
1541 :
1542 : /*
1543 : * EUC-JP character incrementer function.
1544 : *
1545 : * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1546 : * representing JIS X 0201 characters with the second byte ranging between
1547 : * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1548 : * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1549 : *
1550 : * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1551 : * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1552 : * is incremented if possible, otherwise the second-to-last byte.
1553 : *
1554 : * If the sequence starts with a value other than the above and its MSB
1555 : * is set, it must be a two-byte sequence representing JIS X 0208 characters
1556 : * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1557 : * incremented if possible, otherwise the second-to-last byte.
1558 : *
1559 : * Otherwise, the sequence is a single-byte ASCII character. It is
1560 : * incremented up to 0x7f.
1561 : */
1562 : static bool
1563 0 : pg_eucjp_increment(unsigned char *charptr, int length)
1564 : {
1565 : unsigned char c1,
1566 : c2;
1567 : int i;
1568 :
1569 0 : c1 = *charptr;
1570 :
1571 0 : switch (c1)
1572 : {
1573 0 : case SS2: /* JIS X 0201 */
1574 0 : if (length != 2)
1575 0 : return false;
1576 :
1577 0 : c2 = charptr[1];
1578 :
1579 0 : if (c2 >= 0xdf)
1580 0 : charptr[0] = charptr[1] = 0xa1;
1581 0 : else if (c2 < 0xa1)
1582 0 : charptr[1] = 0xa1;
1583 : else
1584 0 : charptr[1]++;
1585 0 : break;
1586 :
1587 0 : case SS3: /* JIS X 0212 */
1588 0 : if (length != 3)
1589 0 : return false;
1590 :
1591 0 : for (i = 2; i > 0; i--)
1592 : {
1593 0 : c2 = charptr[i];
1594 0 : if (c2 < 0xa1)
1595 : {
1596 0 : charptr[i] = 0xa1;
1597 0 : return true;
1598 : }
1599 0 : else if (c2 < 0xfe)
1600 : {
1601 0 : charptr[i]++;
1602 0 : return true;
1603 : }
1604 : }
1605 :
1606 : /* Out of 3-byte code region */
1607 0 : return false;
1608 :
1609 0 : default:
1610 0 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1611 : {
1612 0 : if (length != 2)
1613 0 : return false;
1614 :
1615 0 : for (i = 1; i >= 0; i--)
1616 : {
1617 0 : c2 = charptr[i];
1618 0 : if (c2 < 0xa1)
1619 : {
1620 0 : charptr[i] = 0xa1;
1621 0 : return true;
1622 : }
1623 0 : else if (c2 < 0xfe)
1624 : {
1625 0 : charptr[i]++;
1626 0 : return true;
1627 : }
1628 : }
1629 :
1630 : /* Out of 2 byte code region */
1631 0 : return false;
1632 : }
1633 : else
1634 : { /* ASCII, single byte */
1635 0 : if (c1 > 0x7e)
1636 0 : return false;
1637 0 : (*charptr)++;
1638 : }
1639 0 : break;
1640 : }
1641 :
1642 0 : return true;
1643 : }
1644 :
1645 : /*
1646 : * get the character incrementer for the encoding for the current database
1647 : */
1648 : mbcharacter_incrementer
1649 3586 : pg_database_encoding_character_incrementer(void)
1650 : {
1651 : /*
1652 : * Eventually it might be best to add a field to pg_wchar_table[], but for
1653 : * now we just use a switch.
1654 : */
1655 3586 : switch (GetDatabaseEncoding())
1656 : {
1657 3482 : case PG_UTF8:
1658 3482 : return pg_utf8_increment;
1659 :
1660 0 : case PG_EUC_JP:
1661 0 : return pg_eucjp_increment;
1662 :
1663 104 : default:
1664 104 : return pg_generic_charinc;
1665 : }
1666 : }
1667 :
1668 : /*
1669 : * fetch maximum length of the encoding for the current database
1670 : */
1671 : int
1672 5998312 : pg_database_encoding_max_length(void)
1673 : {
1674 5998312 : return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1675 : }
1676 :
1677 : /*
1678 : * Verify mbstr to make sure that it is validly encoded in the current
1679 : * database encoding. Otherwise same as pg_verify_mbstr().
1680 : */
1681 : bool
1682 4584 : pg_verifymbstr(const char *mbstr, int len, bool noError)
1683 : {
1684 4584 : return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1685 : }
1686 :
1687 : /*
1688 : * Verify mbstr to make sure that it is validly encoded in the specified
1689 : * encoding.
1690 : */
1691 : bool
1692 1193168 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1693 : {
1694 : int oklen;
1695 :
1696 : Assert(PG_VALID_ENCODING(encoding));
1697 :
1698 1193168 : oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1699 1193168 : if (oklen != len)
1700 : {
1701 16 : if (noError)
1702 0 : return false;
1703 16 : report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1704 : }
1705 1193152 : return true;
1706 : }
1707 :
1708 : /*
1709 : * Verify mbstr to make sure that it is validly encoded in the specified
1710 : * encoding.
1711 : *
1712 : * mbstr is not necessarily zero terminated; length of mbstr is
1713 : * specified by len.
1714 : *
1715 : * If OK, return length of string in the encoding.
1716 : * If a problem is found, return -1 when noError is
1717 : * true; when noError is false, ereport() a descriptive message.
1718 : *
1719 : * Note: We cannot use the faster encoding-specific mbverifystr() function
1720 : * here, because we need to count the number of characters in the string.
1721 : */
1722 : int
1723 0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1724 : {
1725 : mbchar_verifier mbverifychar;
1726 : int mb_len;
1727 :
1728 : Assert(PG_VALID_ENCODING(encoding));
1729 :
1730 : /*
1731 : * In single-byte encodings, we need only reject nulls (\0).
1732 : */
1733 0 : if (pg_encoding_max_length(encoding) <= 1)
1734 : {
1735 0 : const char *nullpos = memchr(mbstr, 0, len);
1736 :
1737 0 : if (nullpos == NULL)
1738 0 : return len;
1739 0 : if (noError)
1740 0 : return -1;
1741 0 : report_invalid_encoding(encoding, nullpos, 1);
1742 : }
1743 :
1744 : /* fetch function pointer just once */
1745 0 : mbverifychar = pg_wchar_table[encoding].mbverifychar;
1746 :
1747 0 : mb_len = 0;
1748 :
1749 0 : while (len > 0)
1750 : {
1751 : int l;
1752 :
1753 : /* fast path for ASCII-subset characters */
1754 0 : if (!IS_HIGHBIT_SET(*mbstr))
1755 : {
1756 0 : if (*mbstr != '\0')
1757 : {
1758 0 : mb_len++;
1759 0 : mbstr++;
1760 0 : len--;
1761 0 : continue;
1762 : }
1763 0 : if (noError)
1764 0 : return -1;
1765 0 : report_invalid_encoding(encoding, mbstr, len);
1766 : }
1767 :
1768 0 : l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1769 :
1770 0 : if (l < 0)
1771 : {
1772 0 : if (noError)
1773 0 : return -1;
1774 0 : report_invalid_encoding(encoding, mbstr, len);
1775 : }
1776 :
1777 0 : mbstr += l;
1778 0 : len -= l;
1779 0 : mb_len++;
1780 : }
1781 0 : return mb_len;
1782 : }
1783 :
1784 : /*
1785 : * check_encoding_conversion_args: check arguments of a conversion function
1786 : *
1787 : * "expected" arguments can be either an encoding ID or -1 to indicate that
1788 : * the caller will check whether it accepts the ID.
1789 : *
1790 : * Note: the errors here are not really user-facing, so elog instead of
1791 : * ereport seems sufficient. Also, we trust that the "expected" encoding
1792 : * arguments are valid encoding IDs, but we don't trust the actuals.
1793 : */
1794 : void
1795 7130 : check_encoding_conversion_args(int src_encoding,
1796 : int dest_encoding,
1797 : int len,
1798 : int expected_src_encoding,
1799 : int expected_dest_encoding)
1800 : {
1801 7130 : if (!PG_VALID_ENCODING(src_encoding))
1802 0 : elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1803 7130 : if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1804 0 : elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1805 : pg_enc2name_tbl[expected_src_encoding].name,
1806 : pg_enc2name_tbl[src_encoding].name);
1807 7130 : if (!PG_VALID_ENCODING(dest_encoding))
1808 0 : elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1809 7130 : if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1810 0 : elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1811 : pg_enc2name_tbl[expected_dest_encoding].name,
1812 : pg_enc2name_tbl[dest_encoding].name);
1813 7130 : if (len < 0)
1814 0 : elog(ERROR, "encoding conversion length must not be negative");
1815 7130 : }
1816 :
1817 : /*
1818 : * report_invalid_encoding: complain about invalid multibyte character
1819 : *
1820 : * note: len is remaining length of string, not length of character;
1821 : * len must be greater than zero (or we'd neglect initializing "buf").
1822 : */
1823 : void
1824 2998 : report_invalid_encoding(int encoding, const char *mbstr, int len)
1825 : {
1826 2998 : int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
1827 :
1828 2998 : report_invalid_encoding_int(encoding, mbstr, l, len);
1829 : }
1830 :
1831 : static void
1832 3034 : report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
1833 : {
1834 : char buf[8 * 5 + 1];
1835 3034 : char *p = buf;
1836 : int j,
1837 : jlimit;
1838 :
1839 3034 : jlimit = Min(mblen, len);
1840 3034 : jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1841 :
1842 9320 : for (j = 0; j < jlimit; j++)
1843 : {
1844 6286 : p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1845 6286 : if (j < jlimit - 1)
1846 3252 : p += sprintf(p, " ");
1847 : }
1848 :
1849 3034 : ereport(ERROR,
1850 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1851 : errmsg("invalid byte sequence for encoding \"%s\": %s",
1852 : pg_enc2name_tbl[encoding].name,
1853 : buf)));
1854 : }
1855 :
1856 : static void
1857 36 : report_invalid_encoding_db(const char *mbstr, int mblen, int len)
1858 : {
1859 36 : report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
1860 : }
1861 :
1862 : /*
1863 : * report_untranslatable_char: complain about untranslatable character
1864 : *
1865 : * note: len is remaining length of string, not length of character;
1866 : * len must be greater than zero (or we'd neglect initializing "buf").
1867 : */
1868 : void
1869 936 : report_untranslatable_char(int src_encoding, int dest_encoding,
1870 : const char *mbstr, int len)
1871 : {
1872 : int l;
1873 : char buf[8 * 5 + 1];
1874 936 : char *p = buf;
1875 : int j,
1876 : jlimit;
1877 :
1878 : /*
1879 : * We probably could use plain pg_encoding_mblen(), because
1880 : * gb18030_to_utf8() verifies before it converts. All conversions should.
1881 : * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
1882 : * so, be defensive, since a buggy conversion might pass invalid data.
1883 : * This is not a performance-critical path.
1884 : */
1885 936 : l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
1886 936 : jlimit = Min(l, len);
1887 936 : jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1888 :
1889 3528 : for (j = 0; j < jlimit; j++)
1890 : {
1891 2592 : p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1892 2592 : if (j < jlimit - 1)
1893 1656 : p += sprintf(p, " ");
1894 : }
1895 :
1896 936 : ereport(ERROR,
1897 : (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1898 : errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1899 : buf,
1900 : pg_enc2name_tbl[src_encoding].name,
1901 : pg_enc2name_tbl[dest_encoding].name)));
1902 : }
1903 :
1904 :
1905 : #ifdef WIN32
1906 : /*
1907 : * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1908 : * string. The character length is also passed to utf16len if not
1909 : * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1910 : * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1911 : */
1912 : WCHAR *
1913 : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1914 : {
1915 : int msgenc = GetMessageEncoding();
1916 : WCHAR *utf16;
1917 : int dstlen;
1918 : UINT codepage;
1919 :
1920 : if (msgenc == PG_SQL_ASCII)
1921 : /* No conversion is possible, and SQL_ASCII is never utf16. */
1922 : return NULL;
1923 :
1924 : codepage = pg_enc2name_tbl[msgenc].codepage;
1925 :
1926 : /*
1927 : * Use MultiByteToWideChar directly if there is a corresponding codepage,
1928 : * or double conversion through UTF8 if not. Double conversion is needed,
1929 : * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1930 : */
1931 : if (codepage != 0)
1932 : {
1933 : utf16 = palloc_array(WCHAR, len + 1);
1934 : dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1935 : utf16[dstlen] = (WCHAR) 0;
1936 : }
1937 : else
1938 : {
1939 : char *utf8;
1940 :
1941 : /*
1942 : * XXX pg_do_encoding_conversion() requires a transaction. In the
1943 : * absence of one, hope for the input to be valid UTF8.
1944 : */
1945 : if (IsTransactionState())
1946 : {
1947 : utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1948 : len,
1949 : msgenc,
1950 : PG_UTF8);
1951 : if (utf8 != str)
1952 : len = strlen(utf8);
1953 : }
1954 : else
1955 : utf8 = (char *) str;
1956 :
1957 : utf16 = palloc_array(WCHAR, len + 1);
1958 : dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1959 : utf16[dstlen] = (WCHAR) 0;
1960 :
1961 : if (utf8 != str)
1962 : pfree(utf8);
1963 : }
1964 :
1965 : if (dstlen == 0 && len > 0)
1966 : {
1967 : pfree(utf16);
1968 : return NULL; /* error */
1969 : }
1970 :
1971 : if (utf16len)
1972 : *utf16len = dstlen;
1973 : return utf16;
1974 : }
1975 :
1976 : #endif /* WIN32 */
|