Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * ISO8859_1 <--> UTF8 4 : * 5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 6 : * Portions Copyright (c) 1994, Regents of the University of California 7 : * 8 : * IDENTIFICATION 9 : * src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c 10 : * 11 : *------------------------------------------------------------------------- 12 : */ 13 : 14 : #include "postgres.h" 15 : #include "fmgr.h" 16 : #include "mb/pg_wchar.h" 17 : 18 248 : PG_MODULE_MAGIC_EXT( 19 : .name = "utf8_and_iso8859_1", 20 : .version = PG_VERSION 21 : ); 22 : 23 48 : PG_FUNCTION_INFO_V1(iso8859_1_to_utf8); 24 226 : PG_FUNCTION_INFO_V1(utf8_to_iso8859_1); 25 : 26 : /* ---------- 27 : * conv_proc( 28 : * INTEGER, -- source encoding id 29 : * INTEGER, -- destination encoding id 30 : * CSTRING, -- source string (null terminated C string) 31 : * CSTRING, -- destination string (null terminated C string) 32 : * INTEGER, -- source string length 33 : * BOOL -- if true, don't throw an error if conversion fails 34 : * ) returns INTEGER; 35 : * 36 : * Returns the number of bytes successfully converted. 37 : * ---------- 38 : */ 39 : 40 : Datum 41 130 : iso8859_1_to_utf8(PG_FUNCTION_ARGS) 42 : { 43 130 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); 44 130 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); 45 130 : int len = PG_GETARG_INT32(4); 46 130 : bool noError = PG_GETARG_BOOL(5); 47 130 : unsigned char *start = src; 48 : unsigned short c; 49 : 50 130 : CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8); 51 : 52 5474 : while (len > 0) 53 : { 54 5344 : c = *src; 55 5344 : if (c == 0) 56 : { 57 0 : if (noError) 58 0 : break; 59 0 : report_invalid_encoding(PG_LATIN1, (const char *) src, len); 60 : } 61 5344 : if (!IS_HIGHBIT_SET(c)) 62 5308 : *dest++ = c; 63 : else 64 : { 65 36 : *dest++ = (c >> 6) | 0xc0; 66 36 : *dest++ = (c & 0x003f) | HIGHBIT; 67 : } 68 5344 : src++; 69 5344 : len--; 70 : } 71 130 : *dest = '\0'; 72 : 73 130 : PG_RETURN_INT32(src - start); 74 : } 75 : 76 : Datum 77 762 : utf8_to_iso8859_1(PG_FUNCTION_ARGS) 78 : { 79 762 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); 80 762 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); 81 762 : int len = PG_GETARG_INT32(4); 82 762 : bool noError = PG_GETARG_BOOL(5); 83 762 : unsigned char *start = src; 84 : unsigned short c, 85 : c1; 86 : 87 762 : CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_LATIN1); 88 : 89 4478 : while (len > 0) 90 : { 91 4112 : c = *src; 92 4112 : if (c == 0) 93 : { 94 36 : if (noError) 95 18 : break; 96 18 : report_invalid_encoding(PG_UTF8, (const char *) src, len); 97 : } 98 : /* fast path for ASCII-subset characters */ 99 4076 : if (!IS_HIGHBIT_SET(c)) 100 : { 101 3678 : *dest++ = c; 102 3678 : src++; 103 3678 : len--; 104 : } 105 : else 106 : { 107 398 : int l = pg_utf_mblen(src); 108 : 109 398 : if (l > len || !pg_utf8_islegal(src, l)) 110 : { 111 108 : if (noError) 112 54 : break; 113 54 : report_invalid_encoding(PG_UTF8, (const char *) src, len); 114 : } 115 290 : if (l != 2) 116 : { 117 216 : if (noError) 118 108 : break; 119 108 : report_untranslatable_char(PG_UTF8, PG_LATIN1, 120 : (const char *) src, len); 121 : } 122 74 : c1 = src[1] & 0x3f; 123 74 : c = ((c & 0x1f) << 6) | c1; 124 74 : if (c >= 0x80 && c <= 0xff) 125 : { 126 38 : *dest++ = (unsigned char) c; 127 38 : src += 2; 128 38 : len -= 2; 129 : } 130 : else 131 : { 132 36 : if (noError) 133 18 : break; 134 18 : report_untranslatable_char(PG_UTF8, PG_LATIN1, 135 : (const char *) src, len); 136 : } 137 : } 138 : } 139 564 : *dest = '\0'; 140 : 141 564 : PG_RETURN_INT32(src - start); 142 : }