Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * ISO8859_1 <--> UTF8 4 : * 5 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group 6 : * Portions Copyright (c) 1994, Regents of the University of California 7 : * 8 : * IDENTIFICATION 9 : * src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c 10 : * 11 : *------------------------------------------------------------------------- 12 : */ 13 : 14 : #include "postgres.h" 15 : #include "fmgr.h" 16 : #include "mb/pg_wchar.h" 17 : 18 240 : PG_MODULE_MAGIC; 19 : 20 42 : PG_FUNCTION_INFO_V1(iso8859_1_to_utf8); 21 218 : PG_FUNCTION_INFO_V1(utf8_to_iso8859_1); 22 : 23 : /* ---------- 24 : * conv_proc( 25 : * INTEGER, -- source encoding id 26 : * INTEGER, -- destination encoding id 27 : * CSTRING, -- source string (null terminated C string) 28 : * CSTRING, -- destination string (null terminated C string) 29 : * INTEGER, -- source string length 30 : * BOOL -- if true, don't throw an error if conversion fails 31 : * ) returns INTEGER; 32 : * 33 : * Returns the number of bytes successfully converted. 34 : * ---------- 35 : */ 36 : 37 : Datum 38 106 : iso8859_1_to_utf8(PG_FUNCTION_ARGS) 39 : { 40 106 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); 41 106 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); 42 106 : int len = PG_GETARG_INT32(4); 43 106 : bool noError = PG_GETARG_BOOL(5); 44 106 : unsigned char *start = src; 45 : unsigned short c; 46 : 47 106 : CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8); 48 : 49 4492 : while (len > 0) 50 : { 51 4386 : c = *src; 52 4386 : if (c == 0) 53 : { 54 0 : if (noError) 55 0 : break; 56 0 : report_invalid_encoding(PG_LATIN1, (const char *) src, len); 57 : } 58 4386 : if (!IS_HIGHBIT_SET(c)) 59 4386 : *dest++ = c; 60 : else 61 : { 62 0 : *dest++ = (c >> 6) | 0xc0; 63 0 : *dest++ = (c & 0x003f) | HIGHBIT; 64 : } 65 4386 : src++; 66 4386 : len--; 67 : } 68 106 : *dest = '\0'; 69 : 70 106 : PG_RETURN_INT32(src - start); 71 : } 72 : 73 : Datum 74 750 : utf8_to_iso8859_1(PG_FUNCTION_ARGS) 75 : { 76 750 : unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); 77 750 : unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); 78 750 : int len = PG_GETARG_INT32(4); 79 750 : bool noError = PG_GETARG_BOOL(5); 80 750 : unsigned char *start = src; 81 : unsigned short c, 82 : c1; 83 : 84 750 : CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_LATIN1); 85 : 86 4340 : while (len > 0) 87 : { 88 3986 : c = *src; 89 3986 : if (c == 0) 90 : { 91 36 : if (noError) 92 18 : break; 93 18 : report_invalid_encoding(PG_UTF8, (const char *) src, len); 94 : } 95 : /* fast path for ASCII-subset characters */ 96 3950 : if (!IS_HIGHBIT_SET(c)) 97 : { 98 3552 : *dest++ = c; 99 3552 : src++; 100 3552 : len--; 101 : } 102 : else 103 : { 104 398 : int l = pg_utf_mblen(src); 105 : 106 398 : if (l > len || !pg_utf8_islegal(src, l)) 107 : { 108 108 : if (noError) 109 54 : break; 110 54 : report_invalid_encoding(PG_UTF8, (const char *) src, len); 111 : } 112 290 : if (l != 2) 113 : { 114 216 : if (noError) 115 108 : break; 116 108 : report_untranslatable_char(PG_UTF8, PG_LATIN1, 117 : (const char *) src, len); 118 : } 119 74 : c1 = src[1] & 0x3f; 120 74 : c = ((c & 0x1f) << 6) | c1; 121 74 : if (c >= 0x80 && c <= 0xff) 122 : { 123 38 : *dest++ = (unsigned char) c; 124 38 : src += 2; 125 38 : len -= 2; 126 : } 127 : else 128 : { 129 36 : if (noError) 130 18 : break; 131 18 : report_untranslatable_char(PG_UTF8, PG_LATIN1, 132 : (const char *) src, len); 133 : } 134 : } 135 : } 136 552 : *dest = '\0'; 137 : 138 552 : PG_RETURN_INT32(src - start); 139 : }