Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * encnames.c
4 : * Encoding names and routines for working with them.
5 : *
6 : * Portions Copyright (c) 2001-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/encnames.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <ctype.h>
16 : #include <unistd.h>
17 :
18 : #include "mb/pg_wchar.h"
19 :
20 :
21 : /* ----------
22 : * All encoding names, sorted: *** A L P H A B E T I C ***
23 : *
24 : * All names must be without irrelevant chars, search routines use
25 : * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26 : * are always converted to 'iso88591'. All must be lower case.
27 : *
28 : * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29 : *
30 : * Karel Zak, Aug 2001
31 : * ----------
32 : */
33 : typedef struct pg_encname
34 : {
35 : const char *name;
36 : pg_enc encoding;
37 : } pg_encname;
38 :
39 : static const pg_encname pg_encname_tbl[] =
40 : {
41 : {
42 : "abc", PG_WIN1258
43 : }, /* alias for WIN1258 */
44 : {
45 : "alt", PG_WIN866
46 : }, /* IBM866 */
47 : {
48 : "big5", PG_BIG5
49 : }, /* Big5; Chinese for Taiwan multibyte set */
50 : {
51 : "euccn", PG_EUC_CN
52 : }, /* EUC-CN; Extended Unix Code for simplified
53 : * Chinese */
54 : {
55 : "eucjis2004", PG_EUC_JIS_2004
56 : }, /* EUC-JIS-2004; Extended UNIX Code fixed
57 : * Width for Japanese, standard JIS X 0213 */
58 : {
59 : "eucjp", PG_EUC_JP
60 : }, /* EUC-JP; Extended UNIX Code fixed Width for
61 : * Japanese, standard OSF */
62 : {
63 : "euckr", PG_EUC_KR
64 : }, /* EUC-KR; Extended Unix Code for Korean
65 : * precomposed (Wansung) encoding, standard KS
66 : * X 1001 */
67 : {
68 : "euctw", PG_EUC_TW
69 : }, /* EUC-TW; Extended Unix Code for
70 : *
71 : * traditional Chinese */
72 : {
73 : "gb18030", PG_GB18030
74 : }, /* GB18030;GB18030 */
75 : {
76 : "gbk", PG_GBK
77 : }, /* GBK; Chinese Windows CodePage 936
78 : * simplified Chinese */
79 : {
80 : "iso88591", PG_LATIN1
81 : }, /* ISO-8859-1; RFC1345,KXS2 */
82 : {
83 : "iso885910", PG_LATIN6
84 : }, /* ISO-8859-10; RFC1345,KXS2 */
85 : {
86 : "iso885913", PG_LATIN7
87 : }, /* ISO-8859-13; RFC1345,KXS2 */
88 : {
89 : "iso885914", PG_LATIN8
90 : }, /* ISO-8859-14; RFC1345,KXS2 */
91 : {
92 : "iso885915", PG_LATIN9
93 : }, /* ISO-8859-15; RFC1345,KXS2 */
94 : {
95 : "iso885916", PG_LATIN10
96 : }, /* ISO-8859-16; RFC1345,KXS2 */
97 : {
98 : "iso88592", PG_LATIN2
99 : }, /* ISO-8859-2; RFC1345,KXS2 */
100 : {
101 : "iso88593", PG_LATIN3
102 : }, /* ISO-8859-3; RFC1345,KXS2 */
103 : {
104 : "iso88594", PG_LATIN4
105 : }, /* ISO-8859-4; RFC1345,KXS2 */
106 : {
107 : "iso88595", PG_ISO_8859_5
108 : }, /* ISO-8859-5; RFC1345,KXS2 */
109 : {
110 : "iso88596", PG_ISO_8859_6
111 : }, /* ISO-8859-6; RFC1345,KXS2 */
112 : {
113 : "iso88597", PG_ISO_8859_7
114 : }, /* ISO-8859-7; RFC1345,KXS2 */
115 : {
116 : "iso88598", PG_ISO_8859_8
117 : }, /* ISO-8859-8; RFC1345,KXS2 */
118 : {
119 : "iso88599", PG_LATIN5
120 : }, /* ISO-8859-9; RFC1345,KXS2 */
121 : {
122 : "johab", PG_JOHAB
123 : }, /* JOHAB; Korean combining (Johab) encoding,
124 : * standard KS X 1001 annex 3 */
125 : {
126 : "koi8", PG_KOI8R
127 : }, /* _dirty_ alias for KOI8-R (backward
128 : * compatibility) */
129 : {
130 : "koi8r", PG_KOI8R
131 : }, /* KOI8-R; RFC1489 */
132 : {
133 : "koi8u", PG_KOI8U
134 : }, /* KOI8-U; RFC2319 */
135 : {
136 : "latin1", PG_LATIN1
137 : }, /* alias for ISO-8859-1 */
138 : {
139 : "latin10", PG_LATIN10
140 : }, /* alias for ISO-8859-16 */
141 : {
142 : "latin2", PG_LATIN2
143 : }, /* alias for ISO-8859-2 */
144 : {
145 : "latin3", PG_LATIN3
146 : }, /* alias for ISO-8859-3 */
147 : {
148 : "latin4", PG_LATIN4
149 : }, /* alias for ISO-8859-4 */
150 : {
151 : "latin5", PG_LATIN5
152 : }, /* alias for ISO-8859-9 */
153 : {
154 : "latin6", PG_LATIN6
155 : }, /* alias for ISO-8859-10 */
156 : {
157 : "latin7", PG_LATIN7
158 : }, /* alias for ISO-8859-13 */
159 : {
160 : "latin8", PG_LATIN8
161 : }, /* alias for ISO-8859-14 */
162 : {
163 : "latin9", PG_LATIN9
164 : }, /* alias for ISO-8859-15 */
165 : {
166 : "mskanji", PG_SJIS
167 : }, /* alias for Shift_JIS */
168 : {
169 : "shiftjis", PG_SJIS
170 : }, /* Shift_JIS; JIS X 0202-1991 */
171 :
172 : {
173 : "shiftjis2004", PG_SHIFT_JIS_2004
174 : }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
175 : * standard JIS X 0213 */
176 : {
177 : "sjis", PG_SJIS
178 : }, /* alias for Shift_JIS */
179 : {
180 : "sqlascii", PG_SQL_ASCII
181 : },
182 : {
183 : "tcvn", PG_WIN1258
184 : }, /* alias for WIN1258 */
185 : {
186 : "tcvn5712", PG_WIN1258
187 : }, /* alias for WIN1258 */
188 : {
189 : "uhc", PG_UHC
190 : }, /* UHC; Unified Hangul Code, Microsoft Windows
191 : * CodePage 949; superset of EUC-KR covering
192 : * all 11,172 precomposed Hangul syllables */
193 : {
194 : "unicode", PG_UTF8
195 : }, /* alias for UTF8 */
196 : {
197 : "utf8", PG_UTF8
198 : }, /* alias for UTF8 */
199 : {
200 : "vscii", PG_WIN1258
201 : }, /* alias for WIN1258 */
202 : {
203 : "win", PG_WIN1251
204 : }, /* _dirty_ alias for windows-1251 (backward
205 : * compatibility) */
206 : {
207 : "win1250", PG_WIN1250
208 : }, /* alias for Windows-1250 */
209 : {
210 : "win1251", PG_WIN1251
211 : }, /* alias for Windows-1251 */
212 : {
213 : "win1252", PG_WIN1252
214 : }, /* alias for Windows-1252 */
215 : {
216 : "win1253", PG_WIN1253
217 : }, /* alias for Windows-1253 */
218 : {
219 : "win1254", PG_WIN1254
220 : }, /* alias for Windows-1254 */
221 : {
222 : "win1255", PG_WIN1255
223 : }, /* alias for Windows-1255 */
224 : {
225 : "win1256", PG_WIN1256
226 : }, /* alias for Windows-1256 */
227 : {
228 : "win1257", PG_WIN1257
229 : }, /* alias for Windows-1257 */
230 : {
231 : "win1258", PG_WIN1258
232 : }, /* alias for Windows-1258 */
233 : {
234 : "win866", PG_WIN866
235 : }, /* IBM866 */
236 : {
237 : "win874", PG_WIN874
238 : }, /* alias for Windows-874 */
239 : {
240 : "win932", PG_SJIS
241 : }, /* alias for Shift_JIS */
242 : {
243 : "win936", PG_GBK
244 : }, /* alias for GBK */
245 : {
246 : "win949", PG_UHC
247 : }, /* alias for UHC */
248 : {
249 : "win950", PG_BIG5
250 : }, /* alias for BIG5 */
251 : {
252 : "windows1250", PG_WIN1250
253 : }, /* Windows-1251; Microsoft */
254 : {
255 : "windows1251", PG_WIN1251
256 : }, /* Windows-1251; Microsoft */
257 : {
258 : "windows1252", PG_WIN1252
259 : }, /* Windows-1252; Microsoft */
260 : {
261 : "windows1253", PG_WIN1253
262 : }, /* Windows-1253; Microsoft */
263 : {
264 : "windows1254", PG_WIN1254
265 : }, /* Windows-1254; Microsoft */
266 : {
267 : "windows1255", PG_WIN1255
268 : }, /* Windows-1255; Microsoft */
269 : {
270 : "windows1256", PG_WIN1256
271 : }, /* Windows-1256; Microsoft */
272 : {
273 : "windows1257", PG_WIN1257
274 : }, /* Windows-1257; Microsoft */
275 : {
276 : "windows1258", PG_WIN1258
277 : }, /* Windows-1258; Microsoft */
278 : {
279 : "windows866", PG_WIN866
280 : }, /* IBM866 */
281 : {
282 : "windows874", PG_WIN874
283 : }, /* Windows-874; Microsoft */
284 : {
285 : "windows932", PG_SJIS
286 : }, /* alias for Shift_JIS */
287 : {
288 : "windows936", PG_GBK
289 : }, /* alias for GBK */
290 : {
291 : "windows949", PG_UHC
292 : }, /* alias for UHC */
293 : {
294 : "windows950", PG_BIG5
295 : } /* alias for BIG5 */
296 : };
297 :
298 : /* ----------
299 : * These are "official" encoding names.
300 : * ----------
301 : */
302 : #ifndef WIN32
303 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304 : #else
305 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306 : #endif
307 :
308 : const pg_enc2name pg_enc2name_tbl[] =
309 : {
310 : [PG_SQL_ASCII] = DEF_ENC2NAME(SQL_ASCII, 0),
311 : [PG_EUC_JP] = DEF_ENC2NAME(EUC_JP, 20932),
312 : [PG_EUC_CN] = DEF_ENC2NAME(EUC_CN, 20936),
313 : [PG_EUC_KR] = DEF_ENC2NAME(EUC_KR, 51949),
314 : [PG_EUC_TW] = DEF_ENC2NAME(EUC_TW, 0),
315 : [PG_EUC_JIS_2004] = DEF_ENC2NAME(EUC_JIS_2004, 20932),
316 : [PG_UTF8] = DEF_ENC2NAME(UTF8, 65001),
317 : [PG_LATIN1] = DEF_ENC2NAME(LATIN1, 28591),
318 : [PG_LATIN2] = DEF_ENC2NAME(LATIN2, 28592),
319 : [PG_LATIN3] = DEF_ENC2NAME(LATIN3, 28593),
320 : [PG_LATIN4] = DEF_ENC2NAME(LATIN4, 28594),
321 : [PG_LATIN5] = DEF_ENC2NAME(LATIN5, 28599),
322 : [PG_LATIN6] = DEF_ENC2NAME(LATIN6, 0),
323 : [PG_LATIN7] = DEF_ENC2NAME(LATIN7, 0),
324 : [PG_LATIN8] = DEF_ENC2NAME(LATIN8, 0),
325 : [PG_LATIN9] = DEF_ENC2NAME(LATIN9, 28605),
326 : [PG_LATIN10] = DEF_ENC2NAME(LATIN10, 0),
327 : [PG_WIN1256] = DEF_ENC2NAME(WIN1256, 1256),
328 : [PG_WIN1258] = DEF_ENC2NAME(WIN1258, 1258),
329 : [PG_WIN866] = DEF_ENC2NAME(WIN866, 866),
330 : [PG_WIN874] = DEF_ENC2NAME(WIN874, 874),
331 : [PG_KOI8R] = DEF_ENC2NAME(KOI8R, 20866),
332 : [PG_WIN1251] = DEF_ENC2NAME(WIN1251, 1251),
333 : [PG_WIN1252] = DEF_ENC2NAME(WIN1252, 1252),
334 : [PG_ISO_8859_5] = DEF_ENC2NAME(ISO_8859_5, 28595),
335 : [PG_ISO_8859_6] = DEF_ENC2NAME(ISO_8859_6, 28596),
336 : [PG_ISO_8859_7] = DEF_ENC2NAME(ISO_8859_7, 28597),
337 : [PG_ISO_8859_8] = DEF_ENC2NAME(ISO_8859_8, 28598),
338 : [PG_WIN1250] = DEF_ENC2NAME(WIN1250, 1250),
339 : [PG_WIN1253] = DEF_ENC2NAME(WIN1253, 1253),
340 : [PG_WIN1254] = DEF_ENC2NAME(WIN1254, 1254),
341 : [PG_WIN1255] = DEF_ENC2NAME(WIN1255, 1255),
342 : [PG_WIN1257] = DEF_ENC2NAME(WIN1257, 1257),
343 : [PG_KOI8U] = DEF_ENC2NAME(KOI8U, 21866),
344 : [PG_SJIS] = DEF_ENC2NAME(SJIS, 932),
345 : [PG_BIG5] = DEF_ENC2NAME(BIG5, 950),
346 : [PG_GBK] = DEF_ENC2NAME(GBK, 936),
347 : [PG_UHC] = DEF_ENC2NAME(UHC, 949),
348 : [PG_GB18030] = DEF_ENC2NAME(GB18030, 54936),
349 : [PG_JOHAB] = DEF_ENC2NAME(JOHAB, 0),
350 : [PG_SHIFT_JIS_2004] = DEF_ENC2NAME(SHIFT_JIS_2004, 932),
351 : };
352 :
353 : /* ----------
354 : * These are encoding names for gettext.
355 : * ----------
356 : */
357 : const char *pg_enc2gettext_tbl[] =
358 : {
359 : [PG_SQL_ASCII] = "US-ASCII",
360 : [PG_UTF8] = "UTF-8",
361 : [PG_LATIN1] = "LATIN1",
362 : [PG_LATIN2] = "LATIN2",
363 : [PG_LATIN3] = "LATIN3",
364 : [PG_LATIN4] = "LATIN4",
365 : [PG_ISO_8859_5] = "ISO-8859-5",
366 : [PG_ISO_8859_6] = "ISO_8859-6",
367 : [PG_ISO_8859_7] = "ISO-8859-7",
368 : [PG_ISO_8859_8] = "ISO-8859-8",
369 : [PG_LATIN5] = "LATIN5",
370 : [PG_LATIN6] = "LATIN6",
371 : [PG_LATIN7] = "LATIN7",
372 : [PG_LATIN8] = "LATIN8",
373 : [PG_LATIN9] = "LATIN-9",
374 : [PG_LATIN10] = "LATIN10",
375 : [PG_KOI8R] = "KOI8-R",
376 : [PG_KOI8U] = "KOI8-U",
377 : [PG_WIN1250] = "CP1250",
378 : [PG_WIN1251] = "CP1251",
379 : [PG_WIN1252] = "CP1252",
380 : [PG_WIN1253] = "CP1253",
381 : [PG_WIN1254] = "CP1254",
382 : [PG_WIN1255] = "CP1255",
383 : [PG_WIN1256] = "CP1256",
384 : [PG_WIN1257] = "CP1257",
385 : [PG_WIN1258] = "CP1258",
386 : [PG_WIN866] = "CP866",
387 : [PG_WIN874] = "CP874",
388 : [PG_EUC_CN] = "EUC-CN",
389 : [PG_EUC_JP] = "EUC-JP",
390 : [PG_EUC_KR] = "EUC-KR",
391 : [PG_EUC_TW] = "EUC-TW",
392 : [PG_EUC_JIS_2004] = "EUC-JP",
393 : [PG_SJIS] = "SHIFT-JIS",
394 : [PG_BIG5] = "BIG5",
395 : [PG_GBK] = "GBK",
396 : [PG_UHC] = "UHC",
397 : [PG_GB18030] = "GB18030",
398 : [PG_JOHAB] = "JOHAB",
399 : [PG_SHIFT_JIS_2004] = "SHIFT_JISX0213",
400 : };
401 :
402 :
403 : /*
404 : * Table of encoding names for ICU (currently covers backend encodings only)
405 : *
406 : * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
407 : *
408 : * NULL entries are not supported by ICU, or their mapping is unclear.
409 : */
410 : static const char *const pg_enc2icu_tbl[] =
411 : {
412 : [PG_SQL_ASCII] = NULL,
413 : [PG_EUC_JP] = "EUC-JP",
414 : [PG_EUC_CN] = "EUC-CN",
415 : [PG_EUC_KR] = "EUC-KR",
416 : [PG_EUC_TW] = "EUC-TW",
417 : [PG_EUC_JIS_2004] = NULL,
418 : [PG_UTF8] = "UTF-8",
419 : [PG_LATIN1] = "ISO-8859-1",
420 : [PG_LATIN2] = "ISO-8859-2",
421 : [PG_LATIN3] = "ISO-8859-3",
422 : [PG_LATIN4] = "ISO-8859-4",
423 : [PG_LATIN5] = "ISO-8859-9",
424 : [PG_LATIN6] = "ISO-8859-10",
425 : [PG_LATIN7] = "ISO-8859-13",
426 : [PG_LATIN8] = "ISO-8859-14",
427 : [PG_LATIN9] = "ISO-8859-15",
428 : [PG_LATIN10] = NULL,
429 : [PG_WIN1256] = "CP1256",
430 : [PG_WIN1258] = "CP1258",
431 : [PG_WIN866] = "CP866",
432 : [PG_WIN874] = NULL,
433 : [PG_KOI8R] = "KOI8-R",
434 : [PG_WIN1251] = "CP1251",
435 : [PG_WIN1252] = "CP1252",
436 : [PG_ISO_8859_5] = "ISO-8859-5",
437 : [PG_ISO_8859_6] = "ISO-8859-6",
438 : [PG_ISO_8859_7] = "ISO-8859-7",
439 : [PG_ISO_8859_8] = "ISO-8859-8",
440 : [PG_WIN1250] = "CP1250",
441 : [PG_WIN1253] = "CP1253",
442 : [PG_WIN1254] = "CP1254",
443 : [PG_WIN1255] = "CP1255",
444 : [PG_WIN1257] = "CP1257",
445 : [PG_KOI8U] = "KOI8-U",
446 : };
447 :
448 : StaticAssertDecl(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
449 : "pg_enc2icu_tbl incomplete");
450 :
451 :
452 : /*
453 : * Is this encoding supported by ICU?
454 : */
455 : bool
456 1197 : is_encoding_supported_by_icu(int encoding)
457 : {
458 1197 : if (!PG_VALID_BE_ENCODING(encoding))
459 0 : return false;
460 1197 : return (pg_enc2icu_tbl[encoding] != NULL);
461 : }
462 :
463 : /*
464 : * Returns ICU's name for encoding, or NULL if not supported
465 : */
466 : const char *
467 4 : get_encoding_name_for_icu(int encoding)
468 : {
469 4 : if (!PG_VALID_BE_ENCODING(encoding))
470 0 : return NULL;
471 4 : return pg_enc2icu_tbl[encoding];
472 : }
473 :
474 :
475 : /* ----------
476 : * Encoding checks, for error returns -1 else encoding id
477 : * ----------
478 : */
479 : int
480 25621 : pg_valid_client_encoding(const char *name)
481 : {
482 : int enc;
483 :
484 25621 : if ((enc = pg_char_to_encoding(name)) < 0)
485 0 : return -1;
486 :
487 25621 : if (!PG_VALID_FE_ENCODING(enc))
488 0 : return -1;
489 :
490 25621 : return enc;
491 : }
492 :
493 : int
494 274 : pg_valid_server_encoding(const char *name)
495 : {
496 : int enc;
497 :
498 274 : if ((enc = pg_char_to_encoding(name)) < 0)
499 0 : return -1;
500 :
501 274 : if (!PG_VALID_BE_ENCODING(enc))
502 0 : return -1;
503 :
504 274 : return enc;
505 : }
506 :
507 : int
508 517491 : pg_valid_server_encoding_id(int encoding)
509 : {
510 517491 : return PG_VALID_BE_ENCODING(encoding);
511 : }
512 :
513 : /*
514 : * Remove irrelevant chars from encoding name, store at *newkey
515 : *
516 : * (Caller's responsibility to provide a large enough buffer)
517 : */
518 : static char *
519 64321 : clean_encoding_name(const char *key, char *newkey)
520 : {
521 : const char *p;
522 : char *np;
523 :
524 371690 : for (p = key, np = newkey; *p != '\0'; p++)
525 : {
526 307369 : if (isalnum((unsigned char) *p))
527 : {
528 298962 : if (*p >= 'A' && *p <= 'Z')
529 171668 : *np++ = *p + 'a' - 'A';
530 : else
531 127294 : *np++ = *p;
532 : }
533 : }
534 64321 : *np = '\0';
535 64321 : return newkey;
536 : }
537 :
538 : /*
539 : * Search encoding by encoding name
540 : *
541 : * Returns encoding ID, or -1 if not recognized
542 : */
543 : int
544 64321 : pg_char_to_encoding(const char *name)
545 : {
546 64321 : unsigned int nel = lengthof(pg_encname_tbl);
547 64321 : const pg_encname *base = pg_encname_tbl,
548 64321 : *last = base + nel - 1,
549 : *position;
550 : int result;
551 : char buff[NAMEDATALEN],
552 : *key;
553 :
554 64321 : if (name == NULL || *name == '\0')
555 0 : return -1;
556 :
557 64321 : if (strlen(name) >= NAMEDATALEN)
558 0 : return -1; /* it's certainly not in the table */
559 :
560 64321 : key = clean_encoding_name(name, buff);
561 :
562 383677 : while (last >= base)
563 : {
564 383676 : position = base + ((last - base) >> 1);
565 383676 : result = key[0] - position->name[0];
566 :
567 383676 : if (result == 0)
568 : {
569 135067 : result = strcmp(key, position->name);
570 135067 : if (result == 0)
571 64320 : return position->encoding;
572 : }
573 319356 : if (result < 0)
574 138428 : last = position - 1;
575 : else
576 180928 : base = position + 1;
577 : }
578 1 : return -1;
579 : }
580 :
581 : const char *
582 39277 : pg_encoding_to_char(int encoding)
583 : {
584 39277 : if (PG_VALID_ENCODING(encoding))
585 : {
586 39277 : const pg_enc2name *p = &pg_enc2name_tbl[encoding];
587 :
588 : Assert(encoding == p->encoding);
589 39277 : return p->name;
590 : }
591 0 : return "";
592 : }
|