Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * encnames.c
4 : * Encoding names and routines for working with them.
5 : *
6 : * Portions Copyright (c) 2001-2023, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/common/encnames.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #include <ctype.h>
16 : #include <unistd.h>
17 :
18 : #include "mb/pg_wchar.h"
19 :
20 :
21 : /* ----------
22 : * All encoding names, sorted: *** A L P H A B E T I C ***
23 : *
24 : * All names must be without irrelevant chars, search routines use
25 : * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26 : * are always converted to 'iso88591'. All must be lower case.
27 : *
28 : * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29 : *
30 : * Karel Zak, Aug 2001
31 : * ----------
32 : */
33 : typedef struct pg_encname
34 : {
35 : const char *name;
36 : pg_enc encoding;
37 : } pg_encname;
38 :
39 : static const pg_encname pg_encname_tbl[] =
40 : {
41 : {
42 : "abc", PG_WIN1258
43 : }, /* alias for WIN1258 */
44 : {
45 : "alt", PG_WIN866
46 : }, /* IBM866 */
47 : {
48 : "big5", PG_BIG5
49 : }, /* Big5; Chinese for Taiwan multibyte set */
50 : {
51 : "euccn", PG_EUC_CN
52 : }, /* EUC-CN; Extended Unix Code for simplified
53 : * Chinese */
54 : {
55 : "eucjis2004", PG_EUC_JIS_2004
56 : }, /* EUC-JIS-2004; Extended UNIX Code fixed
57 : * Width for Japanese, standard JIS X 0213 */
58 : {
59 : "eucjp", PG_EUC_JP
60 : }, /* EUC-JP; Extended UNIX Code fixed Width for
61 : * Japanese, standard OSF */
62 : {
63 : "euckr", PG_EUC_KR
64 : }, /* EUC-KR; Extended Unix Code for Korean , KS
65 : * X 1001 standard */
66 : {
67 : "euctw", PG_EUC_TW
68 : }, /* EUC-TW; Extended Unix Code for
69 : *
70 : * traditional Chinese */
71 : {
72 : "gb18030", PG_GB18030
73 : }, /* GB18030;GB18030 */
74 : {
75 : "gbk", PG_GBK
76 : }, /* GBK; Chinese Windows CodePage 936
77 : * simplified Chinese */
78 : {
79 : "iso88591", PG_LATIN1
80 : }, /* ISO-8859-1; RFC1345,KXS2 */
81 : {
82 : "iso885910", PG_LATIN6
83 : }, /* ISO-8859-10; RFC1345,KXS2 */
84 : {
85 : "iso885913", PG_LATIN7
86 : }, /* ISO-8859-13; RFC1345,KXS2 */
87 : {
88 : "iso885914", PG_LATIN8
89 : }, /* ISO-8859-14; RFC1345,KXS2 */
90 : {
91 : "iso885915", PG_LATIN9
92 : }, /* ISO-8859-15; RFC1345,KXS2 */
93 : {
94 : "iso885916", PG_LATIN10
95 : }, /* ISO-8859-16; RFC1345,KXS2 */
96 : {
97 : "iso88592", PG_LATIN2
98 : }, /* ISO-8859-2; RFC1345,KXS2 */
99 : {
100 : "iso88593", PG_LATIN3
101 : }, /* ISO-8859-3; RFC1345,KXS2 */
102 : {
103 : "iso88594", PG_LATIN4
104 : }, /* ISO-8859-4; RFC1345,KXS2 */
105 : {
106 : "iso88595", PG_ISO_8859_5
107 : }, /* ISO-8859-5; RFC1345,KXS2 */
108 : {
109 : "iso88596", PG_ISO_8859_6
110 : }, /* ISO-8859-6; RFC1345,KXS2 */
111 : {
112 : "iso88597", PG_ISO_8859_7
113 : }, /* ISO-8859-7; RFC1345,KXS2 */
114 : {
115 : "iso88598", PG_ISO_8859_8
116 : }, /* ISO-8859-8; RFC1345,KXS2 */
117 : {
118 : "iso88599", PG_LATIN5
119 : }, /* ISO-8859-9; RFC1345,KXS2 */
120 : {
121 : "johab", PG_JOHAB
122 : }, /* JOHAB; Extended Unix Code for simplified
123 : * Chinese */
124 : {
125 : "koi8", PG_KOI8R
126 : }, /* _dirty_ alias for KOI8-R (backward
127 : * compatibility) */
128 : {
129 : "koi8r", PG_KOI8R
130 : }, /* KOI8-R; RFC1489 */
131 : {
132 : "koi8u", PG_KOI8U
133 : }, /* KOI8-U; RFC2319 */
134 : {
135 : "latin1", PG_LATIN1
136 : }, /* alias for ISO-8859-1 */
137 : {
138 : "latin10", PG_LATIN10
139 : }, /* alias for ISO-8859-16 */
140 : {
141 : "latin2", PG_LATIN2
142 : }, /* alias for ISO-8859-2 */
143 : {
144 : "latin3", PG_LATIN3
145 : }, /* alias for ISO-8859-3 */
146 : {
147 : "latin4", PG_LATIN4
148 : }, /* alias for ISO-8859-4 */
149 : {
150 : "latin5", PG_LATIN5
151 : }, /* alias for ISO-8859-9 */
152 : {
153 : "latin6", PG_LATIN6
154 : }, /* alias for ISO-8859-10 */
155 : {
156 : "latin7", PG_LATIN7
157 : }, /* alias for ISO-8859-13 */
158 : {
159 : "latin8", PG_LATIN8
160 : }, /* alias for ISO-8859-14 */
161 : {
162 : "latin9", PG_LATIN9
163 : }, /* alias for ISO-8859-15 */
164 : {
165 : "mskanji", PG_SJIS
166 : }, /* alias for Shift_JIS */
167 : {
168 : "muleinternal", PG_MULE_INTERNAL
169 : },
170 : {
171 : "shiftjis", PG_SJIS
172 : }, /* Shift_JIS; JIS X 0202-1991 */
173 :
174 : {
175 : "shiftjis2004", PG_SHIFT_JIS_2004
176 : }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
177 : * standard JIS X 0213 */
178 : {
179 : "sjis", PG_SJIS
180 : }, /* alias for Shift_JIS */
181 : {
182 : "sqlascii", PG_SQL_ASCII
183 : },
184 : {
185 : "tcvn", PG_WIN1258
186 : }, /* alias for WIN1258 */
187 : {
188 : "tcvn5712", PG_WIN1258
189 : }, /* alias for WIN1258 */
190 : {
191 : "uhc", PG_UHC
192 : }, /* UHC; Korean Windows CodePage 949 */
193 : {
194 : "unicode", PG_UTF8
195 : }, /* alias for UTF8 */
196 : {
197 : "utf8", PG_UTF8
198 : }, /* alias for UTF8 */
199 : {
200 : "vscii", PG_WIN1258
201 : }, /* alias for WIN1258 */
202 : {
203 : "win", PG_WIN1251
204 : }, /* _dirty_ alias for windows-1251 (backward
205 : * compatibility) */
206 : {
207 : "win1250", PG_WIN1250
208 : }, /* alias for Windows-1250 */
209 : {
210 : "win1251", PG_WIN1251
211 : }, /* alias for Windows-1251 */
212 : {
213 : "win1252", PG_WIN1252
214 : }, /* alias for Windows-1252 */
215 : {
216 : "win1253", PG_WIN1253
217 : }, /* alias for Windows-1253 */
218 : {
219 : "win1254", PG_WIN1254
220 : }, /* alias for Windows-1254 */
221 : {
222 : "win1255", PG_WIN1255
223 : }, /* alias for Windows-1255 */
224 : {
225 : "win1256", PG_WIN1256
226 : }, /* alias for Windows-1256 */
227 : {
228 : "win1257", PG_WIN1257
229 : }, /* alias for Windows-1257 */
230 : {
231 : "win1258", PG_WIN1258
232 : }, /* alias for Windows-1258 */
233 : {
234 : "win866", PG_WIN866
235 : }, /* IBM866 */
236 : {
237 : "win874", PG_WIN874
238 : }, /* alias for Windows-874 */
239 : {
240 : "win932", PG_SJIS
241 : }, /* alias for Shift_JIS */
242 : {
243 : "win936", PG_GBK
244 : }, /* alias for GBK */
245 : {
246 : "win949", PG_UHC
247 : }, /* alias for UHC */
248 : {
249 : "win950", PG_BIG5
250 : }, /* alias for BIG5 */
251 : {
252 : "windows1250", PG_WIN1250
253 : }, /* Windows-1251; Microsoft */
254 : {
255 : "windows1251", PG_WIN1251
256 : }, /* Windows-1251; Microsoft */
257 : {
258 : "windows1252", PG_WIN1252
259 : }, /* Windows-1252; Microsoft */
260 : {
261 : "windows1253", PG_WIN1253
262 : }, /* Windows-1253; Microsoft */
263 : {
264 : "windows1254", PG_WIN1254
265 : }, /* Windows-1254; Microsoft */
266 : {
267 : "windows1255", PG_WIN1255
268 : }, /* Windows-1255; Microsoft */
269 : {
270 : "windows1256", PG_WIN1256
271 : }, /* Windows-1256; Microsoft */
272 : {
273 : "windows1257", PG_WIN1257
274 : }, /* Windows-1257; Microsoft */
275 : {
276 : "windows1258", PG_WIN1258
277 : }, /* Windows-1258; Microsoft */
278 : {
279 : "windows866", PG_WIN866
280 : }, /* IBM866 */
281 : {
282 : "windows874", PG_WIN874
283 : }, /* Windows-874; Microsoft */
284 : {
285 : "windows932", PG_SJIS
286 : }, /* alias for Shift_JIS */
287 : {
288 : "windows936", PG_GBK
289 : }, /* alias for GBK */
290 : {
291 : "windows949", PG_UHC
292 : }, /* alias for UHC */
293 : {
294 : "windows950", PG_BIG5
295 : } /* alias for BIG5 */
296 : };
297 :
298 : /* ----------
299 : * These are "official" encoding names.
300 : * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
301 : * ----------
302 : */
303 : #ifndef WIN32
304 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
305 : #else
306 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
307 : #endif
308 :
309 : const pg_enc2name pg_enc2name_tbl[] =
310 : {
311 : DEF_ENC2NAME(SQL_ASCII, 0),
312 : DEF_ENC2NAME(EUC_JP, 20932),
313 : DEF_ENC2NAME(EUC_CN, 20936),
314 : DEF_ENC2NAME(EUC_KR, 51949),
315 : DEF_ENC2NAME(EUC_TW, 0),
316 : DEF_ENC2NAME(EUC_JIS_2004, 20932),
317 : DEF_ENC2NAME(UTF8, 65001),
318 : DEF_ENC2NAME(MULE_INTERNAL, 0),
319 : DEF_ENC2NAME(LATIN1, 28591),
320 : DEF_ENC2NAME(LATIN2, 28592),
321 : DEF_ENC2NAME(LATIN3, 28593),
322 : DEF_ENC2NAME(LATIN4, 28594),
323 : DEF_ENC2NAME(LATIN5, 28599),
324 : DEF_ENC2NAME(LATIN6, 0),
325 : DEF_ENC2NAME(LATIN7, 0),
326 : DEF_ENC2NAME(LATIN8, 0),
327 : DEF_ENC2NAME(LATIN9, 28605),
328 : DEF_ENC2NAME(LATIN10, 0),
329 : DEF_ENC2NAME(WIN1256, 1256),
330 : DEF_ENC2NAME(WIN1258, 1258),
331 : DEF_ENC2NAME(WIN866, 866),
332 : DEF_ENC2NAME(WIN874, 874),
333 : DEF_ENC2NAME(KOI8R, 20866),
334 : DEF_ENC2NAME(WIN1251, 1251),
335 : DEF_ENC2NAME(WIN1252, 1252),
336 : DEF_ENC2NAME(ISO_8859_5, 28595),
337 : DEF_ENC2NAME(ISO_8859_6, 28596),
338 : DEF_ENC2NAME(ISO_8859_7, 28597),
339 : DEF_ENC2NAME(ISO_8859_8, 28598),
340 : DEF_ENC2NAME(WIN1250, 1250),
341 : DEF_ENC2NAME(WIN1253, 1253),
342 : DEF_ENC2NAME(WIN1254, 1254),
343 : DEF_ENC2NAME(WIN1255, 1255),
344 : DEF_ENC2NAME(WIN1257, 1257),
345 : DEF_ENC2NAME(KOI8U, 21866),
346 : DEF_ENC2NAME(SJIS, 932),
347 : DEF_ENC2NAME(BIG5, 950),
348 : DEF_ENC2NAME(GBK, 936),
349 : DEF_ENC2NAME(UHC, 949),
350 : DEF_ENC2NAME(GB18030, 54936),
351 : DEF_ENC2NAME(JOHAB, 0),
352 : DEF_ENC2NAME(SHIFT_JIS_2004, 932)
353 : };
354 :
355 : /* ----------
356 : * These are encoding names for gettext.
357 : *
358 : * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
359 : * ----------
360 : */
361 : const pg_enc2gettext pg_enc2gettext_tbl[] =
362 : {
363 : {PG_SQL_ASCII, "US-ASCII"},
364 : {PG_UTF8, "UTF-8"},
365 : {PG_LATIN1, "LATIN1"},
366 : {PG_LATIN2, "LATIN2"},
367 : {PG_LATIN3, "LATIN3"},
368 : {PG_LATIN4, "LATIN4"},
369 : {PG_ISO_8859_5, "ISO-8859-5"},
370 : {PG_ISO_8859_6, "ISO_8859-6"},
371 : {PG_ISO_8859_7, "ISO-8859-7"},
372 : {PG_ISO_8859_8, "ISO-8859-8"},
373 : {PG_LATIN5, "LATIN5"},
374 : {PG_LATIN6, "LATIN6"},
375 : {PG_LATIN7, "LATIN7"},
376 : {PG_LATIN8, "LATIN8"},
377 : {PG_LATIN9, "LATIN-9"},
378 : {PG_LATIN10, "LATIN10"},
379 : {PG_KOI8R, "KOI8-R"},
380 : {PG_KOI8U, "KOI8-U"},
381 : {PG_WIN1250, "CP1250"},
382 : {PG_WIN1251, "CP1251"},
383 : {PG_WIN1252, "CP1252"},
384 : {PG_WIN1253, "CP1253"},
385 : {PG_WIN1254, "CP1254"},
386 : {PG_WIN1255, "CP1255"},
387 : {PG_WIN1256, "CP1256"},
388 : {PG_WIN1257, "CP1257"},
389 : {PG_WIN1258, "CP1258"},
390 : {PG_WIN866, "CP866"},
391 : {PG_WIN874, "CP874"},
392 : {PG_EUC_CN, "EUC-CN"},
393 : {PG_EUC_JP, "EUC-JP"},
394 : {PG_EUC_KR, "EUC-KR"},
395 : {PG_EUC_TW, "EUC-TW"},
396 : {PG_EUC_JIS_2004, "EUC-JP"},
397 : {PG_SJIS, "SHIFT-JIS"},
398 : {PG_BIG5, "BIG5"},
399 : {PG_GBK, "GBK"},
400 : {PG_UHC, "UHC"},
401 : {PG_GB18030, "GB18030"},
402 : {PG_JOHAB, "JOHAB"},
403 : {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
404 : {0, NULL}
405 : };
406 :
407 :
408 : /*
409 : * Table of encoding names for ICU (currently covers backend encodings only)
410 : *
411 : * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412 : *
413 : * NULL entries are not supported by ICU, or their mapping is unclear.
414 : */
415 : static const char *const pg_enc2icu_tbl[] =
416 : {
417 : NULL, /* PG_SQL_ASCII */
418 : "EUC-JP", /* PG_EUC_JP */
419 : "EUC-CN", /* PG_EUC_CN */
420 : "EUC-KR", /* PG_EUC_KR */
421 : "EUC-TW", /* PG_EUC_TW */
422 : NULL, /* PG_EUC_JIS_2004 */
423 : "UTF-8", /* PG_UTF8 */
424 : NULL, /* PG_MULE_INTERNAL */
425 : "ISO-8859-1", /* PG_LATIN1 */
426 : "ISO-8859-2", /* PG_LATIN2 */
427 : "ISO-8859-3", /* PG_LATIN3 */
428 : "ISO-8859-4", /* PG_LATIN4 */
429 : "ISO-8859-9", /* PG_LATIN5 */
430 : "ISO-8859-10", /* PG_LATIN6 */
431 : "ISO-8859-13", /* PG_LATIN7 */
432 : "ISO-8859-14", /* PG_LATIN8 */
433 : "ISO-8859-15", /* PG_LATIN9 */
434 : NULL, /* PG_LATIN10 */
435 : "CP1256", /* PG_WIN1256 */
436 : "CP1258", /* PG_WIN1258 */
437 : "CP866", /* PG_WIN866 */
438 : NULL, /* PG_WIN874 */
439 : "KOI8-R", /* PG_KOI8R */
440 : "CP1251", /* PG_WIN1251 */
441 : "CP1252", /* PG_WIN1252 */
442 : "ISO-8859-5", /* PG_ISO_8859_5 */
443 : "ISO-8859-6", /* PG_ISO_8859_6 */
444 : "ISO-8859-7", /* PG_ISO_8859_7 */
445 : "ISO-8859-8", /* PG_ISO_8859_8 */
446 : "CP1250", /* PG_WIN1250 */
447 : "CP1253", /* PG_WIN1253 */
448 : "CP1254", /* PG_WIN1254 */
449 : "CP1255", /* PG_WIN1255 */
450 : "CP1257", /* PG_WIN1257 */
451 : "KOI8-U", /* PG_KOI8U */
452 : };
453 :
454 : StaticAssertDecl(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
455 : "pg_enc2icu_tbl incomplete");
456 :
457 :
458 : /*
459 : * Is this encoding supported by ICU?
460 : */
461 : bool
462 426 : is_encoding_supported_by_icu(int encoding)
463 : {
464 426 : if (!PG_VALID_BE_ENCODING(encoding))
465 0 : return false;
466 426 : return (pg_enc2icu_tbl[encoding] != NULL);
467 : }
468 :
469 : /*
470 : * Returns ICU's name for encoding, or NULL if not supported
471 : */
472 : const char *
473 28 : get_encoding_name_for_icu(int encoding)
474 : {
475 28 : if (!PG_VALID_BE_ENCODING(encoding))
476 0 : return NULL;
477 28 : return pg_enc2icu_tbl[encoding];
478 : }
479 :
480 :
481 : /* ----------
482 : * Encoding checks, for error returns -1 else encoding id
483 : * ----------
484 : */
485 : int
486 30854 : pg_valid_client_encoding(const char *name)
487 : {
488 : int enc;
489 :
490 30854 : if ((enc = pg_char_to_encoding(name)) < 0)
491 0 : return -1;
492 :
493 30854 : if (!PG_VALID_FE_ENCODING(enc))
494 0 : return -1;
495 :
496 30854 : return enc;
497 : }
498 :
499 : int
500 80 : pg_valid_server_encoding(const char *name)
501 : {
502 : int enc;
503 :
504 80 : if ((enc = pg_char_to_encoding(name)) < 0)
505 0 : return -1;
506 :
507 80 : if (!PG_VALID_BE_ENCODING(enc))
508 0 : return -1;
509 :
510 80 : return enc;
511 : }
512 :
513 : int
514 565970 : pg_valid_server_encoding_id(int encoding)
515 : {
516 565970 : return PG_VALID_BE_ENCODING(encoding);
517 : }
518 :
519 : /*
520 : * Remove irrelevant chars from encoding name, store at *newkey
521 : *
522 : * (Caller's responsibility to provide a large enough buffer)
523 : */
524 : static char *
525 82458 : clean_encoding_name(const char *key, char *newkey)
526 : {
527 : const char *p;
528 : char *np;
529 :
530 700414 : for (p = key, np = newkey; *p != '\0'; p++)
531 : {
532 617956 : if (isalnum((unsigned char) *p))
533 : {
534 565020 : if (*p >= 'A' && *p <= 'Z')
535 394528 : *np++ = *p + 'a' - 'A';
536 : else
537 170492 : *np++ = *p;
538 : }
539 : }
540 82458 : *np = '\0';
541 82458 : return newkey;
542 : }
543 :
544 : /*
545 : * Search encoding by encoding name
546 : *
547 : * Returns encoding ID, or -1 if not recognized
548 : */
549 : int
550 82458 : pg_char_to_encoding(const char *name)
551 : {
552 82458 : unsigned int nel = lengthof(pg_encname_tbl);
553 82458 : const pg_encname *base = pg_encname_tbl,
554 82458 : *last = base + nel - 1,
555 : *position;
556 : int result;
557 : char buff[NAMEDATALEN],
558 : *key;
559 :
560 82458 : if (name == NULL || *name == '\0')
561 0 : return -1;
562 :
563 82458 : if (strlen(name) >= NAMEDATALEN)
564 0 : return -1; /* it's certainly not in the table */
565 :
566 82458 : key = clean_encoding_name(name, buff);
567 :
568 491216 : while (last >= base)
569 : {
570 491214 : position = base + ((last - base) >> 1);
571 491214 : result = key[0] - position->name[0];
572 :
573 491214 : if (result == 0)
574 : {
575 210792 : result = strcmp(key, position->name);
576 210792 : if (result == 0)
577 82456 : return position->encoding;
578 : }
579 408758 : if (result < 0)
580 214772 : last = position - 1;
581 : else
582 193986 : base = position + 1;
583 : }
584 2 : return -1;
585 : }
586 :
587 : const char *
588 49336 : pg_encoding_to_char(int encoding)
589 : {
590 49336 : if (PG_VALID_ENCODING(encoding))
591 : {
592 49336 : const pg_enc2name *p = &pg_enc2name_tbl[encoding];
593 :
594 : Assert(encoding == p->encoding);
595 49336 : return p->name;
596 : }
597 0 : return "";
598 : }
|