Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_locale.c
4 : * locale compatibility layer for tsearch
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_locale.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "common/string.h"
17 : #include "storage/fd.h"
18 : #include "tsearch/ts_locale.h"
19 :
20 : static void tsearch_readline_callback(void *arg);
21 :
22 :
23 : /*
24 : * The reason these functions use a 3-wchar_t output buffer, not 2 as you
25 : * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
26 : * getting from char2wchar() is UTF16 not UTF32. A single input character
27 : * may therefore produce a surrogate pair rather than just one wchar_t;
28 : * we also need room for a trailing null. When we do get a surrogate pair,
29 : * we pass just the first code to iswdigit() etc, so that these functions will
30 : * always return false for characters outside the Basic Multilingual Plane.
31 : */
32 : #define WC_BUF_LEN 3
33 :
34 : int
35 20604 : t_isdigit(const char *ptr)
36 : {
37 20604 : int clen = pg_mblen(ptr);
38 : wchar_t character[WC_BUF_LEN];
39 20604 : pg_locale_t mylocale = 0; /* TODO */
40 :
41 20604 : if (clen == 1 || database_ctype_is_c)
42 20604 : return isdigit(TOUCHAR(ptr));
43 :
44 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
45 :
46 0 : return iswdigit((wint_t) character[0]);
47 : }
48 :
49 : int
50 925428 : t_isspace(const char *ptr)
51 : {
52 925428 : int clen = pg_mblen(ptr);
53 : wchar_t character[WC_BUF_LEN];
54 925428 : pg_locale_t mylocale = 0; /* TODO */
55 :
56 925428 : if (clen == 1 || database_ctype_is_c)
57 912724 : return isspace(TOUCHAR(ptr));
58 :
59 12704 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
60 :
61 12704 : return iswspace((wint_t) character[0]);
62 : }
63 :
64 : int
65 10284 : t_isalpha(const char *ptr)
66 : {
67 10284 : int clen = pg_mblen(ptr);
68 : wchar_t character[WC_BUF_LEN];
69 10284 : pg_locale_t mylocale = 0; /* TODO */
70 :
71 10284 : if (clen == 1 || database_ctype_is_c)
72 10284 : return isalpha(TOUCHAR(ptr));
73 :
74 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
75 :
76 0 : return iswalpha((wint_t) character[0]);
77 : }
78 :
79 : int
80 2767776 : t_isalnum(const char *ptr)
81 : {
82 2767776 : int clen = pg_mblen(ptr);
83 : wchar_t character[WC_BUF_LEN];
84 2767776 : pg_locale_t mylocale = 0; /* TODO */
85 :
86 2767776 : if (clen == 1 || database_ctype_is_c)
87 2767776 : return isalnum(TOUCHAR(ptr));
88 :
89 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
90 :
91 0 : return iswalnum((wint_t) character[0]);
92 : }
93 :
94 : int
95 4150 : t_isprint(const char *ptr)
96 : {
97 4150 : int clen = pg_mblen(ptr);
98 : wchar_t character[WC_BUF_LEN];
99 4150 : pg_locale_t mylocale = 0; /* TODO */
100 :
101 4150 : if (clen == 1 || database_ctype_is_c)
102 4150 : return isprint(TOUCHAR(ptr));
103 :
104 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
105 :
106 0 : return iswprint((wint_t) character[0]);
107 : }
108 :
109 :
110 : /*
111 : * Set up to read a file using tsearch_readline(). This facility is
112 : * better than just reading the file directly because it provides error
113 : * context pointing to the specific line where a problem is detected.
114 : *
115 : * Expected usage is:
116 : *
117 : * tsearch_readline_state trst;
118 : *
119 : * if (!tsearch_readline_begin(&trst, filename))
120 : * ereport(ERROR,
121 : * (errcode(ERRCODE_CONFIG_FILE_ERROR),
122 : * errmsg("could not open stop-word file \"%s\": %m",
123 : * filename)));
124 : * while ((line = tsearch_readline(&trst)) != NULL)
125 : * process line;
126 : * tsearch_readline_end(&trst);
127 : *
128 : * Note that the caller supplies the ereport() for file open failure;
129 : * this is so that a custom message can be provided. The filename string
130 : * passed to tsearch_readline_begin() must remain valid through
131 : * tsearch_readline_end().
132 : */
133 : bool
134 556 : tsearch_readline_begin(tsearch_readline_state *stp,
135 : const char *filename)
136 : {
137 556 : if ((stp->fp = AllocateFile(filename, "r")) == NULL)
138 0 : return false;
139 556 : stp->filename = filename;
140 556 : stp->lineno = 0;
141 556 : initStringInfo(&stp->buf);
142 556 : stp->curline = NULL;
143 : /* Setup error traceback support for ereport() */
144 556 : stp->cb.callback = tsearch_readline_callback;
145 556 : stp->cb.arg = stp;
146 556 : stp->cb.previous = error_context_stack;
147 556 : error_context_stack = &stp->cb;
148 556 : return true;
149 : }
150 :
151 : /*
152 : * Read the next line from a tsearch data file (expected to be in UTF-8), and
153 : * convert it to database encoding if needed. The returned string is palloc'd.
154 : * NULL return means EOF.
155 : */
156 : char *
157 25466 : tsearch_readline(tsearch_readline_state *stp)
158 : {
159 : char *recoded;
160 :
161 : /* Advance line number to use in error reports */
162 25466 : stp->lineno++;
163 :
164 : /* Clear curline, it's no longer relevant */
165 25466 : if (stp->curline)
166 : {
167 24910 : if (stp->curline != stp->buf.data)
168 0 : pfree(stp->curline);
169 24910 : stp->curline = NULL;
170 : }
171 :
172 : /* Collect next line, if there is one */
173 25466 : if (!pg_get_line_buf(stp->fp, &stp->buf))
174 470 : return NULL;
175 :
176 : /* Validate the input as UTF-8, then convert to DB encoding if needed */
177 24996 : recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
178 :
179 : /* Save the correctly-encoded string for possible error reports */
180 24996 : stp->curline = recoded; /* might be equal to buf.data */
181 :
182 : /*
183 : * We always return a freshly pstrdup'd string. This is clearly necessary
184 : * if pg_any_to_server() returned buf.data, and we need a second copy even
185 : * if encoding conversion did occur. The caller is entitled to pfree the
186 : * returned string at any time, which would leave curline pointing to
187 : * recycled storage, causing problems if an error occurs after that point.
188 : * (It's preferable to return the result of pstrdup instead of the output
189 : * of pg_any_to_server, because the conversion result tends to be
190 : * over-allocated. Since callers might save the result string directly
191 : * into a long-lived dictionary structure, we don't want it to be a larger
192 : * palloc chunk than necessary. We'll reclaim the conversion result on
193 : * the next call.)
194 : */
195 24996 : return pstrdup(recoded);
196 : }
197 :
198 : /*
199 : * Close down after reading a file with tsearch_readline()
200 : */
201 : void
202 556 : tsearch_readline_end(tsearch_readline_state *stp)
203 : {
204 : /* Suppress use of curline in any error reported below */
205 556 : if (stp->curline)
206 : {
207 86 : if (stp->curline != stp->buf.data)
208 0 : pfree(stp->curline);
209 86 : stp->curline = NULL;
210 : }
211 :
212 : /* Release other resources */
213 556 : pfree(stp->buf.data);
214 556 : FreeFile(stp->fp);
215 :
216 : /* Pop the error context stack */
217 556 : error_context_stack = stp->cb.previous;
218 556 : }
219 :
220 : /*
221 : * Error context callback for errors occurring while reading a tsearch
222 : * configuration file.
223 : */
224 : static void
225 0 : tsearch_readline_callback(void *arg)
226 : {
227 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg;
228 :
229 : /*
230 : * We can't include the text of the config line for errors that occur
231 : * during tsearch_readline() itself. The major cause of such errors is
232 : * encoding violations, and we daren't try to print error messages
233 : * containing badly-encoded data.
234 : */
235 0 : if (stp->curline)
236 0 : errcontext("line %d of configuration file \"%s\": \"%s\"",
237 : stp->lineno,
238 : stp->filename,
239 : stp->curline);
240 : else
241 0 : errcontext("line %d of configuration file \"%s\"",
242 : stp->lineno,
243 : stp->filename);
244 0 : }
245 :
246 :
247 : /*
248 : * lowerstr --- fold null-terminated string to lower case
249 : *
250 : * Returned string is palloc'd
251 : */
252 : char *
253 13844 : lowerstr(const char *str)
254 : {
255 13844 : return lowerstr_with_len(str, strlen(str));
256 : }
257 :
258 : /*
259 : * lowerstr_with_len --- fold string to lower case
260 : *
261 : * Input string need not be null-terminated.
262 : *
263 : * Returned string is palloc'd
264 : */
265 : char *
266 283710 : lowerstr_with_len(const char *str, int len)
267 : {
268 : char *out;
269 283710 : pg_locale_t mylocale = 0; /* TODO */
270 :
271 283710 : if (len == 0)
272 0 : return pstrdup("");
273 :
274 : /*
275 : * Use wide char code only when max encoding length > 1 and ctype != C.
276 : * Some operating systems fail with multi-byte encodings and a C locale.
277 : * Also, for a C locale there is no need to process as multibyte. From
278 : * backend/utils/adt/oracle_compat.c Teodor
279 : */
280 283710 : if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
281 273996 : {
282 : wchar_t *wstr,
283 : *wptr;
284 : int wlen;
285 :
286 : /*
287 : * alloc number of wchar_t for worst case, len contains number of
288 : * bytes >= number of characters and alloc 1 wchar_t for 0, because
289 : * wchar2char wants zero-terminated string
290 : */
291 273996 : wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
292 :
293 273996 : wlen = char2wchar(wstr, len + 1, str, len, mylocale);
294 : Assert(wlen <= len);
295 :
296 2305520 : while (*wptr)
297 : {
298 2031524 : *wptr = towlower((wint_t) *wptr);
299 2031524 : wptr++;
300 : }
301 :
302 : /*
303 : * Alloc result string for worst case + '\0'
304 : */
305 273996 : len = pg_database_encoding_max_length() * wlen + 1;
306 273996 : out = (char *) palloc(len);
307 :
308 273996 : wlen = wchar2char(out, wstr, len, mylocale);
309 :
310 273996 : pfree(wstr);
311 :
312 273996 : if (wlen < 0)
313 0 : ereport(ERROR,
314 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
315 : errmsg("conversion from wchar_t to server encoding failed: %m")));
316 : Assert(wlen < len);
317 : }
318 : else
319 : {
320 9714 : const char *ptr = str;
321 : char *outptr;
322 :
323 9714 : outptr = out = (char *) palloc(sizeof(char) * (len + 1));
324 56420 : while ((ptr - str) < len && *ptr)
325 : {
326 46706 : *outptr++ = tolower(TOUCHAR(ptr));
327 46706 : ptr++;
328 : }
329 9714 : *outptr = '\0';
330 : }
331 :
332 283710 : return out;
333 : }
|