Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_locale.c
4 : * locale compatibility layer for tsearch
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_locale.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_collation.h"
17 : #include "common/string.h"
18 : #include "storage/fd.h"
19 : #include "tsearch/ts_locale.h"
20 : #include "tsearch/ts_public.h"
21 :
22 : static void tsearch_readline_callback(void *arg);
23 :
24 :
25 : /*
26 : * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27 : * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28 : * getting from char2wchar() is UTF16 not UTF32. A single input character
29 : * may therefore produce a surrogate pair rather than just one wchar_t;
30 : * we also need room for a trailing null. When we do get a surrogate pair,
31 : * we pass just the first code to iswdigit() etc, so that these functions will
32 : * always return false for characters outside the Basic Multilingual Plane.
33 : */
34 : #define WC_BUF_LEN 3
35 :
36 : int
37 20604 : t_isdigit(const char *ptr)
38 : {
39 20604 : int clen = pg_mblen(ptr);
40 : wchar_t character[WC_BUF_LEN];
41 20604 : pg_locale_t mylocale = 0; /* TODO */
42 :
43 20604 : if (clen == 1 || database_ctype_is_c)
44 20604 : return isdigit(TOUCHAR(ptr));
45 :
46 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47 :
48 0 : return iswdigit((wint_t) character[0]);
49 : }
50 :
51 : int
52 881132 : t_isspace(const char *ptr)
53 : {
54 881132 : int clen = pg_mblen(ptr);
55 : wchar_t character[WC_BUF_LEN];
56 881132 : pg_locale_t mylocale = 0; /* TODO */
57 :
58 881132 : if (clen == 1 || database_ctype_is_c)
59 881132 : return isspace(TOUCHAR(ptr));
60 :
61 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
62 :
63 0 : return iswspace((wint_t) character[0]);
64 : }
65 :
66 : int
67 10284 : t_isalpha(const char *ptr)
68 : {
69 10284 : int clen = pg_mblen(ptr);
70 : wchar_t character[WC_BUF_LEN];
71 10284 : pg_locale_t mylocale = 0; /* TODO */
72 :
73 10284 : if (clen == 1 || database_ctype_is_c)
74 10284 : return isalpha(TOUCHAR(ptr));
75 :
76 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
77 :
78 0 : return iswalpha((wint_t) character[0]);
79 : }
80 :
81 : int
82 2743730 : t_isalnum(const char *ptr)
83 : {
84 2743730 : int clen = pg_mblen(ptr);
85 : wchar_t character[WC_BUF_LEN];
86 2743730 : pg_locale_t mylocale = 0; /* TODO */
87 :
88 2743730 : if (clen == 1 || database_ctype_is_c)
89 2743730 : return isalnum(TOUCHAR(ptr));
90 :
91 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
92 :
93 0 : return iswalnum((wint_t) character[0]);
94 : }
95 :
96 : int
97 4150 : t_isprint(const char *ptr)
98 : {
99 4150 : int clen = pg_mblen(ptr);
100 : wchar_t character[WC_BUF_LEN];
101 4150 : pg_locale_t mylocale = 0; /* TODO */
102 :
103 4150 : if (clen == 1 || database_ctype_is_c)
104 4150 : return isprint(TOUCHAR(ptr));
105 :
106 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
107 :
108 0 : return iswprint((wint_t) character[0]);
109 : }
110 :
111 :
112 : /*
113 : * Set up to read a file using tsearch_readline(). This facility is
114 : * better than just reading the file directly because it provides error
115 : * context pointing to the specific line where a problem is detected.
116 : *
117 : * Expected usage is:
118 : *
119 : * tsearch_readline_state trst;
120 : *
121 : * if (!tsearch_readline_begin(&trst, filename))
122 : * ereport(ERROR,
123 : * (errcode(ERRCODE_CONFIG_FILE_ERROR),
124 : * errmsg("could not open stop-word file \"%s\": %m",
125 : * filename)));
126 : * while ((line = tsearch_readline(&trst)) != NULL)
127 : * process line;
128 : * tsearch_readline_end(&trst);
129 : *
130 : * Note that the caller supplies the ereport() for file open failure;
131 : * this is so that a custom message can be provided. The filename string
132 : * passed to tsearch_readline_begin() must remain valid through
133 : * tsearch_readline_end().
134 : */
135 : bool
136 552 : tsearch_readline_begin(tsearch_readline_state *stp,
137 : const char *filename)
138 : {
139 552 : if ((stp->fp = AllocateFile(filename, "r")) == NULL)
140 0 : return false;
141 552 : stp->filename = filename;
142 552 : stp->lineno = 0;
143 552 : initStringInfo(&stp->buf);
144 552 : stp->curline = NULL;
145 : /* Setup error traceback support for ereport() */
146 552 : stp->cb.callback = tsearch_readline_callback;
147 552 : stp->cb.arg = (void *) stp;
148 552 : stp->cb.previous = error_context_stack;
149 552 : error_context_stack = &stp->cb;
150 552 : return true;
151 : }
152 :
153 : /*
154 : * Read the next line from a tsearch data file (expected to be in UTF-8), and
155 : * convert it to database encoding if needed. The returned string is palloc'd.
156 : * NULL return means EOF.
157 : */
158 : char *
159 14818 : tsearch_readline(tsearch_readline_state *stp)
160 : {
161 : char *recoded;
162 :
163 : /* Advance line number to use in error reports */
164 14818 : stp->lineno++;
165 :
166 : /* Clear curline, it's no longer relevant */
167 14818 : if (stp->curline)
168 : {
169 14266 : if (stp->curline != stp->buf.data)
170 0 : pfree(stp->curline);
171 14266 : stp->curline = NULL;
172 : }
173 :
174 : /* Collect next line, if there is one */
175 14818 : if (!pg_get_line_buf(stp->fp, &stp->buf))
176 466 : return NULL;
177 :
178 : /* Validate the input as UTF-8, then convert to DB encoding if needed */
179 14352 : recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
180 :
181 : /* Save the correctly-encoded string for possible error reports */
182 14352 : stp->curline = recoded; /* might be equal to buf.data */
183 :
184 : /*
185 : * We always return a freshly pstrdup'd string. This is clearly necessary
186 : * if pg_any_to_server() returned buf.data, and we need a second copy even
187 : * if encoding conversion did occur. The caller is entitled to pfree the
188 : * returned string at any time, which would leave curline pointing to
189 : * recycled storage, causing problems if an error occurs after that point.
190 : * (It's preferable to return the result of pstrdup instead of the output
191 : * of pg_any_to_server, because the conversion result tends to be
192 : * over-allocated. Since callers might save the result string directly
193 : * into a long-lived dictionary structure, we don't want it to be a larger
194 : * palloc chunk than necessary. We'll reclaim the conversion result on
195 : * the next call.)
196 : */
197 14352 : return pstrdup(recoded);
198 : }
199 :
200 : /*
201 : * Close down after reading a file with tsearch_readline()
202 : */
203 : void
204 552 : tsearch_readline_end(tsearch_readline_state *stp)
205 : {
206 : /* Suppress use of curline in any error reported below */
207 552 : if (stp->curline)
208 : {
209 86 : if (stp->curline != stp->buf.data)
210 0 : pfree(stp->curline);
211 86 : stp->curline = NULL;
212 : }
213 :
214 : /* Release other resources */
215 552 : pfree(stp->buf.data);
216 552 : FreeFile(stp->fp);
217 :
218 : /* Pop the error context stack */
219 552 : error_context_stack = stp->cb.previous;
220 552 : }
221 :
222 : /*
223 : * Error context callback for errors occurring while reading a tsearch
224 : * configuration file.
225 : */
226 : static void
227 0 : tsearch_readline_callback(void *arg)
228 : {
229 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg;
230 :
231 : /*
232 : * We can't include the text of the config line for errors that occur
233 : * during tsearch_readline() itself. The major cause of such errors is
234 : * encoding violations, and we daren't try to print error messages
235 : * containing badly-encoded data.
236 : */
237 0 : if (stp->curline)
238 0 : errcontext("line %d of configuration file \"%s\": \"%s\"",
239 : stp->lineno,
240 : stp->filename,
241 : stp->curline);
242 : else
243 0 : errcontext("line %d of configuration file \"%s\"",
244 : stp->lineno,
245 : stp->filename);
246 0 : }
247 :
248 :
249 : /*
250 : * lowerstr --- fold null-terminated string to lower case
251 : *
252 : * Returned string is palloc'd
253 : */
254 : char *
255 13844 : lowerstr(const char *str)
256 : {
257 13844 : return lowerstr_with_len(str, strlen(str));
258 : }
259 :
260 : /*
261 : * lowerstr_with_len --- fold string to lower case
262 : *
263 : * Input string need not be null-terminated.
264 : *
265 : * Returned string is palloc'd
266 : */
267 : char *
268 281684 : lowerstr_with_len(const char *str, int len)
269 : {
270 : char *out;
271 281684 : pg_locale_t mylocale = 0; /* TODO */
272 :
273 281684 : if (len == 0)
274 0 : return pstrdup("");
275 :
276 : /*
277 : * Use wide char code only when max encoding length > 1 and ctype != C.
278 : * Some operating systems fail with multi-byte encodings and a C locale.
279 : * Also, for a C locale there is no need to process as multibyte. From
280 : * backend/utils/adt/oracle_compat.c Teodor
281 : */
282 281684 : if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
283 0 : {
284 : wchar_t *wstr,
285 : *wptr;
286 : int wlen;
287 :
288 : /*
289 : * alloc number of wchar_t for worst case, len contains number of
290 : * bytes >= number of characters and alloc 1 wchar_t for 0, because
291 : * wchar2char wants zero-terminated string
292 : */
293 0 : wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
294 :
295 0 : wlen = char2wchar(wstr, len + 1, str, len, mylocale);
296 : Assert(wlen <= len);
297 :
298 0 : while (*wptr)
299 : {
300 0 : *wptr = towlower((wint_t) *wptr);
301 0 : wptr++;
302 : }
303 :
304 : /*
305 : * Alloc result string for worst case + '\0'
306 : */
307 0 : len = pg_database_encoding_max_length() * wlen + 1;
308 0 : out = (char *) palloc(len);
309 :
310 0 : wlen = wchar2char(out, wstr, len, mylocale);
311 :
312 0 : pfree(wstr);
313 :
314 0 : if (wlen < 0)
315 0 : ereport(ERROR,
316 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
317 : errmsg("conversion from wchar_t to server encoding failed: %m")));
318 : Assert(wlen < len);
319 : }
320 : else
321 : {
322 281684 : const char *ptr = str;
323 : char *outptr;
324 :
325 281684 : outptr = out = (char *) palloc(sizeof(char) * (len + 1));
326 2337842 : while ((ptr - str) < len && *ptr)
327 : {
328 2056158 : *outptr++ = tolower(TOUCHAR(ptr));
329 2056158 : ptr++;
330 : }
331 281684 : *outptr = '\0';
332 : }
333 :
334 281684 : return out;
335 : }
|