Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * ts_locale.c 4 : * locale compatibility layer for tsearch 5 : * 6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 7 : * 8 : * 9 : * IDENTIFICATION 10 : * src/backend/tsearch/ts_locale.c 11 : * 12 : *------------------------------------------------------------------------- 13 : */ 14 : #include "postgres.h" 15 : 16 : #include "common/string.h" 17 : #include "storage/fd.h" 18 : #include "tsearch/ts_locale.h" 19 : 20 : static void tsearch_readline_callback(void *arg); 21 : 22 : 23 : /* 24 : * The reason these functions use a 3-wchar_t output buffer, not 2 as you 25 : * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be 26 : * getting from char2wchar() is UTF16 not UTF32. A single input character 27 : * may therefore produce a surrogate pair rather than just one wchar_t; 28 : * we also need room for a trailing null. When we do get a surrogate pair, 29 : * we pass just the first code to iswdigit() etc, so that these functions will 30 : * always return false for characters outside the Basic Multilingual Plane. 31 : */ 32 : #define WC_BUF_LEN 3 33 : 34 : int 35 10284 : t_isalpha(const char *ptr) 36 : { 37 10284 : int clen = pg_mblen(ptr); 38 : wchar_t character[WC_BUF_LEN]; 39 10284 : pg_locale_t mylocale = 0; /* TODO */ 40 : 41 10284 : if (clen == 1 || database_ctype_is_c) 42 10284 : return isalpha(TOUCHAR(ptr)); 43 : 44 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 45 : 46 0 : return iswalpha((wint_t) character[0]); 47 : } 48 : 49 : int 50 2767806 : t_isalnum(const char *ptr) 51 : { 52 2767806 : int clen = pg_mblen(ptr); 53 : wchar_t character[WC_BUF_LEN]; 54 2767806 : pg_locale_t mylocale = 0; /* TODO */ 55 : 56 2767806 : if (clen == 1 || database_ctype_is_c) 57 2767806 : return isalnum(TOUCHAR(ptr)); 58 : 59 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); 60 : 61 0 : return iswalnum((wint_t) character[0]); 62 : } 63 : 64 : 65 : /* 66 : * Set up to read a file using tsearch_readline(). This facility is 67 : * better than just reading the file directly because it provides error 68 : * context pointing to the specific line where a problem is detected. 69 : * 70 : * Expected usage is: 71 : * 72 : * tsearch_readline_state trst; 73 : * 74 : * if (!tsearch_readline_begin(&trst, filename)) 75 : * ereport(ERROR, 76 : * (errcode(ERRCODE_CONFIG_FILE_ERROR), 77 : * errmsg("could not open stop-word file \"%s\": %m", 78 : * filename))); 79 : * while ((line = tsearch_readline(&trst)) != NULL) 80 : * process line; 81 : * tsearch_readline_end(&trst); 82 : * 83 : * Note that the caller supplies the ereport() for file open failure; 84 : * this is so that a custom message can be provided. The filename string 85 : * passed to tsearch_readline_begin() must remain valid through 86 : * tsearch_readline_end(). 87 : */ 88 : bool 89 556 : tsearch_readline_begin(tsearch_readline_state *stp, 90 : const char *filename) 91 : { 92 556 : if ((stp->fp = AllocateFile(filename, "r")) == NULL) 93 0 : return false; 94 556 : stp->filename = filename; 95 556 : stp->lineno = 0; 96 556 : initStringInfo(&stp->buf); 97 556 : stp->curline = NULL; 98 : /* Setup error traceback support for ereport() */ 99 556 : stp->cb.callback = tsearch_readline_callback; 100 556 : stp->cb.arg = stp; 101 556 : stp->cb.previous = error_context_stack; 102 556 : error_context_stack = &stp->cb; 103 556 : return true; 104 : } 105 : 106 : /* 107 : * Read the next line from a tsearch data file (expected to be in UTF-8), and 108 : * convert it to database encoding if needed. The returned string is palloc'd. 109 : * NULL return means EOF. 110 : */ 111 : char * 112 25466 : tsearch_readline(tsearch_readline_state *stp) 113 : { 114 : char *recoded; 115 : 116 : /* Advance line number to use in error reports */ 117 25466 : stp->lineno++; 118 : 119 : /* Clear curline, it's no longer relevant */ 120 25466 : if (stp->curline) 121 : { 122 24910 : if (stp->curline != stp->buf.data) 123 0 : pfree(stp->curline); 124 24910 : stp->curline = NULL; 125 : } 126 : 127 : /* Collect next line, if there is one */ 128 25466 : if (!pg_get_line_buf(stp->fp, &stp->buf)) 129 470 : return NULL; 130 : 131 : /* Validate the input as UTF-8, then convert to DB encoding if needed */ 132 24996 : recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8); 133 : 134 : /* Save the correctly-encoded string for possible error reports */ 135 24996 : stp->curline = recoded; /* might be equal to buf.data */ 136 : 137 : /* 138 : * We always return a freshly pstrdup'd string. This is clearly necessary 139 : * if pg_any_to_server() returned buf.data, and we need a second copy even 140 : * if encoding conversion did occur. The caller is entitled to pfree the 141 : * returned string at any time, which would leave curline pointing to 142 : * recycled storage, causing problems if an error occurs after that point. 143 : * (It's preferable to return the result of pstrdup instead of the output 144 : * of pg_any_to_server, because the conversion result tends to be 145 : * over-allocated. Since callers might save the result string directly 146 : * into a long-lived dictionary structure, we don't want it to be a larger 147 : * palloc chunk than necessary. We'll reclaim the conversion result on 148 : * the next call.) 149 : */ 150 24996 : return pstrdup(recoded); 151 : } 152 : 153 : /* 154 : * Close down after reading a file with tsearch_readline() 155 : */ 156 : void 157 556 : tsearch_readline_end(tsearch_readline_state *stp) 158 : { 159 : /* Suppress use of curline in any error reported below */ 160 556 : if (stp->curline) 161 : { 162 86 : if (stp->curline != stp->buf.data) 163 0 : pfree(stp->curline); 164 86 : stp->curline = NULL; 165 : } 166 : 167 : /* Release other resources */ 168 556 : pfree(stp->buf.data); 169 556 : FreeFile(stp->fp); 170 : 171 : /* Pop the error context stack */ 172 556 : error_context_stack = stp->cb.previous; 173 556 : } 174 : 175 : /* 176 : * Error context callback for errors occurring while reading a tsearch 177 : * configuration file. 178 : */ 179 : static void 180 0 : tsearch_readline_callback(void *arg) 181 : { 182 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg; 183 : 184 : /* 185 : * We can't include the text of the config line for errors that occur 186 : * during tsearch_readline() itself. The major cause of such errors is 187 : * encoding violations, and we daren't try to print error messages 188 : * containing badly-encoded data. 189 : */ 190 0 : if (stp->curline) 191 0 : errcontext("line %d of configuration file \"%s\": \"%s\"", 192 : stp->lineno, 193 : stp->filename, 194 : stp->curline); 195 : else 196 0 : errcontext("line %d of configuration file \"%s\"", 197 : stp->lineno, 198 : stp->filename); 199 0 : }