Line data Source code
1 : /*
2 : * psql - the PostgreSQL interactive terminal
3 : *
4 : * Copyright (c) 2000-2024, PostgreSQL Global Development Group
5 : *
6 : * src/bin/psql/stringutils.c
7 : */
8 : #include "postgres_fe.h"
9 :
10 : #include <ctype.h>
11 :
12 : #include "common.h"
13 : #include "stringutils.h"
14 :
15 :
16 : /*
17 : * Replacement for strtok() (a.k.a. poor man's flex)
18 : *
19 : * Splits a string into tokens, returning one token per call, then NULL
20 : * when no more tokens exist in the given string.
21 : *
22 : * The calling convention is similar to that of strtok, but with more
23 : * frammishes.
24 : *
25 : * s - string to parse, if NULL continue parsing the last string
26 : * whitespace - set of whitespace characters that separate tokens
27 : * delim - set of non-whitespace separator characters (or NULL)
28 : * quote - set of characters that can quote a token (NULL if none)
29 : * escape - character that can quote quotes (0 if none)
30 : * e_strings - if true, treat E'...' syntax as a valid token
31 : * del_quotes - if true, strip quotes from the returned token, else return
32 : * it exactly as found in the string
33 : * encoding - the active character-set encoding
34 : *
35 : * Characters in 'delim', if any, will be returned as single-character
36 : * tokens unless part of a quoted token.
37 : *
38 : * Double occurrences of the quoting character are always taken to represent
39 : * a single quote character in the data. If escape isn't 0, then escape
40 : * followed by anything (except \0) is a data character too.
41 : *
42 : * The combination of e_strings and del_quotes both true is not currently
43 : * handled. This could be fixed but it's not needed anywhere at the moment.
44 : *
45 : * Note that the string s is _not_ overwritten in this implementation.
46 : *
47 : * NB: it's okay to vary delim, quote, and escape from one call to the
48 : * next on a single source string, but changing whitespace is a bad idea
49 : * since you might lose data.
50 : */
51 : char *
52 1014 : strtokx(const char *s,
53 : const char *whitespace,
54 : const char *delim,
55 : const char *quote,
56 : char escape,
57 : bool e_strings,
58 : bool del_quotes,
59 : int encoding)
60 : {
61 : static char *storage = NULL; /* store the local copy of the users
62 : * string here */
63 : static char *string = NULL; /* pointer into storage where to continue on
64 : * next call */
65 :
66 : /* variously abused variables: */
67 : unsigned int offset;
68 : char *start;
69 : char *p;
70 :
71 1014 : if (s)
72 : {
73 186 : free(storage);
74 :
75 : /*
76 : * We may need extra space to insert delimiter nulls for adjacent
77 : * tokens. 2X the space is a gross overestimate, but it's unlikely
78 : * that this code will be used on huge strings anyway.
79 : */
80 186 : storage = pg_malloc(2 * strlen(s) + 1);
81 186 : strcpy(storage, s);
82 186 : string = storage;
83 : }
84 :
85 1014 : if (!storage)
86 0 : return NULL;
87 :
88 : /* skip leading whitespace */
89 1014 : offset = strspn(string, whitespace);
90 1014 : start = &string[offset];
91 :
92 : /* end of string reached? */
93 1014 : if (*start == '\0')
94 : {
95 : /* technically we don't need to free here, but we're nice */
96 112 : free(storage);
97 112 : storage = NULL;
98 112 : string = NULL;
99 112 : return NULL;
100 : }
101 :
102 : /* test if delimiter character */
103 902 : if (delim && strchr(delim, *start))
104 : {
105 : /*
106 : * If not at end of string, we need to insert a null to terminate the
107 : * returned token. We can just overwrite the next character if it
108 : * happens to be in the whitespace set ... otherwise move over the
109 : * rest of the string to make room. (This is why we allocated extra
110 : * space above).
111 : */
112 84 : p = start + 1;
113 84 : if (*p != '\0')
114 : {
115 84 : if (!strchr(whitespace, *p))
116 48 : memmove(p + 1, p, strlen(p) + 1);
117 84 : *p = '\0';
118 84 : string = p + 1;
119 : }
120 : else
121 : {
122 : /* at end of string, so no extra work */
123 0 : string = p;
124 : }
125 :
126 84 : return start;
127 : }
128 :
129 : /* check for E string */
130 818 : p = start;
131 818 : if (e_strings &&
132 282 : (*p == 'E' || *p == 'e') &&
133 0 : p[1] == '\'')
134 : {
135 0 : quote = "'";
136 0 : escape = '\\'; /* if std strings before, not any more */
137 0 : p++;
138 : }
139 :
140 : /* test if quoting character */
141 818 : if (quote && strchr(quote, *p))
142 : {
143 : /* okay, we have a quoted token, now scan for the closer */
144 156 : char thisquote = *p++;
145 :
146 2422 : for (; *p; p += PQmblenBounded(p, encoding))
147 : {
148 2410 : if (*p == escape && p[1] != '\0')
149 0 : p++; /* process escaped anything */
150 2410 : else if (*p == thisquote && p[1] == thisquote)
151 0 : p++; /* process doubled quote */
152 2410 : else if (*p == thisquote)
153 : {
154 144 : p++; /* skip trailing quote */
155 144 : break;
156 : }
157 : }
158 :
159 : /*
160 : * If not at end of string, we need to insert a null to terminate the
161 : * returned token. See notes above.
162 : */
163 156 : if (*p != '\0')
164 : {
165 60 : if (!strchr(whitespace, *p))
166 36 : memmove(p + 1, p, strlen(p) + 1);
167 60 : *p = '\0';
168 60 : string = p + 1;
169 : }
170 : else
171 : {
172 : /* at end of string, so no extra work */
173 96 : string = p;
174 : }
175 :
176 : /* Clean up the token if caller wants that */
177 156 : if (del_quotes)
178 12 : strip_quotes(start, thisquote, escape, encoding);
179 :
180 156 : return start;
181 : }
182 :
183 : /*
184 : * Otherwise no quoting character. Scan till next whitespace, delimiter
185 : * or quote. NB: at this point, *start is known not to be '\0',
186 : * whitespace, delim, or quote, so we will consume at least one character.
187 : */
188 662 : offset = strcspn(start, whitespace);
189 :
190 662 : if (delim)
191 : {
192 600 : unsigned int offset2 = strcspn(start, delim);
193 :
194 600 : if (offset > offset2)
195 72 : offset = offset2;
196 : }
197 :
198 662 : if (quote)
199 : {
200 612 : unsigned int offset2 = strcspn(start, quote);
201 :
202 612 : if (offset > offset2)
203 24 : offset = offset2;
204 : }
205 :
206 662 : p = start + offset;
207 :
208 : /*
209 : * If not at end of string, we need to insert a null to terminate the
210 : * returned token. See notes above.
211 : */
212 662 : if (*p != '\0')
213 : {
214 572 : if (!strchr(whitespace, *p))
215 84 : memmove(p + 1, p, strlen(p) + 1);
216 572 : *p = '\0';
217 572 : string = p + 1;
218 : }
219 : else
220 : {
221 : /* at end of string, so no extra work */
222 90 : string = p;
223 : }
224 :
225 662 : return start;
226 : }
227 :
228 :
229 : /*
230 : * strip_quotes
231 : *
232 : * Remove quotes from the string at *source. Leading and trailing occurrences
233 : * of 'quote' are removed; embedded double occurrences of 'quote' are reduced
234 : * to single occurrences; if 'escape' is not 0 then 'escape' removes special
235 : * significance of next character.
236 : *
237 : * Note that the source string is overwritten in-place.
238 : */
239 : void
240 104 : strip_quotes(char *source, char quote, char escape, int encoding)
241 : {
242 : char *src;
243 : char *dst;
244 :
245 : Assert(source != NULL);
246 : Assert(quote != '\0');
247 :
248 104 : src = dst = source;
249 :
250 104 : if (*src && *src == quote)
251 102 : src++; /* skip leading quote */
252 :
253 2422 : while (*src)
254 : {
255 2408 : char c = *src;
256 : int i;
257 :
258 2408 : if (c == quote && src[1] == '\0')
259 : break; /* skip trailing quote */
260 2318 : else if (c == quote && src[1] == quote)
261 0 : src++; /* process doubled quote */
262 2318 : else if (c == escape && src[1] != '\0')
263 0 : src++; /* process escaped character */
264 :
265 2318 : i = PQmblenBounded(src, encoding);
266 4636 : while (i--)
267 2318 : *dst++ = *src++;
268 : }
269 :
270 104 : *dst = '\0';
271 104 : }
272 :
273 :
274 : /*
275 : * quote_if_needed
276 : *
277 : * Opposite of strip_quotes(). If "source" denotes itself literally without
278 : * quoting or escaping, returns NULL. Otherwise, returns a malloc'd copy with
279 : * quoting and escaping applied:
280 : *
281 : * source - string to parse
282 : * entails_quote - any of these present? need outer quotes
283 : * quote - doubled within string, affixed to both ends
284 : * escape - doubled within string
285 : * force_quote - if true, quote the output even if it doesn't "need" it
286 : * encoding - the active character-set encoding
287 : *
288 : * Do not use this as a substitute for PQescapeStringConn(). Use it for
289 : * strings to be parsed by strtokx() or psql_scan_slash_option().
290 : */
291 : char *
292 10 : quote_if_needed(const char *source, const char *entails_quote,
293 : char quote, char escape, bool force_quote,
294 : int encoding)
295 : {
296 : const char *src;
297 : char *ret;
298 : char *dst;
299 10 : bool need_quotes = force_quote;
300 :
301 : Assert(source != NULL);
302 : Assert(quote != '\0');
303 :
304 10 : src = source;
305 10 : dst = ret = pg_malloc(2 * strlen(src) + 3); /* excess */
306 :
307 10 : *dst++ = quote;
308 :
309 202 : while (*src)
310 : {
311 192 : char c = *src;
312 : int i;
313 :
314 192 : if (c == quote)
315 : {
316 0 : need_quotes = true;
317 0 : *dst++ = quote;
318 : }
319 192 : else if (c == escape)
320 : {
321 0 : need_quotes = true;
322 0 : *dst++ = escape;
323 : }
324 192 : else if (strchr(entails_quote, c))
325 0 : need_quotes = true;
326 :
327 192 : i = PQmblenBounded(src, encoding);
328 384 : while (i--)
329 192 : *dst++ = *src++;
330 : }
331 :
332 10 : *dst++ = quote;
333 10 : *dst = '\0';
334 :
335 10 : if (!need_quotes)
336 : {
337 4 : free(ret);
338 4 : ret = NULL;
339 : }
340 :
341 10 : return ret;
342 : }
|