Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copyapi.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bitutils.h"
75 : #include "port/pg_bswap.h"
76 : #include "port/simd.h"
77 : #include "utils/builtins.h"
78 : #include "utils/rel.h"
79 : #include "utils/wait_event.h"
80 :
81 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
82 : #define OCTVALUE(c) ((c) - '0')
83 :
84 : /*
85 : * These macros centralize code used to process line_buf and input_buf buffers.
86 : * They are macros because they often do continue/break control and to avoid
87 : * function call overhead in tight COPY loops.
88 : *
89 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
90 : * prevent the continue/break processing from working. We end the "if (1)"
91 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
92 : * any "else" in the calling code, and to avoid any compiler warnings about
93 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
94 : */
95 :
96 : /*
97 : * This keeps the character read at the top of the loop in the buffer
98 : * even if there is more than one read-ahead.
99 : */
100 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
101 : if (1) \
102 : { \
103 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
104 : { \
105 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
106 : need_data = true; \
107 : continue; \
108 : } \
109 : } else ((void) 0)
110 :
111 : /* This consumes the remainder of the buffer and breaks */
112 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
113 : if (1) \
114 : { \
115 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
116 : { \
117 : if (extralen) \
118 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
119 : /* backslash just before EOF, treat as data char */ \
120 : result = true; \
121 : break; \
122 : } \
123 : } else ((void) 0)
124 :
125 : /*
126 : * Transfer any approved data to line_buf; must do this to be sure
127 : * there is some room in input_buf.
128 : */
129 : #define REFILL_LINEBUF \
130 : if (1) \
131 : { \
132 : if (input_buf_ptr > cstate->input_buf_index) \
133 : { \
134 : appendBinaryStringInfo(&cstate->line_buf, \
135 : cstate->input_buf + cstate->input_buf_index, \
136 : input_buf_ptr - cstate->input_buf_index); \
137 : cstate->input_buf_index = input_buf_ptr; \
138 : } \
139 : } else ((void) 0)
140 :
141 : /* NOTE: there's a copy of this in copyto.c */
142 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
143 :
144 :
145 : /* non-export function prototypes */
146 : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
147 : static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
148 : bool is_csv);
149 : static int CopyReadAttributesText(CopyFromState cstate);
150 : static int CopyReadAttributesCSV(CopyFromState cstate);
151 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
152 : Oid typioparam, int32 typmod,
153 : bool *isnull);
154 : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
155 : ExprContext *econtext,
156 : Datum *values,
157 : bool *nulls,
158 : bool is_csv);
159 : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
160 : char ***fields,
161 : int *nfields,
162 : bool is_csv);
163 :
164 :
165 : /* Low-level communications functions */
166 : static int CopyGetData(CopyFromState cstate, void *databuf,
167 : int minread, int maxread);
168 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
169 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
170 : static void CopyLoadInputBuf(CopyFromState cstate);
171 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
172 :
173 : void
174 681 : ReceiveCopyBegin(CopyFromState cstate)
175 : {
176 : StringInfoData buf;
177 681 : int natts = list_length(cstate->attnumlist);
178 681 : int16 format = (cstate->opts.format == COPY_FORMAT_BINARY ? 1 : 0);
179 : int i;
180 :
181 681 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
182 681 : pq_sendbyte(&buf, format); /* overall format */
183 681 : pq_sendint16(&buf, natts);
184 2445 : for (i = 0; i < natts; i++)
185 1764 : pq_sendint16(&buf, format); /* per-column formats */
186 681 : pq_endmessage(&buf);
187 681 : cstate->copy_src = COPY_FRONTEND;
188 681 : cstate->fe_msgbuf = makeStringInfo();
189 : /* We *must* flush here to ensure FE knows it can send. */
190 681 : pq_flush();
191 681 : }
192 :
193 : void
194 8 : ReceiveCopyBinaryHeader(CopyFromState cstate)
195 : {
196 : char readSig[11];
197 : int32 tmp;
198 :
199 : /* Signature */
200 8 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
201 8 : memcmp(readSig, BinarySignature, 11) != 0)
202 0 : ereport(ERROR,
203 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
204 : errmsg("COPY file signature not recognized")));
205 : /* Flags field */
206 8 : if (!CopyGetInt32(cstate, &tmp))
207 0 : ereport(ERROR,
208 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 : errmsg("invalid COPY file header (missing flags)")));
210 8 : if ((tmp & (1 << 16)) != 0)
211 0 : ereport(ERROR,
212 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
213 : errmsg("invalid COPY file header (WITH OIDS)")));
214 8 : tmp &= ~(1 << 16);
215 8 : if ((tmp >> 16) != 0)
216 0 : ereport(ERROR,
217 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
218 : errmsg("unrecognized critical flags in COPY file header")));
219 : /* Header extension length */
220 8 : if (!CopyGetInt32(cstate, &tmp) ||
221 8 : tmp < 0)
222 0 : ereport(ERROR,
223 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
224 : errmsg("invalid COPY file header (missing length)")));
225 : /* Skip extension header, if present */
226 8 : while (tmp-- > 0)
227 : {
228 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
229 0 : ereport(ERROR,
230 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
231 : errmsg("invalid COPY file header (wrong length)")));
232 : }
233 8 : }
234 :
235 : /*
236 : * CopyGetData reads data from the source (file or frontend)
237 : *
238 : * We attempt to read at least minread, and at most maxread, bytes from
239 : * the source. The actual number of bytes read is returned; if this is
240 : * less than minread, EOF was detected.
241 : *
242 : * Note: when copying from the frontend, we expect a proper EOF mark per
243 : * protocol; if the frontend simply drops the connection, we raise error.
244 : * It seems unwise to allow the COPY IN to complete normally in that case.
245 : *
246 : * NB: no data conversion is applied here.
247 : */
248 : static int
249 217548 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
250 : {
251 217548 : int bytesread = 0;
252 :
253 217548 : switch (cstate->copy_src)
254 : {
255 705 : case COPY_FILE:
256 705 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
257 705 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
258 705 : pgstat_report_wait_end();
259 705 : if (ferror(cstate->copy_file))
260 0 : ereport(ERROR,
261 : (errcode_for_file_access(),
262 : errmsg("could not read from COPY file: %m")));
263 705 : if (bytesread == 0)
264 277 : cstate->raw_reached_eof = true;
265 705 : break;
266 201766 : case COPY_FRONTEND:
267 402515 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
268 : {
269 : int avail;
270 :
271 402035 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
272 : {
273 : /* Try to receive another message */
274 : int mtype;
275 : int maxmsglen;
276 :
277 201286 : readmessage:
278 201286 : HOLD_CANCEL_INTERRUPTS();
279 201286 : pq_startmsgread();
280 201286 : mtype = pq_getbyte();
281 201286 : if (mtype == EOF)
282 0 : ereport(ERROR,
283 : (errcode(ERRCODE_CONNECTION_FAILURE),
284 : errmsg("unexpected EOF on client connection with an open transaction")));
285 : /* Validate message type and set packet size limit */
286 201286 : switch (mtype)
287 : {
288 200749 : case PqMsg_CopyData:
289 200749 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
290 200749 : break;
291 535 : case PqMsg_CopyDone:
292 : case PqMsg_CopyFail:
293 : case PqMsg_Flush:
294 : case PqMsg_Sync:
295 535 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
296 535 : break;
297 2 : default:
298 2 : ereport(ERROR,
299 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
300 : errmsg("unexpected message type 0x%02X during COPY from stdin",
301 : mtype)));
302 : maxmsglen = 0; /* keep compiler quiet */
303 : break;
304 : }
305 : /* Now collect the message body */
306 201284 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
307 0 : ereport(ERROR,
308 : (errcode(ERRCODE_CONNECTION_FAILURE),
309 : errmsg("unexpected EOF on client connection with an open transaction")));
310 201284 : RESUME_CANCEL_INTERRUPTS();
311 : /* ... and process it */
312 201284 : switch (mtype)
313 : {
314 200749 : case PqMsg_CopyData:
315 200749 : break;
316 535 : case PqMsg_CopyDone:
317 : /* COPY IN correctly terminated by frontend */
318 535 : cstate->raw_reached_eof = true;
319 535 : return bytesread;
320 0 : case PqMsg_CopyFail:
321 0 : ereport(ERROR,
322 : (errcode(ERRCODE_QUERY_CANCELED),
323 : errmsg("COPY from stdin failed: %s",
324 : pq_getmsgstring(cstate->fe_msgbuf))));
325 : break;
326 0 : case PqMsg_Flush:
327 : case PqMsg_Sync:
328 :
329 : /*
330 : * Ignore Flush/Sync for the convenience of client
331 : * libraries (such as libpq) that may send those
332 : * without noticing that the command they just
333 : * sent was COPY.
334 : */
335 0 : goto readmessage;
336 200749 : default:
337 : Assert(false); /* NOT REACHED */
338 : }
339 : }
340 200749 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
341 200749 : if (avail > maxread)
342 0 : avail = maxread;
343 200749 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
344 200749 : databuf = (char *) databuf + avail;
345 200749 : maxread -= avail;
346 200749 : bytesread += avail;
347 : }
348 201229 : break;
349 15077 : case COPY_CALLBACK:
350 15077 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
351 15077 : break;
352 : }
353 :
354 217011 : return bytesread;
355 : }
356 :
357 :
358 : /*
359 : * These functions do apply some data conversion
360 : */
361 :
362 : /*
363 : * CopyGetInt32 reads an int32 that appears in network byte order
364 : *
365 : * Returns true if OK, false if EOF
366 : */
367 : static inline bool
368 116 : CopyGetInt32(CopyFromState cstate, int32 *val)
369 : {
370 : uint32 buf;
371 :
372 116 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
373 : {
374 0 : *val = 0; /* suppress compiler warning */
375 0 : return false;
376 : }
377 116 : *val = (int32) pg_ntoh32(buf);
378 116 : return true;
379 : }
380 :
381 : /*
382 : * CopyGetInt16 reads an int16 that appears in network byte order
383 : */
384 : static inline bool
385 25 : CopyGetInt16(CopyFromState cstate, int16 *val)
386 : {
387 : uint16 buf;
388 :
389 25 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
390 : {
391 0 : *val = 0; /* suppress compiler warning */
392 0 : return false;
393 : }
394 25 : *val = (int16) pg_ntoh16(buf);
395 25 : return true;
396 : }
397 :
398 :
399 : /*
400 : * Perform encoding conversion on data in 'raw_buf', writing the converted
401 : * data into 'input_buf'.
402 : *
403 : * On entry, there must be some data to convert in 'raw_buf'.
404 : */
405 : static void
406 434312 : CopyConvertBuf(CopyFromState cstate)
407 : {
408 : /*
409 : * If the file and server encoding are the same, no encoding conversion is
410 : * required. However, we still need to verify that the input is valid for
411 : * the encoding.
412 : */
413 434312 : if (!cstate->need_transcoding)
414 : {
415 : /*
416 : * When conversion is not required, input_buf and raw_buf are the
417 : * same. raw_buf_len is the total number of bytes in the buffer, and
418 : * input_buf_len tracks how many of those bytes have already been
419 : * verified.
420 : */
421 434224 : int preverifiedlen = cstate->input_buf_len;
422 434224 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
423 : int nverified;
424 :
425 434224 : if (unverifiedlen == 0)
426 : {
427 : /*
428 : * If no more raw data is coming, report the EOF to the caller.
429 : */
430 218216 : if (cstate->raw_reached_eof)
431 1259 : cstate->input_reached_eof = true;
432 218216 : return;
433 : }
434 :
435 : /*
436 : * Verify the new data, including any residual unverified bytes from
437 : * previous round.
438 : */
439 216008 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
440 216008 : cstate->raw_buf + preverifiedlen,
441 : unverifiedlen);
442 216008 : if (nverified == 0)
443 : {
444 : /*
445 : * Could not verify anything.
446 : *
447 : * If there is no more raw input data coming, it means that there
448 : * was an incomplete multi-byte sequence at the end. Also, if
449 : * there's "enough" input left, we should be able to verify at
450 : * least one character, and a failure to do so means that we've
451 : * hit an invalid byte sequence.
452 : */
453 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
454 0 : cstate->input_reached_error = true;
455 0 : return;
456 : }
457 216008 : cstate->input_buf_len += nverified;
458 : }
459 : else
460 : {
461 : /*
462 : * Encoding conversion is needed.
463 : */
464 : int nbytes;
465 : unsigned char *src;
466 : int srclen;
467 : unsigned char *dst;
468 : int dstlen;
469 : int convertedlen;
470 :
471 88 : if (RAW_BUF_BYTES(cstate) == 0)
472 : {
473 : /*
474 : * If no more raw data is coming, report the EOF to the caller.
475 : */
476 56 : if (cstate->raw_reached_eof)
477 16 : cstate->input_reached_eof = true;
478 56 : return;
479 : }
480 :
481 : /*
482 : * First, copy down any unprocessed data.
483 : */
484 32 : nbytes = INPUT_BUF_BYTES(cstate);
485 32 : if (nbytes > 0 && cstate->input_buf_index > 0)
486 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
487 : nbytes);
488 32 : cstate->input_buf_index = 0;
489 32 : cstate->input_buf_len = nbytes;
490 32 : cstate->input_buf[nbytes] = '\0';
491 :
492 32 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
493 32 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
494 32 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
495 32 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
496 :
497 : /*
498 : * Do the conversion. This might stop short, if there is an invalid
499 : * byte sequence in the input. We'll convert as much as we can in
500 : * that case.
501 : *
502 : * Note: Even if we hit an invalid byte sequence, we don't report the
503 : * error until all the valid bytes have been consumed. The input
504 : * might contain an end-of-input marker (\.), and we don't want to
505 : * report an error if the invalid byte sequence is after the
506 : * end-of-input marker. We might unnecessarily convert some data
507 : * after the end-of-input marker as long as it's valid for the
508 : * encoding, but that's harmless.
509 : */
510 32 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
511 : cstate->file_encoding,
512 : GetDatabaseEncoding(),
513 : src, srclen,
514 : dst, dstlen,
515 : true);
516 32 : if (convertedlen == 0)
517 : {
518 : /*
519 : * Could not convert anything. If there is no more raw input data
520 : * coming, it means that there was an incomplete multi-byte
521 : * sequence at the end. Also, if there is plenty of input left,
522 : * we should be able to convert at least one character, so a
523 : * failure to do so must mean that we've hit a byte sequence
524 : * that's invalid.
525 : */
526 16 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
527 8 : cstate->input_reached_error = true;
528 16 : return;
529 : }
530 16 : cstate->raw_buf_index += convertedlen;
531 16 : cstate->input_buf_len += strlen((char *) dst);
532 : }
533 : }
534 :
535 : /*
536 : * Report an encoding or conversion error.
537 : */
538 : static void
539 8 : CopyConversionError(CopyFromState cstate)
540 : {
541 : Assert(cstate->raw_buf_len > 0);
542 : Assert(cstate->input_reached_error);
543 :
544 8 : if (!cstate->need_transcoding)
545 : {
546 : /*
547 : * Everything up to input_buf_len was successfully verified, and
548 : * input_buf_len points to the invalid or incomplete character.
549 : */
550 0 : report_invalid_encoding(cstate->file_encoding,
551 0 : cstate->raw_buf + cstate->input_buf_len,
552 0 : cstate->raw_buf_len - cstate->input_buf_len);
553 : }
554 : else
555 : {
556 : /*
557 : * raw_buf_index points to the invalid or untranslatable character. We
558 : * let the conversion routine report the error, because it can provide
559 : * a more specific error message than we could here. An earlier call
560 : * to the conversion routine in CopyConvertBuf() detected that there
561 : * is an error, now we call the conversion routine again with
562 : * noError=false, to have it throw the error.
563 : */
564 : unsigned char *src;
565 : int srclen;
566 : unsigned char *dst;
567 : int dstlen;
568 :
569 8 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
570 8 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
571 8 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
572 8 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
573 :
574 8 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
575 : cstate->file_encoding,
576 : GetDatabaseEncoding(),
577 : src, srclen,
578 : dst, dstlen,
579 : false);
580 :
581 : /*
582 : * The conversion routine should have reported an error, so this
583 : * should not be reached.
584 : */
585 0 : elog(ERROR, "encoding conversion failed without error");
586 : }
587 : }
588 :
589 : /*
590 : * Load more data from data source to raw_buf.
591 : *
592 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
593 : * beginning of the buffer, and we load new data after that.
594 : */
595 : static void
596 217025 : CopyLoadRawBuf(CopyFromState cstate)
597 : {
598 : int nbytes;
599 : int inbytes;
600 :
601 : /*
602 : * In text mode, if encoding conversion is not required, raw_buf and
603 : * input_buf point to the same buffer. Their len/index better agree, too.
604 : */
605 217025 : if (cstate->raw_buf == cstate->input_buf)
606 : {
607 : Assert(!cstate->need_transcoding);
608 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
609 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
610 : }
611 :
612 : /*
613 : * Copy down the unprocessed data if any.
614 : */
615 217025 : nbytes = RAW_BUF_BYTES(cstate);
616 217025 : if (nbytes > 0 && cstate->raw_buf_index > 0)
617 612 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
618 : nbytes);
619 217025 : cstate->raw_buf_len -= cstate->raw_buf_index;
620 217025 : cstate->raw_buf_index = 0;
621 :
622 : /*
623 : * If raw_buf and input_buf are in fact the same buffer, adjust the
624 : * input_buf variables, too.
625 : */
626 217025 : if (cstate->raw_buf == cstate->input_buf)
627 : {
628 216957 : cstate->input_buf_len -= cstate->input_buf_index;
629 216957 : cstate->input_buf_index = 0;
630 : }
631 :
632 : /* Load more data */
633 217025 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
634 217025 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
635 217023 : nbytes += inbytes;
636 217023 : cstate->raw_buf[nbytes] = '\0';
637 217023 : cstate->raw_buf_len = nbytes;
638 :
639 217023 : cstate->bytes_processed += inbytes;
640 217023 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
641 :
642 217023 : if (inbytes == 0)
643 978 : cstate->raw_reached_eof = true;
644 217023 : }
645 :
646 : /*
647 : * CopyLoadInputBuf loads some more data into input_buf
648 : *
649 : * On return, at least one more input character is loaded into
650 : * input_buf, or input_reached_eof is set.
651 : *
652 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
653 : * of the buffer and then we load more data after that.
654 : */
655 : static void
656 217309 : CopyLoadInputBuf(CopyFromState cstate)
657 : {
658 217309 : int nbytes = INPUT_BUF_BYTES(cstate);
659 :
660 : /*
661 : * The caller has updated input_buf_index to indicate how much of the
662 : * input has been consumed and isn't needed anymore. If input_buf is the
663 : * same physical area as raw_buf, update raw_buf_index accordingly.
664 : */
665 217309 : if (cstate->raw_buf == cstate->input_buf)
666 : {
667 : Assert(!cstate->need_transcoding);
668 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
669 217269 : cstate->raw_buf_index = cstate->input_buf_index;
670 : }
671 :
672 : for (;;)
673 : {
674 : /* If we now have some unconverted data, try to convert it */
675 434312 : CopyConvertBuf(cstate);
676 :
677 : /* If we now have some more input bytes ready, return them */
678 434312 : if (INPUT_BUF_BYTES(cstate) > nbytes)
679 216024 : return;
680 :
681 : /*
682 : * If we reached an invalid byte sequence, or we're at an incomplete
683 : * multi-byte character but there is no more raw input data, report
684 : * conversion error.
685 : */
686 218288 : if (cstate->input_reached_error)
687 8 : CopyConversionError(cstate);
688 :
689 : /* no more input, and everything has been converted */
690 218280 : if (cstate->input_reached_eof)
691 1275 : break;
692 :
693 : /* Try to load more raw data */
694 : Assert(!cstate->raw_reached_eof);
695 217005 : CopyLoadRawBuf(cstate);
696 : }
697 : }
698 :
699 : /*
700 : * CopyReadBinaryData
701 : *
702 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
703 : * and writes them to 'dest'. Returns the number of bytes read (which
704 : * would be less than 'nbytes' only if we reach EOF).
705 : */
706 : static int
707 236 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
708 : {
709 236 : int copied_bytes = 0;
710 :
711 236 : if (RAW_BUF_BYTES(cstate) >= nbytes)
712 : {
713 : /* Enough bytes are present in the buffer. */
714 216 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
715 216 : cstate->raw_buf_index += nbytes;
716 216 : copied_bytes = nbytes;
717 : }
718 : else
719 : {
720 : /*
721 : * Not enough bytes in the buffer, so must read from the file. Need
722 : * to loop since 'nbytes' could be larger than the buffer size.
723 : */
724 : do
725 : {
726 : int copy_bytes;
727 :
728 : /* Load more data if buffer is empty. */
729 20 : if (RAW_BUF_BYTES(cstate) == 0)
730 : {
731 20 : CopyLoadRawBuf(cstate);
732 20 : if (cstate->raw_reached_eof)
733 7 : break; /* EOF */
734 : }
735 :
736 : /* Transfer some bytes. */
737 13 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
738 13 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
739 13 : cstate->raw_buf_index += copy_bytes;
740 13 : dest += copy_bytes;
741 13 : copied_bytes += copy_bytes;
742 13 : } while (copied_bytes < nbytes);
743 : }
744 :
745 236 : return copied_bytes;
746 : }
747 :
748 : /*
749 : * This function is exposed for use by extensions that read raw fields in the
750 : * next line. See NextCopyFromRawFieldsInternal() for details.
751 : */
752 : bool
753 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
754 : {
755 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
756 0 : cstate->opts.format == COPY_FORMAT_CSV);
757 : }
758 :
759 : /*
760 : * Workhorse for NextCopyFromRawFields().
761 : *
762 : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
763 : * false if no more lines.
764 : *
765 : * An internal temporary buffer is returned via 'fields'. It is valid until
766 : * the next call of the function. Since the function returns all raw fields
767 : * in the input file, 'nfields' could be different from the number of columns
768 : * in the relation.
769 : *
770 : * NOTE: force_not_null option are not applied to the returned fields.
771 : *
772 : * We use pg_attribute_always_inline to reduce function call overhead
773 : * and to help compilers to optimize away the 'is_csv' condition when called
774 : * by internal functions such as CopyFromTextLikeOneRow().
775 : */
776 : static pg_attribute_always_inline bool
777 754621 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
778 : {
779 : int fldct;
780 754621 : bool done = false;
781 :
782 : /* only available for text or csv input */
783 : Assert(cstate->opts.format == COPY_FORMAT_TEXT ||
784 : cstate->opts.format == COPY_FORMAT_CSV);
785 :
786 : /* on input check that the header line is correct if needed */
787 754621 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
788 : {
789 : ListCell *cur;
790 : TupleDesc tupDesc;
791 93 : int lines_to_skip = cstate->opts.header_line;
792 :
793 : /* If set to "match", one header line is skipped */
794 93 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
795 50 : lines_to_skip = 1;
796 :
797 93 : tupDesc = RelationGetDescr(cstate->rel);
798 :
799 218 : for (int i = 0; i < lines_to_skip; i++)
800 : {
801 130 : cstate->cur_lineno++;
802 130 : if ((done = CopyReadLine(cstate, is_csv)))
803 5 : break;
804 : }
805 :
806 93 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
807 : {
808 : int fldnum;
809 :
810 50 : if (is_csv)
811 6 : fldct = CopyReadAttributesCSV(cstate);
812 : else
813 44 : fldct = CopyReadAttributesText(cstate);
814 :
815 50 : if (fldct != list_length(cstate->attnumlist))
816 16 : ereport(ERROR,
817 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
818 : errmsg("wrong number of fields in header line: got %d, expected %d",
819 : fldct, list_length(cstate->attnumlist))));
820 :
821 34 : fldnum = 0;
822 104 : foreach(cur, cstate->attnumlist)
823 : {
824 83 : int attnum = lfirst_int(cur);
825 : char *colName;
826 83 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
827 :
828 : Assert(fldnum < cstate->max_fields);
829 :
830 83 : colName = cstate->raw_fields[fldnum++];
831 83 : if (colName == NULL)
832 4 : ereport(ERROR,
833 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
834 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
835 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
836 :
837 79 : if (namestrcmp(&attr->attname, colName) != 0)
838 : {
839 9 : ereport(ERROR,
840 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
841 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
842 : fldnum, colName, NameStr(attr->attname))));
843 : }
844 : }
845 : }
846 :
847 64 : if (done)
848 5 : return false;
849 : }
850 :
851 754587 : cstate->cur_lineno++;
852 :
853 : /* Actually read the line into memory here */
854 754587 : done = CopyReadLine(cstate, is_csv);
855 :
856 : /*
857 : * EOF at start of line means we're done. If we see EOF after some
858 : * characters, we act as though it was newline followed by EOF, ie,
859 : * process the line and then exit loop on next iteration.
860 : */
861 754569 : if (done && cstate->line_buf.len == 0)
862 984 : return false;
863 :
864 : /* Parse the line into de-escaped field values */
865 753585 : if (is_csv)
866 312 : fldct = CopyReadAttributesCSV(cstate);
867 : else
868 753273 : fldct = CopyReadAttributesText(cstate);
869 :
870 753577 : *fields = cstate->raw_fields;
871 753577 : *nfields = fldct;
872 753577 : return true;
873 : }
874 :
875 : /*
876 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
877 : *
878 : * 'econtext' is used to evaluate default expression for each column that is
879 : * either not read from the file or is using the DEFAULT option of COPY FROM.
880 : * It can be NULL when no default values are used, i.e. when all columns are
881 : * read from the file, and DEFAULT option is unset.
882 : *
883 : * 'values' and 'nulls' arrays must be the same length as columns of the
884 : * relation passed to BeginCopyFrom. This function fills the arrays.
885 : */
886 : bool
887 754646 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
888 : Datum *values, bool *nulls)
889 : {
890 : TupleDesc tupDesc;
891 : AttrNumber num_phys_attrs,
892 754646 : num_defaults = cstate->num_defaults;
893 : int i;
894 754646 : int *defmap = cstate->defmap;
895 754646 : ExprState **defexprs = cstate->defexprs;
896 :
897 754646 : tupDesc = RelationGetDescr(cstate->rel);
898 754646 : num_phys_attrs = tupDesc->natts;
899 :
900 : /* Initialize all values for row to NULL */
901 3532192 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
902 754646 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
903 850718 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
904 :
905 : /* Get one row from source */
906 754646 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
907 996 : return false;
908 :
909 : /*
910 : * Now compute and insert any defaults available for the columns not
911 : * provided by the input data. Anything not processed here or above will
912 : * remain NULL.
913 : */
914 793870 : for (i = 0; i < num_defaults; i++)
915 : {
916 : /*
917 : * The caller must supply econtext and have switched into the
918 : * per-tuple memory context in it.
919 : */
920 : Assert(econtext != NULL);
921 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
922 :
923 40345 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
924 40345 : &nulls[defmap[i]]);
925 : }
926 :
927 753525 : return true;
928 : }
929 :
930 : /* Implementation of the per-row callback for text format */
931 : bool
932 754151 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
933 : bool *nulls)
934 : {
935 754151 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
936 : }
937 :
938 : /* Implementation of the per-row callback for CSV format */
939 : bool
940 470 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
941 : bool *nulls)
942 : {
943 470 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
944 : }
945 :
946 : /*
947 : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
948 : *
949 : * We use pg_attribute_always_inline to reduce function call overhead
950 : * and to help compilers to optimize away the 'is_csv' condition.
951 : */
952 : static pg_attribute_always_inline bool
953 754621 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
954 : Datum *values, bool *nulls, bool is_csv)
955 : {
956 : TupleDesc tupDesc;
957 : AttrNumber attr_count;
958 754621 : FmgrInfo *in_functions = cstate->in_functions;
959 754621 : Oid *typioparams = cstate->typioparams;
960 754621 : ExprState **defexprs = cstate->defexprs;
961 : char **field_strings;
962 : ListCell *cur;
963 : int fldct;
964 : int fieldno;
965 : char *string;
966 754621 : bool current_row_erroneous = false;
967 :
968 754621 : tupDesc = RelationGetDescr(cstate->rel);
969 754621 : attr_count = list_length(cstate->attnumlist);
970 :
971 : /* read raw fields in the next line */
972 754621 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
973 989 : return false;
974 :
975 : /* check for overflowing fields */
976 753577 : if (attr_count > 0 && fldct > attr_count)
977 16 : ereport(ERROR,
978 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
979 : errmsg("extra data after last expected column")));
980 :
981 753561 : fieldno = 0;
982 :
983 : /* Loop to read the user attributes on the line. */
984 3444916 : foreach(cur, cstate->attnumlist)
985 : {
986 2691490 : int attnum = lfirst_int(cur);
987 2691490 : int m = attnum - 1;
988 2691490 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
989 :
990 2691490 : if (fieldno >= fldct)
991 16 : ereport(ERROR,
992 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
993 : errmsg("missing data for column \"%s\"",
994 : NameStr(att->attname))));
995 2691474 : string = field_strings[fieldno++];
996 :
997 2691474 : if (cstate->convert_select_flags &&
998 10 : !cstate->convert_select_flags[m])
999 : {
1000 : /* ignore input field, leaving column as NULL */
1001 5 : continue;
1002 : }
1003 :
1004 2691469 : if (is_csv)
1005 : {
1006 619 : if (string == NULL &&
1007 27 : cstate->opts.force_notnull_flags[m])
1008 : {
1009 : /*
1010 : * FORCE_NOT_NULL option is set and column is NULL - convert
1011 : * it to the NULL string.
1012 : */
1013 18 : string = cstate->opts.null_print;
1014 : }
1015 601 : else if (string != NULL && cstate->opts.force_null_flags[m]
1016 32 : && strcmp(string, cstate->opts.null_print) == 0)
1017 : {
1018 : /*
1019 : * FORCE_NULL option is set and column matches the NULL
1020 : * string. It must have been quoted, or otherwise the string
1021 : * would already have been set to NULL. Convert it to NULL as
1022 : * specified.
1023 : */
1024 17 : string = NULL;
1025 : }
1026 : }
1027 :
1028 2691469 : cstate->cur_attname = NameStr(att->attname);
1029 2691469 : cstate->cur_attval = string;
1030 :
1031 2691469 : if (string != NULL)
1032 2688717 : nulls[m] = false;
1033 :
1034 2691469 : if (cstate->defaults[m])
1035 : {
1036 : /* We must have switched into the per-tuple memory context */
1037 : Assert(econtext != NULL);
1038 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1039 :
1040 38 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1041 : }
1042 :
1043 : /*
1044 : * If ON_ERROR is specified, handle the different options
1045 : */
1046 2691406 : else if (!InputFunctionCallSafe(&in_functions[m],
1047 : string,
1048 2691431 : typioparams[m],
1049 : att->atttypmod,
1050 2691431 : (Node *) cstate->escontext,
1051 2691431 : &values[m]))
1052 : {
1053 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1054 :
1055 112 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1056 82 : cstate->num_errors++;
1057 30 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1058 : {
1059 : /*
1060 : * Reset error state so the subsequent InputFunctionCallSafe
1061 : * call (for domain constraint check) can properly report
1062 : * whether it succeeded or failed.
1063 : */
1064 30 : cstate->escontext->error_occurred = false;
1065 :
1066 : Assert(cstate->domain_with_constraint != NULL);
1067 :
1068 : /*
1069 : * For constrained domains, we need an additional
1070 : * InputFunctionCallSafe() to ensure that an error is thrown
1071 : * if the domain constraint rejects null values.
1072 : */
1073 50 : if (!cstate->domain_with_constraint[m] ||
1074 20 : InputFunctionCallSafe(&in_functions[m],
1075 : NULL,
1076 20 : typioparams[m],
1077 : att->atttypmod,
1078 20 : (Node *) cstate->escontext,
1079 20 : &values[m]))
1080 : {
1081 18 : nulls[m] = true;
1082 18 : values[m] = (Datum) 0;
1083 : }
1084 : else
1085 12 : ereport(ERROR,
1086 : errcode(ERRCODE_NOT_NULL_VIOLATION),
1087 : errmsg("domain %s does not allow null values",
1088 : format_type_be(typioparams[m])),
1089 : errdetail("ON_ERROR SET_NULL cannot be applied because column \"%s\" (domain %s) does not accept null values.",
1090 : cstate->cur_attname,
1091 : format_type_be(typioparams[m])),
1092 : errdatatype(typioparams[m]));
1093 :
1094 : /*
1095 : * We count only the number of rows (not fields) where
1096 : * ON_ERROR SET_NULL was applied.
1097 : */
1098 18 : if (!current_row_erroneous)
1099 : {
1100 14 : current_row_erroneous = true;
1101 14 : cstate->num_errors++;
1102 : }
1103 : }
1104 :
1105 100 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1106 : {
1107 : /*
1108 : * Since we emit line number and column info in the below
1109 : * notice message, we suppress error context information other
1110 : * than the relation name.
1111 : */
1112 : Assert(!cstate->relname_only);
1113 44 : cstate->relname_only = true;
1114 :
1115 44 : if (cstate->cur_attval)
1116 : {
1117 : char *attval;
1118 :
1119 40 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1120 :
1121 40 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1122 24 : ereport(NOTICE,
1123 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1124 : cstate->cur_lineno,
1125 : cstate->cur_attname,
1126 : attval));
1127 16 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1128 16 : ereport(NOTICE,
1129 : errmsg("setting to null due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1130 : cstate->cur_lineno,
1131 : cstate->cur_attname,
1132 : attval));
1133 40 : pfree(attval);
1134 : }
1135 : else
1136 : {
1137 4 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1138 4 : ereport(NOTICE,
1139 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1140 : cstate->cur_lineno,
1141 : cstate->cur_attname));
1142 : }
1143 : /* reset relname_only */
1144 44 : cstate->relname_only = false;
1145 : }
1146 :
1147 100 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1148 82 : return true;
1149 18 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1150 18 : continue;
1151 : }
1152 :
1153 2691332 : cstate->cur_attname = NULL;
1154 2691332 : cstate->cur_attval = NULL;
1155 : }
1156 :
1157 : Assert(fieldno == attr_count);
1158 :
1159 753426 : return true;
1160 : }
1161 :
1162 : /* Implementation of the per-row callback for binary format */
1163 : bool
1164 25 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1165 : bool *nulls)
1166 : {
1167 : TupleDesc tupDesc;
1168 : AttrNumber attr_count;
1169 25 : FmgrInfo *in_functions = cstate->in_functions;
1170 25 : Oid *typioparams = cstate->typioparams;
1171 : int16 fld_count;
1172 : ListCell *cur;
1173 :
1174 25 : tupDesc = RelationGetDescr(cstate->rel);
1175 25 : attr_count = list_length(cstate->attnumlist);
1176 :
1177 25 : cstate->cur_lineno++;
1178 :
1179 25 : if (!CopyGetInt16(cstate, &fld_count))
1180 : {
1181 : /* EOF detected (end of file, or protocol-level EOF) */
1182 0 : return false;
1183 : }
1184 :
1185 25 : if (fld_count == -1)
1186 : {
1187 : /*
1188 : * Received EOF marker. Wait for the protocol-level EOF, and complain
1189 : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1190 : * that we correctly handle CopyFail, if client chooses to send that
1191 : * now. When copying from file, we could ignore the rest of the file
1192 : * like in text mode, but we choose to be consistent with the COPY
1193 : * FROM STDIN case.
1194 : */
1195 : char dummy;
1196 :
1197 7 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1198 0 : ereport(ERROR,
1199 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1200 : errmsg("received copy data after EOF marker")));
1201 7 : return false;
1202 : }
1203 :
1204 18 : if (fld_count != attr_count)
1205 0 : ereport(ERROR,
1206 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1207 : errmsg("row field count is %d, expected %d",
1208 : fld_count, attr_count)));
1209 :
1210 117 : foreach(cur, cstate->attnumlist)
1211 : {
1212 100 : int attnum = lfirst_int(cur);
1213 100 : int m = attnum - 1;
1214 100 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1215 :
1216 100 : cstate->cur_attname = NameStr(att->attname);
1217 199 : values[m] = CopyReadBinaryAttribute(cstate,
1218 100 : &in_functions[m],
1219 100 : typioparams[m],
1220 : att->atttypmod,
1221 : &nulls[m]);
1222 99 : cstate->cur_attname = NULL;
1223 : }
1224 :
1225 17 : return true;
1226 : }
1227 :
1228 : /*
1229 : * Read the next input line and stash it in line_buf.
1230 : *
1231 : * Result is true if read was terminated by EOF, false if terminated
1232 : * by newline. The terminating newline or EOF marker is not included
1233 : * in the final value of line_buf.
1234 : */
1235 : static bool
1236 754717 : CopyReadLine(CopyFromState cstate, bool is_csv)
1237 : {
1238 : bool result;
1239 :
1240 754717 : resetStringInfo(&cstate->line_buf);
1241 754717 : cstate->line_buf_valid = false;
1242 :
1243 : /*
1244 : * Parse data and transfer into line_buf.
1245 : *
1246 : * Because this is performance critical, we inline CopyReadLineText() and
1247 : * pass the boolean parameters as constants to allow the compiler to emit
1248 : * specialized code with fewer branches.
1249 : */
1250 754717 : if (is_csv)
1251 546 : result = CopyReadLineText(cstate, true);
1252 : else
1253 754171 : result = CopyReadLineText(cstate, false);
1254 :
1255 754699 : if (result)
1256 : {
1257 : /*
1258 : * Reached EOF. In protocol version 3, we should ignore anything
1259 : * after \. up to the protocol end of copy data. (XXX maybe better
1260 : * not to treat \. as special?)
1261 : */
1262 993 : if (cstate->copy_src == COPY_FRONTEND)
1263 : {
1264 : int inbytes;
1265 :
1266 : do
1267 : {
1268 523 : inbytes = CopyGetData(cstate, cstate->input_buf,
1269 : 1, INPUT_BUF_SIZE);
1270 523 : } while (inbytes > 0);
1271 523 : cstate->input_buf_index = 0;
1272 523 : cstate->input_buf_len = 0;
1273 523 : cstate->raw_buf_index = 0;
1274 523 : cstate->raw_buf_len = 0;
1275 : }
1276 : }
1277 : else
1278 : {
1279 : /*
1280 : * If we didn't hit EOF, then we must have transferred the EOL marker
1281 : * to line_buf along with the data. Get rid of it.
1282 : */
1283 753706 : switch (cstate->eol_type)
1284 : {
1285 753706 : case EOL_NL:
1286 : Assert(cstate->line_buf.len >= 1);
1287 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1288 753706 : cstate->line_buf.len--;
1289 753706 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1290 753706 : break;
1291 0 : case EOL_CR:
1292 : Assert(cstate->line_buf.len >= 1);
1293 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1294 0 : cstate->line_buf.len--;
1295 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1296 0 : break;
1297 0 : case EOL_CRNL:
1298 : Assert(cstate->line_buf.len >= 2);
1299 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1300 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1301 0 : cstate->line_buf.len -= 2;
1302 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1303 0 : break;
1304 0 : case EOL_UNKNOWN:
1305 : /* shouldn't get here */
1306 : Assert(false);
1307 0 : break;
1308 : }
1309 : }
1310 :
1311 : /* Now it's safe to use the buffer in error messages */
1312 754699 : cstate->line_buf_valid = true;
1313 :
1314 754699 : return result;
1315 : }
1316 :
1317 : #ifndef USE_NO_SIMD
1318 : /*
1319 : * Helper function for CopyReadLineText() that uses SIMD instructions to scan
1320 : * the input buffer for special characters. This can be much faster.
1321 : *
1322 : * Note that we disable SIMD for the remainder of the COPY FROM command upon
1323 : * encountering a special character (except for end-of-line characters) or a
1324 : * short line. This is perhaps too conservative, but it should help avoid
1325 : * regressions. It could probably be made more lenient in the future via
1326 : * fine-tuned heuristics.
1327 : */
1328 : static bool
1329 332164 : CopyReadLineTextSIMDHelper(CopyFromState cstate, bool is_csv,
1330 : bool *hit_eof_p, int *input_buf_ptr_p)
1331 : {
1332 : char *copy_input_buf;
1333 : int input_buf_ptr;
1334 : int copy_buf_len;
1335 : bool unique_esc_char; /* for csv, do quote/esc chars differ? */
1336 332164 : bool first = true;
1337 332164 : bool result = false;
1338 332164 : const Vector8 nl_vec = vector8_broadcast('\n');
1339 332164 : const Vector8 cr_vec = vector8_broadcast('\r');
1340 : Vector8 bs_or_quote_vec; /* '\' for text, quote for csv */
1341 : Vector8 esc_vec; /* only for csv */
1342 :
1343 332164 : if (is_csv)
1344 : {
1345 392 : char quote = cstate->opts.quote[0];
1346 392 : char esc = cstate->opts.escape[0];
1347 :
1348 392 : bs_or_quote_vec = vector8_broadcast(quote);
1349 392 : esc_vec = vector8_broadcast(esc);
1350 392 : unique_esc_char = (quote != esc);
1351 : }
1352 : else
1353 : {
1354 331772 : bs_or_quote_vec = vector8_broadcast('\\');
1355 331772 : unique_esc_char = false;
1356 : }
1357 :
1358 : /*
1359 : * For a little extra speed within the loop, we copy some state members
1360 : * into local variables. Note that we need to use a separate local
1361 : * variable for input_buf_ptr so that the REFILL_LINEBUF macro works. We
1362 : * copy its value into the input_buf_ptr_p argument before returning.
1363 : */
1364 332164 : copy_input_buf = cstate->input_buf;
1365 332164 : input_buf_ptr = cstate->input_buf_index;
1366 332164 : copy_buf_len = cstate->input_buf_len;
1367 :
1368 : /*
1369 : * See the corresponding loop in CopyReadLineText() for more information
1370 : * about the purpose of this loop. This one does the same thing using
1371 : * SIMD instructions, although we are quick to bail out to the scalar path
1372 : * if we encounter a special character.
1373 : */
1374 : for (;;)
1375 390136 : {
1376 : Vector8 chunk;
1377 : Vector8 match;
1378 :
1379 : /* Load more data if needed. */
1380 722300 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1381 : {
1382 216815 : REFILL_LINEBUF;
1383 :
1384 216815 : CopyLoadInputBuf(cstate);
1385 : /* update our local variables */
1386 216805 : *hit_eof_p = cstate->input_reached_eof;
1387 216805 : input_buf_ptr = cstate->input_buf_index;
1388 216805 : copy_buf_len = cstate->input_buf_len;
1389 :
1390 : /*
1391 : * If we are completely out of data, break out of the loop,
1392 : * reporting EOF.
1393 : */
1394 216805 : if (INPUT_BUF_BYTES(cstate) <= 0)
1395 : {
1396 586 : result = true;
1397 586 : break;
1398 : }
1399 : }
1400 :
1401 : /*
1402 : * If we still don't have enough data for the SIMD path, fall back to
1403 : * the scalar code. Note that this doesn't necessarily mean we
1404 : * encountered a short line, so we leave cstate->simd_enabled set to
1405 : * true.
1406 : */
1407 721704 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1408 215334 : break;
1409 :
1410 : /*
1411 : * If we made it here, we have at least enough data to fit in a
1412 : * Vector8, so we can use SIMD instructions to scan for special
1413 : * characters.
1414 : */
1415 506370 : vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]);
1416 :
1417 : /*
1418 : * Check for \n, \r, \\ (for text), quotes (for csv), and escapes (for
1419 : * csv, if different from quotes).
1420 : */
1421 506370 : match = vector8_eq(chunk, nl_vec);
1422 506370 : match = vector8_or(match, vector8_eq(chunk, cr_vec));
1423 506370 : match = vector8_or(match, vector8_eq(chunk, bs_or_quote_vec));
1424 506370 : if (unique_esc_char)
1425 21 : match = vector8_or(match, vector8_eq(chunk, esc_vec));
1426 :
1427 : /*
1428 : * If we found a special character, advance to it and hand off to the
1429 : * scalar path. Except for end-of-line characters, we also disable
1430 : * SIMD processing for the remainder of the COPY FROM command.
1431 : */
1432 506370 : if (vector8_is_highbit_set(match))
1433 : {
1434 : uint32 mask;
1435 : char c;
1436 :
1437 116234 : mask = vector8_highbit_mask(match);
1438 116234 : input_buf_ptr += pg_rightmost_one_pos32(mask);
1439 :
1440 : /*
1441 : * Don't disable SIMD if we found \n or \r, else we'd stop using
1442 : * SIMD instructions after the first line. As an exception, we do
1443 : * disable it if this is the first vector we processed, as that
1444 : * means the line is too short for SIMD.
1445 : */
1446 116234 : c = copy_input_buf[input_buf_ptr];
1447 116234 : if (first || (c != '\n' && c != '\r'))
1448 393 : cstate->simd_enabled = false;
1449 :
1450 116234 : break;
1451 : }
1452 :
1453 : /* That chunk was clear of special characters, so we can skip it. */
1454 390136 : input_buf_ptr += sizeof(Vector8);
1455 390136 : first = false;
1456 : }
1457 :
1458 332154 : *input_buf_ptr_p = input_buf_ptr;
1459 332154 : return result;
1460 : }
1461 : #endif /* ! USE_NO_SIMD */
1462 :
1463 : /*
1464 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1465 : */
1466 : static pg_attribute_always_inline bool
1467 754717 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1468 : {
1469 : char *copy_input_buf;
1470 : int input_buf_ptr;
1471 : int copy_buf_len;
1472 754717 : bool need_data = false;
1473 754717 : bool hit_eof = false;
1474 754717 : bool result = false;
1475 :
1476 : /* CSV variables */
1477 754717 : bool in_quote = false,
1478 754717 : last_was_esc = false;
1479 754717 : char quotec = '\0';
1480 754717 : char escapec = '\0';
1481 :
1482 754717 : if (is_csv)
1483 : {
1484 546 : quotec = cstate->opts.quote[0];
1485 546 : escapec = cstate->opts.escape[0];
1486 : /* ignore special escape processing if it's the same as quotec */
1487 546 : if (quotec == escapec)
1488 438 : escapec = '\0';
1489 : }
1490 :
1491 : /*
1492 : * The objective of this loop is to transfer the entire next input line
1493 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1494 : * \n) and the end-of-copy marker (\.).
1495 : *
1496 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1497 : * value and are put in line_buf. We keep just enough state to know if we
1498 : * are currently in a quoted field or not.
1499 : *
1500 : * The input has already been converted to the database encoding. All
1501 : * supported server encodings have the property that all bytes in a
1502 : * multi-byte sequence have the high bit set, so a multibyte character
1503 : * cannot contain any newline or escape characters embedded in the
1504 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1505 : * regardless of the encoding.
1506 : *
1507 : * For speed, we try to move data from input_buf to line_buf in chunks
1508 : * rather than one character at a time. input_buf_ptr points to the next
1509 : * character to examine; any characters from input_buf_index to
1510 : * input_buf_ptr have been determined to be part of the line, but not yet
1511 : * transferred to line_buf.
1512 : *
1513 : * For a little extra speed within the loop, we copy some state
1514 : * information into local variables. input_buf_ptr could be changed in
1515 : * the SIMD path, so we must set that one before it. The others are set
1516 : * afterwards.
1517 : */
1518 754717 : input_buf_ptr = cstate->input_buf_index;
1519 :
1520 : /*
1521 : * We first try to use SIMD for the task described above, falling back to
1522 : * the scalar path (i.e., the loop below) if needed.
1523 : */
1524 : #ifndef USE_NO_SIMD
1525 754717 : if (cstate->simd_enabled)
1526 : {
1527 : /*
1528 : * Using temporary variables seems to encourage the compiler to keep
1529 : * them in a register, which is beneficial for performance.
1530 : */
1531 332164 : bool tmp_hit_eof = false;
1532 332164 : int tmp_input_buf_ptr = 0; /* silence compiler warning */
1533 :
1534 332164 : result = CopyReadLineTextSIMDHelper(cstate, is_csv, &tmp_hit_eof,
1535 : &tmp_input_buf_ptr);
1536 332154 : hit_eof = tmp_hit_eof;
1537 332154 : input_buf_ptr = tmp_input_buf_ptr;
1538 :
1539 332154 : if (result)
1540 : {
1541 : /* Transfer any still-uncopied data to line_buf. */
1542 586 : REFILL_LINEBUF;
1543 :
1544 586 : return result;
1545 : }
1546 : }
1547 : #endif /* ! USE_NO_SIMD */
1548 :
1549 754121 : copy_input_buf = cstate->input_buf;
1550 754121 : copy_buf_len = cstate->input_buf_len;
1551 :
1552 : for (;;)
1553 8712837 : {
1554 : int prev_raw_ptr;
1555 : char c;
1556 :
1557 : /*
1558 : * Load more data if needed.
1559 : *
1560 : * TODO: We could just force four bytes of read-ahead and avoid the
1561 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1562 : * unsafe with the old v2 COPY protocol, but we don't support that
1563 : * anymore.
1564 : */
1565 9466958 : if (input_buf_ptr >= copy_buf_len || need_data)
1566 : {
1567 494 : REFILL_LINEBUF;
1568 :
1569 494 : CopyLoadInputBuf(cstate);
1570 : /* update our local variables */
1571 494 : hit_eof = cstate->input_reached_eof;
1572 494 : input_buf_ptr = cstate->input_buf_index;
1573 494 : copy_buf_len = cstate->input_buf_len;
1574 :
1575 : /*
1576 : * If we are completely out of data, break out of the loop,
1577 : * reporting EOF.
1578 : */
1579 494 : if (INPUT_BUF_BYTES(cstate) <= 0)
1580 : {
1581 360 : result = true;
1582 360 : break;
1583 : }
1584 134 : need_data = false;
1585 : }
1586 :
1587 : /* OK to fetch a character */
1588 9466598 : prev_raw_ptr = input_buf_ptr;
1589 9466598 : c = copy_input_buf[input_buf_ptr++];
1590 :
1591 9466598 : if (is_csv)
1592 : {
1593 : /*
1594 : * If character is '\r', we may need to look ahead below. Force
1595 : * fetch of the next character if we don't already have it. We
1596 : * need to do this before changing CSV state, in case '\r' is also
1597 : * the quote or escape character.
1598 : */
1599 2615 : if (c == '\r')
1600 : {
1601 24 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1602 : }
1603 :
1604 : /*
1605 : * Dealing with quotes and escapes here is mildly tricky. If the
1606 : * quote char is also the escape char, there's no problem - we
1607 : * just use the char as a toggle. If they are different, we need
1608 : * to ensure that we only take account of an escape inside a
1609 : * quoted field and immediately preceding a quote char, and not
1610 : * the second in an escape-escape sequence.
1611 : */
1612 2615 : if (in_quote && c == escapec)
1613 32 : last_was_esc = !last_was_esc;
1614 2615 : if (c == quotec && !last_was_esc)
1615 308 : in_quote = !in_quote;
1616 2615 : if (c != escapec)
1617 2579 : last_was_esc = false;
1618 :
1619 : /*
1620 : * Updating the line count for embedded CR and/or LF chars is
1621 : * necessarily a little fragile - this test is probably about the
1622 : * best we can do. (XXX it's arguable whether we should do this
1623 : * at all --- is cur_lineno a physical or logical count?)
1624 : */
1625 2615 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1626 24 : cstate->cur_lineno++;
1627 : }
1628 :
1629 : /* Process \r */
1630 9466598 : if (c == '\r' && (!is_csv || !in_quote))
1631 : {
1632 : /* Check for \r\n on first line, _and_ handle \r\n. */
1633 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1634 0 : cstate->eol_type == EOL_CRNL)
1635 : {
1636 : /*
1637 : * If need more data, go back to loop top to load it.
1638 : *
1639 : * Note that if we are at EOF, c will wind up as '\0' because
1640 : * of the guaranteed pad of input_buf.
1641 : */
1642 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1643 :
1644 : /* get next char */
1645 0 : c = copy_input_buf[input_buf_ptr];
1646 :
1647 0 : if (c == '\n')
1648 : {
1649 0 : input_buf_ptr++; /* eat newline */
1650 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1651 : }
1652 : else
1653 : {
1654 : /* found \r, but no \n */
1655 0 : if (cstate->eol_type == EOL_CRNL)
1656 0 : ereport(ERROR,
1657 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1658 : !is_csv ?
1659 : errmsg("literal carriage return found in data") :
1660 : errmsg("unquoted carriage return found in data"),
1661 : !is_csv ?
1662 : errhint("Use \"\\r\" to represent carriage return.") :
1663 : errhint("Use quoted CSV field to represent carriage return.")));
1664 :
1665 : /*
1666 : * if we got here, it is the first line and we didn't find
1667 : * \n, so don't consume the peeked character
1668 : */
1669 0 : cstate->eol_type = EOL_CR;
1670 : }
1671 : }
1672 0 : else if (cstate->eol_type == EOL_NL)
1673 0 : ereport(ERROR,
1674 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1675 : !is_csv ?
1676 : errmsg("literal carriage return found in data") :
1677 : errmsg("unquoted carriage return found in data"),
1678 : !is_csv ?
1679 : errhint("Use \"\\r\" to represent carriage return.") :
1680 : errhint("Use quoted CSV field to represent carriage return.")));
1681 : /* If reach here, we have found the line terminator */
1682 0 : break;
1683 : }
1684 :
1685 : /* Process \n */
1686 9466598 : if (c == '\n' && (!is_csv || !in_quote))
1687 : {
1688 753706 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1689 0 : ereport(ERROR,
1690 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1691 : !is_csv ?
1692 : errmsg("literal newline found in data") :
1693 : errmsg("unquoted newline found in data"),
1694 : !is_csv ?
1695 : errhint("Use \"\\n\" to represent newline.") :
1696 : errhint("Use quoted CSV field to represent newline.")));
1697 753706 : cstate->eol_type = EOL_NL; /* in case not set yet */
1698 : /* If reach here, we have found the line terminator */
1699 753706 : break;
1700 : }
1701 :
1702 : /*
1703 : * Process backslash, except in CSV mode where backslash is a normal
1704 : * character.
1705 : */
1706 8712892 : if (c == '\\' && !is_csv)
1707 : {
1708 : char c2;
1709 :
1710 4904 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1711 4904 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1712 :
1713 : /* -----
1714 : * get next character
1715 : * Note: we do not change c so if it isn't \., we can fall
1716 : * through and continue processing.
1717 : * -----
1718 : */
1719 4904 : c2 = copy_input_buf[input_buf_ptr];
1720 :
1721 4904 : if (c2 == '.')
1722 : {
1723 55 : input_buf_ptr++; /* consume the '.' */
1724 55 : if (cstate->eol_type == EOL_CRNL)
1725 : {
1726 : /* Get the next character */
1727 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1728 : /* if hit_eof, c2 will become '\0' */
1729 0 : c2 = copy_input_buf[input_buf_ptr++];
1730 :
1731 0 : if (c2 == '\n')
1732 0 : ereport(ERROR,
1733 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1734 : errmsg("end-of-copy marker does not match previous newline style")));
1735 0 : else if (c2 != '\r')
1736 0 : ereport(ERROR,
1737 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1738 : errmsg("end-of-copy marker is not alone on its line")));
1739 : }
1740 :
1741 : /* Get the next character */
1742 55 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1743 : /* if hit_eof, c2 will become '\0' */
1744 55 : c2 = copy_input_buf[input_buf_ptr++];
1745 :
1746 55 : if (c2 != '\r' && c2 != '\n')
1747 4 : ereport(ERROR,
1748 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1749 : errmsg("end-of-copy marker is not alone on its line")));
1750 :
1751 51 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1752 51 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1753 51 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1754 0 : ereport(ERROR,
1755 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1756 : errmsg("end-of-copy marker does not match previous newline style")));
1757 :
1758 : /*
1759 : * If there is any data on this line before the \., complain.
1760 : */
1761 51 : if (cstate->line_buf.len > 0 ||
1762 51 : prev_raw_ptr > cstate->input_buf_index)
1763 4 : ereport(ERROR,
1764 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1765 : errmsg("end-of-copy marker is not alone on its line")));
1766 :
1767 : /*
1768 : * Discard the \. and newline, then report EOF.
1769 : */
1770 47 : cstate->input_buf_index = input_buf_ptr;
1771 47 : result = true; /* report EOF */
1772 47 : break;
1773 : }
1774 : else
1775 : {
1776 : /*
1777 : * If we are here, it means we found a backslash followed by
1778 : * something other than a period. In non-CSV mode, anything
1779 : * after a backslash is special, so we skip over that second
1780 : * character too. If we didn't do that \\. would be
1781 : * considered an eof-of copy, while in non-CSV mode it is a
1782 : * literal backslash followed by a period.
1783 : */
1784 4849 : input_buf_ptr++;
1785 : }
1786 : }
1787 : } /* end of outer loop */
1788 :
1789 : /*
1790 : * Transfer any still-uncopied data to line_buf.
1791 : */
1792 754113 : REFILL_LINEBUF;
1793 :
1794 754113 : return result;
1795 : }
1796 :
1797 : /*
1798 : * Return decimal value for a hexadecimal digit
1799 : */
1800 : static int
1801 0 : GetDecimalFromHex(char hex)
1802 : {
1803 0 : if (isdigit((unsigned char) hex))
1804 0 : return hex - '0';
1805 : else
1806 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1807 : }
1808 :
1809 : /*
1810 : * Parse the current line into separate attributes (fields),
1811 : * performing de-escaping as needed.
1812 : *
1813 : * The input is in line_buf. We use attribute_buf to hold the result
1814 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1815 : * string, or NULL when the input matches the null marker string.
1816 : * This array is expanded as necessary.
1817 : *
1818 : * (Note that the caller cannot check for nulls since the returned
1819 : * string would be the post-de-escaping equivalent, which may look
1820 : * the same as some valid data string.)
1821 : *
1822 : * delim is the column delimiter string (must be just one byte for now).
1823 : * null_print is the null marker string. Note that this is compared to
1824 : * the pre-de-escaped input string.
1825 : *
1826 : * The return value is the number of fields actually read.
1827 : */
1828 : static int
1829 753317 : CopyReadAttributesText(CopyFromState cstate)
1830 : {
1831 753317 : char delimc = cstate->opts.delim[0];
1832 : int fieldno;
1833 : char *output_ptr;
1834 : char *cur_ptr;
1835 : char *line_end_ptr;
1836 :
1837 : /*
1838 : * We need a special case for zero-column tables: check that the input
1839 : * line is empty, and return.
1840 : */
1841 753317 : if (cstate->max_fields <= 0)
1842 : {
1843 4 : if (cstate->line_buf.len != 0)
1844 0 : ereport(ERROR,
1845 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1846 : errmsg("extra data after last expected column")));
1847 4 : return 0;
1848 : }
1849 :
1850 753313 : resetStringInfo(&cstate->attribute_buf);
1851 :
1852 : /*
1853 : * The de-escaped attributes will certainly not be longer than the input
1854 : * data line, so we can just force attribute_buf to be large enough and
1855 : * then transfer data without any checks for enough space. We need to do
1856 : * it this way because enlarging attribute_buf mid-stream would invalidate
1857 : * pointers already stored into cstate->raw_fields[].
1858 : */
1859 753313 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1860 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1861 753313 : output_ptr = cstate->attribute_buf.data;
1862 :
1863 : /* set pointer variables for loop */
1864 753313 : cur_ptr = cstate->line_buf.data;
1865 753313 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1866 :
1867 : /* Outer loop iterates over fields */
1868 753313 : fieldno = 0;
1869 : for (;;)
1870 1937829 : {
1871 2691142 : bool found_delim = false;
1872 : char *start_ptr;
1873 : char *end_ptr;
1874 : int input_len;
1875 2691142 : bool saw_non_ascii = false;
1876 :
1877 : /* Make sure there is enough space for the next value */
1878 2691142 : if (fieldno >= cstate->max_fields)
1879 : {
1880 28 : cstate->max_fields *= 2;
1881 28 : cstate->raw_fields =
1882 28 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1883 : }
1884 :
1885 : /* Remember start of field on both input and output sides */
1886 2691142 : start_ptr = cur_ptr;
1887 2691142 : cstate->raw_fields[fieldno] = output_ptr;
1888 :
1889 : /*
1890 : * Scan data for field.
1891 : *
1892 : * Note that in this loop, we are scanning to locate the end of field
1893 : * and also speculatively performing de-escaping. Once we find the
1894 : * end-of-field, we can match the raw field contents against the null
1895 : * marker string. Only after that comparison fails do we know that
1896 : * de-escaping is actually the right thing to do; therefore we *must
1897 : * not* throw any syntax errors before we've done the null-marker
1898 : * check.
1899 : */
1900 : for (;;)
1901 13807040 : {
1902 : char c;
1903 :
1904 16498182 : end_ptr = cur_ptr;
1905 16498182 : if (cur_ptr >= line_end_ptr)
1906 753309 : break;
1907 15744873 : c = *cur_ptr++;
1908 15744873 : if (c == delimc)
1909 : {
1910 1937833 : found_delim = true;
1911 1937833 : break;
1912 : }
1913 13807040 : if (c == '\\')
1914 : {
1915 4849 : if (cur_ptr >= line_end_ptr)
1916 0 : break;
1917 4849 : c = *cur_ptr++;
1918 4849 : switch (c)
1919 : {
1920 8 : case '0':
1921 : case '1':
1922 : case '2':
1923 : case '3':
1924 : case '4':
1925 : case '5':
1926 : case '6':
1927 : case '7':
1928 : {
1929 : /* handle \013 */
1930 : int val;
1931 :
1932 8 : val = OCTVALUE(c);
1933 8 : if (cur_ptr < line_end_ptr)
1934 : {
1935 4 : c = *cur_ptr;
1936 4 : if (ISOCTAL(c))
1937 : {
1938 0 : cur_ptr++;
1939 0 : val = (val << 3) + OCTVALUE(c);
1940 0 : if (cur_ptr < line_end_ptr)
1941 : {
1942 0 : c = *cur_ptr;
1943 0 : if (ISOCTAL(c))
1944 : {
1945 0 : cur_ptr++;
1946 0 : val = (val << 3) + OCTVALUE(c);
1947 : }
1948 : }
1949 : }
1950 : }
1951 8 : c = val & 0377;
1952 8 : if (c == '\0' || IS_HIGHBIT_SET(c))
1953 8 : saw_non_ascii = true;
1954 : }
1955 8 : break;
1956 8 : case 'x':
1957 : /* Handle \x3F */
1958 8 : if (cur_ptr < line_end_ptr)
1959 : {
1960 4 : char hexchar = *cur_ptr;
1961 :
1962 4 : if (isxdigit((unsigned char) hexchar))
1963 : {
1964 0 : int val = GetDecimalFromHex(hexchar);
1965 :
1966 0 : cur_ptr++;
1967 0 : if (cur_ptr < line_end_ptr)
1968 : {
1969 0 : hexchar = *cur_ptr;
1970 0 : if (isxdigit((unsigned char) hexchar))
1971 : {
1972 0 : cur_ptr++;
1973 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1974 : }
1975 : }
1976 0 : c = val & 0xff;
1977 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1978 0 : saw_non_ascii = true;
1979 : }
1980 : }
1981 8 : break;
1982 0 : case 'b':
1983 0 : c = '\b';
1984 0 : break;
1985 0 : case 'f':
1986 0 : c = '\f';
1987 0 : break;
1988 2033 : case 'n':
1989 2033 : c = '\n';
1990 2033 : break;
1991 0 : case 'r':
1992 0 : c = '\r';
1993 0 : break;
1994 0 : case 't':
1995 0 : c = '\t';
1996 0 : break;
1997 0 : case 'v':
1998 0 : c = '\v';
1999 0 : break;
2000 :
2001 : /*
2002 : * in all other cases, take the char after '\'
2003 : * literally
2004 : */
2005 : }
2006 : }
2007 :
2008 : /* Add c to output string */
2009 13807040 : *output_ptr++ = c;
2010 : }
2011 :
2012 : /* Check whether raw input matched null marker */
2013 2691142 : input_len = end_ptr - start_ptr;
2014 2691142 : if (input_len == cstate->opts.null_print_len &&
2015 163055 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2016 2730 : cstate->raw_fields[fieldno] = NULL;
2017 : /* Check whether raw input matched default marker */
2018 2688412 : else if (fieldno < list_length(cstate->attnumlist) &&
2019 2688380 : cstate->opts.default_print &&
2020 76 : input_len == cstate->opts.default_print_len &&
2021 20 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2022 16 : {
2023 : /* fieldno is 0-indexed and attnum is 1-indexed */
2024 20 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2025 :
2026 20 : if (cstate->defexprs[m] != NULL)
2027 : {
2028 : /* defaults contain entries for all physical attributes */
2029 16 : cstate->defaults[m] = true;
2030 : }
2031 : else
2032 : {
2033 4 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2034 4 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2035 :
2036 4 : ereport(ERROR,
2037 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2038 : errmsg("unexpected default marker in COPY data"),
2039 : errdetail("Column \"%s\" has no default value.",
2040 : NameStr(att->attname))));
2041 : }
2042 : }
2043 : else
2044 : {
2045 : /*
2046 : * At this point we know the field is supposed to contain data.
2047 : *
2048 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
2049 : * resulting string is valid data for the db encoding.
2050 : */
2051 2688392 : if (saw_non_ascii)
2052 : {
2053 0 : char *fld = cstate->raw_fields[fieldno];
2054 :
2055 0 : pg_verifymbstr(fld, output_ptr - fld, false);
2056 : }
2057 : }
2058 :
2059 : /* Terminate attribute value in output area */
2060 2691138 : *output_ptr++ = '\0';
2061 :
2062 2691138 : fieldno++;
2063 : /* Done if we hit EOL instead of a delim */
2064 2691138 : if (!found_delim)
2065 753309 : break;
2066 : }
2067 :
2068 : /* Clean up state of attribute_buf */
2069 753309 : output_ptr--;
2070 : Assert(*output_ptr == '\0');
2071 753309 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2072 :
2073 753309 : return fieldno;
2074 : }
2075 :
2076 : /*
2077 : * Parse the current line into separate attributes (fields),
2078 : * performing de-escaping as needed. This has exactly the same API as
2079 : * CopyReadAttributesText, except we parse the fields according to
2080 : * "standard" (i.e. common) CSV usage.
2081 : */
2082 : static int
2083 318 : CopyReadAttributesCSV(CopyFromState cstate)
2084 : {
2085 318 : char delimc = cstate->opts.delim[0];
2086 318 : char quotec = cstate->opts.quote[0];
2087 318 : char escapec = cstate->opts.escape[0];
2088 : int fieldno;
2089 : char *output_ptr;
2090 : char *cur_ptr;
2091 : char *line_end_ptr;
2092 :
2093 : /*
2094 : * We need a special case for zero-column tables: check that the input
2095 : * line is empty, and return.
2096 : */
2097 318 : if (cstate->max_fields <= 0)
2098 : {
2099 0 : if (cstate->line_buf.len != 0)
2100 0 : ereport(ERROR,
2101 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2102 : errmsg("extra data after last expected column")));
2103 0 : return 0;
2104 : }
2105 :
2106 318 : resetStringInfo(&cstate->attribute_buf);
2107 :
2108 : /*
2109 : * The de-escaped attributes will certainly not be longer than the input
2110 : * data line, so we can just force attribute_buf to be large enough and
2111 : * then transfer data without any checks for enough space. We need to do
2112 : * it this way because enlarging attribute_buf mid-stream would invalidate
2113 : * pointers already stored into cstate->raw_fields[].
2114 : */
2115 318 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
2116 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
2117 318 : output_ptr = cstate->attribute_buf.data;
2118 :
2119 : /* set pointer variables for loop */
2120 318 : cur_ptr = cstate->line_buf.data;
2121 318 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
2122 :
2123 : /* Outer loop iterates over fields */
2124 318 : fieldno = 0;
2125 : for (;;)
2126 326 : {
2127 644 : bool found_delim = false;
2128 644 : bool saw_quote = false;
2129 : char *start_ptr;
2130 : char *end_ptr;
2131 : int input_len;
2132 :
2133 : /* Make sure there is enough space for the next value */
2134 644 : if (fieldno >= cstate->max_fields)
2135 : {
2136 0 : cstate->max_fields *= 2;
2137 0 : cstate->raw_fields =
2138 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
2139 : }
2140 :
2141 : /* Remember start of field on both input and output sides */
2142 644 : start_ptr = cur_ptr;
2143 644 : cstate->raw_fields[fieldno] = output_ptr;
2144 :
2145 : /*
2146 : * Scan data for field,
2147 : *
2148 : * The loop starts in "not quote" mode and then toggles between that
2149 : * and "in quote" mode. The loop exits normally if it is in "not
2150 : * quote" mode and a delimiter or line end is seen.
2151 : */
2152 : for (;;)
2153 137 : {
2154 : char c;
2155 :
2156 : /* Not in quote */
2157 : for (;;)
2158 : {
2159 2045 : end_ptr = cur_ptr;
2160 2045 : if (cur_ptr >= line_end_ptr)
2161 314 : goto endfield;
2162 1731 : c = *cur_ptr++;
2163 : /* unquoted field delimiter */
2164 1731 : if (c == delimc)
2165 : {
2166 330 : found_delim = true;
2167 330 : goto endfield;
2168 : }
2169 : /* start of quoted field (or part of field) */
2170 1401 : if (c == quotec)
2171 : {
2172 137 : saw_quote = true;
2173 137 : break;
2174 : }
2175 : /* Add c to output string */
2176 1264 : *output_ptr++ = c;
2177 : }
2178 :
2179 : /* In quote */
2180 : for (;;)
2181 : {
2182 852 : end_ptr = cur_ptr;
2183 852 : if (cur_ptr >= line_end_ptr)
2184 0 : ereport(ERROR,
2185 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2186 : errmsg("unterminated CSV quoted field")));
2187 :
2188 852 : c = *cur_ptr++;
2189 :
2190 : /* escape within a quoted field */
2191 852 : if (c == escapec)
2192 : {
2193 : /*
2194 : * peek at the next char if available, and escape it if it
2195 : * is an escape char or a quote char
2196 : */
2197 81 : if (cur_ptr < line_end_ptr)
2198 : {
2199 47 : char nextc = *cur_ptr;
2200 :
2201 47 : if (nextc == escapec || nextc == quotec)
2202 : {
2203 16 : *output_ptr++ = nextc;
2204 16 : cur_ptr++;
2205 16 : continue;
2206 : }
2207 : }
2208 : }
2209 :
2210 : /*
2211 : * end of quoted field. Must do this test after testing for
2212 : * escape in case quote char and escape char are the same
2213 : * (which is the common case).
2214 : */
2215 836 : if (c == quotec)
2216 137 : break;
2217 :
2218 : /* Add c to output string */
2219 699 : *output_ptr++ = c;
2220 : }
2221 : }
2222 644 : endfield:
2223 :
2224 : /* Terminate attribute value in output area */
2225 644 : *output_ptr++ = '\0';
2226 :
2227 : /* Check whether raw input matched null marker */
2228 644 : input_len = end_ptr - start_ptr;
2229 644 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
2230 27 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2231 27 : cstate->raw_fields[fieldno] = NULL;
2232 : /* Check whether raw input matched default marker */
2233 617 : else if (fieldno < list_length(cstate->attnumlist) &&
2234 617 : cstate->opts.default_print &&
2235 94 : input_len == cstate->opts.default_print_len &&
2236 26 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2237 : {
2238 : /* fieldno is 0-index and attnum is 1-index */
2239 26 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2240 :
2241 26 : if (cstate->defexprs[m] != NULL)
2242 : {
2243 : /* defaults contain entries for all physical attributes */
2244 22 : cstate->defaults[m] = true;
2245 : }
2246 : else
2247 : {
2248 4 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2249 4 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2250 :
2251 4 : ereport(ERROR,
2252 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2253 : errmsg("unexpected default marker in COPY data"),
2254 : errdetail("Column \"%s\" has no default value.",
2255 : NameStr(att->attname))));
2256 : }
2257 : }
2258 :
2259 640 : fieldno++;
2260 : /* Done if we hit EOL instead of a delim */
2261 640 : if (!found_delim)
2262 314 : break;
2263 : }
2264 :
2265 : /* Clean up state of attribute_buf */
2266 314 : output_ptr--;
2267 : Assert(*output_ptr == '\0');
2268 314 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2269 :
2270 314 : return fieldno;
2271 : }
2272 :
2273 :
2274 : /*
2275 : * Read a binary attribute
2276 : */
2277 : static Datum
2278 100 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2279 : Oid typioparam, int32 typmod,
2280 : bool *isnull)
2281 : {
2282 : int32 fld_size;
2283 : Datum result;
2284 :
2285 100 : if (!CopyGetInt32(cstate, &fld_size))
2286 0 : ereport(ERROR,
2287 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2288 : errmsg("unexpected EOF in COPY data")));
2289 100 : if (fld_size == -1)
2290 : {
2291 20 : *isnull = true;
2292 20 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2293 : }
2294 80 : if (fld_size < 0)
2295 0 : ereport(ERROR,
2296 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2297 : errmsg("invalid field size")));
2298 :
2299 : /* reset attribute_buf to empty, and load raw data in it */
2300 80 : resetStringInfo(&cstate->attribute_buf);
2301 :
2302 80 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2303 80 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2304 80 : fld_size) != fld_size)
2305 0 : ereport(ERROR,
2306 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2307 : errmsg("unexpected EOF in COPY data")));
2308 :
2309 80 : cstate->attribute_buf.len = fld_size;
2310 80 : cstate->attribute_buf.data[fld_size] = '\0';
2311 :
2312 : /* Call the column type's binary input converter */
2313 80 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2314 : typioparam, typmod);
2315 :
2316 : /* Trouble if it didn't eat the whole buffer */
2317 80 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2318 1 : ereport(ERROR,
2319 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2320 : errmsg("incorrect binary data format")));
2321 :
2322 79 : *isnull = false;
2323 79 : return result;
2324 : }
|