Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copyapi.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bswap.h"
75 : #include "utils/builtins.h"
76 : #include "utils/rel.h"
77 : #include "utils/wait_event.h"
78 :
79 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
80 : #define OCTVALUE(c) ((c) - '0')
81 :
82 : /*
83 : * These macros centralize code used to process line_buf and input_buf buffers.
84 : * They are macros because they often do continue/break control and to avoid
85 : * function call overhead in tight COPY loops.
86 : *
87 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
88 : * prevent the continue/break processing from working. We end the "if (1)"
89 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
90 : * any "else" in the calling code, and to avoid any compiler warnings about
91 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
92 : */
93 :
94 : /*
95 : * This keeps the character read at the top of the loop in the buffer
96 : * even if there is more than one read-ahead.
97 : */
98 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
99 : if (1) \
100 : { \
101 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
102 : { \
103 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
104 : need_data = true; \
105 : continue; \
106 : } \
107 : } else ((void) 0)
108 :
109 : /* This consumes the remainder of the buffer and breaks */
110 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
111 : if (1) \
112 : { \
113 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
114 : { \
115 : if (extralen) \
116 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
117 : /* backslash just before EOF, treat as data char */ \
118 : result = true; \
119 : break; \
120 : } \
121 : } else ((void) 0)
122 :
123 : /*
124 : * Transfer any approved data to line_buf; must do this to be sure
125 : * there is some room in input_buf.
126 : */
127 : #define REFILL_LINEBUF \
128 : if (1) \
129 : { \
130 : if (input_buf_ptr > cstate->input_buf_index) \
131 : { \
132 : appendBinaryStringInfo(&cstate->line_buf, \
133 : cstate->input_buf + cstate->input_buf_index, \
134 : input_buf_ptr - cstate->input_buf_index); \
135 : cstate->input_buf_index = input_buf_ptr; \
136 : } \
137 : } else ((void) 0)
138 :
139 : /* NOTE: there's a copy of this in copyto.c */
140 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
141 :
142 :
143 : /* non-export function prototypes */
144 : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
145 : static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
146 : bool is_csv);
147 : static int CopyReadAttributesText(CopyFromState cstate);
148 : static int CopyReadAttributesCSV(CopyFromState cstate);
149 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
150 : Oid typioparam, int32 typmod,
151 : bool *isnull);
152 : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
153 : ExprContext *econtext,
154 : Datum *values,
155 : bool *nulls,
156 : bool is_csv);
157 : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
158 : char ***fields,
159 : int *nfields,
160 : bool is_csv);
161 :
162 :
163 : /* Low-level communications functions */
164 : static int CopyGetData(CopyFromState cstate, void *databuf,
165 : int minread, int maxread);
166 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
167 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
168 : static void CopyLoadInputBuf(CopyFromState cstate);
169 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
170 :
171 : void
172 541 : ReceiveCopyBegin(CopyFromState cstate)
173 : {
174 : StringInfoData buf;
175 541 : int natts = list_length(cstate->attnumlist);
176 541 : int16 format = (cstate->opts.binary ? 1 : 0);
177 : int i;
178 :
179 541 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
180 541 : pq_sendbyte(&buf, format); /* overall format */
181 541 : pq_sendint16(&buf, natts);
182 1944 : for (i = 0; i < natts; i++)
183 1403 : pq_sendint16(&buf, format); /* per-column formats */
184 541 : pq_endmessage(&buf);
185 541 : cstate->copy_src = COPY_FRONTEND;
186 541 : cstate->fe_msgbuf = makeStringInfo();
187 : /* We *must* flush here to ensure FE knows it can send. */
188 541 : pq_flush();
189 541 : }
190 :
191 : void
192 7 : ReceiveCopyBinaryHeader(CopyFromState cstate)
193 : {
194 : char readSig[11];
195 : int32 tmp;
196 :
197 : /* Signature */
198 7 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
199 7 : memcmp(readSig, BinarySignature, 11) != 0)
200 0 : ereport(ERROR,
201 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
202 : errmsg("COPY file signature not recognized")));
203 : /* Flags field */
204 7 : if (!CopyGetInt32(cstate, &tmp))
205 0 : ereport(ERROR,
206 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
207 : errmsg("invalid COPY file header (missing flags)")));
208 7 : if ((tmp & (1 << 16)) != 0)
209 0 : ereport(ERROR,
210 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
211 : errmsg("invalid COPY file header (WITH OIDS)")));
212 7 : tmp &= ~(1 << 16);
213 7 : if ((tmp >> 16) != 0)
214 0 : ereport(ERROR,
215 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
216 : errmsg("unrecognized critical flags in COPY file header")));
217 : /* Header extension length */
218 7 : if (!CopyGetInt32(cstate, &tmp) ||
219 7 : tmp < 0)
220 0 : ereport(ERROR,
221 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
222 : errmsg("invalid COPY file header (missing length)")));
223 : /* Skip extension header, if present */
224 7 : while (tmp-- > 0)
225 : {
226 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
227 0 : ereport(ERROR,
228 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
229 : errmsg("invalid COPY file header (wrong length)")));
230 : }
231 7 : }
232 :
233 : /*
234 : * CopyGetData reads data from the source (file or frontend)
235 : *
236 : * We attempt to read at least minread, and at most maxread, bytes from
237 : * the source. The actual number of bytes read is returned; if this is
238 : * less than minread, EOF was detected.
239 : *
240 : * Note: when copying from the frontend, we expect a proper EOF mark per
241 : * protocol; if the frontend simply drops the connection, we raise error.
242 : * It seems unwise to allow the COPY IN to complete normally in that case.
243 : *
244 : * NB: no data conversion is applied here.
245 : */
246 : static int
247 217009 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
248 : {
249 217009 : int bytesread = 0;
250 :
251 217009 : switch (cstate->copy_src)
252 : {
253 564 : case COPY_FILE:
254 564 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
255 564 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
256 564 : pgstat_report_wait_end();
257 564 : if (ferror(cstate->copy_file))
258 0 : ereport(ERROR,
259 : (errcode_for_file_access(),
260 : errmsg("could not read from COPY file: %m")));
261 564 : if (bytesread == 0)
262 223 : cstate->raw_reached_eof = true;
263 564 : break;
264 201414 : case COPY_FRONTEND:
265 402029 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
266 : {
267 : int avail;
268 :
269 401653 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
270 : {
271 : /* Try to receive another message */
272 : int mtype;
273 : int maxmsglen;
274 :
275 201038 : readmessage:
276 201038 : HOLD_CANCEL_INTERRUPTS();
277 201038 : pq_startmsgread();
278 201038 : mtype = pq_getbyte();
279 201038 : if (mtype == EOF)
280 0 : ereport(ERROR,
281 : (errcode(ERRCODE_CONNECTION_FAILURE),
282 : errmsg("unexpected EOF on client connection with an open transaction")));
283 : /* Validate message type and set packet size limit */
284 201038 : switch (mtype)
285 : {
286 200615 : case PqMsg_CopyData:
287 200615 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
288 200615 : break;
289 421 : case PqMsg_CopyDone:
290 : case PqMsg_CopyFail:
291 : case PqMsg_Flush:
292 : case PqMsg_Sync:
293 421 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
294 421 : break;
295 2 : default:
296 2 : ereport(ERROR,
297 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
298 : errmsg("unexpected message type 0x%02X during COPY from stdin",
299 : mtype)));
300 : maxmsglen = 0; /* keep compiler quiet */
301 : break;
302 : }
303 : /* Now collect the message body */
304 201036 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
305 0 : ereport(ERROR,
306 : (errcode(ERRCODE_CONNECTION_FAILURE),
307 : errmsg("unexpected EOF on client connection with an open transaction")));
308 201036 : RESUME_CANCEL_INTERRUPTS();
309 : /* ... and process it */
310 201036 : switch (mtype)
311 : {
312 200615 : case PqMsg_CopyData:
313 200615 : break;
314 421 : case PqMsg_CopyDone:
315 : /* COPY IN correctly terminated by frontend */
316 421 : cstate->raw_reached_eof = true;
317 421 : return bytesread;
318 0 : case PqMsg_CopyFail:
319 0 : ereport(ERROR,
320 : (errcode(ERRCODE_QUERY_CANCELED),
321 : errmsg("COPY from stdin failed: %s",
322 : pq_getmsgstring(cstate->fe_msgbuf))));
323 : break;
324 0 : case PqMsg_Flush:
325 : case PqMsg_Sync:
326 :
327 : /*
328 : * Ignore Flush/Sync for the convenience of client
329 : * libraries (such as libpq) that may send those
330 : * without noticing that the command they just
331 : * sent was COPY.
332 : */
333 0 : goto readmessage;
334 200615 : default:
335 : Assert(false); /* NOT REACHED */
336 : }
337 : }
338 200615 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
339 200615 : if (avail > maxread)
340 0 : avail = maxread;
341 200615 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
342 200615 : databuf = (char *) databuf + avail;
343 200615 : maxread -= avail;
344 200615 : bytesread += avail;
345 : }
346 200991 : break;
347 15031 : case COPY_CALLBACK:
348 15031 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
349 15031 : break;
350 : }
351 :
352 216586 : return bytesread;
353 : }
354 :
355 :
356 : /*
357 : * These functions do apply some data conversion
358 : */
359 :
360 : /*
361 : * CopyGetInt32 reads an int32 that appears in network byte order
362 : *
363 : * Returns true if OK, false if EOF
364 : */
365 : static inline bool
366 93 : CopyGetInt32(CopyFromState cstate, int32 *val)
367 : {
368 : uint32 buf;
369 :
370 93 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
371 : {
372 0 : *val = 0; /* suppress compiler warning */
373 0 : return false;
374 : }
375 93 : *val = (int32) pg_ntoh32(buf);
376 93 : return true;
377 : }
378 :
379 : /*
380 : * CopyGetInt16 reads an int16 that appears in network byte order
381 : */
382 : static inline bool
383 21 : CopyGetInt16(CopyFromState cstate, int16 *val)
384 : {
385 : uint16 buf;
386 :
387 21 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
388 : {
389 0 : *val = 0; /* suppress compiler warning */
390 0 : return false;
391 : }
392 21 : *val = (int16) pg_ntoh16(buf);
393 21 : return true;
394 : }
395 :
396 :
397 : /*
398 : * Perform encoding conversion on data in 'raw_buf', writing the converted
399 : * data into 'input_buf'.
400 : *
401 : * On entry, there must be some data to convert in 'raw_buf'.
402 : */
403 : static void
404 433133 : CopyConvertBuf(CopyFromState cstate)
405 : {
406 : /*
407 : * If the file and server encoding are the same, no encoding conversion is
408 : * required. However, we still need to verify that the input is valid for
409 : * the encoding.
410 : */
411 433133 : if (!cstate->need_transcoding)
412 : {
413 : /*
414 : * When conversion is not required, input_buf and raw_buf are the
415 : * same. raw_buf_len is the total number of bytes in the buffer, and
416 : * input_buf_len tracks how many of those bytes have already been
417 : * verified.
418 : */
419 433067 : int preverifiedlen = cstate->input_buf_len;
420 433067 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
421 : int nverified;
422 :
423 433067 : if (unverifiedlen == 0)
424 : {
425 : /*
426 : * If no more raw data is coming, report the EOF to the caller.
427 : */
428 217313 : if (cstate->raw_reached_eof)
429 779 : cstate->input_reached_eof = true;
430 217313 : return;
431 : }
432 :
433 : /*
434 : * Verify the new data, including any residual unverified bytes from
435 : * previous round.
436 : */
437 215754 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
438 215754 : cstate->raw_buf + preverifiedlen,
439 : unverifiedlen);
440 215754 : if (nverified == 0)
441 : {
442 : /*
443 : * Could not verify anything.
444 : *
445 : * If there is no more raw input data coming, it means that there
446 : * was an incomplete multi-byte sequence at the end. Also, if
447 : * there's "enough" input left, we should be able to verify at
448 : * least one character, and a failure to do so means that we've
449 : * hit an invalid byte sequence.
450 : */
451 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
452 0 : cstate->input_reached_error = true;
453 0 : return;
454 : }
455 215754 : cstate->input_buf_len += nverified;
456 : }
457 : else
458 : {
459 : /*
460 : * Encoding conversion is needed.
461 : */
462 : int nbytes;
463 : unsigned char *src;
464 : int srclen;
465 : unsigned char *dst;
466 : int dstlen;
467 : int convertedlen;
468 :
469 66 : if (RAW_BUF_BYTES(cstate) == 0)
470 : {
471 : /*
472 : * If no more raw data is coming, report the EOF to the caller.
473 : */
474 42 : if (cstate->raw_reached_eof)
475 12 : cstate->input_reached_eof = true;
476 42 : return;
477 : }
478 :
479 : /*
480 : * First, copy down any unprocessed data.
481 : */
482 24 : nbytes = INPUT_BUF_BYTES(cstate);
483 24 : if (nbytes > 0 && cstate->input_buf_index > 0)
484 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
485 : nbytes);
486 24 : cstate->input_buf_index = 0;
487 24 : cstate->input_buf_len = nbytes;
488 24 : cstate->input_buf[nbytes] = '\0';
489 :
490 24 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
491 24 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
492 24 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
493 24 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
494 :
495 : /*
496 : * Do the conversion. This might stop short, if there is an invalid
497 : * byte sequence in the input. We'll convert as much as we can in
498 : * that case.
499 : *
500 : * Note: Even if we hit an invalid byte sequence, we don't report the
501 : * error until all the valid bytes have been consumed. The input
502 : * might contain an end-of-input marker (\.), and we don't want to
503 : * report an error if the invalid byte sequence is after the
504 : * end-of-input marker. We might unnecessarily convert some data
505 : * after the end-of-input marker as long as it's valid for the
506 : * encoding, but that's harmless.
507 : */
508 24 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
509 : cstate->file_encoding,
510 : GetDatabaseEncoding(),
511 : src, srclen,
512 : dst, dstlen,
513 : true);
514 24 : if (convertedlen == 0)
515 : {
516 : /*
517 : * Could not convert anything. If there is no more raw input data
518 : * coming, it means that there was an incomplete multi-byte
519 : * sequence at the end. Also, if there is plenty of input left,
520 : * we should be able to convert at least one character, so a
521 : * failure to do so must mean that we've hit a byte sequence
522 : * that's invalid.
523 : */
524 12 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
525 6 : cstate->input_reached_error = true;
526 12 : return;
527 : }
528 12 : cstate->raw_buf_index += convertedlen;
529 12 : cstate->input_buf_len += strlen((char *) dst);
530 : }
531 : }
532 :
533 : /*
534 : * Report an encoding or conversion error.
535 : */
536 : static void
537 6 : CopyConversionError(CopyFromState cstate)
538 : {
539 : Assert(cstate->raw_buf_len > 0);
540 : Assert(cstate->input_reached_error);
541 :
542 6 : if (!cstate->need_transcoding)
543 : {
544 : /*
545 : * Everything up to input_buf_len was successfully verified, and
546 : * input_buf_len points to the invalid or incomplete character.
547 : */
548 0 : report_invalid_encoding(cstate->file_encoding,
549 0 : cstate->raw_buf + cstate->input_buf_len,
550 0 : cstate->raw_buf_len - cstate->input_buf_len);
551 : }
552 : else
553 : {
554 : /*
555 : * raw_buf_index points to the invalid or untranslatable character. We
556 : * let the conversion routine report the error, because it can provide
557 : * a more specific error message than we could here. An earlier call
558 : * to the conversion routine in CopyConvertBuf() detected that there
559 : * is an error, now we call the conversion routine again with
560 : * noError=false, to have it throw the error.
561 : */
562 : unsigned char *src;
563 : int srclen;
564 : unsigned char *dst;
565 : int dstlen;
566 :
567 6 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
568 6 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
569 6 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
570 6 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
571 :
572 6 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
573 : cstate->file_encoding,
574 : GetDatabaseEncoding(),
575 : src, srclen,
576 : dst, dstlen,
577 : false);
578 :
579 : /*
580 : * The conversion routine should have reported an error, so this
581 : * should not be reached.
582 : */
583 0 : elog(ERROR, "encoding conversion failed without error");
584 : }
585 : }
586 :
587 : /*
588 : * Load more data from data source to raw_buf.
589 : *
590 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
591 : * beginning of the buffer, and we load new data after that.
592 : */
593 : static void
594 216588 : CopyLoadRawBuf(CopyFromState cstate)
595 : {
596 : int nbytes;
597 : int inbytes;
598 :
599 : /*
600 : * In text mode, if encoding conversion is not required, raw_buf and
601 : * input_buf point to the same buffer. Their len/index better agree, too.
602 : */
603 216588 : if (cstate->raw_buf == cstate->input_buf)
604 : {
605 : Assert(!cstate->need_transcoding);
606 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
607 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
608 : }
609 :
610 : /*
611 : * Copy down the unprocessed data if any.
612 : */
613 216588 : nbytes = RAW_BUF_BYTES(cstate);
614 216588 : if (nbytes > 0 && cstate->raw_buf_index > 0)
615 0 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
616 : nbytes);
617 216588 : cstate->raw_buf_len -= cstate->raw_buf_index;
618 216588 : cstate->raw_buf_index = 0;
619 :
620 : /*
621 : * If raw_buf and input_buf are in fact the same buffer, adjust the
622 : * input_buf variables, too.
623 : */
624 216588 : if (cstate->raw_buf == cstate->input_buf)
625 : {
626 216534 : cstate->input_buf_len -= cstate->input_buf_index;
627 216534 : cstate->input_buf_index = 0;
628 : }
629 :
630 : /* Load more data */
631 216588 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
632 216588 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
633 216586 : nbytes += inbytes;
634 216586 : cstate->raw_buf[nbytes] = '\0';
635 216586 : cstate->raw_buf_len = nbytes;
636 :
637 216586 : cstate->bytes_processed += inbytes;
638 216586 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
639 :
640 216586 : if (inbytes == 0)
641 802 : cstate->raw_reached_eof = true;
642 216586 : }
643 :
644 : /*
645 : * CopyLoadInputBuf loads some more data into input_buf
646 : *
647 : * On return, at least one more input character is loaded into
648 : * input_buf, or input_reached_eof is set.
649 : *
650 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
651 : * of the buffer and then we load more data after that.
652 : */
653 : static void
654 216565 : CopyLoadInputBuf(CopyFromState cstate)
655 : {
656 216565 : int nbytes = INPUT_BUF_BYTES(cstate);
657 :
658 : /*
659 : * The caller has updated input_buf_index to indicate how much of the
660 : * input has been consumed and isn't needed anymore. If input_buf is the
661 : * same physical area as raw_buf, update raw_buf_index accordingly.
662 : */
663 216565 : if (cstate->raw_buf == cstate->input_buf)
664 : {
665 : Assert(!cstate->need_transcoding);
666 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
667 216535 : cstate->raw_buf_index = cstate->input_buf_index;
668 : }
669 :
670 : for (;;)
671 : {
672 : /* If we now have some unconverted data, try to convert it */
673 433133 : CopyConvertBuf(cstate);
674 :
675 : /* If we now have some more input bytes ready, return them */
676 433133 : if (INPUT_BUF_BYTES(cstate) > nbytes)
677 215766 : return;
678 :
679 : /*
680 : * If we reached an invalid byte sequence, or we're at an incomplete
681 : * multi-byte character but there is no more raw input data, report
682 : * conversion error.
683 : */
684 217367 : if (cstate->input_reached_error)
685 6 : CopyConversionError(cstate);
686 :
687 : /* no more input, and everything has been converted */
688 217361 : if (cstate->input_reached_eof)
689 791 : break;
690 :
691 : /* Try to load more raw data */
692 : Assert(!cstate->raw_reached_eof);
693 216570 : CopyLoadRawBuf(cstate);
694 : }
695 : }
696 :
697 : /*
698 : * CopyReadBinaryData
699 : *
700 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
701 : * and writes them to 'dest'. Returns the number of bytes read (which
702 : * would be less than 'nbytes' only if we reach EOF).
703 : */
704 : static int
705 191 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
706 : {
707 191 : int copied_bytes = 0;
708 :
709 191 : if (RAW_BUF_BYTES(cstate) >= nbytes)
710 : {
711 : /* Enough bytes are present in the buffer. */
712 173 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
713 173 : cstate->raw_buf_index += nbytes;
714 173 : copied_bytes = nbytes;
715 : }
716 : else
717 : {
718 : /*
719 : * Not enough bytes in the buffer, so must read from the file. Need
720 : * to loop since 'nbytes' could be larger than the buffer size.
721 : */
722 : do
723 : {
724 : int copy_bytes;
725 :
726 : /* Load more data if buffer is empty. */
727 18 : if (RAW_BUF_BYTES(cstate) == 0)
728 : {
729 18 : CopyLoadRawBuf(cstate);
730 18 : if (cstate->raw_reached_eof)
731 6 : break; /* EOF */
732 : }
733 :
734 : /* Transfer some bytes. */
735 12 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
736 12 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
737 12 : cstate->raw_buf_index += copy_bytes;
738 12 : dest += copy_bytes;
739 12 : copied_bytes += copy_bytes;
740 12 : } while (copied_bytes < nbytes);
741 : }
742 :
743 191 : return copied_bytes;
744 : }
745 :
746 : /*
747 : * This function is exposed for use by extensions that read raw fields in the
748 : * next line. See NextCopyFromRawFieldsInternal() for details.
749 : */
750 : bool
751 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
752 : {
753 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
754 0 : cstate->opts.csv_mode);
755 : }
756 :
757 : /*
758 : * Workhorse for NextCopyFromRawFields().
759 : *
760 : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
761 : * false if no more lines.
762 : *
763 : * An internal temporary buffer is returned via 'fields'. It is valid until
764 : * the next call of the function. Since the function returns all raw fields
765 : * in the input file, 'nfields' could be different from the number of columns
766 : * in the relation.
767 : *
768 : * NOTE: force_not_null option are not applied to the returned fields.
769 : *
770 : * We use pg_attribute_always_inline to reduce function call overhead
771 : * and to help compilers to optimize away the 'is_csv' condition when called
772 : * by internal functions such as CopyFromTextLikeOneRow().
773 : */
774 : static pg_attribute_always_inline bool
775 634773 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
776 : {
777 : int fldct;
778 634773 : bool done = false;
779 :
780 : /* only available for text or csv input */
781 : Assert(!cstate->opts.binary);
782 :
783 : /* on input check that the header line is correct if needed */
784 634773 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
785 : {
786 : ListCell *cur;
787 : TupleDesc tupDesc;
788 74 : int lines_to_skip = cstate->opts.header_line;
789 :
790 : /* If set to "match", one header line is skipped */
791 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
792 38 : lines_to_skip = 1;
793 :
794 74 : tupDesc = RelationGetDescr(cstate->rel);
795 :
796 173 : for (int i = 0; i < lines_to_skip; i++)
797 : {
798 103 : cstate->cur_lineno++;
799 103 : if ((done = CopyReadLine(cstate, is_csv)))
800 4 : break;
801 : }
802 :
803 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
804 : {
805 : int fldnum;
806 :
807 38 : if (is_csv)
808 5 : fldct = CopyReadAttributesCSV(cstate);
809 : else
810 33 : fldct = CopyReadAttributesText(cstate);
811 :
812 38 : if (fldct != list_length(cstate->attnumlist))
813 12 : ereport(ERROR,
814 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
815 : errmsg("wrong number of fields in header line: got %d, expected %d",
816 : fldct, list_length(cstate->attnumlist))));
817 :
818 26 : fldnum = 0;
819 79 : foreach(cur, cstate->attnumlist)
820 : {
821 63 : int attnum = lfirst_int(cur);
822 : char *colName;
823 63 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
824 :
825 : Assert(fldnum < cstate->max_fields);
826 :
827 63 : colName = cstate->raw_fields[fldnum++];
828 63 : if (colName == NULL)
829 3 : ereport(ERROR,
830 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
831 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
832 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
833 :
834 60 : if (namestrcmp(&attr->attname, colName) != 0)
835 : {
836 7 : ereport(ERROR,
837 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
838 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
839 : fldnum, colName, NameStr(attr->attname))));
840 : }
841 : }
842 : }
843 :
844 52 : if (done)
845 4 : return false;
846 : }
847 :
848 634747 : cstate->cur_lineno++;
849 :
850 : /* Actually read the line into memory here */
851 634747 : done = CopyReadLine(cstate, is_csv);
852 :
853 : /*
854 : * EOF at start of line means we're done. If we see EOF after some
855 : * characters, we act as though it was newline followed by EOF, ie,
856 : * process the line and then exit loop on next iteration.
857 : */
858 634733 : if (done && cstate->line_buf.len == 0)
859 831 : return false;
860 :
861 : /* Parse the line into de-escaped field values */
862 633902 : if (is_csv)
863 252 : fldct = CopyReadAttributesCSV(cstate);
864 : else
865 633650 : fldct = CopyReadAttributesText(cstate);
866 :
867 633896 : *fields = cstate->raw_fields;
868 633896 : *nfields = fldct;
869 633896 : return true;
870 : }
871 :
872 : /*
873 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
874 : *
875 : * 'econtext' is used to evaluate default expression for each column that is
876 : * either not read from the file or is using the DEFAULT option of COPY FROM.
877 : * It can be NULL when no default values are used, i.e. when all columns are
878 : * read from the file, and DEFAULT option is unset.
879 : *
880 : * 'values' and 'nulls' arrays must be the same length as columns of the
881 : * relation passed to BeginCopyFrom. This function fills the arrays.
882 : */
883 : bool
884 634794 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
885 : Datum *values, bool *nulls)
886 : {
887 : TupleDesc tupDesc;
888 : AttrNumber num_phys_attrs,
889 634794 : num_defaults = cstate->num_defaults;
890 : int i;
891 634794 : int *defmap = cstate->defmap;
892 634794 : ExprState **defexprs = cstate->defexprs;
893 :
894 634794 : tupDesc = RelationGetDescr(cstate->rel);
895 634794 : num_phys_attrs = tupDesc->natts;
896 :
897 : /* Initialize all values for row to NULL */
898 2971698 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
899 634794 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
900 706848 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
901 :
902 : /* Get one row from source */
903 634794 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
904 841 : return false;
905 :
906 : /*
907 : * Now compute and insert any defaults available for the columns not
908 : * provided by the input data. Anything not processed here or above will
909 : * remain NULL.
910 : */
911 664123 : for (i = 0; i < num_defaults; i++)
912 : {
913 : /*
914 : * The caller must supply econtext and have switched into the
915 : * per-tuple memory context in it.
916 : */
917 : Assert(econtext != NULL);
918 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
919 :
920 30265 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
921 30265 : &nulls[defmap[i]]);
922 : }
923 :
924 633858 : return true;
925 : }
926 :
927 : /* Implementation of the per-row callback for text format */
928 : bool
929 634396 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
930 : bool *nulls)
931 : {
932 634396 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
933 : }
934 :
935 : /* Implementation of the per-row callback for CSV format */
936 : bool
937 377 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
938 : bool *nulls)
939 : {
940 377 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
941 : }
942 :
943 : /*
944 : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
945 : *
946 : * We use pg_attribute_always_inline to reduce function call overhead
947 : * and to help compilers to optimize away the 'is_csv' condition.
948 : */
949 : static pg_attribute_always_inline bool
950 634773 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
951 : Datum *values, bool *nulls, bool is_csv)
952 : {
953 : TupleDesc tupDesc;
954 : AttrNumber attr_count;
955 634773 : FmgrInfo *in_functions = cstate->in_functions;
956 634773 : Oid *typioparams = cstate->typioparams;
957 634773 : ExprState **defexprs = cstate->defexprs;
958 : char **field_strings;
959 : ListCell *cur;
960 : int fldct;
961 : int fieldno;
962 : char *string;
963 634773 : bool current_row_erroneous = false;
964 :
965 634773 : tupDesc = RelationGetDescr(cstate->rel);
966 634773 : attr_count = list_length(cstate->attnumlist);
967 :
968 : /* read raw fields in the next line */
969 634773 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
970 835 : return false;
971 :
972 : /* check for overflowing fields */
973 633896 : if (attr_count > 0 && fldct > attr_count)
974 12 : ereport(ERROR,
975 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
976 : errmsg("extra data after last expected column")));
977 :
978 633884 : fieldno = 0;
979 :
980 : /* Loop to read the user attributes on the line. */
981 2900266 : foreach(cur, cstate->attnumlist)
982 : {
983 2266486 : int attnum = lfirst_int(cur);
984 2266486 : int m = attnum - 1;
985 2266486 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
986 :
987 2266486 : if (fieldno >= fldct)
988 12 : ereport(ERROR,
989 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
990 : errmsg("missing data for column \"%s\"",
991 : NameStr(att->attname))));
992 2266474 : string = field_strings[fieldno++];
993 :
994 2266474 : if (cstate->convert_select_flags &&
995 10 : !cstate->convert_select_flags[m])
996 : {
997 : /* ignore input field, leaving column as NULL */
998 5 : continue;
999 : }
1000 :
1001 2266469 : if (is_csv)
1002 : {
1003 503 : if (string == NULL &&
1004 22 : cstate->opts.force_notnull_flags[m])
1005 : {
1006 : /*
1007 : * FORCE_NOT_NULL option is set and column is NULL - convert
1008 : * it to the NULL string.
1009 : */
1010 14 : string = cstate->opts.null_print;
1011 : }
1012 489 : else if (string != NULL && cstate->opts.force_null_flags[m]
1013 25 : && strcmp(string, cstate->opts.null_print) == 0)
1014 : {
1015 : /*
1016 : * FORCE_NULL option is set and column matches the NULL
1017 : * string. It must have been quoted, or otherwise the string
1018 : * would already have been set to NULL. Convert it to NULL as
1019 : * specified.
1020 : */
1021 13 : string = NULL;
1022 : }
1023 : }
1024 :
1025 2266469 : cstate->cur_attname = NameStr(att->attname);
1026 2266469 : cstate->cur_attval = string;
1027 :
1028 2266469 : if (string != NULL)
1029 2264044 : nulls[m] = false;
1030 :
1031 2266469 : if (cstate->defaults[m])
1032 : {
1033 : /* We must have switched into the per-tuple memory context */
1034 : Assert(econtext != NULL);
1035 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1036 :
1037 30 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1038 : }
1039 :
1040 : /*
1041 : * If ON_ERROR is specified, handle the different options
1042 : */
1043 2266420 : else if (!InputFunctionCallSafe(&in_functions[m],
1044 : string,
1045 2266439 : typioparams[m],
1046 : att->atttypmod,
1047 2266439 : (Node *) cstate->escontext,
1048 2266439 : &values[m]))
1049 : {
1050 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1051 :
1052 85 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1053 64 : cstate->num_errors++;
1054 21 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1055 : {
1056 : /*
1057 : * Reset error state so the subsequent InputFunctionCallSafe
1058 : * call (for domain constraint check) can properly report
1059 : * whether it succeeded or failed.
1060 : */
1061 21 : cstate->escontext->error_occurred = false;
1062 :
1063 : Assert(cstate->domain_with_constraint != NULL);
1064 :
1065 : /*
1066 : * For constrained domains, we need an additional
1067 : * InputFunctionCallSafe() to ensure that an error is thrown
1068 : * if the domain constraint rejects null values.
1069 : */
1070 36 : if (!cstate->domain_with_constraint[m] ||
1071 15 : InputFunctionCallSafe(&in_functions[m],
1072 : NULL,
1073 15 : typioparams[m],
1074 : att->atttypmod,
1075 15 : (Node *) cstate->escontext,
1076 15 : &values[m]))
1077 : {
1078 12 : nulls[m] = true;
1079 12 : values[m] = (Datum) 0;
1080 : }
1081 : else
1082 9 : ereport(ERROR,
1083 : errcode(ERRCODE_NOT_NULL_VIOLATION),
1084 : errmsg("domain %s does not allow null values",
1085 : format_type_be(typioparams[m])),
1086 : errdetail("ON_ERROR SET_NULL cannot be applied because column \"%s\" (domain %s) does not accept null values.",
1087 : cstate->cur_attname,
1088 : format_type_be(typioparams[m])),
1089 : errdatatype(typioparams[m]));
1090 :
1091 : /*
1092 : * We count only the number of rows (not fields) where
1093 : * ON_ERROR SET_NULL was applied.
1094 : */
1095 12 : if (!current_row_erroneous)
1096 : {
1097 9 : current_row_erroneous = true;
1098 9 : cstate->num_errors++;
1099 : }
1100 : }
1101 :
1102 76 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1103 : {
1104 : /*
1105 : * Since we emit line number and column info in the below
1106 : * notice message, we suppress error context information other
1107 : * than the relation name.
1108 : */
1109 : Assert(!cstate->relname_only);
1110 33 : cstate->relname_only = true;
1111 :
1112 33 : if (cstate->cur_attval)
1113 : {
1114 : char *attval;
1115 :
1116 30 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1117 :
1118 30 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1119 18 : ereport(NOTICE,
1120 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1121 : cstate->cur_lineno,
1122 : cstate->cur_attname,
1123 : attval));
1124 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1125 12 : ereport(NOTICE,
1126 : errmsg("setting to null due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1127 : cstate->cur_lineno,
1128 : cstate->cur_attname,
1129 : attval));
1130 30 : pfree(attval);
1131 : }
1132 : else
1133 : {
1134 3 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1135 3 : ereport(NOTICE,
1136 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1137 : cstate->cur_lineno,
1138 : cstate->cur_attname));
1139 : }
1140 : /* reset relname_only */
1141 33 : cstate->relname_only = false;
1142 : }
1143 :
1144 76 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1145 64 : return true;
1146 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1147 12 : continue;
1148 : }
1149 :
1150 2266365 : cstate->cur_attname = NULL;
1151 2266365 : cstate->cur_attval = NULL;
1152 : }
1153 :
1154 : Assert(fieldno == attr_count);
1155 :
1156 633780 : return true;
1157 : }
1158 :
1159 : /* Implementation of the per-row callback for binary format */
1160 : bool
1161 21 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1162 : bool *nulls)
1163 : {
1164 : TupleDesc tupDesc;
1165 : AttrNumber attr_count;
1166 21 : FmgrInfo *in_functions = cstate->in_functions;
1167 21 : Oid *typioparams = cstate->typioparams;
1168 : int16 fld_count;
1169 : ListCell *cur;
1170 :
1171 21 : tupDesc = RelationGetDescr(cstate->rel);
1172 21 : attr_count = list_length(cstate->attnumlist);
1173 :
1174 21 : cstate->cur_lineno++;
1175 :
1176 21 : if (!CopyGetInt16(cstate, &fld_count))
1177 : {
1178 : /* EOF detected (end of file, or protocol-level EOF) */
1179 0 : return false;
1180 : }
1181 :
1182 21 : if (fld_count == -1)
1183 : {
1184 : /*
1185 : * Received EOF marker. Wait for the protocol-level EOF, and complain
1186 : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1187 : * that we correctly handle CopyFail, if client chooses to send that
1188 : * now. When copying from file, we could ignore the rest of the file
1189 : * like in text mode, but we choose to be consistent with the COPY
1190 : * FROM STDIN case.
1191 : */
1192 : char dummy;
1193 :
1194 6 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1195 0 : ereport(ERROR,
1196 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1197 : errmsg("received copy data after EOF marker")));
1198 6 : return false;
1199 : }
1200 :
1201 15 : if (fld_count != attr_count)
1202 0 : ereport(ERROR,
1203 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1204 : errmsg("row field count is %d, expected %d",
1205 : fld_count, attr_count)));
1206 :
1207 93 : foreach(cur, cstate->attnumlist)
1208 : {
1209 79 : int attnum = lfirst_int(cur);
1210 79 : int m = attnum - 1;
1211 79 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1212 :
1213 79 : cstate->cur_attname = NameStr(att->attname);
1214 157 : values[m] = CopyReadBinaryAttribute(cstate,
1215 79 : &in_functions[m],
1216 79 : typioparams[m],
1217 : att->atttypmod,
1218 : &nulls[m]);
1219 78 : cstate->cur_attname = NULL;
1220 : }
1221 :
1222 14 : return true;
1223 : }
1224 :
1225 : /*
1226 : * Read the next input line and stash it in line_buf.
1227 : *
1228 : * Result is true if read was terminated by EOF, false if terminated
1229 : * by newline. The terminating newline or EOF marker is not included
1230 : * in the final value of line_buf.
1231 : */
1232 : static bool
1233 634850 : CopyReadLine(CopyFromState cstate, bool is_csv)
1234 : {
1235 : bool result;
1236 :
1237 634850 : resetStringInfo(&cstate->line_buf);
1238 634850 : cstate->line_buf_valid = false;
1239 :
1240 : /*
1241 : * Parse data and transfer into line_buf.
1242 : *
1243 : * Because this is performance critical, we inline CopyReadLineText() and
1244 : * pass the boolean parameters as constants to allow the compiler to emit
1245 : * specialized code with fewer branches.
1246 : */
1247 634850 : if (is_csv)
1248 439 : result = CopyReadLineText(cstate, true);
1249 : else
1250 634411 : result = CopyReadLineText(cstate, false);
1251 :
1252 634836 : if (result)
1253 : {
1254 : /*
1255 : * Reached EOF. In protocol version 3, we should ignore anything
1256 : * after \. up to the protocol end of copy data. (XXX maybe better
1257 : * not to treat \. as special?)
1258 : */
1259 836 : if (cstate->copy_src == COPY_FRONTEND)
1260 : {
1261 : int inbytes;
1262 :
1263 : do
1264 : {
1265 421 : inbytes = CopyGetData(cstate, cstate->input_buf,
1266 : 1, INPUT_BUF_SIZE);
1267 421 : } while (inbytes > 0);
1268 421 : cstate->input_buf_index = 0;
1269 421 : cstate->input_buf_len = 0;
1270 421 : cstate->raw_buf_index = 0;
1271 421 : cstate->raw_buf_len = 0;
1272 : }
1273 : }
1274 : else
1275 : {
1276 : /*
1277 : * If we didn't hit EOF, then we must have transferred the EOL marker
1278 : * to line_buf along with the data. Get rid of it.
1279 : */
1280 634000 : switch (cstate->eol_type)
1281 : {
1282 634000 : case EOL_NL:
1283 : Assert(cstate->line_buf.len >= 1);
1284 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1285 634000 : cstate->line_buf.len--;
1286 634000 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1287 634000 : break;
1288 0 : case EOL_CR:
1289 : Assert(cstate->line_buf.len >= 1);
1290 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1291 0 : cstate->line_buf.len--;
1292 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1293 0 : break;
1294 0 : case EOL_CRNL:
1295 : Assert(cstate->line_buf.len >= 2);
1296 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1297 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1298 0 : cstate->line_buf.len -= 2;
1299 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1300 0 : break;
1301 0 : case EOL_UNKNOWN:
1302 : /* shouldn't get here */
1303 : Assert(false);
1304 0 : break;
1305 : }
1306 : }
1307 :
1308 : /* Now it's safe to use the buffer in error messages */
1309 634836 : cstate->line_buf_valid = true;
1310 :
1311 634836 : return result;
1312 : }
1313 :
1314 : /*
1315 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1316 : */
1317 : static pg_attribute_always_inline bool
1318 634850 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1319 : {
1320 : char *copy_input_buf;
1321 : int input_buf_ptr;
1322 : int copy_buf_len;
1323 634850 : bool need_data = false;
1324 634850 : bool hit_eof = false;
1325 634850 : bool result = false;
1326 :
1327 : /* CSV variables */
1328 634850 : bool in_quote = false,
1329 634850 : last_was_esc = false;
1330 634850 : char quotec = '\0';
1331 634850 : char escapec = '\0';
1332 :
1333 634850 : if (is_csv)
1334 : {
1335 439 : quotec = cstate->opts.quote[0];
1336 439 : escapec = cstate->opts.escape[0];
1337 : /* ignore special escape processing if it's the same as quotec */
1338 439 : if (quotec == escapec)
1339 342 : escapec = '\0';
1340 : }
1341 :
1342 : /*
1343 : * The objective of this loop is to transfer the entire next input line
1344 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1345 : * \n) and the end-of-copy marker (\.).
1346 : *
1347 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1348 : * value and are put in line_buf. We keep just enough state to know if we
1349 : * are currently in a quoted field or not.
1350 : *
1351 : * The input has already been converted to the database encoding. All
1352 : * supported server encodings have the property that all bytes in a
1353 : * multi-byte sequence have the high bit set, so a multibyte character
1354 : * cannot contain any newline or escape characters embedded in the
1355 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1356 : * regardless of the encoding.
1357 : *
1358 : * For speed, we try to move data from input_buf to line_buf in chunks
1359 : * rather than one character at a time. input_buf_ptr points to the next
1360 : * character to examine; any characters from input_buf_index to
1361 : * input_buf_ptr have been determined to be part of the line, but not yet
1362 : * transferred to line_buf.
1363 : *
1364 : * For a little extra speed within the loop, we copy input_buf and
1365 : * input_buf_len into local variables.
1366 : */
1367 634850 : copy_input_buf = cstate->input_buf;
1368 634850 : input_buf_ptr = cstate->input_buf_index;
1369 634850 : copy_buf_len = cstate->input_buf_len;
1370 :
1371 : for (;;)
1372 12772607 : {
1373 : int prev_raw_ptr;
1374 : char c;
1375 :
1376 : /*
1377 : * Load more data if needed.
1378 : *
1379 : * TODO: We could just force four bytes of read-ahead and avoid the
1380 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1381 : * unsafe with the old v2 COPY protocol, but we don't support that
1382 : * anymore.
1383 : */
1384 13407457 : if (input_buf_ptr >= copy_buf_len || need_data)
1385 : {
1386 216565 : REFILL_LINEBUF;
1387 :
1388 216565 : CopyLoadInputBuf(cstate);
1389 : /* update our local variables */
1390 216557 : hit_eof = cstate->input_reached_eof;
1391 216557 : input_buf_ptr = cstate->input_buf_index;
1392 216557 : copy_buf_len = cstate->input_buf_len;
1393 :
1394 : /*
1395 : * If we are completely out of data, break out of the loop,
1396 : * reporting EOF.
1397 : */
1398 216557 : if (INPUT_BUF_BYTES(cstate) <= 0)
1399 : {
1400 791 : result = true;
1401 791 : break;
1402 : }
1403 215766 : need_data = false;
1404 : }
1405 :
1406 : /* OK to fetch a character */
1407 13406658 : prev_raw_ptr = input_buf_ptr;
1408 13406658 : c = copy_input_buf[input_buf_ptr++];
1409 :
1410 13406658 : if (is_csv)
1411 : {
1412 : /*
1413 : * If character is '\r', we may need to look ahead below. Force
1414 : * fetch of the next character if we don't already have it. We
1415 : * need to do this before changing CSV state, in case '\r' is also
1416 : * the quote or escape character.
1417 : */
1418 3425 : if (c == '\r')
1419 : {
1420 18 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1421 : }
1422 :
1423 : /*
1424 : * Dealing with quotes and escapes here is mildly tricky. If the
1425 : * quote char is also the escape char, there's no problem - we
1426 : * just use the char as a toggle. If they are different, we need
1427 : * to ensure that we only take account of an escape inside a
1428 : * quoted field and immediately preceding a quote char, and not
1429 : * the second in an escape-escape sequence.
1430 : */
1431 3425 : if (in_quote && c == escapec)
1432 24 : last_was_esc = !last_was_esc;
1433 3425 : if (c == quotec && !last_was_esc)
1434 260 : in_quote = !in_quote;
1435 3425 : if (c != escapec)
1436 3398 : last_was_esc = false;
1437 :
1438 : /*
1439 : * Updating the line count for embedded CR and/or LF chars is
1440 : * necessarily a little fragile - this test is probably about the
1441 : * best we can do. (XXX it's arguable whether we should do this
1442 : * at all --- is cur_lineno a physical or logical count?)
1443 : */
1444 3425 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1445 18 : cstate->cur_lineno++;
1446 : }
1447 :
1448 : /* Process \r */
1449 13406658 : if (c == '\r' && (!is_csv || !in_quote))
1450 : {
1451 : /* Check for \r\n on first line, _and_ handle \r\n. */
1452 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1453 0 : cstate->eol_type == EOL_CRNL)
1454 : {
1455 : /*
1456 : * If need more data, go back to loop top to load it.
1457 : *
1458 : * Note that if we are at EOF, c will wind up as '\0' because
1459 : * of the guaranteed pad of input_buf.
1460 : */
1461 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1462 :
1463 : /* get next char */
1464 0 : c = copy_input_buf[input_buf_ptr];
1465 :
1466 0 : if (c == '\n')
1467 : {
1468 0 : input_buf_ptr++; /* eat newline */
1469 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1470 : }
1471 : else
1472 : {
1473 : /* found \r, but no \n */
1474 0 : if (cstate->eol_type == EOL_CRNL)
1475 0 : ereport(ERROR,
1476 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1477 : !is_csv ?
1478 : errmsg("literal carriage return found in data") :
1479 : errmsg("unquoted carriage return found in data"),
1480 : !is_csv ?
1481 : errhint("Use \"\\r\" to represent carriage return.") :
1482 : errhint("Use quoted CSV field to represent carriage return.")));
1483 :
1484 : /*
1485 : * if we got here, it is the first line and we didn't find
1486 : * \n, so don't consume the peeked character
1487 : */
1488 0 : cstate->eol_type = EOL_CR;
1489 : }
1490 : }
1491 0 : else if (cstate->eol_type == EOL_NL)
1492 0 : ereport(ERROR,
1493 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1494 : !is_csv ?
1495 : errmsg("literal carriage return found in data") :
1496 : errmsg("unquoted carriage return found in data"),
1497 : !is_csv ?
1498 : errhint("Use \"\\r\" to represent carriage return.") :
1499 : errhint("Use quoted CSV field to represent carriage return.")));
1500 : /* If reach here, we have found the line terminator */
1501 0 : break;
1502 : }
1503 :
1504 : /* Process \n */
1505 13406658 : if (c == '\n' && (!is_csv || !in_quote))
1506 : {
1507 634000 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1508 0 : ereport(ERROR,
1509 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1510 : !is_csv ?
1511 : errmsg("literal newline found in data") :
1512 : errmsg("unquoted newline found in data"),
1513 : !is_csv ?
1514 : errhint("Use \"\\n\" to represent newline.") :
1515 : errhint("Use quoted CSV field to represent newline.")));
1516 634000 : cstate->eol_type = EOL_NL; /* in case not set yet */
1517 : /* If reach here, we have found the line terminator */
1518 634000 : break;
1519 : }
1520 :
1521 : /*
1522 : * Process backslash, except in CSV mode where backslash is a normal
1523 : * character.
1524 : */
1525 12772658 : if (c == '\\' && !is_csv)
1526 : {
1527 : char c2;
1528 :
1529 4051 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1530 4051 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1531 :
1532 : /* -----
1533 : * get next character
1534 : * Note: we do not change c so if it isn't \., we can fall
1535 : * through and continue processing.
1536 : * -----
1537 : */
1538 4051 : c2 = copy_input_buf[input_buf_ptr];
1539 :
1540 4051 : if (c2 == '.')
1541 : {
1542 51 : input_buf_ptr++; /* consume the '.' */
1543 51 : if (cstate->eol_type == EOL_CRNL)
1544 : {
1545 : /* Get the next character */
1546 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1547 : /* if hit_eof, c2 will become '\0' */
1548 0 : c2 = copy_input_buf[input_buf_ptr++];
1549 :
1550 0 : if (c2 == '\n')
1551 0 : ereport(ERROR,
1552 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1553 : errmsg("end-of-copy marker does not match previous newline style")));
1554 0 : else if (c2 != '\r')
1555 0 : ereport(ERROR,
1556 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1557 : errmsg("end-of-copy marker is not alone on its line")));
1558 : }
1559 :
1560 : /* Get the next character */
1561 51 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1562 : /* if hit_eof, c2 will become '\0' */
1563 51 : c2 = copy_input_buf[input_buf_ptr++];
1564 :
1565 51 : if (c2 != '\r' && c2 != '\n')
1566 3 : ereport(ERROR,
1567 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1568 : errmsg("end-of-copy marker is not alone on its line")));
1569 :
1570 48 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1571 48 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1572 48 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1573 0 : ereport(ERROR,
1574 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1575 : errmsg("end-of-copy marker does not match previous newline style")));
1576 :
1577 : /*
1578 : * If there is any data on this line before the \., complain.
1579 : */
1580 48 : if (cstate->line_buf.len > 0 ||
1581 48 : prev_raw_ptr > cstate->input_buf_index)
1582 3 : ereport(ERROR,
1583 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1584 : errmsg("end-of-copy marker is not alone on its line")));
1585 :
1586 : /*
1587 : * Discard the \. and newline, then report EOF.
1588 : */
1589 45 : cstate->input_buf_index = input_buf_ptr;
1590 45 : result = true; /* report EOF */
1591 45 : break;
1592 : }
1593 : else
1594 : {
1595 : /*
1596 : * If we are here, it means we found a backslash followed by
1597 : * something other than a period. In non-CSV mode, anything
1598 : * after a backslash is special, so we skip over that second
1599 : * character too. If we didn't do that \\. would be
1600 : * considered an eof-of copy, while in non-CSV mode it is a
1601 : * literal backslash followed by a period.
1602 : */
1603 4000 : input_buf_ptr++;
1604 : }
1605 : }
1606 : } /* end of outer loop */
1607 :
1608 : /*
1609 : * Transfer any still-uncopied data to line_buf.
1610 : */
1611 634836 : REFILL_LINEBUF;
1612 :
1613 634836 : return result;
1614 : }
1615 :
1616 : /*
1617 : * Return decimal value for a hexadecimal digit
1618 : */
1619 : static int
1620 0 : GetDecimalFromHex(char hex)
1621 : {
1622 0 : if (isdigit((unsigned char) hex))
1623 0 : return hex - '0';
1624 : else
1625 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1626 : }
1627 :
1628 : /*
1629 : * Parse the current line into separate attributes (fields),
1630 : * performing de-escaping as needed.
1631 : *
1632 : * The input is in line_buf. We use attribute_buf to hold the result
1633 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1634 : * string, or NULL when the input matches the null marker string.
1635 : * This array is expanded as necessary.
1636 : *
1637 : * (Note that the caller cannot check for nulls since the returned
1638 : * string would be the post-de-escaping equivalent, which may look
1639 : * the same as some valid data string.)
1640 : *
1641 : * delim is the column delimiter string (must be just one byte for now).
1642 : * null_print is the null marker string. Note that this is compared to
1643 : * the pre-de-escaped input string.
1644 : *
1645 : * The return value is the number of fields actually read.
1646 : */
1647 : static int
1648 633683 : CopyReadAttributesText(CopyFromState cstate)
1649 : {
1650 633683 : char delimc = cstate->opts.delim[0];
1651 : int fieldno;
1652 : char *output_ptr;
1653 : char *cur_ptr;
1654 : char *line_end_ptr;
1655 :
1656 : /*
1657 : * We need a special case for zero-column tables: check that the input
1658 : * line is empty, and return.
1659 : */
1660 633683 : if (cstate->max_fields <= 0)
1661 : {
1662 4 : if (cstate->line_buf.len != 0)
1663 0 : ereport(ERROR,
1664 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1665 : errmsg("extra data after last expected column")));
1666 4 : return 0;
1667 : }
1668 :
1669 633679 : resetStringInfo(&cstate->attribute_buf);
1670 :
1671 : /*
1672 : * The de-escaped attributes will certainly not be longer than the input
1673 : * data line, so we can just force attribute_buf to be large enough and
1674 : * then transfer data without any checks for enough space. We need to do
1675 : * it this way because enlarging attribute_buf mid-stream would invalidate
1676 : * pointers already stored into cstate->raw_fields[].
1677 : */
1678 633679 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1679 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1680 633679 : output_ptr = cstate->attribute_buf.data;
1681 :
1682 : /* set pointer variables for loop */
1683 633679 : cur_ptr = cstate->line_buf.data;
1684 633679 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1685 :
1686 : /* Outer loop iterates over fields */
1687 633679 : fieldno = 0;
1688 : for (;;)
1689 1632506 : {
1690 2266185 : bool found_delim = false;
1691 : char *start_ptr;
1692 : char *end_ptr;
1693 : int input_len;
1694 2266185 : bool saw_non_ascii = false;
1695 :
1696 : /* Make sure there is enough space for the next value */
1697 2266185 : if (fieldno >= cstate->max_fields)
1698 : {
1699 21 : cstate->max_fields *= 2;
1700 21 : cstate->raw_fields =
1701 21 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1702 : }
1703 :
1704 : /* Remember start of field on both input and output sides */
1705 2266185 : start_ptr = cur_ptr;
1706 2266185 : cstate->raw_fields[fieldno] = output_ptr;
1707 :
1708 : /*
1709 : * Scan data for field.
1710 : *
1711 : * Note that in this loop, we are scanning to locate the end of field
1712 : * and also speculatively performing de-escaping. Once we find the
1713 : * end-of-field, we can match the raw field contents against the null
1714 : * marker string. Only after that comparison fails do we know that
1715 : * de-escaping is actually the right thing to do; therefore we *must
1716 : * not* throw any syntax errors before we've done the null-marker
1717 : * check.
1718 : */
1719 : for (;;)
1720 11136736 : {
1721 : char c;
1722 :
1723 13402921 : end_ptr = cur_ptr;
1724 13402921 : if (cur_ptr >= line_end_ptr)
1725 633676 : break;
1726 12769245 : c = *cur_ptr++;
1727 12769245 : if (c == delimc)
1728 : {
1729 1632509 : found_delim = true;
1730 1632509 : break;
1731 : }
1732 11136736 : if (c == '\\')
1733 : {
1734 4000 : if (cur_ptr >= line_end_ptr)
1735 0 : break;
1736 4000 : c = *cur_ptr++;
1737 4000 : switch (c)
1738 : {
1739 6 : case '0':
1740 : case '1':
1741 : case '2':
1742 : case '3':
1743 : case '4':
1744 : case '5':
1745 : case '6':
1746 : case '7':
1747 : {
1748 : /* handle \013 */
1749 : int val;
1750 :
1751 6 : val = OCTVALUE(c);
1752 6 : if (cur_ptr < line_end_ptr)
1753 : {
1754 3 : c = *cur_ptr;
1755 3 : if (ISOCTAL(c))
1756 : {
1757 0 : cur_ptr++;
1758 0 : val = (val << 3) + OCTVALUE(c);
1759 0 : if (cur_ptr < line_end_ptr)
1760 : {
1761 0 : c = *cur_ptr;
1762 0 : if (ISOCTAL(c))
1763 : {
1764 0 : cur_ptr++;
1765 0 : val = (val << 3) + OCTVALUE(c);
1766 : }
1767 : }
1768 : }
1769 : }
1770 6 : c = val & 0377;
1771 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
1772 6 : saw_non_ascii = true;
1773 : }
1774 6 : break;
1775 6 : case 'x':
1776 : /* Handle \x3F */
1777 6 : if (cur_ptr < line_end_ptr)
1778 : {
1779 3 : char hexchar = *cur_ptr;
1780 :
1781 3 : if (isxdigit((unsigned char) hexchar))
1782 : {
1783 0 : int val = GetDecimalFromHex(hexchar);
1784 :
1785 0 : cur_ptr++;
1786 0 : if (cur_ptr < line_end_ptr)
1787 : {
1788 0 : hexchar = *cur_ptr;
1789 0 : if (isxdigit((unsigned char) hexchar))
1790 : {
1791 0 : cur_ptr++;
1792 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1793 : }
1794 : }
1795 0 : c = val & 0xff;
1796 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1797 0 : saw_non_ascii = true;
1798 : }
1799 : }
1800 6 : break;
1801 0 : case 'b':
1802 0 : c = '\b';
1803 0 : break;
1804 0 : case 'f':
1805 0 : c = '\f';
1806 0 : break;
1807 1525 : case 'n':
1808 1525 : c = '\n';
1809 1525 : break;
1810 0 : case 'r':
1811 0 : c = '\r';
1812 0 : break;
1813 0 : case 't':
1814 0 : c = '\t';
1815 0 : break;
1816 0 : case 'v':
1817 0 : c = '\v';
1818 0 : break;
1819 :
1820 : /*
1821 : * in all other cases, take the char after '\'
1822 : * literally
1823 : */
1824 : }
1825 : }
1826 :
1827 : /* Add c to output string */
1828 11136736 : *output_ptr++ = c;
1829 : }
1830 :
1831 : /* Check whether raw input matched null marker */
1832 2266185 : input_len = end_ptr - start_ptr;
1833 2266185 : if (input_len == cstate->opts.null_print_len &&
1834 125656 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1835 2407 : cstate->raw_fields[fieldno] = NULL;
1836 : /* Check whether raw input matched default marker */
1837 2263778 : else if (fieldno < list_length(cstate->attnumlist) &&
1838 2263754 : cstate->opts.default_print &&
1839 57 : input_len == cstate->opts.default_print_len &&
1840 15 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1841 12 : {
1842 : /* fieldno is 0-indexed and attnum is 1-indexed */
1843 15 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1844 :
1845 15 : if (cstate->defexprs[m] != NULL)
1846 : {
1847 : /* defaults contain entries for all physical attributes */
1848 12 : cstate->defaults[m] = true;
1849 : }
1850 : else
1851 : {
1852 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1853 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1854 :
1855 3 : ereport(ERROR,
1856 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1857 : errmsg("unexpected default marker in COPY data"),
1858 : errdetail("Column \"%s\" has no default value.",
1859 : NameStr(att->attname))));
1860 : }
1861 : }
1862 : else
1863 : {
1864 : /*
1865 : * At this point we know the field is supposed to contain data.
1866 : *
1867 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
1868 : * resulting string is valid data for the db encoding.
1869 : */
1870 2263763 : if (saw_non_ascii)
1871 : {
1872 0 : char *fld = cstate->raw_fields[fieldno];
1873 :
1874 0 : pg_verifymbstr(fld, output_ptr - fld, false);
1875 : }
1876 : }
1877 :
1878 : /* Terminate attribute value in output area */
1879 2266182 : *output_ptr++ = '\0';
1880 :
1881 2266182 : fieldno++;
1882 : /* Done if we hit EOL instead of a delim */
1883 2266182 : if (!found_delim)
1884 633676 : break;
1885 : }
1886 :
1887 : /* Clean up state of attribute_buf */
1888 633676 : output_ptr--;
1889 : Assert(*output_ptr == '\0');
1890 633676 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1891 :
1892 633676 : return fieldno;
1893 : }
1894 :
1895 : /*
1896 : * Parse the current line into separate attributes (fields),
1897 : * performing de-escaping as needed. This has exactly the same API as
1898 : * CopyReadAttributesText, except we parse the fields according to
1899 : * "standard" (i.e. common) CSV usage.
1900 : */
1901 : static int
1902 257 : CopyReadAttributesCSV(CopyFromState cstate)
1903 : {
1904 257 : char delimc = cstate->opts.delim[0];
1905 257 : char quotec = cstate->opts.quote[0];
1906 257 : char escapec = cstate->opts.escape[0];
1907 : int fieldno;
1908 : char *output_ptr;
1909 : char *cur_ptr;
1910 : char *line_end_ptr;
1911 :
1912 : /*
1913 : * We need a special case for zero-column tables: check that the input
1914 : * line is empty, and return.
1915 : */
1916 257 : if (cstate->max_fields <= 0)
1917 : {
1918 0 : if (cstate->line_buf.len != 0)
1919 0 : ereport(ERROR,
1920 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1921 : errmsg("extra data after last expected column")));
1922 0 : return 0;
1923 : }
1924 :
1925 257 : resetStringInfo(&cstate->attribute_buf);
1926 :
1927 : /*
1928 : * The de-escaped attributes will certainly not be longer than the input
1929 : * data line, so we can just force attribute_buf to be large enough and
1930 : * then transfer data without any checks for enough space. We need to do
1931 : * it this way because enlarging attribute_buf mid-stream would invalidate
1932 : * pointers already stored into cstate->raw_fields[].
1933 : */
1934 257 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1935 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1936 257 : output_ptr = cstate->attribute_buf.data;
1937 :
1938 : /* set pointer variables for loop */
1939 257 : cur_ptr = cstate->line_buf.data;
1940 257 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1941 :
1942 : /* Outer loop iterates over fields */
1943 257 : fieldno = 0;
1944 : for (;;)
1945 267 : {
1946 524 : bool found_delim = false;
1947 524 : bool saw_quote = false;
1948 : char *start_ptr;
1949 : char *end_ptr;
1950 : int input_len;
1951 :
1952 : /* Make sure there is enough space for the next value */
1953 524 : if (fieldno >= cstate->max_fields)
1954 : {
1955 0 : cstate->max_fields *= 2;
1956 0 : cstate->raw_fields =
1957 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1958 : }
1959 :
1960 : /* Remember start of field on both input and output sides */
1961 524 : start_ptr = cur_ptr;
1962 524 : cstate->raw_fields[fieldno] = output_ptr;
1963 :
1964 : /*
1965 : * Scan data for field,
1966 : *
1967 : * The loop starts in "not quote" mode and then toggles between that
1968 : * and "in quote" mode. The loop exits normally if it is in "not
1969 : * quote" mode and a delimiter or line end is seen.
1970 : */
1971 : for (;;)
1972 114 : {
1973 : char c;
1974 :
1975 : /* Not in quote */
1976 : for (;;)
1977 : {
1978 1666 : end_ptr = cur_ptr;
1979 1666 : if (cur_ptr >= line_end_ptr)
1980 254 : goto endfield;
1981 1412 : c = *cur_ptr++;
1982 : /* unquoted field delimiter */
1983 1412 : if (c == delimc)
1984 : {
1985 270 : found_delim = true;
1986 270 : goto endfield;
1987 : }
1988 : /* start of quoted field (or part of field) */
1989 1142 : if (c == quotec)
1990 : {
1991 114 : saw_quote = true;
1992 114 : break;
1993 : }
1994 : /* Add c to output string */
1995 1028 : *output_ptr++ = c;
1996 : }
1997 :
1998 : /* In quote */
1999 : for (;;)
2000 : {
2001 710 : end_ptr = cur_ptr;
2002 710 : if (cur_ptr >= line_end_ptr)
2003 0 : ereport(ERROR,
2004 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2005 : errmsg("unterminated CSV quoted field")));
2006 :
2007 710 : c = *cur_ptr++;
2008 :
2009 : /* escape within a quoted field */
2010 710 : if (c == escapec)
2011 : {
2012 : /*
2013 : * peek at the next char if available, and escape it if it
2014 : * is an escape char or a quote char
2015 : */
2016 62 : if (cur_ptr < line_end_ptr)
2017 : {
2018 36 : char nextc = *cur_ptr;
2019 :
2020 36 : if (nextc == escapec || nextc == quotec)
2021 : {
2022 12 : *output_ptr++ = nextc;
2023 12 : cur_ptr++;
2024 12 : continue;
2025 : }
2026 : }
2027 : }
2028 :
2029 : /*
2030 : * end of quoted field. Must do this test after testing for
2031 : * escape in case quote char and escape char are the same
2032 : * (which is the common case).
2033 : */
2034 698 : if (c == quotec)
2035 114 : break;
2036 :
2037 : /* Add c to output string */
2038 584 : *output_ptr++ = c;
2039 : }
2040 : }
2041 524 : endfield:
2042 :
2043 : /* Terminate attribute value in output area */
2044 524 : *output_ptr++ = '\0';
2045 :
2046 : /* Check whether raw input matched null marker */
2047 524 : input_len = end_ptr - start_ptr;
2048 524 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
2049 22 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2050 22 : cstate->raw_fields[fieldno] = NULL;
2051 : /* Check whether raw input matched default marker */
2052 502 : else if (fieldno < list_length(cstate->attnumlist) &&
2053 502 : cstate->opts.default_print &&
2054 75 : input_len == cstate->opts.default_print_len &&
2055 21 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2056 : {
2057 : /* fieldno is 0-index and attnum is 1-index */
2058 21 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2059 :
2060 21 : if (cstate->defexprs[m] != NULL)
2061 : {
2062 : /* defaults contain entries for all physical attributes */
2063 18 : cstate->defaults[m] = true;
2064 : }
2065 : else
2066 : {
2067 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2068 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2069 :
2070 3 : ereport(ERROR,
2071 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2072 : errmsg("unexpected default marker in COPY data"),
2073 : errdetail("Column \"%s\" has no default value.",
2074 : NameStr(att->attname))));
2075 : }
2076 : }
2077 :
2078 521 : fieldno++;
2079 : /* Done if we hit EOL instead of a delim */
2080 521 : if (!found_delim)
2081 254 : break;
2082 : }
2083 :
2084 : /* Clean up state of attribute_buf */
2085 254 : output_ptr--;
2086 : Assert(*output_ptr == '\0');
2087 254 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2088 :
2089 254 : return fieldno;
2090 : }
2091 :
2092 :
2093 : /*
2094 : * Read a binary attribute
2095 : */
2096 : static Datum
2097 79 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2098 : Oid typioparam, int32 typmod,
2099 : bool *isnull)
2100 : {
2101 : int32 fld_size;
2102 : Datum result;
2103 :
2104 79 : if (!CopyGetInt32(cstate, &fld_size))
2105 0 : ereport(ERROR,
2106 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2107 : errmsg("unexpected EOF in COPY data")));
2108 79 : if (fld_size == -1)
2109 : {
2110 15 : *isnull = true;
2111 15 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2112 : }
2113 64 : if (fld_size < 0)
2114 0 : ereport(ERROR,
2115 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2116 : errmsg("invalid field size")));
2117 :
2118 : /* reset attribute_buf to empty, and load raw data in it */
2119 64 : resetStringInfo(&cstate->attribute_buf);
2120 :
2121 64 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2122 64 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2123 64 : fld_size) != fld_size)
2124 0 : ereport(ERROR,
2125 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2126 : errmsg("unexpected EOF in COPY data")));
2127 :
2128 64 : cstate->attribute_buf.len = fld_size;
2129 64 : cstate->attribute_buf.data[fld_size] = '\0';
2130 :
2131 : /* Call the column type's binary input converter */
2132 64 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2133 : typioparam, typmod);
2134 :
2135 : /* Trouble if it didn't eat the whole buffer */
2136 64 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2137 1 : ereport(ERROR,
2138 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2139 : errmsg("incorrect binary data format")));
2140 :
2141 63 : *isnull = false;
2142 63 : return result;
2143 : }
|