Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copyapi.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bswap.h"
75 : #include "utils/builtins.h"
76 : #include "utils/rel.h"
77 :
78 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 : #define OCTVALUE(c) ((c) - '0')
80 :
81 : /*
82 : * These macros centralize code used to process line_buf and input_buf buffers.
83 : * They are macros because they often do continue/break control and to avoid
84 : * function call overhead in tight COPY loops.
85 : *
86 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 : * prevent the continue/break processing from working. We end the "if (1)"
88 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 : * any "else" in the calling code, and to avoid any compiler warnings about
90 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 : */
92 :
93 : /*
94 : * This keeps the character read at the top of the loop in the buffer
95 : * even if there is more than one read-ahead.
96 : */
97 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98 : if (1) \
99 : { \
100 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 : { \
102 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 : need_data = true; \
104 : continue; \
105 : } \
106 : } else ((void) 0)
107 :
108 : /* This consumes the remainder of the buffer and breaks */
109 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110 : if (1) \
111 : { \
112 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 : { \
114 : if (extralen) \
115 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 : /* backslash just before EOF, treat as data char */ \
117 : result = true; \
118 : break; \
119 : } \
120 : } else ((void) 0)
121 :
122 : /*
123 : * Transfer any approved data to line_buf; must do this to be sure
124 : * there is some room in input_buf.
125 : */
126 : #define REFILL_LINEBUF \
127 : if (1) \
128 : { \
129 : if (input_buf_ptr > cstate->input_buf_index) \
130 : { \
131 : appendBinaryStringInfo(&cstate->line_buf, \
132 : cstate->input_buf + cstate->input_buf_index, \
133 : input_buf_ptr - cstate->input_buf_index); \
134 : cstate->input_buf_index = input_buf_ptr; \
135 : } \
136 : } else ((void) 0)
137 :
138 : /* NOTE: there's a copy of this in copyto.c */
139 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140 :
141 :
142 : /* non-export function prototypes */
143 : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
144 : static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
145 : bool is_csv);
146 : static int CopyReadAttributesText(CopyFromState cstate);
147 : static int CopyReadAttributesCSV(CopyFromState cstate);
148 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
149 : Oid typioparam, int32 typmod,
150 : bool *isnull);
151 : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
152 : ExprContext *econtext,
153 : Datum *values,
154 : bool *nulls,
155 : bool is_csv);
156 : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
157 : char ***fields,
158 : int *nfields,
159 : bool is_csv);
160 :
161 :
162 : /* Low-level communications functions */
163 : static int CopyGetData(CopyFromState cstate, void *databuf,
164 : int minread, int maxread);
165 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
166 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
167 : static void CopyLoadInputBuf(CopyFromState cstate);
168 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
169 :
170 : void
171 541 : ReceiveCopyBegin(CopyFromState cstate)
172 : {
173 : StringInfoData buf;
174 541 : int natts = list_length(cstate->attnumlist);
175 541 : int16 format = (cstate->opts.binary ? 1 : 0);
176 : int i;
177 :
178 541 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
179 541 : pq_sendbyte(&buf, format); /* overall format */
180 541 : pq_sendint16(&buf, natts);
181 1944 : for (i = 0; i < natts; i++)
182 1403 : pq_sendint16(&buf, format); /* per-column formats */
183 541 : pq_endmessage(&buf);
184 541 : cstate->copy_src = COPY_FRONTEND;
185 541 : cstate->fe_msgbuf = makeStringInfo();
186 : /* We *must* flush here to ensure FE knows it can send. */
187 541 : pq_flush();
188 541 : }
189 :
190 : void
191 7 : ReceiveCopyBinaryHeader(CopyFromState cstate)
192 : {
193 : char readSig[11];
194 : int32 tmp;
195 :
196 : /* Signature */
197 7 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
198 7 : memcmp(readSig, BinarySignature, 11) != 0)
199 0 : ereport(ERROR,
200 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
201 : errmsg("COPY file signature not recognized")));
202 : /* Flags field */
203 7 : if (!CopyGetInt32(cstate, &tmp))
204 0 : ereport(ERROR,
205 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
206 : errmsg("invalid COPY file header (missing flags)")));
207 7 : if ((tmp & (1 << 16)) != 0)
208 0 : ereport(ERROR,
209 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
210 : errmsg("invalid COPY file header (WITH OIDS)")));
211 7 : tmp &= ~(1 << 16);
212 7 : if ((tmp >> 16) != 0)
213 0 : ereport(ERROR,
214 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
215 : errmsg("unrecognized critical flags in COPY file header")));
216 : /* Header extension length */
217 7 : if (!CopyGetInt32(cstate, &tmp) ||
218 7 : tmp < 0)
219 0 : ereport(ERROR,
220 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
221 : errmsg("invalid COPY file header (missing length)")));
222 : /* Skip extension header, if present */
223 7 : while (tmp-- > 0)
224 : {
225 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
226 0 : ereport(ERROR,
227 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
228 : errmsg("invalid COPY file header (wrong length)")));
229 : }
230 7 : }
231 :
232 : /*
233 : * CopyGetData reads data from the source (file or frontend)
234 : *
235 : * We attempt to read at least minread, and at most maxread, bytes from
236 : * the source. The actual number of bytes read is returned; if this is
237 : * less than minread, EOF was detected.
238 : *
239 : * Note: when copying from the frontend, we expect a proper EOF mark per
240 : * protocol; if the frontend simply drops the connection, we raise error.
241 : * It seems unwise to allow the COPY IN to complete normally in that case.
242 : *
243 : * NB: no data conversion is applied here.
244 : */
245 : static int
246 213006 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
247 : {
248 213006 : int bytesread = 0;
249 :
250 213006 : switch (cstate->copy_src)
251 : {
252 568 : case COPY_FILE:
253 568 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
254 568 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
255 568 : pgstat_report_wait_end();
256 568 : if (ferror(cstate->copy_file))
257 0 : ereport(ERROR,
258 : (errcode_for_file_access(),
259 : errmsg("could not read from COPY file: %m")));
260 568 : if (bytesread == 0)
261 225 : cstate->raw_reached_eof = true;
262 568 : break;
263 201414 : case COPY_FRONTEND:
264 402029 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
265 : {
266 : int avail;
267 :
268 401653 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
269 : {
270 : /* Try to receive another message */
271 : int mtype;
272 : int maxmsglen;
273 :
274 201038 : readmessage:
275 201038 : HOLD_CANCEL_INTERRUPTS();
276 201038 : pq_startmsgread();
277 201038 : mtype = pq_getbyte();
278 201038 : if (mtype == EOF)
279 0 : ereport(ERROR,
280 : (errcode(ERRCODE_CONNECTION_FAILURE),
281 : errmsg("unexpected EOF on client connection with an open transaction")));
282 : /* Validate message type and set packet size limit */
283 201038 : switch (mtype)
284 : {
285 200615 : case PqMsg_CopyData:
286 200615 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
287 200615 : break;
288 421 : case PqMsg_CopyDone:
289 : case PqMsg_CopyFail:
290 : case PqMsg_Flush:
291 : case PqMsg_Sync:
292 421 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
293 421 : break;
294 2 : default:
295 2 : ereport(ERROR,
296 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
297 : errmsg("unexpected message type 0x%02X during COPY from stdin",
298 : mtype)));
299 : maxmsglen = 0; /* keep compiler quiet */
300 : break;
301 : }
302 : /* Now collect the message body */
303 201036 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
304 0 : ereport(ERROR,
305 : (errcode(ERRCODE_CONNECTION_FAILURE),
306 : errmsg("unexpected EOF on client connection with an open transaction")));
307 201036 : RESUME_CANCEL_INTERRUPTS();
308 : /* ... and process it */
309 201036 : switch (mtype)
310 : {
311 200615 : case PqMsg_CopyData:
312 200615 : break;
313 421 : case PqMsg_CopyDone:
314 : /* COPY IN correctly terminated by frontend */
315 421 : cstate->raw_reached_eof = true;
316 421 : return bytesread;
317 0 : case PqMsg_CopyFail:
318 0 : ereport(ERROR,
319 : (errcode(ERRCODE_QUERY_CANCELED),
320 : errmsg("COPY from stdin failed: %s",
321 : pq_getmsgstring(cstate->fe_msgbuf))));
322 : break;
323 0 : case PqMsg_Flush:
324 : case PqMsg_Sync:
325 :
326 : /*
327 : * Ignore Flush/Sync for the convenience of client
328 : * libraries (such as libpq) that may send those
329 : * without noticing that the command they just
330 : * sent was COPY.
331 : */
332 0 : goto readmessage;
333 200615 : default:
334 : Assert(false); /* NOT REACHED */
335 : }
336 : }
337 200615 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
338 200615 : if (avail > maxread)
339 0 : avail = maxread;
340 200615 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
341 200615 : databuf = (char *) databuf + avail;
342 200615 : maxread -= avail;
343 200615 : bytesread += avail;
344 : }
345 200991 : break;
346 11024 : case COPY_CALLBACK:
347 11024 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
348 11024 : break;
349 : }
350 :
351 212583 : return bytesread;
352 : }
353 :
354 :
355 : /*
356 : * These functions do apply some data conversion
357 : */
358 :
359 : /*
360 : * CopyGetInt32 reads an int32 that appears in network byte order
361 : *
362 : * Returns true if OK, false if EOF
363 : */
364 : static inline bool
365 93 : CopyGetInt32(CopyFromState cstate, int32 *val)
366 : {
367 : uint32 buf;
368 :
369 93 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
370 : {
371 0 : *val = 0; /* suppress compiler warning */
372 0 : return false;
373 : }
374 93 : *val = (int32) pg_ntoh32(buf);
375 93 : return true;
376 : }
377 :
378 : /*
379 : * CopyGetInt16 reads an int16 that appears in network byte order
380 : */
381 : static inline bool
382 21 : CopyGetInt16(CopyFromState cstate, int16 *val)
383 : {
384 : uint16 buf;
385 :
386 21 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
387 : {
388 0 : *val = 0; /* suppress compiler warning */
389 0 : return false;
390 : }
391 21 : *val = (int16) pg_ntoh16(buf);
392 21 : return true;
393 : }
394 :
395 :
396 : /*
397 : * Perform encoding conversion on data in 'raw_buf', writing the converted
398 : * data into 'input_buf'.
399 : *
400 : * On entry, there must be some data to convert in 'raw_buf'.
401 : */
402 : static void
403 425129 : CopyConvertBuf(CopyFromState cstate)
404 : {
405 : /*
406 : * If the file and server encoding are the same, no encoding conversion is
407 : * required. However, we still need to verify that the input is valid for
408 : * the encoding.
409 : */
410 425129 : if (!cstate->need_transcoding)
411 : {
412 : /*
413 : * When conversion is not required, input_buf and raw_buf are the
414 : * same. raw_buf_len is the total number of bytes in the buffer, and
415 : * input_buf_len tracks how many of those bytes have already been
416 : * verified.
417 : */
418 425063 : int preverifiedlen = cstate->input_buf_len;
419 425063 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
420 : int nverified;
421 :
422 425063 : if (unverifiedlen == 0)
423 : {
424 : /*
425 : * If no more raw data is coming, report the EOF to the caller.
426 : */
427 213308 : if (cstate->raw_reached_eof)
428 777 : cstate->input_reached_eof = true;
429 213308 : return;
430 : }
431 :
432 : /*
433 : * Verify the new data, including any residual unverified bytes from
434 : * previous round.
435 : */
436 211755 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
437 211755 : cstate->raw_buf + preverifiedlen,
438 : unverifiedlen);
439 211755 : if (nverified == 0)
440 : {
441 : /*
442 : * Could not verify anything.
443 : *
444 : * If there is no more raw input data coming, it means that there
445 : * was an incomplete multi-byte sequence at the end. Also, if
446 : * there's "enough" input left, we should be able to verify at
447 : * least one character, and a failure to do so means that we've
448 : * hit an invalid byte sequence.
449 : */
450 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
451 0 : cstate->input_reached_error = true;
452 0 : return;
453 : }
454 211755 : cstate->input_buf_len += nverified;
455 : }
456 : else
457 : {
458 : /*
459 : * Encoding conversion is needed.
460 : */
461 : int nbytes;
462 : unsigned char *src;
463 : int srclen;
464 : unsigned char *dst;
465 : int dstlen;
466 : int convertedlen;
467 :
468 66 : if (RAW_BUF_BYTES(cstate) == 0)
469 : {
470 : /*
471 : * If no more raw data is coming, report the EOF to the caller.
472 : */
473 42 : if (cstate->raw_reached_eof)
474 12 : cstate->input_reached_eof = true;
475 42 : return;
476 : }
477 :
478 : /*
479 : * First, copy down any unprocessed data.
480 : */
481 24 : nbytes = INPUT_BUF_BYTES(cstate);
482 24 : if (nbytes > 0 && cstate->input_buf_index > 0)
483 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
484 : nbytes);
485 24 : cstate->input_buf_index = 0;
486 24 : cstate->input_buf_len = nbytes;
487 24 : cstate->input_buf[nbytes] = '\0';
488 :
489 24 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
490 24 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
491 24 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
492 24 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
493 :
494 : /*
495 : * Do the conversion. This might stop short, if there is an invalid
496 : * byte sequence in the input. We'll convert as much as we can in
497 : * that case.
498 : *
499 : * Note: Even if we hit an invalid byte sequence, we don't report the
500 : * error until all the valid bytes have been consumed. The input
501 : * might contain an end-of-input marker (\.), and we don't want to
502 : * report an error if the invalid byte sequence is after the
503 : * end-of-input marker. We might unnecessarily convert some data
504 : * after the end-of-input marker as long as it's valid for the
505 : * encoding, but that's harmless.
506 : */
507 24 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
508 : cstate->file_encoding,
509 : GetDatabaseEncoding(),
510 : src, srclen,
511 : dst, dstlen,
512 : true);
513 24 : if (convertedlen == 0)
514 : {
515 : /*
516 : * Could not convert anything. If there is no more raw input data
517 : * coming, it means that there was an incomplete multi-byte
518 : * sequence at the end. Also, if there is plenty of input left,
519 : * we should be able to convert at least one character, so a
520 : * failure to do so must mean that we've hit a byte sequence
521 : * that's invalid.
522 : */
523 12 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
524 6 : cstate->input_reached_error = true;
525 12 : return;
526 : }
527 12 : cstate->raw_buf_index += convertedlen;
528 12 : cstate->input_buf_len += strlen((char *) dst);
529 : }
530 : }
531 :
532 : /*
533 : * Report an encoding or conversion error.
534 : */
535 : static void
536 6 : CopyConversionError(CopyFromState cstate)
537 : {
538 : Assert(cstate->raw_buf_len > 0);
539 : Assert(cstate->input_reached_error);
540 :
541 6 : if (!cstate->need_transcoding)
542 : {
543 : /*
544 : * Everything up to input_buf_len was successfully verified, and
545 : * input_buf_len points to the invalid or incomplete character.
546 : */
547 0 : report_invalid_encoding(cstate->file_encoding,
548 0 : cstate->raw_buf + cstate->input_buf_len,
549 0 : cstate->raw_buf_len - cstate->input_buf_len);
550 : }
551 : else
552 : {
553 : /*
554 : * raw_buf_index points to the invalid or untranslatable character. We
555 : * let the conversion routine report the error, because it can provide
556 : * a more specific error message than we could here. An earlier call
557 : * to the conversion routine in CopyConvertBuf() detected that there
558 : * is an error, now we call the conversion routine again with
559 : * noError=false, to have it throw the error.
560 : */
561 : unsigned char *src;
562 : int srclen;
563 : unsigned char *dst;
564 : int dstlen;
565 :
566 6 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
567 6 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
568 6 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
569 6 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
570 :
571 6 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
572 : cstate->file_encoding,
573 : GetDatabaseEncoding(),
574 : src, srclen,
575 : dst, dstlen,
576 : false);
577 :
578 : /*
579 : * The conversion routine should have reported an error, so this
580 : * should not be reached.
581 : */
582 0 : elog(ERROR, "encoding conversion failed without error");
583 : }
584 : }
585 :
586 : /*
587 : * Load more data from data source to raw_buf.
588 : *
589 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
590 : * beginning of the buffer, and we load new data after that.
591 : */
592 : static void
593 212585 : CopyLoadRawBuf(CopyFromState cstate)
594 : {
595 : int nbytes;
596 : int inbytes;
597 :
598 : /*
599 : * In text mode, if encoding conversion is not required, raw_buf and
600 : * input_buf point to the same buffer. Their len/index better agree, too.
601 : */
602 212585 : if (cstate->raw_buf == cstate->input_buf)
603 : {
604 : Assert(!cstate->need_transcoding);
605 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
606 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
607 : }
608 :
609 : /*
610 : * Copy down the unprocessed data if any.
611 : */
612 212585 : nbytes = RAW_BUF_BYTES(cstate);
613 212585 : if (nbytes > 0 && cstate->raw_buf_index > 0)
614 0 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
615 : nbytes);
616 212585 : cstate->raw_buf_len -= cstate->raw_buf_index;
617 212585 : cstate->raw_buf_index = 0;
618 :
619 : /*
620 : * If raw_buf and input_buf are in fact the same buffer, adjust the
621 : * input_buf variables, too.
622 : */
623 212585 : if (cstate->raw_buf == cstate->input_buf)
624 : {
625 212531 : cstate->input_buf_len -= cstate->input_buf_index;
626 212531 : cstate->input_buf_index = 0;
627 : }
628 :
629 : /* Load more data */
630 212585 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
631 212585 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
632 212583 : nbytes += inbytes;
633 212583 : cstate->raw_buf[nbytes] = '\0';
634 212583 : cstate->raw_buf_len = nbytes;
635 :
636 212583 : cstate->bytes_processed += inbytes;
637 212583 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
638 :
639 212583 : if (inbytes == 0)
640 798 : cstate->raw_reached_eof = true;
641 212583 : }
642 :
643 : /*
644 : * CopyLoadInputBuf loads some more data into input_buf
645 : *
646 : * On return, at least one more input character is loaded into
647 : * input_buf, or input_reached_eof is set.
648 : *
649 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
650 : * of the buffer and then we load more data after that.
651 : */
652 : static void
653 212564 : CopyLoadInputBuf(CopyFromState cstate)
654 : {
655 212564 : int nbytes = INPUT_BUF_BYTES(cstate);
656 :
657 : /*
658 : * The caller has updated input_buf_index to indicate how much of the
659 : * input has been consumed and isn't needed anymore. If input_buf is the
660 : * same physical area as raw_buf, update raw_buf_index accordingly.
661 : */
662 212564 : if (cstate->raw_buf == cstate->input_buf)
663 : {
664 : Assert(!cstate->need_transcoding);
665 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
666 212534 : cstate->raw_buf_index = cstate->input_buf_index;
667 : }
668 :
669 : for (;;)
670 : {
671 : /* If we now have some unconverted data, try to convert it */
672 425129 : CopyConvertBuf(cstate);
673 :
674 : /* If we now have some more input bytes ready, return them */
675 425129 : if (INPUT_BUF_BYTES(cstate) > nbytes)
676 211767 : return;
677 :
678 : /*
679 : * If we reached an invalid byte sequence, or we're at an incomplete
680 : * multi-byte character but there is no more raw input data, report
681 : * conversion error.
682 : */
683 213362 : if (cstate->input_reached_error)
684 6 : CopyConversionError(cstate);
685 :
686 : /* no more input, and everything has been converted */
687 213356 : if (cstate->input_reached_eof)
688 789 : break;
689 :
690 : /* Try to load more raw data */
691 : Assert(!cstate->raw_reached_eof);
692 212567 : CopyLoadRawBuf(cstate);
693 : }
694 : }
695 :
696 : /*
697 : * CopyReadBinaryData
698 : *
699 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
700 : * and writes them to 'dest'. Returns the number of bytes read (which
701 : * would be less than 'nbytes' only if we reach EOF).
702 : */
703 : static int
704 191 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
705 : {
706 191 : int copied_bytes = 0;
707 :
708 191 : if (RAW_BUF_BYTES(cstate) >= nbytes)
709 : {
710 : /* Enough bytes are present in the buffer. */
711 173 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
712 173 : cstate->raw_buf_index += nbytes;
713 173 : copied_bytes = nbytes;
714 : }
715 : else
716 : {
717 : /*
718 : * Not enough bytes in the buffer, so must read from the file. Need
719 : * to loop since 'nbytes' could be larger than the buffer size.
720 : */
721 : do
722 : {
723 : int copy_bytes;
724 :
725 : /* Load more data if buffer is empty. */
726 18 : if (RAW_BUF_BYTES(cstate) == 0)
727 : {
728 18 : CopyLoadRawBuf(cstate);
729 18 : if (cstate->raw_reached_eof)
730 6 : break; /* EOF */
731 : }
732 :
733 : /* Transfer some bytes. */
734 12 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
735 12 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
736 12 : cstate->raw_buf_index += copy_bytes;
737 12 : dest += copy_bytes;
738 12 : copied_bytes += copy_bytes;
739 12 : } while (copied_bytes < nbytes);
740 : }
741 :
742 191 : return copied_bytes;
743 : }
744 :
745 : /*
746 : * This function is exposed for use by extensions that read raw fields in the
747 : * next line. See NextCopyFromRawFieldsInternal() for details.
748 : */
749 : bool
750 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
751 : {
752 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
753 0 : cstate->opts.csv_mode);
754 : }
755 :
756 : /*
757 : * Workhorse for NextCopyFromRawFields().
758 : *
759 : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
760 : * false if no more lines.
761 : *
762 : * An internal temporary buffer is returned via 'fields'. It is valid until
763 : * the next call of the function. Since the function returns all raw fields
764 : * in the input file, 'nfields' could be different from the number of columns
765 : * in the relation.
766 : *
767 : * NOTE: force_not_null option are not applied to the returned fields.
768 : *
769 : * We use pg_attribute_always_inline to reduce function call overhead
770 : * and to help compilers to optimize away the 'is_csv' condition when called
771 : * by internal functions such as CopyFromTextLikeOneRow().
772 : */
773 : static pg_attribute_always_inline bool
774 630774 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
775 : {
776 : int fldct;
777 630774 : bool done = false;
778 :
779 : /* only available for text or csv input */
780 : Assert(!cstate->opts.binary);
781 :
782 : /* on input check that the header line is correct if needed */
783 630774 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
784 : {
785 : ListCell *cur;
786 : TupleDesc tupDesc;
787 74 : int lines_to_skip = cstate->opts.header_line;
788 :
789 : /* If set to "match", one header line is skipped */
790 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
791 38 : lines_to_skip = 1;
792 :
793 74 : tupDesc = RelationGetDescr(cstate->rel);
794 :
795 173 : for (int i = 0; i < lines_to_skip; i++)
796 : {
797 103 : cstate->cur_lineno++;
798 103 : if ((done = CopyReadLine(cstate, is_csv)))
799 4 : break;
800 : }
801 :
802 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
803 : {
804 : int fldnum;
805 :
806 38 : if (is_csv)
807 5 : fldct = CopyReadAttributesCSV(cstate);
808 : else
809 33 : fldct = CopyReadAttributesText(cstate);
810 :
811 38 : if (fldct != list_length(cstate->attnumlist))
812 12 : ereport(ERROR,
813 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
814 : errmsg("wrong number of fields in header line: got %d, expected %d",
815 : fldct, list_length(cstate->attnumlist))));
816 :
817 26 : fldnum = 0;
818 79 : foreach(cur, cstate->attnumlist)
819 : {
820 63 : int attnum = lfirst_int(cur);
821 : char *colName;
822 63 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
823 :
824 : Assert(fldnum < cstate->max_fields);
825 :
826 63 : colName = cstate->raw_fields[fldnum++];
827 63 : if (colName == NULL)
828 3 : ereport(ERROR,
829 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
830 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
831 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
832 :
833 60 : if (namestrcmp(&attr->attname, colName) != 0)
834 : {
835 7 : ereport(ERROR,
836 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
837 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
838 : fldnum, colName, NameStr(attr->attname))));
839 : }
840 : }
841 : }
842 :
843 52 : if (done)
844 4 : return false;
845 : }
846 :
847 630748 : cstate->cur_lineno++;
848 :
849 : /* Actually read the line into memory here */
850 630748 : done = CopyReadLine(cstate, is_csv);
851 :
852 : /*
853 : * EOF at start of line means we're done. If we see EOF after some
854 : * characters, we act as though it was newline followed by EOF, ie,
855 : * process the line and then exit loop on next iteration.
856 : */
857 630734 : if (done && cstate->line_buf.len == 0)
858 827 : return false;
859 :
860 : /* Parse the line into de-escaped field values */
861 629907 : if (is_csv)
862 252 : fldct = CopyReadAttributesCSV(cstate);
863 : else
864 629655 : fldct = CopyReadAttributesText(cstate);
865 :
866 629901 : *fields = cstate->raw_fields;
867 629901 : *nfields = fldct;
868 629901 : return true;
869 : }
870 :
871 : /*
872 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
873 : *
874 : * 'econtext' is used to evaluate default expression for each column that is
875 : * either not read from the file or is using the DEFAULT option of COPY FROM.
876 : * It can be NULL when no default values are used, i.e. when all columns are
877 : * read from the file, and DEFAULT option is unset.
878 : *
879 : * 'values' and 'nulls' arrays must be the same length as columns of the
880 : * relation passed to BeginCopyFrom. This function fills the arrays.
881 : */
882 : bool
883 630795 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
884 : Datum *values, bool *nulls)
885 : {
886 : TupleDesc tupDesc;
887 : AttrNumber num_phys_attrs,
888 630795 : num_defaults = cstate->num_defaults;
889 : int i;
890 630795 : int *defmap = cstate->defmap;
891 630795 : ExprState **defexprs = cstate->defexprs;
892 :
893 630795 : tupDesc = RelationGetDescr(cstate->rel);
894 630795 : num_phys_attrs = tupDesc->natts;
895 :
896 : /* Initialize all values for row to NULL */
897 2962704 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
898 630795 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
899 702849 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
900 :
901 : /* Get one row from source */
902 630795 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
903 837 : return false;
904 :
905 : /*
906 : * Now compute and insert any defaults available for the columns not
907 : * provided by the input data. Anything not processed here or above will
908 : * remain NULL.
909 : */
910 660128 : for (i = 0; i < num_defaults; i++)
911 : {
912 : /*
913 : * The caller must supply econtext and have switched into the
914 : * per-tuple memory context in it.
915 : */
916 : Assert(econtext != NULL);
917 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
918 :
919 30265 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
920 30265 : &nulls[defmap[i]]);
921 : }
922 :
923 629863 : return true;
924 : }
925 :
926 : /* Implementation of the per-row callback for text format */
927 : bool
928 630397 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
929 : bool *nulls)
930 : {
931 630397 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
932 : }
933 :
934 : /* Implementation of the per-row callback for CSV format */
935 : bool
936 377 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
937 : bool *nulls)
938 : {
939 377 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
940 : }
941 :
942 : /*
943 : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
944 : *
945 : * We use pg_attribute_always_inline to reduce function call overhead
946 : * and to help compilers to optimize away the 'is_csv' condition.
947 : */
948 : static pg_attribute_always_inline bool
949 630774 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
950 : Datum *values, bool *nulls, bool is_csv)
951 : {
952 : TupleDesc tupDesc;
953 : AttrNumber attr_count;
954 630774 : FmgrInfo *in_functions = cstate->in_functions;
955 630774 : Oid *typioparams = cstate->typioparams;
956 630774 : ExprState **defexprs = cstate->defexprs;
957 : char **field_strings;
958 : ListCell *cur;
959 : int fldct;
960 : int fieldno;
961 : char *string;
962 630774 : bool current_row_erroneous = false;
963 :
964 630774 : tupDesc = RelationGetDescr(cstate->rel);
965 630774 : attr_count = list_length(cstate->attnumlist);
966 :
967 : /* read raw fields in the next line */
968 630774 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
969 831 : return false;
970 :
971 : /* check for overflowing fields */
972 629901 : if (attr_count > 0 && fldct > attr_count)
973 12 : ereport(ERROR,
974 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
975 : errmsg("extra data after last expected column")));
976 :
977 629889 : fieldno = 0;
978 :
979 : /* Loop to read the user attributes on the line. */
980 2891280 : foreach(cur, cstate->attnumlist)
981 : {
982 2261495 : int attnum = lfirst_int(cur);
983 2261495 : int m = attnum - 1;
984 2261495 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
985 :
986 2261495 : if (fieldno >= fldct)
987 12 : ereport(ERROR,
988 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
989 : errmsg("missing data for column \"%s\"",
990 : NameStr(att->attname))));
991 2261483 : string = field_strings[fieldno++];
992 :
993 2261483 : if (cstate->convert_select_flags &&
994 10 : !cstate->convert_select_flags[m])
995 : {
996 : /* ignore input field, leaving column as NULL */
997 5 : continue;
998 : }
999 :
1000 2261478 : if (is_csv)
1001 : {
1002 503 : if (string == NULL &&
1003 22 : cstate->opts.force_notnull_flags[m])
1004 : {
1005 : /*
1006 : * FORCE_NOT_NULL option is set and column is NULL - convert
1007 : * it to the NULL string.
1008 : */
1009 14 : string = cstate->opts.null_print;
1010 : }
1011 489 : else if (string != NULL && cstate->opts.force_null_flags[m]
1012 25 : && strcmp(string, cstate->opts.null_print) == 0)
1013 : {
1014 : /*
1015 : * FORCE_NULL option is set and column matches the NULL
1016 : * string. It must have been quoted, or otherwise the string
1017 : * would already have been set to NULL. Convert it to NULL as
1018 : * specified.
1019 : */
1020 13 : string = NULL;
1021 : }
1022 : }
1023 :
1024 2261478 : cstate->cur_attname = NameStr(att->attname);
1025 2261478 : cstate->cur_attval = string;
1026 :
1027 2261478 : if (string != NULL)
1028 2259053 : nulls[m] = false;
1029 :
1030 2261478 : if (cstate->defaults[m])
1031 : {
1032 : /* We must have switched into the per-tuple memory context */
1033 : Assert(econtext != NULL);
1034 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1035 :
1036 30 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1037 : }
1038 :
1039 : /*
1040 : * If ON_ERROR is specified, handle the different options
1041 : */
1042 2261429 : else if (!InputFunctionCallSafe(&in_functions[m],
1043 : string,
1044 2261448 : typioparams[m],
1045 : att->atttypmod,
1046 2261448 : (Node *) cstate->escontext,
1047 2261448 : &values[m]))
1048 : {
1049 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1050 :
1051 85 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1052 64 : cstate->num_errors++;
1053 21 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1054 : {
1055 : /*
1056 : * Reset error state so the subsequent InputFunctionCallSafe
1057 : * call (for domain constraint check) can properly report
1058 : * whether it succeeded or failed.
1059 : */
1060 21 : cstate->escontext->error_occurred = false;
1061 :
1062 : Assert(cstate->domain_with_constraint != NULL);
1063 :
1064 : /*
1065 : * For constrained domains, we need an additional
1066 : * InputFunctionCallSafe() to ensure that an error is thrown
1067 : * if the domain constraint rejects null values.
1068 : */
1069 36 : if (!cstate->domain_with_constraint[m] ||
1070 15 : InputFunctionCallSafe(&in_functions[m],
1071 : NULL,
1072 15 : typioparams[m],
1073 : att->atttypmod,
1074 15 : (Node *) cstate->escontext,
1075 15 : &values[m]))
1076 : {
1077 12 : nulls[m] = true;
1078 12 : values[m] = (Datum) 0;
1079 : }
1080 : else
1081 9 : ereport(ERROR,
1082 : errcode(ERRCODE_NOT_NULL_VIOLATION),
1083 : errmsg("domain %s does not allow null values",
1084 : format_type_be(typioparams[m])),
1085 : errdetail("ON_ERROR SET_NULL cannot be applied because column \"%s\" (domain %s) does not accept null values.",
1086 : cstate->cur_attname,
1087 : format_type_be(typioparams[m])),
1088 : errdatatype(typioparams[m]));
1089 :
1090 : /*
1091 : * We count only the number of rows (not fields) where
1092 : * ON_ERROR SET_NULL was applied.
1093 : */
1094 12 : if (!current_row_erroneous)
1095 : {
1096 9 : current_row_erroneous = true;
1097 9 : cstate->num_errors++;
1098 : }
1099 : }
1100 :
1101 76 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1102 : {
1103 : /*
1104 : * Since we emit line number and column info in the below
1105 : * notice message, we suppress error context information other
1106 : * than the relation name.
1107 : */
1108 : Assert(!cstate->relname_only);
1109 33 : cstate->relname_only = true;
1110 :
1111 33 : if (cstate->cur_attval)
1112 : {
1113 : char *attval;
1114 :
1115 30 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1116 :
1117 30 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1118 18 : ereport(NOTICE,
1119 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1120 : cstate->cur_lineno,
1121 : cstate->cur_attname,
1122 : attval));
1123 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1124 12 : ereport(NOTICE,
1125 : errmsg("setting to null due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1126 : cstate->cur_lineno,
1127 : cstate->cur_attname,
1128 : attval));
1129 30 : pfree(attval);
1130 : }
1131 : else
1132 : {
1133 3 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1134 3 : ereport(NOTICE,
1135 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1136 : cstate->cur_lineno,
1137 : cstate->cur_attname));
1138 : }
1139 : /* reset relname_only */
1140 33 : cstate->relname_only = false;
1141 : }
1142 :
1143 76 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1144 64 : return true;
1145 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1146 12 : continue;
1147 : }
1148 :
1149 2261374 : cstate->cur_attname = NULL;
1150 2261374 : cstate->cur_attval = NULL;
1151 : }
1152 :
1153 : Assert(fieldno == attr_count);
1154 :
1155 629785 : return true;
1156 : }
1157 :
1158 : /* Implementation of the per-row callback for binary format */
1159 : bool
1160 21 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1161 : bool *nulls)
1162 : {
1163 : TupleDesc tupDesc;
1164 : AttrNumber attr_count;
1165 21 : FmgrInfo *in_functions = cstate->in_functions;
1166 21 : Oid *typioparams = cstate->typioparams;
1167 : int16 fld_count;
1168 : ListCell *cur;
1169 :
1170 21 : tupDesc = RelationGetDescr(cstate->rel);
1171 21 : attr_count = list_length(cstate->attnumlist);
1172 :
1173 21 : cstate->cur_lineno++;
1174 :
1175 21 : if (!CopyGetInt16(cstate, &fld_count))
1176 : {
1177 : /* EOF detected (end of file, or protocol-level EOF) */
1178 0 : return false;
1179 : }
1180 :
1181 21 : if (fld_count == -1)
1182 : {
1183 : /*
1184 : * Received EOF marker. Wait for the protocol-level EOF, and complain
1185 : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1186 : * that we correctly handle CopyFail, if client chooses to send that
1187 : * now. When copying from file, we could ignore the rest of the file
1188 : * like in text mode, but we choose to be consistent with the COPY
1189 : * FROM STDIN case.
1190 : */
1191 : char dummy;
1192 :
1193 6 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1194 0 : ereport(ERROR,
1195 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1196 : errmsg("received copy data after EOF marker")));
1197 6 : return false;
1198 : }
1199 :
1200 15 : if (fld_count != attr_count)
1201 0 : ereport(ERROR,
1202 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1203 : errmsg("row field count is %d, expected %d",
1204 : fld_count, attr_count)));
1205 :
1206 93 : foreach(cur, cstate->attnumlist)
1207 : {
1208 79 : int attnum = lfirst_int(cur);
1209 79 : int m = attnum - 1;
1210 79 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1211 :
1212 79 : cstate->cur_attname = NameStr(att->attname);
1213 157 : values[m] = CopyReadBinaryAttribute(cstate,
1214 79 : &in_functions[m],
1215 79 : typioparams[m],
1216 : att->atttypmod,
1217 : &nulls[m]);
1218 78 : cstate->cur_attname = NULL;
1219 : }
1220 :
1221 14 : return true;
1222 : }
1223 :
1224 : /*
1225 : * Read the next input line and stash it in line_buf.
1226 : *
1227 : * Result is true if read was terminated by EOF, false if terminated
1228 : * by newline. The terminating newline or EOF marker is not included
1229 : * in the final value of line_buf.
1230 : */
1231 : static bool
1232 630851 : CopyReadLine(CopyFromState cstate, bool is_csv)
1233 : {
1234 : bool result;
1235 :
1236 630851 : resetStringInfo(&cstate->line_buf);
1237 630851 : cstate->line_buf_valid = false;
1238 :
1239 : /*
1240 : * Parse data and transfer into line_buf.
1241 : *
1242 : * Because this is performance critical, we inline CopyReadLineText() and
1243 : * pass the boolean parameters as constants to allow the compiler to emit
1244 : * specialized code with fewer branches.
1245 : */
1246 630851 : if (is_csv)
1247 439 : result = CopyReadLineText(cstate, true);
1248 : else
1249 630412 : result = CopyReadLineText(cstate, false);
1250 :
1251 630837 : if (result)
1252 : {
1253 : /*
1254 : * Reached EOF. In protocol version 3, we should ignore anything
1255 : * after \. up to the protocol end of copy data. (XXX maybe better
1256 : * not to treat \. as special?)
1257 : */
1258 834 : if (cstate->copy_src == COPY_FRONTEND)
1259 : {
1260 : int inbytes;
1261 :
1262 : do
1263 : {
1264 421 : inbytes = CopyGetData(cstate, cstate->input_buf,
1265 : 1, INPUT_BUF_SIZE);
1266 421 : } while (inbytes > 0);
1267 421 : cstate->input_buf_index = 0;
1268 421 : cstate->input_buf_len = 0;
1269 421 : cstate->raw_buf_index = 0;
1270 421 : cstate->raw_buf_len = 0;
1271 : }
1272 : }
1273 : else
1274 : {
1275 : /*
1276 : * If we didn't hit EOF, then we must have transferred the EOL marker
1277 : * to line_buf along with the data. Get rid of it.
1278 : */
1279 630003 : switch (cstate->eol_type)
1280 : {
1281 630003 : case EOL_NL:
1282 : Assert(cstate->line_buf.len >= 1);
1283 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1284 630003 : cstate->line_buf.len--;
1285 630003 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1286 630003 : break;
1287 0 : case EOL_CR:
1288 : Assert(cstate->line_buf.len >= 1);
1289 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1290 0 : cstate->line_buf.len--;
1291 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1292 0 : break;
1293 0 : case EOL_CRNL:
1294 : Assert(cstate->line_buf.len >= 2);
1295 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1296 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1297 0 : cstate->line_buf.len -= 2;
1298 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1299 0 : break;
1300 0 : case EOL_UNKNOWN:
1301 : /* shouldn't get here */
1302 : Assert(false);
1303 0 : break;
1304 : }
1305 : }
1306 :
1307 : /* Now it's safe to use the buffer in error messages */
1308 630837 : cstate->line_buf_valid = true;
1309 :
1310 630837 : return result;
1311 : }
1312 :
1313 : /*
1314 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1315 : */
1316 : static pg_attribute_always_inline bool
1317 630851 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1318 : {
1319 : char *copy_input_buf;
1320 : int input_buf_ptr;
1321 : int copy_buf_len;
1322 630851 : bool need_data = false;
1323 630851 : bool hit_eof = false;
1324 630851 : bool result = false;
1325 :
1326 : /* CSV variables */
1327 630851 : bool in_quote = false,
1328 630851 : last_was_esc = false;
1329 630851 : char quotec = '\0';
1330 630851 : char escapec = '\0';
1331 :
1332 630851 : if (is_csv)
1333 : {
1334 439 : quotec = cstate->opts.quote[0];
1335 439 : escapec = cstate->opts.escape[0];
1336 : /* ignore special escape processing if it's the same as quotec */
1337 439 : if (quotec == escapec)
1338 342 : escapec = '\0';
1339 : }
1340 :
1341 : /*
1342 : * The objective of this loop is to transfer the entire next input line
1343 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1344 : * \n) and the end-of-copy marker (\.).
1345 : *
1346 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1347 : * value and are put in line_buf. We keep just enough state to know if we
1348 : * are currently in a quoted field or not.
1349 : *
1350 : * The input has already been converted to the database encoding. All
1351 : * supported server encodings have the property that all bytes in a
1352 : * multi-byte sequence have the high bit set, so a multibyte character
1353 : * cannot contain any newline or escape characters embedded in the
1354 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1355 : * regardless of the encoding.
1356 : *
1357 : * For speed, we try to move data from input_buf to line_buf in chunks
1358 : * rather than one character at a time. input_buf_ptr points to the next
1359 : * character to examine; any characters from input_buf_index to
1360 : * input_buf_ptr have been determined to be part of the line, but not yet
1361 : * transferred to line_buf.
1362 : *
1363 : * For a little extra speed within the loop, we copy input_buf and
1364 : * input_buf_len into local variables.
1365 : */
1366 630851 : copy_input_buf = cstate->input_buf;
1367 630851 : input_buf_ptr = cstate->input_buf_index;
1368 630851 : copy_buf_len = cstate->input_buf_len;
1369 :
1370 : for (;;)
1371 12754967 : {
1372 : int prev_raw_ptr;
1373 : char c;
1374 :
1375 : /*
1376 : * Load more data if needed.
1377 : *
1378 : * TODO: We could just force four bytes of read-ahead and avoid the
1379 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1380 : * unsafe with the old v2 COPY protocol, but we don't support that
1381 : * anymore.
1382 : */
1383 13385818 : if (input_buf_ptr >= copy_buf_len || need_data)
1384 : {
1385 212564 : REFILL_LINEBUF;
1386 :
1387 212564 : CopyLoadInputBuf(cstate);
1388 : /* update our local variables */
1389 212556 : hit_eof = cstate->input_reached_eof;
1390 212556 : input_buf_ptr = cstate->input_buf_index;
1391 212556 : copy_buf_len = cstate->input_buf_len;
1392 :
1393 : /*
1394 : * If we are completely out of data, break out of the loop,
1395 : * reporting EOF.
1396 : */
1397 212556 : if (INPUT_BUF_BYTES(cstate) <= 0)
1398 : {
1399 789 : result = true;
1400 789 : break;
1401 : }
1402 211767 : need_data = false;
1403 : }
1404 :
1405 : /* OK to fetch a character */
1406 13385021 : prev_raw_ptr = input_buf_ptr;
1407 13385021 : c = copy_input_buf[input_buf_ptr++];
1408 :
1409 13385021 : if (is_csv)
1410 : {
1411 : /*
1412 : * If character is '\r', we may need to look ahead below. Force
1413 : * fetch of the next character if we don't already have it. We
1414 : * need to do this before changing CSV state, in case '\r' is also
1415 : * the quote or escape character.
1416 : */
1417 3425 : if (c == '\r')
1418 : {
1419 18 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1420 : }
1421 :
1422 : /*
1423 : * Dealing with quotes and escapes here is mildly tricky. If the
1424 : * quote char is also the escape char, there's no problem - we
1425 : * just use the char as a toggle. If they are different, we need
1426 : * to ensure that we only take account of an escape inside a
1427 : * quoted field and immediately preceding a quote char, and not
1428 : * the second in an escape-escape sequence.
1429 : */
1430 3425 : if (in_quote && c == escapec)
1431 24 : last_was_esc = !last_was_esc;
1432 3425 : if (c == quotec && !last_was_esc)
1433 260 : in_quote = !in_quote;
1434 3425 : if (c != escapec)
1435 3398 : last_was_esc = false;
1436 :
1437 : /*
1438 : * Updating the line count for embedded CR and/or LF chars is
1439 : * necessarily a little fragile - this test is probably about the
1440 : * best we can do. (XXX it's arguable whether we should do this
1441 : * at all --- is cur_lineno a physical or logical count?)
1442 : */
1443 3425 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1444 18 : cstate->cur_lineno++;
1445 : }
1446 :
1447 : /* Process \r */
1448 13385021 : if (c == '\r' && (!is_csv || !in_quote))
1449 : {
1450 : /* Check for \r\n on first line, _and_ handle \r\n. */
1451 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1452 0 : cstate->eol_type == EOL_CRNL)
1453 : {
1454 : /*
1455 : * If need more data, go back to loop top to load it.
1456 : *
1457 : * Note that if we are at EOF, c will wind up as '\0' because
1458 : * of the guaranteed pad of input_buf.
1459 : */
1460 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1461 :
1462 : /* get next char */
1463 0 : c = copy_input_buf[input_buf_ptr];
1464 :
1465 0 : if (c == '\n')
1466 : {
1467 0 : input_buf_ptr++; /* eat newline */
1468 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1469 : }
1470 : else
1471 : {
1472 : /* found \r, but no \n */
1473 0 : if (cstate->eol_type == EOL_CRNL)
1474 0 : ereport(ERROR,
1475 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1476 : !is_csv ?
1477 : errmsg("literal carriage return found in data") :
1478 : errmsg("unquoted carriage return found in data"),
1479 : !is_csv ?
1480 : errhint("Use \"\\r\" to represent carriage return.") :
1481 : errhint("Use quoted CSV field to represent carriage return.")));
1482 :
1483 : /*
1484 : * if we got here, it is the first line and we didn't find
1485 : * \n, so don't consume the peeked character
1486 : */
1487 0 : cstate->eol_type = EOL_CR;
1488 : }
1489 : }
1490 0 : else if (cstate->eol_type == EOL_NL)
1491 0 : ereport(ERROR,
1492 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1493 : !is_csv ?
1494 : errmsg("literal carriage return found in data") :
1495 : errmsg("unquoted carriage return found in data"),
1496 : !is_csv ?
1497 : errhint("Use \"\\r\" to represent carriage return.") :
1498 : errhint("Use quoted CSV field to represent carriage return.")));
1499 : /* If reach here, we have found the line terminator */
1500 0 : break;
1501 : }
1502 :
1503 : /* Process \n */
1504 13385021 : if (c == '\n' && (!is_csv || !in_quote))
1505 : {
1506 630003 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1507 0 : ereport(ERROR,
1508 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1509 : !is_csv ?
1510 : errmsg("literal newline found in data") :
1511 : errmsg("unquoted newline found in data"),
1512 : !is_csv ?
1513 : errhint("Use \"\\n\" to represent newline.") :
1514 : errhint("Use quoted CSV field to represent newline.")));
1515 630003 : cstate->eol_type = EOL_NL; /* in case not set yet */
1516 : /* If reach here, we have found the line terminator */
1517 630003 : break;
1518 : }
1519 :
1520 : /*
1521 : * Process backslash, except in CSV mode where backslash is a normal
1522 : * character.
1523 : */
1524 12755018 : if (c == '\\' && !is_csv)
1525 : {
1526 : char c2;
1527 :
1528 4051 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1529 4051 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1530 :
1531 : /* -----
1532 : * get next character
1533 : * Note: we do not change c so if it isn't \., we can fall
1534 : * through and continue processing.
1535 : * -----
1536 : */
1537 4051 : c2 = copy_input_buf[input_buf_ptr];
1538 :
1539 4051 : if (c2 == '.')
1540 : {
1541 51 : input_buf_ptr++; /* consume the '.' */
1542 51 : if (cstate->eol_type == EOL_CRNL)
1543 : {
1544 : /* Get the next character */
1545 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1546 : /* if hit_eof, c2 will become '\0' */
1547 0 : c2 = copy_input_buf[input_buf_ptr++];
1548 :
1549 0 : if (c2 == '\n')
1550 0 : ereport(ERROR,
1551 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1552 : errmsg("end-of-copy marker does not match previous newline style")));
1553 0 : else if (c2 != '\r')
1554 0 : ereport(ERROR,
1555 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1556 : errmsg("end-of-copy marker is not alone on its line")));
1557 : }
1558 :
1559 : /* Get the next character */
1560 51 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1561 : /* if hit_eof, c2 will become '\0' */
1562 51 : c2 = copy_input_buf[input_buf_ptr++];
1563 :
1564 51 : if (c2 != '\r' && c2 != '\n')
1565 3 : ereport(ERROR,
1566 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1567 : errmsg("end-of-copy marker is not alone on its line")));
1568 :
1569 48 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1570 48 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1571 48 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1572 0 : ereport(ERROR,
1573 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1574 : errmsg("end-of-copy marker does not match previous newline style")));
1575 :
1576 : /*
1577 : * If there is any data on this line before the \., complain.
1578 : */
1579 48 : if (cstate->line_buf.len > 0 ||
1580 48 : prev_raw_ptr > cstate->input_buf_index)
1581 3 : ereport(ERROR,
1582 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1583 : errmsg("end-of-copy marker is not alone on its line")));
1584 :
1585 : /*
1586 : * Discard the \. and newline, then report EOF.
1587 : */
1588 45 : cstate->input_buf_index = input_buf_ptr;
1589 45 : result = true; /* report EOF */
1590 45 : break;
1591 : }
1592 : else
1593 : {
1594 : /*
1595 : * If we are here, it means we found a backslash followed by
1596 : * something other than a period. In non-CSV mode, anything
1597 : * after a backslash is special, so we skip over that second
1598 : * character too. If we didn't do that \\. would be
1599 : * considered an eof-of copy, while in non-CSV mode it is a
1600 : * literal backslash followed by a period.
1601 : */
1602 4000 : input_buf_ptr++;
1603 : }
1604 : }
1605 : } /* end of outer loop */
1606 :
1607 : /*
1608 : * Transfer any still-uncopied data to line_buf.
1609 : */
1610 630837 : REFILL_LINEBUF;
1611 :
1612 630837 : return result;
1613 : }
1614 :
1615 : /*
1616 : * Return decimal value for a hexadecimal digit
1617 : */
1618 : static int
1619 0 : GetDecimalFromHex(char hex)
1620 : {
1621 0 : if (isdigit((unsigned char) hex))
1622 0 : return hex - '0';
1623 : else
1624 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1625 : }
1626 :
1627 : /*
1628 : * Parse the current line into separate attributes (fields),
1629 : * performing de-escaping as needed.
1630 : *
1631 : * The input is in line_buf. We use attribute_buf to hold the result
1632 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1633 : * string, or NULL when the input matches the null marker string.
1634 : * This array is expanded as necessary.
1635 : *
1636 : * (Note that the caller cannot check for nulls since the returned
1637 : * string would be the post-de-escaping equivalent, which may look
1638 : * the same as some valid data string.)
1639 : *
1640 : * delim is the column delimiter string (must be just one byte for now).
1641 : * null_print is the null marker string. Note that this is compared to
1642 : * the pre-de-escaped input string.
1643 : *
1644 : * The return value is the number of fields actually read.
1645 : */
1646 : static int
1647 629688 : CopyReadAttributesText(CopyFromState cstate)
1648 : {
1649 629688 : char delimc = cstate->opts.delim[0];
1650 : int fieldno;
1651 : char *output_ptr;
1652 : char *cur_ptr;
1653 : char *line_end_ptr;
1654 :
1655 : /*
1656 : * We need a special case for zero-column tables: check that the input
1657 : * line is empty, and return.
1658 : */
1659 629688 : if (cstate->max_fields <= 0)
1660 : {
1661 4 : if (cstate->line_buf.len != 0)
1662 0 : ereport(ERROR,
1663 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1664 : errmsg("extra data after last expected column")));
1665 4 : return 0;
1666 : }
1667 :
1668 629684 : resetStringInfo(&cstate->attribute_buf);
1669 :
1670 : /*
1671 : * The de-escaped attributes will certainly not be longer than the input
1672 : * data line, so we can just force attribute_buf to be large enough and
1673 : * then transfer data without any checks for enough space. We need to do
1674 : * it this way because enlarging attribute_buf mid-stream would invalidate
1675 : * pointers already stored into cstate->raw_fields[].
1676 : */
1677 629684 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1678 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1679 629684 : output_ptr = cstate->attribute_buf.data;
1680 :
1681 : /* set pointer variables for loop */
1682 629684 : cur_ptr = cstate->line_buf.data;
1683 629684 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1684 :
1685 : /* Outer loop iterates over fields */
1686 629684 : fieldno = 0;
1687 : for (;;)
1688 1631510 : {
1689 2261194 : bool found_delim = false;
1690 : char *start_ptr;
1691 : char *end_ptr;
1692 : int input_len;
1693 2261194 : bool saw_non_ascii = false;
1694 :
1695 : /* Make sure there is enough space for the next value */
1696 2261194 : if (fieldno >= cstate->max_fields)
1697 : {
1698 21 : cstate->max_fields *= 2;
1699 21 : cstate->raw_fields =
1700 21 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1701 : }
1702 :
1703 : /* Remember start of field on both input and output sides */
1704 2261194 : start_ptr = cur_ptr;
1705 2261194 : cstate->raw_fields[fieldno] = output_ptr;
1706 :
1707 : /*
1708 : * Scan data for field.
1709 : *
1710 : * Note that in this loop, we are scanning to locate the end of field
1711 : * and also speculatively performing de-escaping. Once we find the
1712 : * end-of-field, we can match the raw field contents against the null
1713 : * marker string. Only after that comparison fails do we know that
1714 : * de-escaping is actually the right thing to do; therefore we *must
1715 : * not* throw any syntax errors before we've done the null-marker
1716 : * check.
1717 : */
1718 : for (;;)
1719 11120092 : {
1720 : char c;
1721 :
1722 13381286 : end_ptr = cur_ptr;
1723 13381286 : if (cur_ptr >= line_end_ptr)
1724 629681 : break;
1725 12751605 : c = *cur_ptr++;
1726 12751605 : if (c == delimc)
1727 : {
1728 1631513 : found_delim = true;
1729 1631513 : break;
1730 : }
1731 11120092 : if (c == '\\')
1732 : {
1733 4000 : if (cur_ptr >= line_end_ptr)
1734 0 : break;
1735 4000 : c = *cur_ptr++;
1736 4000 : switch (c)
1737 : {
1738 6 : case '0':
1739 : case '1':
1740 : case '2':
1741 : case '3':
1742 : case '4':
1743 : case '5':
1744 : case '6':
1745 : case '7':
1746 : {
1747 : /* handle \013 */
1748 : int val;
1749 :
1750 6 : val = OCTVALUE(c);
1751 6 : if (cur_ptr < line_end_ptr)
1752 : {
1753 3 : c = *cur_ptr;
1754 3 : if (ISOCTAL(c))
1755 : {
1756 0 : cur_ptr++;
1757 0 : val = (val << 3) + OCTVALUE(c);
1758 0 : if (cur_ptr < line_end_ptr)
1759 : {
1760 0 : c = *cur_ptr;
1761 0 : if (ISOCTAL(c))
1762 : {
1763 0 : cur_ptr++;
1764 0 : val = (val << 3) + OCTVALUE(c);
1765 : }
1766 : }
1767 : }
1768 : }
1769 6 : c = val & 0377;
1770 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
1771 6 : saw_non_ascii = true;
1772 : }
1773 6 : break;
1774 6 : case 'x':
1775 : /* Handle \x3F */
1776 6 : if (cur_ptr < line_end_ptr)
1777 : {
1778 3 : char hexchar = *cur_ptr;
1779 :
1780 3 : if (isxdigit((unsigned char) hexchar))
1781 : {
1782 0 : int val = GetDecimalFromHex(hexchar);
1783 :
1784 0 : cur_ptr++;
1785 0 : if (cur_ptr < line_end_ptr)
1786 : {
1787 0 : hexchar = *cur_ptr;
1788 0 : if (isxdigit((unsigned char) hexchar))
1789 : {
1790 0 : cur_ptr++;
1791 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1792 : }
1793 : }
1794 0 : c = val & 0xff;
1795 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1796 0 : saw_non_ascii = true;
1797 : }
1798 : }
1799 6 : break;
1800 0 : case 'b':
1801 0 : c = '\b';
1802 0 : break;
1803 0 : case 'f':
1804 0 : c = '\f';
1805 0 : break;
1806 1525 : case 'n':
1807 1525 : c = '\n';
1808 1525 : break;
1809 0 : case 'r':
1810 0 : c = '\r';
1811 0 : break;
1812 0 : case 't':
1813 0 : c = '\t';
1814 0 : break;
1815 0 : case 'v':
1816 0 : c = '\v';
1817 0 : break;
1818 :
1819 : /*
1820 : * in all other cases, take the char after '\'
1821 : * literally
1822 : */
1823 : }
1824 : }
1825 :
1826 : /* Add c to output string */
1827 11120092 : *output_ptr++ = c;
1828 : }
1829 :
1830 : /* Check whether raw input matched null marker */
1831 2261194 : input_len = end_ptr - start_ptr;
1832 2261194 : if (input_len == cstate->opts.null_print_len &&
1833 125387 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1834 2407 : cstate->raw_fields[fieldno] = NULL;
1835 : /* Check whether raw input matched default marker */
1836 2258787 : else if (fieldno < list_length(cstate->attnumlist) &&
1837 2258763 : cstate->opts.default_print &&
1838 57 : input_len == cstate->opts.default_print_len &&
1839 15 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1840 12 : {
1841 : /* fieldno is 0-indexed and attnum is 1-indexed */
1842 15 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1843 :
1844 15 : if (cstate->defexprs[m] != NULL)
1845 : {
1846 : /* defaults contain entries for all physical attributes */
1847 12 : cstate->defaults[m] = true;
1848 : }
1849 : else
1850 : {
1851 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1852 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1853 :
1854 3 : ereport(ERROR,
1855 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1856 : errmsg("unexpected default marker in COPY data"),
1857 : errdetail("Column \"%s\" has no default value.",
1858 : NameStr(att->attname))));
1859 : }
1860 : }
1861 : else
1862 : {
1863 : /*
1864 : * At this point we know the field is supposed to contain data.
1865 : *
1866 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
1867 : * resulting string is valid data for the db encoding.
1868 : */
1869 2258772 : if (saw_non_ascii)
1870 : {
1871 0 : char *fld = cstate->raw_fields[fieldno];
1872 :
1873 0 : pg_verifymbstr(fld, output_ptr - fld, false);
1874 : }
1875 : }
1876 :
1877 : /* Terminate attribute value in output area */
1878 2261191 : *output_ptr++ = '\0';
1879 :
1880 2261191 : fieldno++;
1881 : /* Done if we hit EOL instead of a delim */
1882 2261191 : if (!found_delim)
1883 629681 : break;
1884 : }
1885 :
1886 : /* Clean up state of attribute_buf */
1887 629681 : output_ptr--;
1888 : Assert(*output_ptr == '\0');
1889 629681 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1890 :
1891 629681 : return fieldno;
1892 : }
1893 :
1894 : /*
1895 : * Parse the current line into separate attributes (fields),
1896 : * performing de-escaping as needed. This has exactly the same API as
1897 : * CopyReadAttributesText, except we parse the fields according to
1898 : * "standard" (i.e. common) CSV usage.
1899 : */
1900 : static int
1901 257 : CopyReadAttributesCSV(CopyFromState cstate)
1902 : {
1903 257 : char delimc = cstate->opts.delim[0];
1904 257 : char quotec = cstate->opts.quote[0];
1905 257 : char escapec = cstate->opts.escape[0];
1906 : int fieldno;
1907 : char *output_ptr;
1908 : char *cur_ptr;
1909 : char *line_end_ptr;
1910 :
1911 : /*
1912 : * We need a special case for zero-column tables: check that the input
1913 : * line is empty, and return.
1914 : */
1915 257 : if (cstate->max_fields <= 0)
1916 : {
1917 0 : if (cstate->line_buf.len != 0)
1918 0 : ereport(ERROR,
1919 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1920 : errmsg("extra data after last expected column")));
1921 0 : return 0;
1922 : }
1923 :
1924 257 : resetStringInfo(&cstate->attribute_buf);
1925 :
1926 : /*
1927 : * The de-escaped attributes will certainly not be longer than the input
1928 : * data line, so we can just force attribute_buf to be large enough and
1929 : * then transfer data without any checks for enough space. We need to do
1930 : * it this way because enlarging attribute_buf mid-stream would invalidate
1931 : * pointers already stored into cstate->raw_fields[].
1932 : */
1933 257 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1934 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1935 257 : output_ptr = cstate->attribute_buf.data;
1936 :
1937 : /* set pointer variables for loop */
1938 257 : cur_ptr = cstate->line_buf.data;
1939 257 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1940 :
1941 : /* Outer loop iterates over fields */
1942 257 : fieldno = 0;
1943 : for (;;)
1944 267 : {
1945 524 : bool found_delim = false;
1946 524 : bool saw_quote = false;
1947 : char *start_ptr;
1948 : char *end_ptr;
1949 : int input_len;
1950 :
1951 : /* Make sure there is enough space for the next value */
1952 524 : if (fieldno >= cstate->max_fields)
1953 : {
1954 0 : cstate->max_fields *= 2;
1955 0 : cstate->raw_fields =
1956 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1957 : }
1958 :
1959 : /* Remember start of field on both input and output sides */
1960 524 : start_ptr = cur_ptr;
1961 524 : cstate->raw_fields[fieldno] = output_ptr;
1962 :
1963 : /*
1964 : * Scan data for field,
1965 : *
1966 : * The loop starts in "not quote" mode and then toggles between that
1967 : * and "in quote" mode. The loop exits normally if it is in "not
1968 : * quote" mode and a delimiter or line end is seen.
1969 : */
1970 : for (;;)
1971 114 : {
1972 : char c;
1973 :
1974 : /* Not in quote */
1975 : for (;;)
1976 : {
1977 1666 : end_ptr = cur_ptr;
1978 1666 : if (cur_ptr >= line_end_ptr)
1979 254 : goto endfield;
1980 1412 : c = *cur_ptr++;
1981 : /* unquoted field delimiter */
1982 1412 : if (c == delimc)
1983 : {
1984 270 : found_delim = true;
1985 270 : goto endfield;
1986 : }
1987 : /* start of quoted field (or part of field) */
1988 1142 : if (c == quotec)
1989 : {
1990 114 : saw_quote = true;
1991 114 : break;
1992 : }
1993 : /* Add c to output string */
1994 1028 : *output_ptr++ = c;
1995 : }
1996 :
1997 : /* In quote */
1998 : for (;;)
1999 : {
2000 710 : end_ptr = cur_ptr;
2001 710 : if (cur_ptr >= line_end_ptr)
2002 0 : ereport(ERROR,
2003 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2004 : errmsg("unterminated CSV quoted field")));
2005 :
2006 710 : c = *cur_ptr++;
2007 :
2008 : /* escape within a quoted field */
2009 710 : if (c == escapec)
2010 : {
2011 : /*
2012 : * peek at the next char if available, and escape it if it
2013 : * is an escape char or a quote char
2014 : */
2015 62 : if (cur_ptr < line_end_ptr)
2016 : {
2017 36 : char nextc = *cur_ptr;
2018 :
2019 36 : if (nextc == escapec || nextc == quotec)
2020 : {
2021 12 : *output_ptr++ = nextc;
2022 12 : cur_ptr++;
2023 12 : continue;
2024 : }
2025 : }
2026 : }
2027 :
2028 : /*
2029 : * end of quoted field. Must do this test after testing for
2030 : * escape in case quote char and escape char are the same
2031 : * (which is the common case).
2032 : */
2033 698 : if (c == quotec)
2034 114 : break;
2035 :
2036 : /* Add c to output string */
2037 584 : *output_ptr++ = c;
2038 : }
2039 : }
2040 524 : endfield:
2041 :
2042 : /* Terminate attribute value in output area */
2043 524 : *output_ptr++ = '\0';
2044 :
2045 : /* Check whether raw input matched null marker */
2046 524 : input_len = end_ptr - start_ptr;
2047 524 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
2048 22 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2049 22 : cstate->raw_fields[fieldno] = NULL;
2050 : /* Check whether raw input matched default marker */
2051 502 : else if (fieldno < list_length(cstate->attnumlist) &&
2052 502 : cstate->opts.default_print &&
2053 75 : input_len == cstate->opts.default_print_len &&
2054 21 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2055 : {
2056 : /* fieldno is 0-index and attnum is 1-index */
2057 21 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2058 :
2059 21 : if (cstate->defexprs[m] != NULL)
2060 : {
2061 : /* defaults contain entries for all physical attributes */
2062 18 : cstate->defaults[m] = true;
2063 : }
2064 : else
2065 : {
2066 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2067 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2068 :
2069 3 : ereport(ERROR,
2070 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2071 : errmsg("unexpected default marker in COPY data"),
2072 : errdetail("Column \"%s\" has no default value.",
2073 : NameStr(att->attname))));
2074 : }
2075 : }
2076 :
2077 521 : fieldno++;
2078 : /* Done if we hit EOL instead of a delim */
2079 521 : if (!found_delim)
2080 254 : break;
2081 : }
2082 :
2083 : /* Clean up state of attribute_buf */
2084 254 : output_ptr--;
2085 : Assert(*output_ptr == '\0');
2086 254 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2087 :
2088 254 : return fieldno;
2089 : }
2090 :
2091 :
2092 : /*
2093 : * Read a binary attribute
2094 : */
2095 : static Datum
2096 79 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2097 : Oid typioparam, int32 typmod,
2098 : bool *isnull)
2099 : {
2100 : int32 fld_size;
2101 : Datum result;
2102 :
2103 79 : if (!CopyGetInt32(cstate, &fld_size))
2104 0 : ereport(ERROR,
2105 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2106 : errmsg("unexpected EOF in COPY data")));
2107 79 : if (fld_size == -1)
2108 : {
2109 15 : *isnull = true;
2110 15 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2111 : }
2112 64 : if (fld_size < 0)
2113 0 : ereport(ERROR,
2114 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2115 : errmsg("invalid field size")));
2116 :
2117 : /* reset attribute_buf to empty, and load raw data in it */
2118 64 : resetStringInfo(&cstate->attribute_buf);
2119 :
2120 64 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2121 64 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2122 64 : fld_size) != fld_size)
2123 0 : ereport(ERROR,
2124 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2125 : errmsg("unexpected EOF in COPY data")));
2126 :
2127 64 : cstate->attribute_buf.len = fld_size;
2128 64 : cstate->attribute_buf.data[fld_size] = '\0';
2129 :
2130 : /* Call the column type's binary input converter */
2131 64 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2132 : typioparam, typmod);
2133 :
2134 : /* Trouble if it didn't eat the whole buffer */
2135 64 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2136 1 : ereport(ERROR,
2137 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2138 : errmsg("incorrect binary data format")));
2139 :
2140 63 : *isnull = false;
2141 63 : return result;
2142 : }
|