Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copyapi.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bswap.h"
75 : #include "utils/builtins.h"
76 : #include "utils/rel.h"
77 :
78 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 : #define OCTVALUE(c) ((c) - '0')
80 :
81 : /*
82 : * These macros centralize code used to process line_buf and input_buf buffers.
83 : * They are macros because they often do continue/break control and to avoid
84 : * function call overhead in tight COPY loops.
85 : *
86 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 : * prevent the continue/break processing from working. We end the "if (1)"
88 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 : * any "else" in the calling code, and to avoid any compiler warnings about
90 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 : */
92 :
93 : /*
94 : * This keeps the character read at the top of the loop in the buffer
95 : * even if there is more than one read-ahead.
96 : */
97 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98 : if (1) \
99 : { \
100 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 : { \
102 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 : need_data = true; \
104 : continue; \
105 : } \
106 : } else ((void) 0)
107 :
108 : /* This consumes the remainder of the buffer and breaks */
109 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110 : if (1) \
111 : { \
112 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 : { \
114 : if (extralen) \
115 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 : /* backslash just before EOF, treat as data char */ \
117 : result = true; \
118 : break; \
119 : } \
120 : } else ((void) 0)
121 :
122 : /*
123 : * Transfer any approved data to line_buf; must do this to be sure
124 : * there is some room in input_buf.
125 : */
126 : #define REFILL_LINEBUF \
127 : if (1) \
128 : { \
129 : if (input_buf_ptr > cstate->input_buf_index) \
130 : { \
131 : appendBinaryStringInfo(&cstate->line_buf, \
132 : cstate->input_buf + cstate->input_buf_index, \
133 : input_buf_ptr - cstate->input_buf_index); \
134 : cstate->input_buf_index = input_buf_ptr; \
135 : } \
136 : } else ((void) 0)
137 :
138 : /* NOTE: there's a copy of this in copyto.c */
139 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140 :
141 :
142 : /* non-export function prototypes */
143 : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
144 : static bool CopyReadLineText(CopyFromState cstate, bool is_csv);
145 : static int CopyReadAttributesText(CopyFromState cstate);
146 : static int CopyReadAttributesCSV(CopyFromState cstate);
147 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
148 : Oid typioparam, int32 typmod,
149 : bool *isnull);
150 : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
151 : ExprContext *econtext,
152 : Datum *values,
153 : bool *nulls,
154 : bool is_csv);
155 : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
156 : char ***fields,
157 : int *nfields,
158 : bool is_csv);
159 :
160 :
161 : /* Low-level communications functions */
162 : static int CopyGetData(CopyFromState cstate, void *databuf,
163 : int minread, int maxread);
164 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
165 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
166 : static void CopyLoadInputBuf(CopyFromState cstate);
167 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
168 :
169 : void
170 980 : ReceiveCopyBegin(CopyFromState cstate)
171 : {
172 : StringInfoData buf;
173 980 : int natts = list_length(cstate->attnumlist);
174 980 : int16 format = (cstate->opts.binary ? 1 : 0);
175 : int i;
176 :
177 980 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
178 980 : pq_sendbyte(&buf, format); /* overall format */
179 980 : pq_sendint16(&buf, natts);
180 3282 : for (i = 0; i < natts; i++)
181 2302 : pq_sendint16(&buf, format); /* per-column formats */
182 980 : pq_endmessage(&buf);
183 980 : cstate->copy_src = COPY_FRONTEND;
184 980 : cstate->fe_msgbuf = makeStringInfo();
185 : /* We *must* flush here to ensure FE knows it can send. */
186 980 : pq_flush();
187 980 : }
188 :
189 : void
190 14 : ReceiveCopyBinaryHeader(CopyFromState cstate)
191 : {
192 : char readSig[11];
193 : int32 tmp;
194 :
195 : /* Signature */
196 14 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
197 14 : memcmp(readSig, BinarySignature, 11) != 0)
198 0 : ereport(ERROR,
199 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
200 : errmsg("COPY file signature not recognized")));
201 : /* Flags field */
202 14 : if (!CopyGetInt32(cstate, &tmp))
203 0 : ereport(ERROR,
204 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
205 : errmsg("invalid COPY file header (missing flags)")));
206 14 : if ((tmp & (1 << 16)) != 0)
207 0 : ereport(ERROR,
208 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 : errmsg("invalid COPY file header (WITH OIDS)")));
210 14 : tmp &= ~(1 << 16);
211 14 : if ((tmp >> 16) != 0)
212 0 : ereport(ERROR,
213 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
214 : errmsg("unrecognized critical flags in COPY file header")));
215 : /* Header extension length */
216 14 : if (!CopyGetInt32(cstate, &tmp) ||
217 14 : tmp < 0)
218 0 : ereport(ERROR,
219 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
220 : errmsg("invalid COPY file header (missing length)")));
221 : /* Skip extension header, if present */
222 14 : while (tmp-- > 0)
223 : {
224 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
225 0 : ereport(ERROR,
226 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
227 : errmsg("invalid COPY file header (wrong length)")));
228 : }
229 14 : }
230 :
231 : /*
232 : * CopyGetData reads data from the source (file or frontend)
233 : *
234 : * We attempt to read at least minread, and at most maxread, bytes from
235 : * the source. The actual number of bytes read is returned; if this is
236 : * less than minread, EOF was detected.
237 : *
238 : * Note: when copying from the frontend, we expect a proper EOF mark per
239 : * protocol; if the frontend simply drops the connection, we raise error.
240 : * It seems unwise to allow the COPY IN to complete normally in that case.
241 : *
242 : * NB: no data conversion is applied here.
243 : */
244 : static int
245 431730 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
246 : {
247 431730 : int bytesread = 0;
248 :
249 431730 : switch (cstate->copy_src)
250 : {
251 1068 : case COPY_FILE:
252 1068 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
253 1068 : if (ferror(cstate->copy_file))
254 0 : ereport(ERROR,
255 : (errcode_for_file_access(),
256 : errmsg("could not read from COPY file: %m")));
257 1068 : if (bytesread == 0)
258 416 : cstate->raw_reached_eof = true;
259 1068 : break;
260 402662 : case COPY_FRONTEND:
261 803798 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
262 : {
263 : int avail;
264 :
265 401914 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
266 : {
267 : /* Try to receive another message */
268 : int mtype;
269 : int maxmsglen;
270 :
271 401914 : readmessage:
272 401950 : HOLD_CANCEL_INTERRUPTS();
273 401950 : pq_startmsgread();
274 401950 : mtype = pq_getbyte();
275 401950 : if (mtype == EOF)
276 0 : ereport(ERROR,
277 : (errcode(ERRCODE_CONNECTION_FAILURE),
278 : errmsg("unexpected EOF on client connection with an open transaction")));
279 : /* Validate message type and set packet size limit */
280 : switch (mtype)
281 : {
282 401136 : case PqMsg_CopyData:
283 401136 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
284 401136 : break;
285 814 : case PqMsg_CopyDone:
286 : case PqMsg_CopyFail:
287 : case PqMsg_Flush:
288 : case PqMsg_Sync:
289 814 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
290 814 : break;
291 0 : default:
292 0 : ereport(ERROR,
293 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
294 : errmsg("unexpected message type 0x%02X during COPY from stdin",
295 : mtype)));
296 : maxmsglen = 0; /* keep compiler quiet */
297 : break;
298 : }
299 : /* Now collect the message body */
300 401950 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
301 0 : ereport(ERROR,
302 : (errcode(ERRCODE_CONNECTION_FAILURE),
303 : errmsg("unexpected EOF on client connection with an open transaction")));
304 401950 : RESUME_CANCEL_INTERRUPTS();
305 : /* ... and process it */
306 : switch (mtype)
307 : {
308 401136 : case PqMsg_CopyData:
309 401136 : break;
310 778 : case PqMsg_CopyDone:
311 : /* COPY IN correctly terminated by frontend */
312 778 : cstate->raw_reached_eof = true;
313 778 : return bytesread;
314 0 : case PqMsg_CopyFail:
315 0 : ereport(ERROR,
316 : (errcode(ERRCODE_QUERY_CANCELED),
317 : errmsg("COPY from stdin failed: %s",
318 : pq_getmsgstring(cstate->fe_msgbuf))));
319 : break;
320 36 : case PqMsg_Flush:
321 : case PqMsg_Sync:
322 :
323 : /*
324 : * Ignore Flush/Sync for the convenience of client
325 : * libraries (such as libpq) that may send those
326 : * without noticing that the command they just
327 : * sent was COPY.
328 : */
329 36 : goto readmessage;
330 803050 : default:
331 : Assert(false); /* NOT REACHED */
332 : }
333 : }
334 401136 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
335 401136 : if (avail > maxread)
336 0 : avail = maxread;
337 401136 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
338 401136 : databuf = (void *) ((char *) databuf + avail);
339 401136 : maxread -= avail;
340 401136 : bytesread += avail;
341 : }
342 401884 : break;
343 28000 : case COPY_CALLBACK:
344 28000 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
345 28000 : break;
346 : }
347 :
348 430952 : return bytesread;
349 : }
350 :
351 :
352 : /*
353 : * These functions do apply some data conversion
354 : */
355 :
356 : /*
357 : * CopyGetInt32 reads an int32 that appears in network byte order
358 : *
359 : * Returns true if OK, false if EOF
360 : */
361 : static inline bool
362 186 : CopyGetInt32(CopyFromState cstate, int32 *val)
363 : {
364 : uint32 buf;
365 :
366 186 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
367 : {
368 0 : *val = 0; /* suppress compiler warning */
369 0 : return false;
370 : }
371 186 : *val = (int32) pg_ntoh32(buf);
372 186 : return true;
373 : }
374 :
375 : /*
376 : * CopyGetInt16 reads an int16 that appears in network byte order
377 : */
378 : static inline bool
379 42 : CopyGetInt16(CopyFromState cstate, int16 *val)
380 : {
381 : uint16 buf;
382 :
383 42 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
384 : {
385 0 : *val = 0; /* suppress compiler warning */
386 0 : return false;
387 : }
388 42 : *val = (int16) pg_ntoh16(buf);
389 42 : return true;
390 : }
391 :
392 :
393 : /*
394 : * Perform encoding conversion on data in 'raw_buf', writing the converted
395 : * data into 'input_buf'.
396 : *
397 : * On entry, there must be some data to convert in 'raw_buf'.
398 : */
399 : static void
400 861820 : CopyConvertBuf(CopyFromState cstate)
401 : {
402 : /*
403 : * If the file and server encoding are the same, no encoding conversion is
404 : * required. However, we still need to verify that the input is valid for
405 : * the encoding.
406 : */
407 861820 : if (!cstate->need_transcoding)
408 : {
409 : /*
410 : * When conversion is not required, input_buf and raw_buf are the
411 : * same. raw_buf_len is the total number of bytes in the buffer, and
412 : * input_buf_len tracks how many of those bytes have already been
413 : * verified.
414 : */
415 861736 : int preverifiedlen = cstate->input_buf_len;
416 861736 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
417 : int nverified;
418 :
419 861736 : if (unverifiedlen == 0)
420 : {
421 : /*
422 : * If no more raw data is coming, report the EOF to the caller.
423 : */
424 432370 : if (cstate->raw_reached_eof)
425 1502 : cstate->input_reached_eof = true;
426 432370 : return;
427 : }
428 :
429 : /*
430 : * Verify the new data, including any residual unverified bytes from
431 : * previous round.
432 : */
433 429366 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
434 429366 : cstate->raw_buf + preverifiedlen,
435 : unverifiedlen);
436 429366 : if (nverified == 0)
437 : {
438 : /*
439 : * Could not verify anything.
440 : *
441 : * If there is no more raw input data coming, it means that there
442 : * was an incomplete multi-byte sequence at the end. Also, if
443 : * there's "enough" input left, we should be able to verify at
444 : * least one character, and a failure to do so means that we've
445 : * hit an invalid byte sequence.
446 : */
447 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
448 0 : cstate->input_reached_error = true;
449 0 : return;
450 : }
451 429366 : cstate->input_buf_len += nverified;
452 : }
453 : else
454 : {
455 : /*
456 : * Encoding conversion is needed.
457 : */
458 : int nbytes;
459 : unsigned char *src;
460 : int srclen;
461 : unsigned char *dst;
462 : int dstlen;
463 : int convertedlen;
464 :
465 84 : if (RAW_BUF_BYTES(cstate) == 0)
466 : {
467 : /*
468 : * If no more raw data is coming, report the EOF to the caller.
469 : */
470 48 : if (cstate->raw_reached_eof)
471 12 : cstate->input_reached_eof = true;
472 48 : return;
473 : }
474 :
475 : /*
476 : * First, copy down any unprocessed data.
477 : */
478 36 : nbytes = INPUT_BUF_BYTES(cstate);
479 36 : if (nbytes > 0 && cstate->input_buf_index > 0)
480 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
481 : nbytes);
482 36 : cstate->input_buf_index = 0;
483 36 : cstate->input_buf_len = nbytes;
484 36 : cstate->input_buf[nbytes] = '\0';
485 :
486 36 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
487 36 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
488 36 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
489 36 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
490 :
491 : /*
492 : * Do the conversion. This might stop short, if there is an invalid
493 : * byte sequence in the input. We'll convert as much as we can in
494 : * that case.
495 : *
496 : * Note: Even if we hit an invalid byte sequence, we don't report the
497 : * error until all the valid bytes have been consumed. The input
498 : * might contain an end-of-input marker (\.), and we don't want to
499 : * report an error if the invalid byte sequence is after the
500 : * end-of-input marker. We might unnecessarily convert some data
501 : * after the end-of-input marker as long as it's valid for the
502 : * encoding, but that's harmless.
503 : */
504 36 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
505 : cstate->file_encoding,
506 : GetDatabaseEncoding(),
507 : src, srclen,
508 : dst, dstlen,
509 : true);
510 36 : if (convertedlen == 0)
511 : {
512 : /*
513 : * Could not convert anything. If there is no more raw input data
514 : * coming, it means that there was an incomplete multi-byte
515 : * sequence at the end. Also, if there is plenty of input left,
516 : * we should be able to convert at least one character, so a
517 : * failure to do so must mean that we've hit a byte sequence
518 : * that's invalid.
519 : */
520 24 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
521 12 : cstate->input_reached_error = true;
522 24 : return;
523 : }
524 12 : cstate->raw_buf_index += convertedlen;
525 12 : cstate->input_buf_len += strlen((char *) dst);
526 : }
527 : }
528 :
529 : /*
530 : * Report an encoding or conversion error.
531 : */
532 : static void
533 12 : CopyConversionError(CopyFromState cstate)
534 : {
535 : Assert(cstate->raw_buf_len > 0);
536 : Assert(cstate->input_reached_error);
537 :
538 12 : if (!cstate->need_transcoding)
539 : {
540 : /*
541 : * Everything up to input_buf_len was successfully verified, and
542 : * input_buf_len points to the invalid or incomplete character.
543 : */
544 0 : report_invalid_encoding(cstate->file_encoding,
545 0 : cstate->raw_buf + cstate->input_buf_len,
546 0 : cstate->raw_buf_len - cstate->input_buf_len);
547 : }
548 : else
549 : {
550 : /*
551 : * raw_buf_index points to the invalid or untranslatable character. We
552 : * let the conversion routine report the error, because it can provide
553 : * a more specific error message than we could here. An earlier call
554 : * to the conversion routine in CopyConvertBuf() detected that there
555 : * is an error, now we call the conversion routine again with
556 : * noError=false, to have it throw the error.
557 : */
558 : unsigned char *src;
559 : int srclen;
560 : unsigned char *dst;
561 : int dstlen;
562 :
563 12 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
564 12 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
565 12 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
566 12 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
567 :
568 12 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
569 : cstate->file_encoding,
570 : GetDatabaseEncoding(),
571 : src, srclen,
572 : dst, dstlen,
573 : false);
574 :
575 : /*
576 : * The conversion routine should have reported an error, so this
577 : * should not be reached.
578 : */
579 0 : elog(ERROR, "encoding conversion failed without error");
580 : }
581 : }
582 :
583 : /*
584 : * Load more data from data source to raw_buf.
585 : *
586 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
587 : * beginning of the buffer, and we load new data after that.
588 : */
589 : static void
590 430952 : CopyLoadRawBuf(CopyFromState cstate)
591 : {
592 : int nbytes;
593 : int inbytes;
594 :
595 : /*
596 : * In text mode, if encoding conversion is not required, raw_buf and
597 : * input_buf point to the same buffer. Their len/index better agree, too.
598 : */
599 430952 : if (cstate->raw_buf == cstate->input_buf)
600 : {
601 : Assert(!cstate->need_transcoding);
602 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
603 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
604 : }
605 :
606 : /*
607 : * Copy down the unprocessed data if any.
608 : */
609 430952 : nbytes = RAW_BUF_BYTES(cstate);
610 430952 : if (nbytes > 0 && cstate->raw_buf_index > 0)
611 0 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
612 : nbytes);
613 430952 : cstate->raw_buf_len -= cstate->raw_buf_index;
614 430952 : cstate->raw_buf_index = 0;
615 :
616 : /*
617 : * If raw_buf and input_buf are in fact the same buffer, adjust the
618 : * input_buf variables, too.
619 : */
620 430952 : if (cstate->raw_buf == cstate->input_buf)
621 : {
622 430868 : cstate->input_buf_len -= cstate->input_buf_index;
623 430868 : cstate->input_buf_index = 0;
624 : }
625 :
626 : /* Load more data */
627 430952 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
628 430952 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
629 430952 : nbytes += inbytes;
630 430952 : cstate->raw_buf[nbytes] = '\0';
631 430952 : cstate->raw_buf_len = nbytes;
632 :
633 430952 : cstate->bytes_processed += inbytes;
634 430952 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
635 :
636 430952 : if (inbytes == 0)
637 1538 : cstate->raw_reached_eof = true;
638 430952 : }
639 :
640 : /*
641 : * CopyLoadInputBuf loads some more data into input_buf
642 : *
643 : * On return, at least one more input character is loaded into
644 : * input_buf, or input_reached_eof is set.
645 : *
646 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
647 : * of the buffer and then we load more data after that.
648 : */
649 : static void
650 430904 : CopyLoadInputBuf(CopyFromState cstate)
651 : {
652 430904 : int nbytes = INPUT_BUF_BYTES(cstate);
653 :
654 : /*
655 : * The caller has updated input_buf_index to indicate how much of the
656 : * input has been consumed and isn't needed anymore. If input_buf is the
657 : * same physical area as raw_buf, update raw_buf_index accordingly.
658 : */
659 430904 : if (cstate->raw_buf == cstate->input_buf)
660 : {
661 : Assert(!cstate->need_transcoding);
662 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
663 430868 : cstate->raw_buf_index = cstate->input_buf_index;
664 : }
665 :
666 : for (;;)
667 : {
668 : /* If we now have some unconverted data, try to convert it */
669 861820 : CopyConvertBuf(cstate);
670 :
671 : /* If we now have some more input bytes ready, return them */
672 861820 : if (INPUT_BUF_BYTES(cstate) > nbytes)
673 429378 : return;
674 :
675 : /*
676 : * If we reached an invalid byte sequence, or we're at an incomplete
677 : * multi-byte character but there is no more raw input data, report
678 : * conversion error.
679 : */
680 432442 : if (cstate->input_reached_error)
681 12 : CopyConversionError(cstate);
682 :
683 : /* no more input, and everything has been converted */
684 432430 : if (cstate->input_reached_eof)
685 1514 : break;
686 :
687 : /* Try to load more raw data */
688 : Assert(!cstate->raw_reached_eof);
689 430916 : CopyLoadRawBuf(cstate);
690 : }
691 : }
692 :
693 : /*
694 : * CopyReadBinaryData
695 : *
696 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
697 : * and writes them to 'dest'. Returns the number of bytes read (which
698 : * would be less than 'nbytes' only if we reach EOF).
699 : */
700 : static int
701 382 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
702 : {
703 382 : int copied_bytes = 0;
704 :
705 382 : if (RAW_BUF_BYTES(cstate) >= nbytes)
706 : {
707 : /* Enough bytes are present in the buffer. */
708 346 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
709 346 : cstate->raw_buf_index += nbytes;
710 346 : copied_bytes = nbytes;
711 : }
712 : else
713 : {
714 : /*
715 : * Not enough bytes in the buffer, so must read from the file. Need
716 : * to loop since 'nbytes' could be larger than the buffer size.
717 : */
718 : do
719 : {
720 : int copy_bytes;
721 :
722 : /* Load more data if buffer is empty. */
723 36 : if (RAW_BUF_BYTES(cstate) == 0)
724 : {
725 36 : CopyLoadRawBuf(cstate);
726 36 : if (cstate->raw_reached_eof)
727 12 : break; /* EOF */
728 : }
729 :
730 : /* Transfer some bytes. */
731 24 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
732 24 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
733 24 : cstate->raw_buf_index += copy_bytes;
734 24 : dest += copy_bytes;
735 24 : copied_bytes += copy_bytes;
736 24 : } while (copied_bytes < nbytes);
737 : }
738 :
739 382 : return copied_bytes;
740 : }
741 :
742 : /*
743 : * This function is exposed for use by extensions that read raw fields in the
744 : * next line. See NextCopyFromRawFieldsInternal() for details.
745 : */
746 : bool
747 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
748 : {
749 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
750 0 : cstate->opts.csv_mode);
751 : }
752 :
753 : /*
754 : * Workhorse for NextCopyFromRawFields().
755 : *
756 : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
757 : * false if no more lines.
758 : *
759 : * An internal temporary buffer is returned via 'fields'. It is valid until
760 : * the next call of the function. Since the function returns all raw fields
761 : * in the input file, 'nfields' could be different from the number of columns
762 : * in the relation.
763 : *
764 : * NOTE: force_not_null option are not applied to the returned fields.
765 : *
766 : * We use pg_attribute_always_inline to reduce function call overhead
767 : * and to help compilers to optimize away the 'is_csv' condition when called
768 : * by internal functions such as CopyFromTextLikeOneRow().
769 : */
770 : static pg_attribute_always_inline bool
771 1258142 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
772 : {
773 : int fldct;
774 : bool done;
775 :
776 : /* only available for text or csv input */
777 : Assert(!cstate->opts.binary);
778 :
779 : /* on input check that the header line is correct if needed */
780 1258142 : if (cstate->cur_lineno == 0 && cstate->opts.header_line)
781 : {
782 : ListCell *cur;
783 : TupleDesc tupDesc;
784 :
785 120 : tupDesc = RelationGetDescr(cstate->rel);
786 :
787 120 : cstate->cur_lineno++;
788 120 : done = CopyReadLine(cstate, is_csv);
789 :
790 120 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
791 : {
792 : int fldnum;
793 :
794 76 : if (is_csv)
795 10 : fldct = CopyReadAttributesCSV(cstate);
796 : else
797 66 : fldct = CopyReadAttributesText(cstate);
798 :
799 76 : if (fldct != list_length(cstate->attnumlist))
800 24 : ereport(ERROR,
801 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
802 : errmsg("wrong number of fields in header line: got %d, expected %d",
803 : fldct, list_length(cstate->attnumlist))));
804 :
805 52 : fldnum = 0;
806 158 : foreach(cur, cstate->attnumlist)
807 : {
808 126 : int attnum = lfirst_int(cur);
809 : char *colName;
810 126 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
811 :
812 : Assert(fldnum < cstate->max_fields);
813 :
814 126 : colName = cstate->raw_fields[fldnum++];
815 126 : if (colName == NULL)
816 6 : ereport(ERROR,
817 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
818 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
819 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
820 :
821 120 : if (namestrcmp(&attr->attname, colName) != 0)
822 : {
823 14 : ereport(ERROR,
824 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
825 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
826 : fldnum, colName, NameStr(attr->attname))));
827 : }
828 : }
829 : }
830 :
831 76 : if (done)
832 0 : return false;
833 : }
834 :
835 1258098 : cstate->cur_lineno++;
836 :
837 : /* Actually read the line into memory here */
838 1258098 : done = CopyReadLine(cstate, is_csv);
839 :
840 : /*
841 : * EOF at start of line means we're done. If we see EOF after some
842 : * characters, we act as though it was newline followed by EOF, ie,
843 : * process the line and then exit loop on next iteration.
844 : */
845 1258074 : if (done && cstate->line_buf.len == 0)
846 1544 : return false;
847 :
848 : /* Parse the line into de-escaped field values */
849 1256530 : if (is_csv)
850 464 : fldct = CopyReadAttributesCSV(cstate);
851 : else
852 1256066 : fldct = CopyReadAttributesText(cstate);
853 :
854 1256518 : *fields = cstate->raw_fields;
855 1256518 : *nfields = fldct;
856 1256518 : return true;
857 : }
858 :
859 : /*
860 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
861 : *
862 : * 'econtext' is used to evaluate default expression for each column that is
863 : * either not read from the file or is using the DEFAULT option of COPY FROM.
864 : * It can be NULL when no default values are used, i.e. when all columns are
865 : * read from the file, and DEFAULT option is unset.
866 : *
867 : * 'values' and 'nulls' arrays must be the same length as columns of the
868 : * relation passed to BeginCopyFrom. This function fills the arrays.
869 : */
870 : bool
871 1258184 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
872 : Datum *values, bool *nulls)
873 : {
874 : TupleDesc tupDesc;
875 : AttrNumber num_phys_attrs,
876 1258184 : num_defaults = cstate->num_defaults;
877 : int i;
878 1258184 : int *defmap = cstate->defmap;
879 1258184 : ExprState **defexprs = cstate->defexprs;
880 :
881 1258184 : tupDesc = RelationGetDescr(cstate->rel);
882 1258184 : num_phys_attrs = tupDesc->natts;
883 :
884 : /* Initialize all values for row to NULL */
885 5863626 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
886 1258184 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
887 1402292 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
888 :
889 : /* Get one row from source */
890 1258184 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
891 1556 : return false;
892 :
893 : /*
894 : * Now compute and insert any defaults available for the columns not
895 : * provided by the input data. Anything not processed here or above will
896 : * remain NULL.
897 : */
898 1317002 : for (i = 0; i < num_defaults; i++)
899 : {
900 : /*
901 : * The caller must supply econtext and have switched into the
902 : * per-tuple memory context in it.
903 : */
904 : Assert(econtext != NULL);
905 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
906 :
907 60530 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
908 60530 : &nulls[defmap[i]]);
909 : }
910 :
911 1256472 : return true;
912 : }
913 :
914 : /* Implementation of the per-row callback for text format */
915 : bool
916 1257468 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
917 : bool *nulls)
918 : {
919 1257468 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
920 : }
921 :
922 : /* Implementation of the per-row callback for CSV format */
923 : bool
924 674 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
925 : bool *nulls)
926 : {
927 674 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
928 : }
929 :
930 : /*
931 : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
932 : *
933 : * We use pg_attribute_always_inline to reduce function call overhead
934 : * and to help compilers to optimize away the 'is_csv' condition.
935 : */
936 : static pg_attribute_always_inline bool
937 1258142 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
938 : Datum *values, bool *nulls, bool is_csv)
939 : {
940 : TupleDesc tupDesc;
941 : AttrNumber attr_count;
942 1258142 : FmgrInfo *in_functions = cstate->in_functions;
943 1258142 : Oid *typioparams = cstate->typioparams;
944 1258142 : ExprState **defexprs = cstate->defexprs;
945 : char **field_strings;
946 : ListCell *cur;
947 : int fldct;
948 : int fieldno;
949 : char *string;
950 :
951 1258142 : tupDesc = RelationGetDescr(cstate->rel);
952 1258142 : attr_count = list_length(cstate->attnumlist);
953 :
954 : /* read raw fields in the next line */
955 1258142 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
956 1544 : return false;
957 :
958 : /* check for overflowing fields */
959 1256518 : if (attr_count > 0 && fldct > attr_count)
960 18 : ereport(ERROR,
961 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
962 : errmsg("extra data after last expected column")));
963 :
964 1256500 : fieldno = 0;
965 :
966 : /* Loop to read the user attributes on the line. */
967 5730594 : foreach(cur, cstate->attnumlist)
968 : {
969 4474278 : int attnum = lfirst_int(cur);
970 4474278 : int m = attnum - 1;
971 4474278 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
972 :
973 4474278 : if (fieldno >= fldct)
974 18 : ereport(ERROR,
975 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
976 : errmsg("missing data for column \"%s\"",
977 : NameStr(att->attname))));
978 4474260 : string = field_strings[fieldno++];
979 :
980 4474260 : if (cstate->convert_select_flags &&
981 20 : !cstate->convert_select_flags[m])
982 : {
983 : /* ignore input field, leaving column as NULL */
984 10 : continue;
985 : }
986 :
987 4474250 : if (is_csv)
988 : {
989 962 : if (string == NULL &&
990 44 : cstate->opts.force_notnull_flags[m])
991 : {
992 : /*
993 : * FORCE_NOT_NULL option is set and column is NULL - convert
994 : * it to the NULL string.
995 : */
996 28 : string = cstate->opts.null_print;
997 : }
998 934 : else if (string != NULL && cstate->opts.force_null_flags[m]
999 50 : && strcmp(string, cstate->opts.null_print) == 0)
1000 : {
1001 : /*
1002 : * FORCE_NULL option is set and column matches the NULL
1003 : * string. It must have been quoted, or otherwise the string
1004 : * would already have been set to NULL. Convert it to NULL as
1005 : * specified.
1006 : */
1007 26 : string = NULL;
1008 : }
1009 : }
1010 :
1011 4474250 : cstate->cur_attname = NameStr(att->attname);
1012 4474250 : cstate->cur_attval = string;
1013 :
1014 4474250 : if (string != NULL)
1015 4469406 : nulls[m] = false;
1016 :
1017 4474250 : if (cstate->defaults[m])
1018 : {
1019 : /* We must have switched into the per-tuple memory context */
1020 : Assert(econtext != NULL);
1021 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1022 :
1023 60 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1024 : }
1025 :
1026 : /*
1027 : * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1028 : */
1029 4474152 : else if (!InputFunctionCallSafe(&in_functions[m],
1030 : string,
1031 4474190 : typioparams[m],
1032 : att->atttypmod,
1033 4474190 : (Node *) cstate->escontext,
1034 4474190 : &values[m]))
1035 : {
1036 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1037 :
1038 128 : cstate->num_errors++;
1039 :
1040 128 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1041 : {
1042 : /*
1043 : * Since we emit line number and column info in the below
1044 : * notice message, we suppress error context information other
1045 : * than the relation name.
1046 : */
1047 : Assert(!cstate->relname_only);
1048 42 : cstate->relname_only = true;
1049 :
1050 42 : if (cstate->cur_attval)
1051 : {
1052 : char *attval;
1053 :
1054 36 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1055 36 : ereport(NOTICE,
1056 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1057 : cstate->cur_lineno,
1058 : cstate->cur_attname,
1059 : attval));
1060 36 : pfree(attval);
1061 : }
1062 : else
1063 6 : ereport(NOTICE,
1064 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1065 : cstate->cur_lineno,
1066 : cstate->cur_attname));
1067 :
1068 : /* reset relname_only */
1069 42 : cstate->relname_only = false;
1070 : }
1071 :
1072 128 : return true;
1073 : }
1074 :
1075 4474084 : cstate->cur_attname = NULL;
1076 4474084 : cstate->cur_attval = NULL;
1077 : }
1078 :
1079 : Assert(fieldno == attr_count);
1080 :
1081 1256316 : return true;
1082 : }
1083 :
1084 : /* Implementation of the per-row callback for binary format */
1085 : bool
1086 42 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1087 : bool *nulls)
1088 : {
1089 : TupleDesc tupDesc;
1090 : AttrNumber attr_count;
1091 42 : FmgrInfo *in_functions = cstate->in_functions;
1092 42 : Oid *typioparams = cstate->typioparams;
1093 : int16 fld_count;
1094 : ListCell *cur;
1095 :
1096 42 : tupDesc = RelationGetDescr(cstate->rel);
1097 42 : attr_count = list_length(cstate->attnumlist);
1098 :
1099 42 : cstate->cur_lineno++;
1100 :
1101 42 : if (!CopyGetInt16(cstate, &fld_count))
1102 : {
1103 : /* EOF detected (end of file, or protocol-level EOF) */
1104 0 : return false;
1105 : }
1106 :
1107 42 : if (fld_count == -1)
1108 : {
1109 : /*
1110 : * Received EOF marker. Wait for the protocol-level EOF, and complain
1111 : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1112 : * that we correctly handle CopyFail, if client chooses to send that
1113 : * now. When copying from file, we could ignore the rest of the file
1114 : * like in text mode, but we choose to be consistent with the COPY
1115 : * FROM STDIN case.
1116 : */
1117 : char dummy;
1118 :
1119 12 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1120 0 : ereport(ERROR,
1121 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1122 : errmsg("received copy data after EOF marker")));
1123 12 : return false;
1124 : }
1125 :
1126 30 : if (fld_count != attr_count)
1127 0 : ereport(ERROR,
1128 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1129 : errmsg("row field count is %d, expected %d",
1130 : (int) fld_count, attr_count)));
1131 :
1132 186 : foreach(cur, cstate->attnumlist)
1133 : {
1134 158 : int attnum = lfirst_int(cur);
1135 158 : int m = attnum - 1;
1136 158 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1137 :
1138 158 : cstate->cur_attname = NameStr(att->attname);
1139 314 : values[m] = CopyReadBinaryAttribute(cstate,
1140 158 : &in_functions[m],
1141 158 : typioparams[m],
1142 : att->atttypmod,
1143 : &nulls[m]);
1144 156 : cstate->cur_attname = NULL;
1145 : }
1146 :
1147 28 : return true;
1148 : }
1149 :
1150 : /*
1151 : * Read the next input line and stash it in line_buf.
1152 : *
1153 : * Result is true if read was terminated by EOF, false if terminated
1154 : * by newline. The terminating newline or EOF marker is not included
1155 : * in the final value of line_buf.
1156 : */
1157 : static bool
1158 1258218 : CopyReadLine(CopyFromState cstate, bool is_csv)
1159 : {
1160 : bool result;
1161 :
1162 1258218 : resetStringInfo(&cstate->line_buf);
1163 1258218 : cstate->line_buf_valid = false;
1164 :
1165 : /* Parse data and transfer into line_buf */
1166 1258218 : result = CopyReadLineText(cstate, is_csv);
1167 :
1168 1258194 : if (result)
1169 : {
1170 : /*
1171 : * Reached EOF. In protocol version 3, we should ignore anything
1172 : * after \. up to the protocol end of copy data. (XXX maybe better
1173 : * not to treat \. as special?)
1174 : */
1175 1544 : if (cstate->copy_src == COPY_FRONTEND)
1176 : {
1177 : int inbytes;
1178 :
1179 : do
1180 : {
1181 778 : inbytes = CopyGetData(cstate, cstate->input_buf,
1182 : 1, INPUT_BUF_SIZE);
1183 778 : } while (inbytes > 0);
1184 778 : cstate->input_buf_index = 0;
1185 778 : cstate->input_buf_len = 0;
1186 778 : cstate->raw_buf_index = 0;
1187 778 : cstate->raw_buf_len = 0;
1188 : }
1189 : }
1190 : else
1191 : {
1192 : /*
1193 : * If we didn't hit EOF, then we must have transferred the EOL marker
1194 : * to line_buf along with the data. Get rid of it.
1195 : */
1196 1256650 : switch (cstate->eol_type)
1197 : {
1198 1256650 : case EOL_NL:
1199 : Assert(cstate->line_buf.len >= 1);
1200 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1201 1256650 : cstate->line_buf.len--;
1202 1256650 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1203 1256650 : break;
1204 0 : case EOL_CR:
1205 : Assert(cstate->line_buf.len >= 1);
1206 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1207 0 : cstate->line_buf.len--;
1208 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1209 0 : break;
1210 0 : case EOL_CRNL:
1211 : Assert(cstate->line_buf.len >= 2);
1212 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1213 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1214 0 : cstate->line_buf.len -= 2;
1215 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1216 0 : break;
1217 0 : case EOL_UNKNOWN:
1218 : /* shouldn't get here */
1219 : Assert(false);
1220 0 : break;
1221 : }
1222 1258194 : }
1223 :
1224 : /* Now it's safe to use the buffer in error messages */
1225 1258194 : cstate->line_buf_valid = true;
1226 :
1227 1258194 : return result;
1228 : }
1229 :
1230 : /*
1231 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1232 : */
1233 : static bool
1234 1258218 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1235 : {
1236 : char *copy_input_buf;
1237 : int input_buf_ptr;
1238 : int copy_buf_len;
1239 1258218 : bool need_data = false;
1240 1258218 : bool hit_eof = false;
1241 1258218 : bool result = false;
1242 :
1243 : /* CSV variables */
1244 1258218 : bool in_quote = false,
1245 1258218 : last_was_esc = false;
1246 1258218 : char quotec = '\0';
1247 1258218 : char escapec = '\0';
1248 :
1249 1258218 : if (is_csv)
1250 : {
1251 720 : quotec = cstate->opts.quote[0];
1252 720 : escapec = cstate->opts.escape[0];
1253 : /* ignore special escape processing if it's the same as quotec */
1254 720 : if (quotec == escapec)
1255 526 : escapec = '\0';
1256 : }
1257 :
1258 : /*
1259 : * The objective of this loop is to transfer the entire next input line
1260 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1261 : * \n) and the end-of-copy marker (\.).
1262 : *
1263 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1264 : * value and are put in line_buf. We keep just enough state to know if we
1265 : * are currently in a quoted field or not.
1266 : *
1267 : * The input has already been converted to the database encoding. All
1268 : * supported server encodings have the property that all bytes in a
1269 : * multi-byte sequence have the high bit set, so a multibyte character
1270 : * cannot contain any newline or escape characters embedded in the
1271 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1272 : * regardless of the encoding.
1273 : *
1274 : * For speed, we try to move data from input_buf to line_buf in chunks
1275 : * rather than one character at a time. input_buf_ptr points to the next
1276 : * character to examine; any characters from input_buf_index to
1277 : * input_buf_ptr have been determined to be part of the line, but not yet
1278 : * transferred to line_buf.
1279 : *
1280 : * For a little extra speed within the loop, we copy input_buf and
1281 : * input_buf_len into local variables.
1282 : */
1283 1258218 : copy_input_buf = cstate->input_buf;
1284 1258218 : input_buf_ptr = cstate->input_buf_index;
1285 1258218 : copy_buf_len = cstate->input_buf_len;
1286 :
1287 : for (;;)
1288 25107846 : {
1289 : int prev_raw_ptr;
1290 : char c;
1291 :
1292 : /*
1293 : * Load more data if needed.
1294 : *
1295 : * TODO: We could just force four bytes of read-ahead and avoid the
1296 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1297 : * unsafe with the old v2 COPY protocol, but we don't support that
1298 : * anymore.
1299 : */
1300 26366064 : if (input_buf_ptr >= copy_buf_len || need_data)
1301 : {
1302 430904 : REFILL_LINEBUF;
1303 :
1304 430904 : CopyLoadInputBuf(cstate);
1305 : /* update our local variables */
1306 430892 : hit_eof = cstate->input_reached_eof;
1307 430892 : input_buf_ptr = cstate->input_buf_index;
1308 430892 : copy_buf_len = cstate->input_buf_len;
1309 :
1310 : /*
1311 : * If we are completely out of data, break out of the loop,
1312 : * reporting EOF.
1313 : */
1314 430892 : if (INPUT_BUF_BYTES(cstate) <= 0)
1315 : {
1316 1514 : result = true;
1317 1514 : break;
1318 : }
1319 429378 : need_data = false;
1320 : }
1321 :
1322 : /* OK to fetch a character */
1323 26364538 : prev_raw_ptr = input_buf_ptr;
1324 26364538 : c = copy_input_buf[input_buf_ptr++];
1325 :
1326 26364538 : if (is_csv)
1327 : {
1328 : /*
1329 : * If character is '\r', we may need to look ahead below. Force
1330 : * fetch of the next character if we don't already have it. We
1331 : * need to do this before changing CSV state, in case '\r' is also
1332 : * the quote or escape character.
1333 : */
1334 5418 : if (c == '\r')
1335 : {
1336 36 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1337 : }
1338 :
1339 : /*
1340 : * Dealing with quotes and escapes here is mildly tricky. If the
1341 : * quote char is also the escape char, there's no problem - we
1342 : * just use the char as a toggle. If they are different, we need
1343 : * to ensure that we only take account of an escape inside a
1344 : * quoted field and immediately preceding a quote char, and not
1345 : * the second in an escape-escape sequence.
1346 : */
1347 5418 : if (in_quote && c == escapec)
1348 48 : last_was_esc = !last_was_esc;
1349 5418 : if (c == quotec && !last_was_esc)
1350 508 : in_quote = !in_quote;
1351 5418 : if (c != escapec)
1352 5364 : last_was_esc = false;
1353 :
1354 : /*
1355 : * Updating the line count for embedded CR and/or LF chars is
1356 : * necessarily a little fragile - this test is probably about the
1357 : * best we can do. (XXX it's arguable whether we should do this
1358 : * at all --- is cur_lineno a physical or logical count?)
1359 : */
1360 5418 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1361 36 : cstate->cur_lineno++;
1362 : }
1363 :
1364 : /* Process \r */
1365 26364538 : if (c == '\r' && (!is_csv || !in_quote))
1366 : {
1367 : /* Check for \r\n on first line, _and_ handle \r\n. */
1368 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1369 0 : cstate->eol_type == EOL_CRNL)
1370 : {
1371 : /*
1372 : * If need more data, go back to loop top to load it.
1373 : *
1374 : * Note that if we are at EOF, c will wind up as '\0' because
1375 : * of the guaranteed pad of input_buf.
1376 : */
1377 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1378 :
1379 : /* get next char */
1380 0 : c = copy_input_buf[input_buf_ptr];
1381 :
1382 0 : if (c == '\n')
1383 : {
1384 0 : input_buf_ptr++; /* eat newline */
1385 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1386 : }
1387 : else
1388 : {
1389 : /* found \r, but no \n */
1390 0 : if (cstate->eol_type == EOL_CRNL)
1391 0 : ereport(ERROR,
1392 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1393 : !is_csv ?
1394 : errmsg("literal carriage return found in data") :
1395 : errmsg("unquoted carriage return found in data"),
1396 : !is_csv ?
1397 : errhint("Use \"\\r\" to represent carriage return.") :
1398 : errhint("Use quoted CSV field to represent carriage return.")));
1399 :
1400 : /*
1401 : * if we got here, it is the first line and we didn't find
1402 : * \n, so don't consume the peeked character
1403 : */
1404 0 : cstate->eol_type = EOL_CR;
1405 : }
1406 : }
1407 0 : else if (cstate->eol_type == EOL_NL)
1408 0 : ereport(ERROR,
1409 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1410 : !is_csv ?
1411 : errmsg("literal carriage return found in data") :
1412 : errmsg("unquoted carriage return found in data"),
1413 : !is_csv ?
1414 : errhint("Use \"\\r\" to represent carriage return.") :
1415 : errhint("Use quoted CSV field to represent carriage return.")));
1416 : /* If reach here, we have found the line terminator */
1417 0 : break;
1418 : }
1419 :
1420 : /* Process \n */
1421 26364538 : if (c == '\n' && (!is_csv || !in_quote))
1422 : {
1423 1256650 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1424 0 : ereport(ERROR,
1425 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1426 : !is_csv ?
1427 : errmsg("literal newline found in data") :
1428 : errmsg("unquoted newline found in data"),
1429 : !is_csv ?
1430 : errhint("Use \"\\n\" to represent newline.") :
1431 : errhint("Use quoted CSV field to represent newline.")));
1432 1256650 : cstate->eol_type = EOL_NL; /* in case not set yet */
1433 : /* If reach here, we have found the line terminator */
1434 1256650 : break;
1435 : }
1436 :
1437 : /*
1438 : * Process backslash, except in CSV mode where backslash is a normal
1439 : * character.
1440 : */
1441 25107888 : if (c == '\\' && !is_csv)
1442 : {
1443 : char c2;
1444 :
1445 8036 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1446 8036 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1447 :
1448 : /* -----
1449 : * get next character
1450 : * Note: we do not change c so if it isn't \., we can fall
1451 : * through and continue processing.
1452 : * -----
1453 : */
1454 8036 : c2 = copy_input_buf[input_buf_ptr];
1455 :
1456 8036 : if (c2 == '.')
1457 : {
1458 42 : input_buf_ptr++; /* consume the '.' */
1459 42 : if (cstate->eol_type == EOL_CRNL)
1460 : {
1461 : /* Get the next character */
1462 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1463 : /* if hit_eof, c2 will become '\0' */
1464 0 : c2 = copy_input_buf[input_buf_ptr++];
1465 :
1466 0 : if (c2 == '\n')
1467 0 : ereport(ERROR,
1468 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1469 : errmsg("end-of-copy marker does not match previous newline style")));
1470 0 : else if (c2 != '\r')
1471 0 : ereport(ERROR,
1472 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1473 : errmsg("end-of-copy marker is not alone on its line")));
1474 : }
1475 :
1476 : /* Get the next character */
1477 42 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1478 : /* if hit_eof, c2 will become '\0' */
1479 42 : c2 = copy_input_buf[input_buf_ptr++];
1480 :
1481 42 : if (c2 != '\r' && c2 != '\n')
1482 6 : ereport(ERROR,
1483 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1484 : errmsg("end-of-copy marker is not alone on its line")));
1485 :
1486 36 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1487 36 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1488 36 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1489 0 : ereport(ERROR,
1490 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1491 : errmsg("end-of-copy marker does not match previous newline style")));
1492 :
1493 : /*
1494 : * If there is any data on this line before the \., complain.
1495 : */
1496 36 : if (cstate->line_buf.len > 0 ||
1497 36 : prev_raw_ptr > cstate->input_buf_index)
1498 6 : ereport(ERROR,
1499 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1500 : errmsg("end-of-copy marker is not alone on its line")));
1501 :
1502 : /*
1503 : * Discard the \. and newline, then report EOF.
1504 : */
1505 30 : cstate->input_buf_index = input_buf_ptr;
1506 30 : result = true; /* report EOF */
1507 30 : break;
1508 : }
1509 : else
1510 : {
1511 : /*
1512 : * If we are here, it means we found a backslash followed by
1513 : * something other than a period. In non-CSV mode, anything
1514 : * after a backslash is special, so we skip over that second
1515 : * character too. If we didn't do that \\. would be
1516 : * considered an eof-of copy, while in non-CSV mode it is a
1517 : * literal backslash followed by a period.
1518 : */
1519 7994 : input_buf_ptr++;
1520 : }
1521 : }
1522 : } /* end of outer loop */
1523 :
1524 : /*
1525 : * Transfer any still-uncopied data to line_buf.
1526 : */
1527 1258194 : REFILL_LINEBUF;
1528 :
1529 1258194 : return result;
1530 : }
1531 :
1532 : /*
1533 : * Return decimal value for a hexadecimal digit
1534 : */
1535 : static int
1536 0 : GetDecimalFromHex(char hex)
1537 : {
1538 0 : if (isdigit((unsigned char) hex))
1539 0 : return hex - '0';
1540 : else
1541 0 : return tolower((unsigned char) hex) - 'a' + 10;
1542 : }
1543 :
1544 : /*
1545 : * Parse the current line into separate attributes (fields),
1546 : * performing de-escaping as needed.
1547 : *
1548 : * The input is in line_buf. We use attribute_buf to hold the result
1549 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1550 : * string, or NULL when the input matches the null marker string.
1551 : * This array is expanded as necessary.
1552 : *
1553 : * (Note that the caller cannot check for nulls since the returned
1554 : * string would be the post-de-escaping equivalent, which may look
1555 : * the same as some valid data string.)
1556 : *
1557 : * delim is the column delimiter string (must be just one byte for now).
1558 : * null_print is the null marker string. Note that this is compared to
1559 : * the pre-de-escaped input string.
1560 : *
1561 : * The return value is the number of fields actually read.
1562 : */
1563 : static int
1564 1256132 : CopyReadAttributesText(CopyFromState cstate)
1565 : {
1566 1256132 : char delimc = cstate->opts.delim[0];
1567 : int fieldno;
1568 : char *output_ptr;
1569 : char *cur_ptr;
1570 : char *line_end_ptr;
1571 :
1572 : /*
1573 : * We need a special case for zero-column tables: check that the input
1574 : * line is empty, and return.
1575 : */
1576 1256132 : if (cstate->max_fields <= 0)
1577 : {
1578 8 : if (cstate->line_buf.len != 0)
1579 0 : ereport(ERROR,
1580 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1581 : errmsg("extra data after last expected column")));
1582 8 : return 0;
1583 : }
1584 :
1585 1256124 : resetStringInfo(&cstate->attribute_buf);
1586 :
1587 : /*
1588 : * The de-escaped attributes will certainly not be longer than the input
1589 : * data line, so we can just force attribute_buf to be large enough and
1590 : * then transfer data without any checks for enough space. We need to do
1591 : * it this way because enlarging attribute_buf mid-stream would invalidate
1592 : * pointers already stored into cstate->raw_fields[].
1593 : */
1594 1256124 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1595 8 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1596 1256124 : output_ptr = cstate->attribute_buf.data;
1597 :
1598 : /* set pointer variables for loop */
1599 1256124 : cur_ptr = cstate->line_buf.data;
1600 1256124 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1601 :
1602 : /* Outer loop iterates over fields */
1603 1256124 : fieldno = 0;
1604 : for (;;)
1605 3217542 : {
1606 4473666 : bool found_delim = false;
1607 : char *start_ptr;
1608 : char *end_ptr;
1609 : int input_len;
1610 4473666 : bool saw_non_ascii = false;
1611 :
1612 : /* Make sure there is enough space for the next value */
1613 4473666 : if (fieldno >= cstate->max_fields)
1614 : {
1615 36 : cstate->max_fields *= 2;
1616 36 : cstate->raw_fields =
1617 36 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1618 : }
1619 :
1620 : /* Remember start of field on both input and output sides */
1621 4473666 : start_ptr = cur_ptr;
1622 4473666 : cstate->raw_fields[fieldno] = output_ptr;
1623 :
1624 : /*
1625 : * Scan data for field.
1626 : *
1627 : * Note that in this loop, we are scanning to locate the end of field
1628 : * and also speculatively performing de-escaping. Once we find the
1629 : * end-of-field, we can match the raw field contents against the null
1630 : * marker string. Only after that comparison fails do we know that
1631 : * de-escaping is actually the right thing to do; therefore we *must
1632 : * not* throw any syntax errors before we've done the null-marker
1633 : * check.
1634 : */
1635 : for (;;)
1636 21884888 : {
1637 : char c;
1638 :
1639 26358554 : end_ptr = cur_ptr;
1640 26358554 : if (cur_ptr >= line_end_ptr)
1641 1256118 : break;
1642 25102436 : c = *cur_ptr++;
1643 25102436 : if (c == delimc)
1644 : {
1645 3217548 : found_delim = true;
1646 3217548 : break;
1647 : }
1648 21884888 : if (c == '\\')
1649 : {
1650 7994 : if (cur_ptr >= line_end_ptr)
1651 0 : break;
1652 7994 : c = *cur_ptr++;
1653 7994 : switch (c)
1654 : {
1655 12 : case '0':
1656 : case '1':
1657 : case '2':
1658 : case '3':
1659 : case '4':
1660 : case '5':
1661 : case '6':
1662 : case '7':
1663 : {
1664 : /* handle \013 */
1665 : int val;
1666 :
1667 12 : val = OCTVALUE(c);
1668 12 : if (cur_ptr < line_end_ptr)
1669 : {
1670 6 : c = *cur_ptr;
1671 6 : if (ISOCTAL(c))
1672 : {
1673 0 : cur_ptr++;
1674 0 : val = (val << 3) + OCTVALUE(c);
1675 0 : if (cur_ptr < line_end_ptr)
1676 : {
1677 0 : c = *cur_ptr;
1678 0 : if (ISOCTAL(c))
1679 : {
1680 0 : cur_ptr++;
1681 0 : val = (val << 3) + OCTVALUE(c);
1682 : }
1683 : }
1684 : }
1685 : }
1686 12 : c = val & 0377;
1687 12 : if (c == '\0' || IS_HIGHBIT_SET(c))
1688 12 : saw_non_ascii = true;
1689 : }
1690 12 : break;
1691 12 : case 'x':
1692 : /* Handle \x3F */
1693 12 : if (cur_ptr < line_end_ptr)
1694 : {
1695 6 : char hexchar = *cur_ptr;
1696 :
1697 6 : if (isxdigit((unsigned char) hexchar))
1698 : {
1699 0 : int val = GetDecimalFromHex(hexchar);
1700 :
1701 0 : cur_ptr++;
1702 0 : if (cur_ptr < line_end_ptr)
1703 : {
1704 0 : hexchar = *cur_ptr;
1705 0 : if (isxdigit((unsigned char) hexchar))
1706 : {
1707 0 : cur_ptr++;
1708 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1709 : }
1710 : }
1711 0 : c = val & 0xff;
1712 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1713 0 : saw_non_ascii = true;
1714 : }
1715 : }
1716 12 : break;
1717 0 : case 'b':
1718 0 : c = '\b';
1719 0 : break;
1720 0 : case 'f':
1721 0 : c = '\f';
1722 0 : break;
1723 3050 : case 'n':
1724 3050 : c = '\n';
1725 3050 : break;
1726 0 : case 'r':
1727 0 : c = '\r';
1728 0 : break;
1729 0 : case 't':
1730 0 : c = '\t';
1731 0 : break;
1732 0 : case 'v':
1733 0 : c = '\v';
1734 0 : break;
1735 :
1736 : /*
1737 : * in all other cases, take the char after '\'
1738 : * literally
1739 : */
1740 : }
1741 21876894 : }
1742 :
1743 : /* Add c to output string */
1744 21884888 : *output_ptr++ = c;
1745 : }
1746 :
1747 : /* Check whether raw input matched null marker */
1748 4473666 : input_len = end_ptr - start_ptr;
1749 4473666 : if (input_len == cstate->opts.null_print_len &&
1750 245810 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1751 4808 : cstate->raw_fields[fieldno] = NULL;
1752 : /* Check whether raw input matched default marker */
1753 4468858 : else if (fieldno < list_length(cstate->attnumlist) &&
1754 4468816 : cstate->opts.default_print &&
1755 114 : input_len == cstate->opts.default_print_len &&
1756 30 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1757 24 : {
1758 : /* fieldno is 0-indexed and attnum is 1-indexed */
1759 30 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1760 :
1761 30 : if (cstate->defexprs[m] != NULL)
1762 : {
1763 : /* defaults contain entries for all physical attributes */
1764 24 : cstate->defaults[m] = true;
1765 : }
1766 : else
1767 : {
1768 6 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1769 6 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1770 :
1771 6 : ereport(ERROR,
1772 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1773 : errmsg("unexpected default marker in COPY data"),
1774 : errdetail("Column \"%s\" has no default value.",
1775 : NameStr(att->attname))));
1776 : }
1777 : }
1778 : else
1779 : {
1780 : /*
1781 : * At this point we know the field is supposed to contain data.
1782 : *
1783 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
1784 : * resulting string is valid data for the db encoding.
1785 : */
1786 4468828 : if (saw_non_ascii)
1787 : {
1788 0 : char *fld = cstate->raw_fields[fieldno];
1789 :
1790 0 : pg_verifymbstr(fld, output_ptr - fld, false);
1791 : }
1792 : }
1793 :
1794 : /* Terminate attribute value in output area */
1795 4473660 : *output_ptr++ = '\0';
1796 :
1797 4473660 : fieldno++;
1798 : /* Done if we hit EOL instead of a delim */
1799 4473660 : if (!found_delim)
1800 1256118 : break;
1801 : }
1802 :
1803 : /* Clean up state of attribute_buf */
1804 1256118 : output_ptr--;
1805 : Assert(*output_ptr == '\0');
1806 1256118 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1807 :
1808 1256118 : return fieldno;
1809 : }
1810 :
1811 : /*
1812 : * Parse the current line into separate attributes (fields),
1813 : * performing de-escaping as needed. This has exactly the same API as
1814 : * CopyReadAttributesText, except we parse the fields according to
1815 : * "standard" (i.e. common) CSV usage.
1816 : */
1817 : static int
1818 474 : CopyReadAttributesCSV(CopyFromState cstate)
1819 : {
1820 474 : char delimc = cstate->opts.delim[0];
1821 474 : char quotec = cstate->opts.quote[0];
1822 474 : char escapec = cstate->opts.escape[0];
1823 : int fieldno;
1824 : char *output_ptr;
1825 : char *cur_ptr;
1826 : char *line_end_ptr;
1827 :
1828 : /*
1829 : * We need a special case for zero-column tables: check that the input
1830 : * line is empty, and return.
1831 : */
1832 474 : if (cstate->max_fields <= 0)
1833 : {
1834 0 : if (cstate->line_buf.len != 0)
1835 0 : ereport(ERROR,
1836 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1837 : errmsg("extra data after last expected column")));
1838 0 : return 0;
1839 : }
1840 :
1841 474 : resetStringInfo(&cstate->attribute_buf);
1842 :
1843 : /*
1844 : * The de-escaped attributes will certainly not be longer than the input
1845 : * data line, so we can just force attribute_buf to be large enough and
1846 : * then transfer data without any checks for enough space. We need to do
1847 : * it this way because enlarging attribute_buf mid-stream would invalidate
1848 : * pointers already stored into cstate->raw_fields[].
1849 : */
1850 474 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1851 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1852 474 : output_ptr = cstate->attribute_buf.data;
1853 :
1854 : /* set pointer variables for loop */
1855 474 : cur_ptr = cstate->line_buf.data;
1856 474 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1857 :
1858 : /* Outer loop iterates over fields */
1859 474 : fieldno = 0;
1860 : for (;;)
1861 530 : {
1862 1004 : bool found_delim = false;
1863 1004 : bool saw_quote = false;
1864 : char *start_ptr;
1865 : char *end_ptr;
1866 : int input_len;
1867 :
1868 : /* Make sure there is enough space for the next value */
1869 1004 : if (fieldno >= cstate->max_fields)
1870 : {
1871 0 : cstate->max_fields *= 2;
1872 0 : cstate->raw_fields =
1873 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1874 : }
1875 :
1876 : /* Remember start of field on both input and output sides */
1877 1004 : start_ptr = cur_ptr;
1878 1004 : cstate->raw_fields[fieldno] = output_ptr;
1879 :
1880 : /*
1881 : * Scan data for field,
1882 : *
1883 : * The loop starts in "not quote" mode and then toggles between that
1884 : * and "in quote" mode. The loop exits normally if it is in "not
1885 : * quote" mode and a delimiter or line end is seen.
1886 : */
1887 : for (;;)
1888 222 : {
1889 : char c;
1890 :
1891 : /* Not in quote */
1892 : for (;;)
1893 : {
1894 3230 : end_ptr = cur_ptr;
1895 3230 : if (cur_ptr >= line_end_ptr)
1896 468 : goto endfield;
1897 2762 : c = *cur_ptr++;
1898 : /* unquoted field delimiter */
1899 2762 : if (c == delimc)
1900 : {
1901 536 : found_delim = true;
1902 536 : goto endfield;
1903 : }
1904 : /* start of quoted field (or part of field) */
1905 2226 : if (c == quotec)
1906 : {
1907 222 : saw_quote = true;
1908 222 : break;
1909 : }
1910 : /* Add c to output string */
1911 2004 : *output_ptr++ = c;
1912 : }
1913 :
1914 : /* In quote */
1915 : for (;;)
1916 : {
1917 1390 : end_ptr = cur_ptr;
1918 1390 : if (cur_ptr >= line_end_ptr)
1919 0 : ereport(ERROR,
1920 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1921 : errmsg("unterminated CSV quoted field")));
1922 :
1923 1390 : c = *cur_ptr++;
1924 :
1925 : /* escape within a quoted field */
1926 1390 : if (c == escapec)
1927 : {
1928 : /*
1929 : * peek at the next char if available, and escape it if it
1930 : * is an escape char or a quote char
1931 : */
1932 118 : if (cur_ptr < line_end_ptr)
1933 : {
1934 72 : char nextc = *cur_ptr;
1935 :
1936 72 : if (nextc == escapec || nextc == quotec)
1937 : {
1938 24 : *output_ptr++ = nextc;
1939 24 : cur_ptr++;
1940 24 : continue;
1941 : }
1942 : }
1943 : }
1944 :
1945 : /*
1946 : * end of quoted field. Must do this test after testing for
1947 : * escape in case quote char and escape char are the same
1948 : * (which is the common case).
1949 : */
1950 1366 : if (c == quotec)
1951 222 : break;
1952 :
1953 : /* Add c to output string */
1954 1144 : *output_ptr++ = c;
1955 : }
1956 : }
1957 1004 : endfield:
1958 :
1959 : /* Terminate attribute value in output area */
1960 1004 : *output_ptr++ = '\0';
1961 :
1962 : /* Check whether raw input matched null marker */
1963 1004 : input_len = end_ptr - start_ptr;
1964 1004 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
1965 44 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1966 44 : cstate->raw_fields[fieldno] = NULL;
1967 : /* Check whether raw input matched default marker */
1968 960 : else if (fieldno < list_length(cstate->attnumlist) &&
1969 960 : cstate->opts.default_print &&
1970 150 : input_len == cstate->opts.default_print_len &&
1971 42 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1972 : {
1973 : /* fieldno is 0-index and attnum is 1-index */
1974 42 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1975 :
1976 42 : if (cstate->defexprs[m] != NULL)
1977 : {
1978 : /* defaults contain entries for all physical attributes */
1979 36 : cstate->defaults[m] = true;
1980 : }
1981 : else
1982 : {
1983 6 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1984 6 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1985 :
1986 6 : ereport(ERROR,
1987 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1988 : errmsg("unexpected default marker in COPY data"),
1989 : errdetail("Column \"%s\" has no default value.",
1990 : NameStr(att->attname))));
1991 : }
1992 : }
1993 :
1994 998 : fieldno++;
1995 : /* Done if we hit EOL instead of a delim */
1996 998 : if (!found_delim)
1997 468 : break;
1998 : }
1999 :
2000 : /* Clean up state of attribute_buf */
2001 468 : output_ptr--;
2002 : Assert(*output_ptr == '\0');
2003 468 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2004 :
2005 468 : return fieldno;
2006 : }
2007 :
2008 :
2009 : /*
2010 : * Read a binary attribute
2011 : */
2012 : static Datum
2013 158 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2014 : Oid typioparam, int32 typmod,
2015 : bool *isnull)
2016 : {
2017 : int32 fld_size;
2018 : Datum result;
2019 :
2020 158 : if (!CopyGetInt32(cstate, &fld_size))
2021 0 : ereport(ERROR,
2022 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2023 : errmsg("unexpected EOF in COPY data")));
2024 158 : if (fld_size == -1)
2025 : {
2026 30 : *isnull = true;
2027 30 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2028 : }
2029 128 : if (fld_size < 0)
2030 0 : ereport(ERROR,
2031 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2032 : errmsg("invalid field size")));
2033 :
2034 : /* reset attribute_buf to empty, and load raw data in it */
2035 128 : resetStringInfo(&cstate->attribute_buf);
2036 :
2037 128 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2038 128 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2039 128 : fld_size) != fld_size)
2040 0 : ereport(ERROR,
2041 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2042 : errmsg("unexpected EOF in COPY data")));
2043 :
2044 128 : cstate->attribute_buf.len = fld_size;
2045 128 : cstate->attribute_buf.data[fld_size] = '\0';
2046 :
2047 : /* Call the column type's binary input converter */
2048 128 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2049 : typioparam, typmod);
2050 :
2051 : /* Trouble if it didn't eat the whole buffer */
2052 128 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2053 2 : ereport(ERROR,
2054 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2055 : errmsg("incorrect binary data format")));
2056 :
2057 126 : *isnull = false;
2058 126 : return result;
2059 : }
|