Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copyapi.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bswap.h"
75 : #include "utils/builtins.h"
76 : #include "utils/rel.h"
77 :
78 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 : #define OCTVALUE(c) ((c) - '0')
80 :
81 : /*
82 : * These macros centralize code used to process line_buf and input_buf buffers.
83 : * They are macros because they often do continue/break control and to avoid
84 : * function call overhead in tight COPY loops.
85 : *
86 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 : * prevent the continue/break processing from working. We end the "if (1)"
88 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 : * any "else" in the calling code, and to avoid any compiler warnings about
90 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 : */
92 :
93 : /*
94 : * This keeps the character read at the top of the loop in the buffer
95 : * even if there is more than one read-ahead.
96 : */
97 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98 : if (1) \
99 : { \
100 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 : { \
102 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 : need_data = true; \
104 : continue; \
105 : } \
106 : } else ((void) 0)
107 :
108 : /* This consumes the remainder of the buffer and breaks */
109 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110 : if (1) \
111 : { \
112 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 : { \
114 : if (extralen) \
115 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 : /* backslash just before EOF, treat as data char */ \
117 : result = true; \
118 : break; \
119 : } \
120 : } else ((void) 0)
121 :
122 : /*
123 : * Transfer any approved data to line_buf; must do this to be sure
124 : * there is some room in input_buf.
125 : */
126 : #define REFILL_LINEBUF \
127 : if (1) \
128 : { \
129 : if (input_buf_ptr > cstate->input_buf_index) \
130 : { \
131 : appendBinaryStringInfo(&cstate->line_buf, \
132 : cstate->input_buf + cstate->input_buf_index, \
133 : input_buf_ptr - cstate->input_buf_index); \
134 : cstate->input_buf_index = input_buf_ptr; \
135 : } \
136 : } else ((void) 0)
137 :
138 : /* NOTE: there's a copy of this in copyto.c */
139 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140 :
141 :
142 : /* non-export function prototypes */
143 : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
144 : static bool CopyReadLineText(CopyFromState cstate, bool is_csv);
145 : static int CopyReadAttributesText(CopyFromState cstate);
146 : static int CopyReadAttributesCSV(CopyFromState cstate);
147 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
148 : Oid typioparam, int32 typmod,
149 : bool *isnull);
150 : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
151 : ExprContext *econtext,
152 : Datum *values,
153 : bool *nulls,
154 : bool is_csv);
155 : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
156 : char ***fields,
157 : int *nfields,
158 : bool is_csv);
159 :
160 :
161 : /* Low-level communications functions */
162 : static int CopyGetData(CopyFromState cstate, void *databuf,
163 : int minread, int maxread);
164 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
165 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
166 : static void CopyLoadInputBuf(CopyFromState cstate);
167 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
168 :
169 : void
170 1104 : ReceiveCopyBegin(CopyFromState cstate)
171 : {
172 : StringInfoData buf;
173 1104 : int natts = list_length(cstate->attnumlist);
174 1104 : int16 format = (cstate->opts.binary ? 1 : 0);
175 : int i;
176 :
177 1104 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
178 1104 : pq_sendbyte(&buf, format); /* overall format */
179 1104 : pq_sendint16(&buf, natts);
180 3980 : for (i = 0; i < natts; i++)
181 2876 : pq_sendint16(&buf, format); /* per-column formats */
182 1104 : pq_endmessage(&buf);
183 1104 : cstate->copy_src = COPY_FRONTEND;
184 1104 : cstate->fe_msgbuf = makeStringInfo();
185 : /* We *must* flush here to ensure FE knows it can send. */
186 1104 : pq_flush();
187 1104 : }
188 :
189 : void
190 14 : ReceiveCopyBinaryHeader(CopyFromState cstate)
191 : {
192 : char readSig[11];
193 : int32 tmp;
194 :
195 : /* Signature */
196 14 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
197 14 : memcmp(readSig, BinarySignature, 11) != 0)
198 0 : ereport(ERROR,
199 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
200 : errmsg("COPY file signature not recognized")));
201 : /* Flags field */
202 14 : if (!CopyGetInt32(cstate, &tmp))
203 0 : ereport(ERROR,
204 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
205 : errmsg("invalid COPY file header (missing flags)")));
206 14 : if ((tmp & (1 << 16)) != 0)
207 0 : ereport(ERROR,
208 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 : errmsg("invalid COPY file header (WITH OIDS)")));
210 14 : tmp &= ~(1 << 16);
211 14 : if ((tmp >> 16) != 0)
212 0 : ereport(ERROR,
213 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
214 : errmsg("unrecognized critical flags in COPY file header")));
215 : /* Header extension length */
216 14 : if (!CopyGetInt32(cstate, &tmp) ||
217 14 : tmp < 0)
218 0 : ereport(ERROR,
219 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
220 : errmsg("invalid COPY file header (missing length)")));
221 : /* Skip extension header, if present */
222 14 : while (tmp-- > 0)
223 : {
224 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
225 0 : ereport(ERROR,
226 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
227 : errmsg("invalid COPY file header (wrong length)")));
228 : }
229 14 : }
230 :
231 : /*
232 : * CopyGetData reads data from the source (file or frontend)
233 : *
234 : * We attempt to read at least minread, and at most maxread, bytes from
235 : * the source. The actual number of bytes read is returned; if this is
236 : * less than minread, EOF was detected.
237 : *
238 : * Note: when copying from the frontend, we expect a proper EOF mark per
239 : * protocol; if the frontend simply drops the connection, we raise error.
240 : * It seems unwise to allow the COPY IN to complete normally in that case.
241 : *
242 : * NB: no data conversion is applied here.
243 : */
244 : static int
245 431998 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
246 : {
247 431998 : int bytesread = 0;
248 :
249 431998 : switch (cstate->copy_src)
250 : {
251 1100 : case COPY_FILE:
252 1100 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
253 1100 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
254 1100 : pgstat_report_wait_end();
255 1100 : if (ferror(cstate->copy_file))
256 0 : ereport(ERROR,
257 : (errcode_for_file_access(),
258 : errmsg("could not read from COPY file: %m")));
259 1100 : if (bytesread == 0)
260 432 : cstate->raw_reached_eof = true;
261 1100 : break;
262 402894 : case COPY_FRONTEND:
263 804146 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
264 : {
265 : int avail;
266 :
267 803402 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
268 : {
269 : /* Try to receive another message */
270 : int mtype;
271 : int maxmsglen;
272 :
273 402150 : readmessage:
274 402150 : HOLD_CANCEL_INTERRUPTS();
275 402150 : pq_startmsgread();
276 402150 : mtype = pq_getbyte();
277 402150 : if (mtype == EOF)
278 0 : ereport(ERROR,
279 : (errcode(ERRCODE_CONNECTION_FAILURE),
280 : errmsg("unexpected EOF on client connection with an open transaction")));
281 : /* Validate message type and set packet size limit */
282 : switch (mtype)
283 : {
284 401252 : case PqMsg_CopyData:
285 401252 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
286 401252 : break;
287 894 : case PqMsg_CopyDone:
288 : case PqMsg_CopyFail:
289 : case PqMsg_Flush:
290 : case PqMsg_Sync:
291 894 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
292 894 : break;
293 4 : default:
294 4 : ereport(ERROR,
295 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
296 : errmsg("unexpected message type 0x%02X during COPY from stdin",
297 : mtype)));
298 : maxmsglen = 0; /* keep compiler quiet */
299 : break;
300 : }
301 : /* Now collect the message body */
302 402146 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
303 0 : ereport(ERROR,
304 : (errcode(ERRCODE_CONNECTION_FAILURE),
305 : errmsg("unexpected EOF on client connection with an open transaction")));
306 402146 : RESUME_CANCEL_INTERRUPTS();
307 : /* ... and process it */
308 : switch (mtype)
309 : {
310 401252 : case PqMsg_CopyData:
311 401252 : break;
312 894 : case PqMsg_CopyDone:
313 : /* COPY IN correctly terminated by frontend */
314 894 : cstate->raw_reached_eof = true;
315 894 : return bytesread;
316 0 : case PqMsg_CopyFail:
317 0 : ereport(ERROR,
318 : (errcode(ERRCODE_QUERY_CANCELED),
319 : errmsg("COPY from stdin failed: %s",
320 : pq_getmsgstring(cstate->fe_msgbuf))));
321 : break;
322 0 : case PqMsg_Flush:
323 : case PqMsg_Sync:
324 :
325 : /*
326 : * Ignore Flush/Sync for the convenience of client
327 : * libraries (such as libpq) that may send those
328 : * without noticing that the command they just
329 : * sent was COPY.
330 : */
331 0 : goto readmessage;
332 401252 : default:
333 : Assert(false); /* NOT REACHED */
334 : }
335 : }
336 401252 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
337 401252 : if (avail > maxread)
338 0 : avail = maxread;
339 401252 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
340 401252 : databuf = (char *) databuf + avail;
341 401252 : maxread -= avail;
342 401252 : bytesread += avail;
343 : }
344 401996 : break;
345 28004 : case COPY_CALLBACK:
346 28004 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
347 28004 : break;
348 : }
349 :
350 431100 : return bytesread;
351 : }
352 :
353 :
354 : /*
355 : * These functions do apply some data conversion
356 : */
357 :
358 : /*
359 : * CopyGetInt32 reads an int32 that appears in network byte order
360 : *
361 : * Returns true if OK, false if EOF
362 : */
363 : static inline bool
364 186 : CopyGetInt32(CopyFromState cstate, int32 *val)
365 : {
366 : uint32 buf;
367 :
368 186 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
369 : {
370 0 : *val = 0; /* suppress compiler warning */
371 0 : return false;
372 : }
373 186 : *val = (int32) pg_ntoh32(buf);
374 186 : return true;
375 : }
376 :
377 : /*
378 : * CopyGetInt16 reads an int16 that appears in network byte order
379 : */
380 : static inline bool
381 42 : CopyGetInt16(CopyFromState cstate, int16 *val)
382 : {
383 : uint16 buf;
384 :
385 42 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
386 : {
387 0 : *val = 0; /* suppress compiler warning */
388 0 : return false;
389 : }
390 42 : *val = (int16) pg_ntoh16(buf);
391 42 : return true;
392 : }
393 :
394 :
395 : /*
396 : * Perform encoding conversion on data in 'raw_buf', writing the converted
397 : * data into 'input_buf'.
398 : *
399 : * On entry, there must be some data to convert in 'raw_buf'.
400 : */
401 : static void
402 862120 : CopyConvertBuf(CopyFromState cstate)
403 : {
404 : /*
405 : * If the file and server encoding are the same, no encoding conversion is
406 : * required. However, we still need to verify that the input is valid for
407 : * the encoding.
408 : */
409 862120 : if (!cstate->need_transcoding)
410 : {
411 : /*
412 : * When conversion is not required, input_buf and raw_buf are the
413 : * same. raw_buf_len is the total number of bytes in the buffer, and
414 : * input_buf_len tracks how many of those bytes have already been
415 : * verified.
416 : */
417 862036 : int preverifiedlen = cstate->input_buf_len;
418 862036 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
419 : int nverified;
420 :
421 862036 : if (unverifiedlen == 0)
422 : {
423 : /*
424 : * If no more raw data is coming, report the EOF to the caller.
425 : */
426 432550 : if (cstate->raw_reached_eof)
427 1530 : cstate->input_reached_eof = true;
428 432550 : return;
429 : }
430 :
431 : /*
432 : * Verify the new data, including any residual unverified bytes from
433 : * previous round.
434 : */
435 429486 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
436 429486 : cstate->raw_buf + preverifiedlen,
437 : unverifiedlen);
438 429486 : if (nverified == 0)
439 : {
440 : /*
441 : * Could not verify anything.
442 : *
443 : * If there is no more raw input data coming, it means that there
444 : * was an incomplete multi-byte sequence at the end. Also, if
445 : * there's "enough" input left, we should be able to verify at
446 : * least one character, and a failure to do so means that we've
447 : * hit an invalid byte sequence.
448 : */
449 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
450 0 : cstate->input_reached_error = true;
451 0 : return;
452 : }
453 429486 : cstate->input_buf_len += nverified;
454 : }
455 : else
456 : {
457 : /*
458 : * Encoding conversion is needed.
459 : */
460 : int nbytes;
461 : unsigned char *src;
462 : int srclen;
463 : unsigned char *dst;
464 : int dstlen;
465 : int convertedlen;
466 :
467 84 : if (RAW_BUF_BYTES(cstate) == 0)
468 : {
469 : /*
470 : * If no more raw data is coming, report the EOF to the caller.
471 : */
472 48 : if (cstate->raw_reached_eof)
473 12 : cstate->input_reached_eof = true;
474 48 : return;
475 : }
476 :
477 : /*
478 : * First, copy down any unprocessed data.
479 : */
480 36 : nbytes = INPUT_BUF_BYTES(cstate);
481 36 : if (nbytes > 0 && cstate->input_buf_index > 0)
482 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
483 : nbytes);
484 36 : cstate->input_buf_index = 0;
485 36 : cstate->input_buf_len = nbytes;
486 36 : cstate->input_buf[nbytes] = '\0';
487 :
488 36 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
489 36 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
490 36 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
491 36 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
492 :
493 : /*
494 : * Do the conversion. This might stop short, if there is an invalid
495 : * byte sequence in the input. We'll convert as much as we can in
496 : * that case.
497 : *
498 : * Note: Even if we hit an invalid byte sequence, we don't report the
499 : * error until all the valid bytes have been consumed. The input
500 : * might contain an end-of-input marker (\.), and we don't want to
501 : * report an error if the invalid byte sequence is after the
502 : * end-of-input marker. We might unnecessarily convert some data
503 : * after the end-of-input marker as long as it's valid for the
504 : * encoding, but that's harmless.
505 : */
506 36 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
507 : cstate->file_encoding,
508 : GetDatabaseEncoding(),
509 : src, srclen,
510 : dst, dstlen,
511 : true);
512 36 : if (convertedlen == 0)
513 : {
514 : /*
515 : * Could not convert anything. If there is no more raw input data
516 : * coming, it means that there was an incomplete multi-byte
517 : * sequence at the end. Also, if there is plenty of input left,
518 : * we should be able to convert at least one character, so a
519 : * failure to do so must mean that we've hit a byte sequence
520 : * that's invalid.
521 : */
522 24 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
523 12 : cstate->input_reached_error = true;
524 24 : return;
525 : }
526 12 : cstate->raw_buf_index += convertedlen;
527 12 : cstate->input_buf_len += strlen((char *) dst);
528 : }
529 : }
530 :
531 : /*
532 : * Report an encoding or conversion error.
533 : */
534 : static void
535 12 : CopyConversionError(CopyFromState cstate)
536 : {
537 : Assert(cstate->raw_buf_len > 0);
538 : Assert(cstate->input_reached_error);
539 :
540 12 : if (!cstate->need_transcoding)
541 : {
542 : /*
543 : * Everything up to input_buf_len was successfully verified, and
544 : * input_buf_len points to the invalid or incomplete character.
545 : */
546 0 : report_invalid_encoding(cstate->file_encoding,
547 0 : cstate->raw_buf + cstate->input_buf_len,
548 0 : cstate->raw_buf_len - cstate->input_buf_len);
549 : }
550 : else
551 : {
552 : /*
553 : * raw_buf_index points to the invalid or untranslatable character. We
554 : * let the conversion routine report the error, because it can provide
555 : * a more specific error message than we could here. An earlier call
556 : * to the conversion routine in CopyConvertBuf() detected that there
557 : * is an error, now we call the conversion routine again with
558 : * noError=false, to have it throw the error.
559 : */
560 : unsigned char *src;
561 : int srclen;
562 : unsigned char *dst;
563 : int dstlen;
564 :
565 12 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
566 12 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
567 12 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
568 12 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
569 :
570 12 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
571 : cstate->file_encoding,
572 : GetDatabaseEncoding(),
573 : src, srclen,
574 : dst, dstlen,
575 : false);
576 :
577 : /*
578 : * The conversion routine should have reported an error, so this
579 : * should not be reached.
580 : */
581 0 : elog(ERROR, "encoding conversion failed without error");
582 : }
583 : }
584 :
585 : /*
586 : * Load more data from data source to raw_buf.
587 : *
588 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
589 : * beginning of the buffer, and we load new data after that.
590 : */
591 : static void
592 431104 : CopyLoadRawBuf(CopyFromState cstate)
593 : {
594 : int nbytes;
595 : int inbytes;
596 :
597 : /*
598 : * In text mode, if encoding conversion is not required, raw_buf and
599 : * input_buf point to the same buffer. Their len/index better agree, too.
600 : */
601 431104 : if (cstate->raw_buf == cstate->input_buf)
602 : {
603 : Assert(!cstate->need_transcoding);
604 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
605 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
606 : }
607 :
608 : /*
609 : * Copy down the unprocessed data if any.
610 : */
611 431104 : nbytes = RAW_BUF_BYTES(cstate);
612 431104 : if (nbytes > 0 && cstate->raw_buf_index > 0)
613 0 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
614 : nbytes);
615 431104 : cstate->raw_buf_len -= cstate->raw_buf_index;
616 431104 : cstate->raw_buf_index = 0;
617 :
618 : /*
619 : * If raw_buf and input_buf are in fact the same buffer, adjust the
620 : * input_buf variables, too.
621 : */
622 431104 : if (cstate->raw_buf == cstate->input_buf)
623 : {
624 431020 : cstate->input_buf_len -= cstate->input_buf_index;
625 431020 : cstate->input_buf_index = 0;
626 : }
627 :
628 : /* Load more data */
629 431104 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
630 431104 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
631 431100 : nbytes += inbytes;
632 431100 : cstate->raw_buf[nbytes] = '\0';
633 431100 : cstate->raw_buf_len = nbytes;
634 :
635 431100 : cstate->bytes_processed += inbytes;
636 431100 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
637 :
638 431100 : if (inbytes == 0)
639 1566 : cstate->raw_reached_eof = true;
640 431100 : }
641 :
642 : /*
643 : * CopyLoadInputBuf loads some more data into input_buf
644 : *
645 : * On return, at least one more input character is loaded into
646 : * input_buf, or input_reached_eof is set.
647 : *
648 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
649 : * of the buffer and then we load more data after that.
650 : */
651 : static void
652 431056 : CopyLoadInputBuf(CopyFromState cstate)
653 : {
654 431056 : int nbytes = INPUT_BUF_BYTES(cstate);
655 :
656 : /*
657 : * The caller has updated input_buf_index to indicate how much of the
658 : * input has been consumed and isn't needed anymore. If input_buf is the
659 : * same physical area as raw_buf, update raw_buf_index accordingly.
660 : */
661 431056 : if (cstate->raw_buf == cstate->input_buf)
662 : {
663 : Assert(!cstate->need_transcoding);
664 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
665 431020 : cstate->raw_buf_index = cstate->input_buf_index;
666 : }
667 :
668 : for (;;)
669 : {
670 : /* If we now have some unconverted data, try to convert it */
671 862120 : CopyConvertBuf(cstate);
672 :
673 : /* If we now have some more input bytes ready, return them */
674 862120 : if (INPUT_BUF_BYTES(cstate) > nbytes)
675 429498 : return;
676 :
677 : /*
678 : * If we reached an invalid byte sequence, or we're at an incomplete
679 : * multi-byte character but there is no more raw input data, report
680 : * conversion error.
681 : */
682 432622 : if (cstate->input_reached_error)
683 12 : CopyConversionError(cstate);
684 :
685 : /* no more input, and everything has been converted */
686 432610 : if (cstate->input_reached_eof)
687 1542 : break;
688 :
689 : /* Try to load more raw data */
690 : Assert(!cstate->raw_reached_eof);
691 431068 : CopyLoadRawBuf(cstate);
692 : }
693 : }
694 :
695 : /*
696 : * CopyReadBinaryData
697 : *
698 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
699 : * and writes them to 'dest'. Returns the number of bytes read (which
700 : * would be less than 'nbytes' only if we reach EOF).
701 : */
702 : static int
703 382 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
704 : {
705 382 : int copied_bytes = 0;
706 :
707 382 : if (RAW_BUF_BYTES(cstate) >= nbytes)
708 : {
709 : /* Enough bytes are present in the buffer. */
710 346 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
711 346 : cstate->raw_buf_index += nbytes;
712 346 : copied_bytes = nbytes;
713 : }
714 : else
715 : {
716 : /*
717 : * Not enough bytes in the buffer, so must read from the file. Need
718 : * to loop since 'nbytes' could be larger than the buffer size.
719 : */
720 : do
721 : {
722 : int copy_bytes;
723 :
724 : /* Load more data if buffer is empty. */
725 36 : if (RAW_BUF_BYTES(cstate) == 0)
726 : {
727 36 : CopyLoadRawBuf(cstate);
728 36 : if (cstate->raw_reached_eof)
729 12 : break; /* EOF */
730 : }
731 :
732 : /* Transfer some bytes. */
733 24 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
734 24 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
735 24 : cstate->raw_buf_index += copy_bytes;
736 24 : dest += copy_bytes;
737 24 : copied_bytes += copy_bytes;
738 24 : } while (copied_bytes < nbytes);
739 : }
740 :
741 382 : return copied_bytes;
742 : }
743 :
744 : /*
745 : * This function is exposed for use by extensions that read raw fields in the
746 : * next line. See NextCopyFromRawFieldsInternal() for details.
747 : */
748 : bool
749 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
750 : {
751 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
752 0 : cstate->opts.csv_mode);
753 : }
754 :
755 : /*
756 : * Workhorse for NextCopyFromRawFields().
757 : *
758 : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
759 : * false if no more lines.
760 : *
761 : * An internal temporary buffer is returned via 'fields'. It is valid until
762 : * the next call of the function. Since the function returns all raw fields
763 : * in the input file, 'nfields' could be different from the number of columns
764 : * in the relation.
765 : *
766 : * NOTE: force_not_null option are not applied to the returned fields.
767 : *
768 : * We use pg_attribute_always_inline to reduce function call overhead
769 : * and to help compilers to optimize away the 'is_csv' condition when called
770 : * by internal functions such as CopyFromTextLikeOneRow().
771 : */
772 : static pg_attribute_always_inline bool
773 1267380 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
774 : {
775 : int fldct;
776 1267380 : bool done = false;
777 :
778 : /* only available for text or csv input */
779 : Assert(!cstate->opts.binary);
780 :
781 : /* on input check that the header line is correct if needed */
782 1267380 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
783 : {
784 : ListCell *cur;
785 : TupleDesc tupDesc;
786 148 : int lines_to_skip = cstate->opts.header_line;
787 :
788 : /* If set to "match", one header line is skipped */
789 148 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
790 76 : lines_to_skip = 1;
791 :
792 148 : tupDesc = RelationGetDescr(cstate->rel);
793 :
794 346 : for (int i = 0; i < lines_to_skip; i++)
795 : {
796 206 : cstate->cur_lineno++;
797 206 : if ((done = CopyReadLine(cstate, is_csv)))
798 8 : break;
799 : }
800 :
801 148 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
802 : {
803 : int fldnum;
804 :
805 76 : if (is_csv)
806 10 : fldct = CopyReadAttributesCSV(cstate);
807 : else
808 66 : fldct = CopyReadAttributesText(cstate);
809 :
810 76 : if (fldct != list_length(cstate->attnumlist))
811 24 : ereport(ERROR,
812 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
813 : errmsg("wrong number of fields in header line: got %d, expected %d",
814 : fldct, list_length(cstate->attnumlist))));
815 :
816 52 : fldnum = 0;
817 158 : foreach(cur, cstate->attnumlist)
818 : {
819 126 : int attnum = lfirst_int(cur);
820 : char *colName;
821 126 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
822 :
823 : Assert(fldnum < cstate->max_fields);
824 :
825 126 : colName = cstate->raw_fields[fldnum++];
826 126 : if (colName == NULL)
827 6 : ereport(ERROR,
828 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
829 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
830 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
831 :
832 120 : if (namestrcmp(&attr->attname, colName) != 0)
833 : {
834 14 : ereport(ERROR,
835 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
836 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
837 : fldnum, colName, NameStr(attr->attname))));
838 : }
839 : }
840 : }
841 :
842 104 : if (done)
843 8 : return false;
844 : }
845 :
846 1267328 : cstate->cur_lineno++;
847 :
848 : /* Actually read the line into memory here */
849 1267328 : done = CopyReadLine(cstate, is_csv);
850 :
851 : /*
852 : * EOF at start of line means we're done. If we see EOF after some
853 : * characters, we act as though it was newline followed by EOF, ie,
854 : * process the line and then exit loop on next iteration.
855 : */
856 1267300 : if (done && cstate->line_buf.len == 0)
857 1684 : return false;
858 :
859 : /* Parse the line into de-escaped field values */
860 1265616 : if (is_csv)
861 498 : fldct = CopyReadAttributesCSV(cstate);
862 : else
863 1265118 : fldct = CopyReadAttributesText(cstate);
864 :
865 1265604 : *fields = cstate->raw_fields;
866 1265604 : *nfields = fldct;
867 1265604 : return true;
868 : }
869 :
870 : /*
871 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
872 : *
873 : * 'econtext' is used to evaluate default expression for each column that is
874 : * either not read from the file or is using the DEFAULT option of COPY FROM.
875 : * It can be NULL when no default values are used, i.e. when all columns are
876 : * read from the file, and DEFAULT option is unset.
877 : *
878 : * 'values' and 'nulls' arrays must be the same length as columns of the
879 : * relation passed to BeginCopyFrom. This function fills the arrays.
880 : */
881 : bool
882 1267422 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
883 : Datum *values, bool *nulls)
884 : {
885 : TupleDesc tupDesc;
886 : AttrNumber num_phys_attrs,
887 1267422 : num_defaults = cstate->num_defaults;
888 : int i;
889 1267422 : int *defmap = cstate->defmap;
890 1267422 : ExprState **defexprs = cstate->defexprs;
891 :
892 1267422 : tupDesc = RelationGetDescr(cstate->rel);
893 1267422 : num_phys_attrs = tupDesc->natts;
894 :
895 : /* Initialize all values for row to NULL */
896 5937100 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
897 1267422 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
898 1411530 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
899 :
900 : /* Get one row from source */
901 1267422 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
902 1704 : return false;
903 :
904 : /*
905 : * Now compute and insert any defaults available for the columns not
906 : * provided by the input data. Anything not processed here or above will
907 : * remain NULL.
908 : */
909 1326088 : for (i = 0; i < num_defaults; i++)
910 : {
911 : /*
912 : * The caller must supply econtext and have switched into the
913 : * per-tuple memory context in it.
914 : */
915 : Assert(econtext != NULL);
916 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
917 :
918 60530 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
919 60530 : &nulls[defmap[i]]);
920 : }
921 :
922 1265558 : return true;
923 : }
924 :
925 : /* Implementation of the per-row callback for text format */
926 : bool
927 1266638 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
928 : bool *nulls)
929 : {
930 1266638 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
931 : }
932 :
933 : /* Implementation of the per-row callback for CSV format */
934 : bool
935 742 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
936 : bool *nulls)
937 : {
938 742 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
939 : }
940 :
941 : /*
942 : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
943 : *
944 : * We use pg_attribute_always_inline to reduce function call overhead
945 : * and to help compilers to optimize away the 'is_csv' condition.
946 : */
947 : static pg_attribute_always_inline bool
948 1267380 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
949 : Datum *values, bool *nulls, bool is_csv)
950 : {
951 : TupleDesc tupDesc;
952 : AttrNumber attr_count;
953 1267380 : FmgrInfo *in_functions = cstate->in_functions;
954 1267380 : Oid *typioparams = cstate->typioparams;
955 1267380 : ExprState **defexprs = cstate->defexprs;
956 : char **field_strings;
957 : ListCell *cur;
958 : int fldct;
959 : int fieldno;
960 : char *string;
961 :
962 1267380 : tupDesc = RelationGetDescr(cstate->rel);
963 1267380 : attr_count = list_length(cstate->attnumlist);
964 :
965 : /* read raw fields in the next line */
966 1267380 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
967 1692 : return false;
968 :
969 : /* check for overflowing fields */
970 1265604 : if (attr_count > 0 && fldct > attr_count)
971 18 : ereport(ERROR,
972 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
973 : errmsg("extra data after last expected column")));
974 :
975 1265586 : fieldno = 0;
976 :
977 : /* Loop to read the user attributes on the line. */
978 5794174 : foreach(cur, cstate->attnumlist)
979 : {
980 4528772 : int attnum = lfirst_int(cur);
981 4528772 : int m = attnum - 1;
982 4528772 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
983 :
984 4528772 : if (fieldno >= fldct)
985 18 : ereport(ERROR,
986 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
987 : errmsg("missing data for column \"%s\"",
988 : NameStr(att->attname))));
989 4528754 : string = field_strings[fieldno++];
990 :
991 4528754 : if (cstate->convert_select_flags &&
992 20 : !cstate->convert_select_flags[m])
993 : {
994 : /* ignore input field, leaving column as NULL */
995 10 : continue;
996 : }
997 :
998 4528744 : if (is_csv)
999 : {
1000 1000 : if (string == NULL &&
1001 44 : cstate->opts.force_notnull_flags[m])
1002 : {
1003 : /*
1004 : * FORCE_NOT_NULL option is set and column is NULL - convert
1005 : * it to the NULL string.
1006 : */
1007 28 : string = cstate->opts.null_print;
1008 : }
1009 972 : else if (string != NULL && cstate->opts.force_null_flags[m]
1010 50 : && strcmp(string, cstate->opts.null_print) == 0)
1011 : {
1012 : /*
1013 : * FORCE_NULL option is set and column matches the NULL
1014 : * string. It must have been quoted, or otherwise the string
1015 : * would already have been set to NULL. Convert it to NULL as
1016 : * specified.
1017 : */
1018 26 : string = NULL;
1019 : }
1020 : }
1021 :
1022 4528744 : cstate->cur_attname = NameStr(att->attname);
1023 4528744 : cstate->cur_attval = string;
1024 :
1025 4528744 : if (string != NULL)
1026 4523880 : nulls[m] = false;
1027 :
1028 4528744 : if (cstate->defaults[m])
1029 : {
1030 : /* We must have switched into the per-tuple memory context */
1031 : Assert(econtext != NULL);
1032 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1033 :
1034 60 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1035 : }
1036 :
1037 : /*
1038 : * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1039 : */
1040 4528646 : else if (!InputFunctionCallSafe(&in_functions[m],
1041 : string,
1042 4528684 : typioparams[m],
1043 : att->atttypmod,
1044 4528684 : (Node *) cstate->escontext,
1045 4528684 : &values[m]))
1046 : {
1047 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1048 :
1049 128 : cstate->num_errors++;
1050 :
1051 128 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1052 : {
1053 : /*
1054 : * Since we emit line number and column info in the below
1055 : * notice message, we suppress error context information other
1056 : * than the relation name.
1057 : */
1058 : Assert(!cstate->relname_only);
1059 42 : cstate->relname_only = true;
1060 :
1061 42 : if (cstate->cur_attval)
1062 : {
1063 : char *attval;
1064 :
1065 36 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1066 36 : ereport(NOTICE,
1067 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1068 : cstate->cur_lineno,
1069 : cstate->cur_attname,
1070 : attval));
1071 36 : pfree(attval);
1072 : }
1073 : else
1074 6 : ereport(NOTICE,
1075 : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1076 : cstate->cur_lineno,
1077 : cstate->cur_attname));
1078 :
1079 : /* reset relname_only */
1080 42 : cstate->relname_only = false;
1081 : }
1082 :
1083 128 : return true;
1084 : }
1085 :
1086 4528578 : cstate->cur_attname = NULL;
1087 4528578 : cstate->cur_attval = NULL;
1088 : }
1089 :
1090 : Assert(fieldno == attr_count);
1091 :
1092 1265402 : return true;
1093 : }
1094 :
1095 : /* Implementation of the per-row callback for binary format */
1096 : bool
1097 42 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1098 : bool *nulls)
1099 : {
1100 : TupleDesc tupDesc;
1101 : AttrNumber attr_count;
1102 42 : FmgrInfo *in_functions = cstate->in_functions;
1103 42 : Oid *typioparams = cstate->typioparams;
1104 : int16 fld_count;
1105 : ListCell *cur;
1106 :
1107 42 : tupDesc = RelationGetDescr(cstate->rel);
1108 42 : attr_count = list_length(cstate->attnumlist);
1109 :
1110 42 : cstate->cur_lineno++;
1111 :
1112 42 : if (!CopyGetInt16(cstate, &fld_count))
1113 : {
1114 : /* EOF detected (end of file, or protocol-level EOF) */
1115 0 : return false;
1116 : }
1117 :
1118 42 : if (fld_count == -1)
1119 : {
1120 : /*
1121 : * Received EOF marker. Wait for the protocol-level EOF, and complain
1122 : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1123 : * that we correctly handle CopyFail, if client chooses to send that
1124 : * now. When copying from file, we could ignore the rest of the file
1125 : * like in text mode, but we choose to be consistent with the COPY
1126 : * FROM STDIN case.
1127 : */
1128 : char dummy;
1129 :
1130 12 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1131 0 : ereport(ERROR,
1132 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1133 : errmsg("received copy data after EOF marker")));
1134 12 : return false;
1135 : }
1136 :
1137 30 : if (fld_count != attr_count)
1138 0 : ereport(ERROR,
1139 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1140 : errmsg("row field count is %d, expected %d",
1141 : fld_count, attr_count)));
1142 :
1143 186 : foreach(cur, cstate->attnumlist)
1144 : {
1145 158 : int attnum = lfirst_int(cur);
1146 158 : int m = attnum - 1;
1147 158 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1148 :
1149 158 : cstate->cur_attname = NameStr(att->attname);
1150 314 : values[m] = CopyReadBinaryAttribute(cstate,
1151 158 : &in_functions[m],
1152 158 : typioparams[m],
1153 : att->atttypmod,
1154 : &nulls[m]);
1155 156 : cstate->cur_attname = NULL;
1156 : }
1157 :
1158 28 : return true;
1159 : }
1160 :
1161 : /*
1162 : * Read the next input line and stash it in line_buf.
1163 : *
1164 : * Result is true if read was terminated by EOF, false if terminated
1165 : * by newline. The terminating newline or EOF marker is not included
1166 : * in the final value of line_buf.
1167 : */
1168 : static bool
1169 1267534 : CopyReadLine(CopyFromState cstate, bool is_csv)
1170 : {
1171 : bool result;
1172 :
1173 1267534 : resetStringInfo(&cstate->line_buf);
1174 1267534 : cstate->line_buf_valid = false;
1175 :
1176 : /* Parse data and transfer into line_buf */
1177 1267534 : result = CopyReadLineText(cstate, is_csv);
1178 :
1179 1267506 : if (result)
1180 : {
1181 : /*
1182 : * Reached EOF. In protocol version 3, we should ignore anything
1183 : * after \. up to the protocol end of copy data. (XXX maybe better
1184 : * not to treat \. as special?)
1185 : */
1186 1692 : if (cstate->copy_src == COPY_FRONTEND)
1187 : {
1188 : int inbytes;
1189 :
1190 : do
1191 : {
1192 894 : inbytes = CopyGetData(cstate, cstate->input_buf,
1193 : 1, INPUT_BUF_SIZE);
1194 894 : } while (inbytes > 0);
1195 894 : cstate->input_buf_index = 0;
1196 894 : cstate->input_buf_len = 0;
1197 894 : cstate->raw_buf_index = 0;
1198 894 : cstate->raw_buf_len = 0;
1199 : }
1200 : }
1201 : else
1202 : {
1203 : /*
1204 : * If we didn't hit EOF, then we must have transferred the EOL marker
1205 : * to line_buf along with the data. Get rid of it.
1206 : */
1207 1265814 : switch (cstate->eol_type)
1208 : {
1209 1265814 : case EOL_NL:
1210 : Assert(cstate->line_buf.len >= 1);
1211 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1212 1265814 : cstate->line_buf.len--;
1213 1265814 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1214 1265814 : break;
1215 0 : case EOL_CR:
1216 : Assert(cstate->line_buf.len >= 1);
1217 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1218 0 : cstate->line_buf.len--;
1219 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1220 0 : break;
1221 0 : case EOL_CRNL:
1222 : Assert(cstate->line_buf.len >= 2);
1223 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1224 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1225 0 : cstate->line_buf.len -= 2;
1226 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1227 0 : break;
1228 0 : case EOL_UNKNOWN:
1229 : /* shouldn't get here */
1230 : Assert(false);
1231 0 : break;
1232 : }
1233 : }
1234 :
1235 : /* Now it's safe to use the buffer in error messages */
1236 1267506 : cstate->line_buf_valid = true;
1237 :
1238 1267506 : return result;
1239 : }
1240 :
1241 : /*
1242 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1243 : */
1244 : static bool
1245 1267534 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1246 : {
1247 : char *copy_input_buf;
1248 : int input_buf_ptr;
1249 : int copy_buf_len;
1250 1267534 : bool need_data = false;
1251 1267534 : bool hit_eof = false;
1252 1267534 : bool result = false;
1253 :
1254 : /* CSV variables */
1255 1267534 : bool in_quote = false,
1256 1267534 : last_was_esc = false;
1257 1267534 : char quotec = '\0';
1258 1267534 : char escapec = '\0';
1259 :
1260 1267534 : if (is_csv)
1261 : {
1262 866 : quotec = cstate->opts.quote[0];
1263 866 : escapec = cstate->opts.escape[0];
1264 : /* ignore special escape processing if it's the same as quotec */
1265 866 : if (quotec == escapec)
1266 672 : escapec = '\0';
1267 : }
1268 :
1269 : /*
1270 : * The objective of this loop is to transfer the entire next input line
1271 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1272 : * \n) and the end-of-copy marker (\.).
1273 : *
1274 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1275 : * value and are put in line_buf. We keep just enough state to know if we
1276 : * are currently in a quoted field or not.
1277 : *
1278 : * The input has already been converted to the database encoding. All
1279 : * supported server encodings have the property that all bytes in a
1280 : * multi-byte sequence have the high bit set, so a multibyte character
1281 : * cannot contain any newline or escape characters embedded in the
1282 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1283 : * regardless of the encoding.
1284 : *
1285 : * For speed, we try to move data from input_buf to line_buf in chunks
1286 : * rather than one character at a time. input_buf_ptr points to the next
1287 : * character to examine; any characters from input_buf_index to
1288 : * input_buf_ptr have been determined to be part of the line, but not yet
1289 : * transferred to line_buf.
1290 : *
1291 : * For a little extra speed within the loop, we copy input_buf and
1292 : * input_buf_len into local variables.
1293 : */
1294 1267534 : copy_input_buf = cstate->input_buf;
1295 1267534 : input_buf_ptr = cstate->input_buf_index;
1296 1267534 : copy_buf_len = cstate->input_buf_len;
1297 :
1298 : for (;;)
1299 25530130 : {
1300 : int prev_raw_ptr;
1301 : char c;
1302 :
1303 : /*
1304 : * Load more data if needed.
1305 : *
1306 : * TODO: We could just force four bytes of read-ahead and avoid the
1307 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1308 : * unsafe with the old v2 COPY protocol, but we don't support that
1309 : * anymore.
1310 : */
1311 26797664 : if (input_buf_ptr >= copy_buf_len || need_data)
1312 : {
1313 431056 : REFILL_LINEBUF;
1314 :
1315 431056 : CopyLoadInputBuf(cstate);
1316 : /* update our local variables */
1317 431040 : hit_eof = cstate->input_reached_eof;
1318 431040 : input_buf_ptr = cstate->input_buf_index;
1319 431040 : copy_buf_len = cstate->input_buf_len;
1320 :
1321 : /*
1322 : * If we are completely out of data, break out of the loop,
1323 : * reporting EOF.
1324 : */
1325 431040 : if (INPUT_BUF_BYTES(cstate) <= 0)
1326 : {
1327 1542 : result = true;
1328 1542 : break;
1329 : }
1330 429498 : need_data = false;
1331 : }
1332 :
1333 : /* OK to fetch a character */
1334 26796106 : prev_raw_ptr = input_buf_ptr;
1335 26796106 : c = copy_input_buf[input_buf_ptr++];
1336 :
1337 26796106 : if (is_csv)
1338 : {
1339 : /*
1340 : * If character is '\r', we may need to look ahead below. Force
1341 : * fetch of the next character if we don't already have it. We
1342 : * need to do this before changing CSV state, in case '\r' is also
1343 : * the quote or escape character.
1344 : */
1345 6808 : if (c == '\r')
1346 : {
1347 36 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1348 : }
1349 :
1350 : /*
1351 : * Dealing with quotes and escapes here is mildly tricky. If the
1352 : * quote char is also the escape char, there's no problem - we
1353 : * just use the char as a toggle. If they are different, we need
1354 : * to ensure that we only take account of an escape inside a
1355 : * quoted field and immediately preceding a quote char, and not
1356 : * the second in an escape-escape sequence.
1357 : */
1358 6808 : if (in_quote && c == escapec)
1359 48 : last_was_esc = !last_was_esc;
1360 6808 : if (c == quotec && !last_was_esc)
1361 508 : in_quote = !in_quote;
1362 6808 : if (c != escapec)
1363 6754 : last_was_esc = false;
1364 :
1365 : /*
1366 : * Updating the line count for embedded CR and/or LF chars is
1367 : * necessarily a little fragile - this test is probably about the
1368 : * best we can do. (XXX it's arguable whether we should do this
1369 : * at all --- is cur_lineno a physical or logical count?)
1370 : */
1371 6808 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1372 36 : cstate->cur_lineno++;
1373 : }
1374 :
1375 : /* Process \r */
1376 26796106 : if (c == '\r' && (!is_csv || !in_quote))
1377 : {
1378 : /* Check for \r\n on first line, _and_ handle \r\n. */
1379 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1380 0 : cstate->eol_type == EOL_CRNL)
1381 : {
1382 : /*
1383 : * If need more data, go back to loop top to load it.
1384 : *
1385 : * Note that if we are at EOF, c will wind up as '\0' because
1386 : * of the guaranteed pad of input_buf.
1387 : */
1388 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1389 :
1390 : /* get next char */
1391 0 : c = copy_input_buf[input_buf_ptr];
1392 :
1393 0 : if (c == '\n')
1394 : {
1395 0 : input_buf_ptr++; /* eat newline */
1396 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1397 : }
1398 : else
1399 : {
1400 : /* found \r, but no \n */
1401 0 : if (cstate->eol_type == EOL_CRNL)
1402 0 : ereport(ERROR,
1403 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1404 : !is_csv ?
1405 : errmsg("literal carriage return found in data") :
1406 : errmsg("unquoted carriage return found in data"),
1407 : !is_csv ?
1408 : errhint("Use \"\\r\" to represent carriage return.") :
1409 : errhint("Use quoted CSV field to represent carriage return.")));
1410 :
1411 : /*
1412 : * if we got here, it is the first line and we didn't find
1413 : * \n, so don't consume the peeked character
1414 : */
1415 0 : cstate->eol_type = EOL_CR;
1416 : }
1417 : }
1418 0 : else if (cstate->eol_type == EOL_NL)
1419 0 : ereport(ERROR,
1420 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1421 : !is_csv ?
1422 : errmsg("literal carriage return found in data") :
1423 : errmsg("unquoted carriage return found in data"),
1424 : !is_csv ?
1425 : errhint("Use \"\\r\" to represent carriage return.") :
1426 : errhint("Use quoted CSV field to represent carriage return.")));
1427 : /* If reach here, we have found the line terminator */
1428 0 : break;
1429 : }
1430 :
1431 : /* Process \n */
1432 26796106 : if (c == '\n' && (!is_csv || !in_quote))
1433 : {
1434 1265814 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1435 0 : ereport(ERROR,
1436 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1437 : !is_csv ?
1438 : errmsg("literal newline found in data") :
1439 : errmsg("unquoted newline found in data"),
1440 : !is_csv ?
1441 : errhint("Use \"\\n\" to represent newline.") :
1442 : errhint("Use quoted CSV field to represent newline.")));
1443 1265814 : cstate->eol_type = EOL_NL; /* in case not set yet */
1444 : /* If reach here, we have found the line terminator */
1445 1265814 : break;
1446 : }
1447 :
1448 : /*
1449 : * Process backslash, except in CSV mode where backslash is a normal
1450 : * character.
1451 : */
1452 25530292 : if (c == '\\' && !is_csv)
1453 : {
1454 : char c2;
1455 :
1456 8176 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1457 8176 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1458 :
1459 : /* -----
1460 : * get next character
1461 : * Note: we do not change c so if it isn't \., we can fall
1462 : * through and continue processing.
1463 : * -----
1464 : */
1465 8176 : c2 = copy_input_buf[input_buf_ptr];
1466 :
1467 8176 : if (c2 == '.')
1468 : {
1469 162 : input_buf_ptr++; /* consume the '.' */
1470 162 : if (cstate->eol_type == EOL_CRNL)
1471 : {
1472 : /* Get the next character */
1473 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1474 : /* if hit_eof, c2 will become '\0' */
1475 0 : c2 = copy_input_buf[input_buf_ptr++];
1476 :
1477 0 : if (c2 == '\n')
1478 0 : ereport(ERROR,
1479 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1480 : errmsg("end-of-copy marker does not match previous newline style")));
1481 0 : else if (c2 != '\r')
1482 0 : ereport(ERROR,
1483 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1484 : errmsg("end-of-copy marker is not alone on its line")));
1485 : }
1486 :
1487 : /* Get the next character */
1488 162 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1489 : /* if hit_eof, c2 will become '\0' */
1490 162 : c2 = copy_input_buf[input_buf_ptr++];
1491 :
1492 162 : if (c2 != '\r' && c2 != '\n')
1493 6 : ereport(ERROR,
1494 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1495 : errmsg("end-of-copy marker is not alone on its line")));
1496 :
1497 156 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1498 156 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1499 156 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1500 0 : ereport(ERROR,
1501 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1502 : errmsg("end-of-copy marker does not match previous newline style")));
1503 :
1504 : /*
1505 : * If there is any data on this line before the \., complain.
1506 : */
1507 156 : if (cstate->line_buf.len > 0 ||
1508 156 : prev_raw_ptr > cstate->input_buf_index)
1509 6 : ereport(ERROR,
1510 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1511 : errmsg("end-of-copy marker is not alone on its line")));
1512 :
1513 : /*
1514 : * Discard the \. and newline, then report EOF.
1515 : */
1516 150 : cstate->input_buf_index = input_buf_ptr;
1517 150 : result = true; /* report EOF */
1518 150 : break;
1519 : }
1520 : else
1521 : {
1522 : /*
1523 : * If we are here, it means we found a backslash followed by
1524 : * something other than a period. In non-CSV mode, anything
1525 : * after a backslash is special, so we skip over that second
1526 : * character too. If we didn't do that \\. would be
1527 : * considered an eof-of copy, while in non-CSV mode it is a
1528 : * literal backslash followed by a period.
1529 : */
1530 8014 : input_buf_ptr++;
1531 : }
1532 : }
1533 : } /* end of outer loop */
1534 :
1535 : /*
1536 : * Transfer any still-uncopied data to line_buf.
1537 : */
1538 1267506 : REFILL_LINEBUF;
1539 :
1540 1267506 : return result;
1541 : }
1542 :
1543 : /*
1544 : * Return decimal value for a hexadecimal digit
1545 : */
1546 : static int
1547 0 : GetDecimalFromHex(char hex)
1548 : {
1549 0 : if (isdigit((unsigned char) hex))
1550 0 : return hex - '0';
1551 : else
1552 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1553 : }
1554 :
1555 : /*
1556 : * Parse the current line into separate attributes (fields),
1557 : * performing de-escaping as needed.
1558 : *
1559 : * The input is in line_buf. We use attribute_buf to hold the result
1560 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1561 : * string, or NULL when the input matches the null marker string.
1562 : * This array is expanded as necessary.
1563 : *
1564 : * (Note that the caller cannot check for nulls since the returned
1565 : * string would be the post-de-escaping equivalent, which may look
1566 : * the same as some valid data string.)
1567 : *
1568 : * delim is the column delimiter string (must be just one byte for now).
1569 : * null_print is the null marker string. Note that this is compared to
1570 : * the pre-de-escaped input string.
1571 : *
1572 : * The return value is the number of fields actually read.
1573 : */
1574 : static int
1575 1265184 : CopyReadAttributesText(CopyFromState cstate)
1576 : {
1577 1265184 : char delimc = cstate->opts.delim[0];
1578 : int fieldno;
1579 : char *output_ptr;
1580 : char *cur_ptr;
1581 : char *line_end_ptr;
1582 :
1583 : /*
1584 : * We need a special case for zero-column tables: check that the input
1585 : * line is empty, and return.
1586 : */
1587 1265184 : if (cstate->max_fields <= 0)
1588 : {
1589 8 : if (cstate->line_buf.len != 0)
1590 0 : ereport(ERROR,
1591 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1592 : errmsg("extra data after last expected column")));
1593 8 : return 0;
1594 : }
1595 :
1596 1265176 : resetStringInfo(&cstate->attribute_buf);
1597 :
1598 : /*
1599 : * The de-escaped attributes will certainly not be longer than the input
1600 : * data line, so we can just force attribute_buf to be large enough and
1601 : * then transfer data without any checks for enough space. We need to do
1602 : * it this way because enlarging attribute_buf mid-stream would invalidate
1603 : * pointers already stored into cstate->raw_fields[].
1604 : */
1605 1265176 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1606 8 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1607 1265176 : output_ptr = cstate->attribute_buf.data;
1608 :
1609 : /* set pointer variables for loop */
1610 1265176 : cur_ptr = cstate->line_buf.data;
1611 1265176 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1612 :
1613 : /* Outer loop iterates over fields */
1614 1265176 : fieldno = 0;
1615 : for (;;)
1616 3262946 : {
1617 4528122 : bool found_delim = false;
1618 : char *start_ptr;
1619 : char *end_ptr;
1620 : int input_len;
1621 4528122 : bool saw_non_ascii = false;
1622 :
1623 : /* Make sure there is enough space for the next value */
1624 4528122 : if (fieldno >= cstate->max_fields)
1625 : {
1626 36 : cstate->max_fields *= 2;
1627 36 : cstate->raw_fields =
1628 36 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1629 : }
1630 :
1631 : /* Remember start of field on both input and output sides */
1632 4528122 : start_ptr = cur_ptr;
1633 4528122 : cstate->raw_fields[fieldno] = output_ptr;
1634 :
1635 : /*
1636 : * Scan data for field.
1637 : *
1638 : * Note that in this loop, we are scanning to locate the end of field
1639 : * and also speculatively performing de-escaping. Once we find the
1640 : * end-of-field, we can match the raw field contents against the null
1641 : * marker string. Only after that comparison fails do we know that
1642 : * de-escaping is actually the right thing to do; therefore we *must
1643 : * not* throw any syntax errors before we've done the null-marker
1644 : * check.
1645 : */
1646 : for (;;)
1647 22260490 : {
1648 : char c;
1649 :
1650 26788612 : end_ptr = cur_ptr;
1651 26788612 : if (cur_ptr >= line_end_ptr)
1652 1265170 : break;
1653 25523442 : c = *cur_ptr++;
1654 25523442 : if (c == delimc)
1655 : {
1656 3262952 : found_delim = true;
1657 3262952 : break;
1658 : }
1659 22260490 : if (c == '\\')
1660 : {
1661 8014 : if (cur_ptr >= line_end_ptr)
1662 0 : break;
1663 8014 : c = *cur_ptr++;
1664 8014 : switch (c)
1665 : {
1666 12 : case '0':
1667 : case '1':
1668 : case '2':
1669 : case '3':
1670 : case '4':
1671 : case '5':
1672 : case '6':
1673 : case '7':
1674 : {
1675 : /* handle \013 */
1676 : int val;
1677 :
1678 12 : val = OCTVALUE(c);
1679 12 : if (cur_ptr < line_end_ptr)
1680 : {
1681 6 : c = *cur_ptr;
1682 6 : if (ISOCTAL(c))
1683 : {
1684 0 : cur_ptr++;
1685 0 : val = (val << 3) + OCTVALUE(c);
1686 0 : if (cur_ptr < line_end_ptr)
1687 : {
1688 0 : c = *cur_ptr;
1689 0 : if (ISOCTAL(c))
1690 : {
1691 0 : cur_ptr++;
1692 0 : val = (val << 3) + OCTVALUE(c);
1693 : }
1694 : }
1695 : }
1696 : }
1697 12 : c = val & 0377;
1698 12 : if (c == '\0' || IS_HIGHBIT_SET(c))
1699 12 : saw_non_ascii = true;
1700 : }
1701 12 : break;
1702 12 : case 'x':
1703 : /* Handle \x3F */
1704 12 : if (cur_ptr < line_end_ptr)
1705 : {
1706 6 : char hexchar = *cur_ptr;
1707 :
1708 6 : if (isxdigit((unsigned char) hexchar))
1709 : {
1710 0 : int val = GetDecimalFromHex(hexchar);
1711 :
1712 0 : cur_ptr++;
1713 0 : if (cur_ptr < line_end_ptr)
1714 : {
1715 0 : hexchar = *cur_ptr;
1716 0 : if (isxdigit((unsigned char) hexchar))
1717 : {
1718 0 : cur_ptr++;
1719 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1720 : }
1721 : }
1722 0 : c = val & 0xff;
1723 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1724 0 : saw_non_ascii = true;
1725 : }
1726 : }
1727 12 : break;
1728 0 : case 'b':
1729 0 : c = '\b';
1730 0 : break;
1731 0 : case 'f':
1732 0 : c = '\f';
1733 0 : break;
1734 3050 : case 'n':
1735 3050 : c = '\n';
1736 3050 : break;
1737 0 : case 'r':
1738 0 : c = '\r';
1739 0 : break;
1740 0 : case 't':
1741 0 : c = '\t';
1742 0 : break;
1743 0 : case 'v':
1744 0 : c = '\v';
1745 0 : break;
1746 :
1747 : /*
1748 : * in all other cases, take the char after '\'
1749 : * literally
1750 : */
1751 : }
1752 : }
1753 :
1754 : /* Add c to output string */
1755 22260490 : *output_ptr++ = c;
1756 : }
1757 :
1758 : /* Check whether raw input matched null marker */
1759 4528122 : input_len = end_ptr - start_ptr;
1760 4528122 : if (input_len == cstate->opts.null_print_len &&
1761 250864 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1762 4828 : cstate->raw_fields[fieldno] = NULL;
1763 : /* Check whether raw input matched default marker */
1764 4523294 : else if (fieldno < list_length(cstate->attnumlist) &&
1765 4523252 : cstate->opts.default_print &&
1766 114 : input_len == cstate->opts.default_print_len &&
1767 30 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1768 24 : {
1769 : /* fieldno is 0-indexed and attnum is 1-indexed */
1770 30 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1771 :
1772 30 : if (cstate->defexprs[m] != NULL)
1773 : {
1774 : /* defaults contain entries for all physical attributes */
1775 24 : cstate->defaults[m] = true;
1776 : }
1777 : else
1778 : {
1779 6 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1780 6 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1781 :
1782 6 : ereport(ERROR,
1783 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1784 : errmsg("unexpected default marker in COPY data"),
1785 : errdetail("Column \"%s\" has no default value.",
1786 : NameStr(att->attname))));
1787 : }
1788 : }
1789 : else
1790 : {
1791 : /*
1792 : * At this point we know the field is supposed to contain data.
1793 : *
1794 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
1795 : * resulting string is valid data for the db encoding.
1796 : */
1797 4523264 : if (saw_non_ascii)
1798 : {
1799 0 : char *fld = cstate->raw_fields[fieldno];
1800 :
1801 0 : pg_verifymbstr(fld, output_ptr - fld, false);
1802 : }
1803 : }
1804 :
1805 : /* Terminate attribute value in output area */
1806 4528116 : *output_ptr++ = '\0';
1807 :
1808 4528116 : fieldno++;
1809 : /* Done if we hit EOL instead of a delim */
1810 4528116 : if (!found_delim)
1811 1265170 : break;
1812 : }
1813 :
1814 : /* Clean up state of attribute_buf */
1815 1265170 : output_ptr--;
1816 : Assert(*output_ptr == '\0');
1817 1265170 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1818 :
1819 1265170 : return fieldno;
1820 : }
1821 :
1822 : /*
1823 : * Parse the current line into separate attributes (fields),
1824 : * performing de-escaping as needed. This has exactly the same API as
1825 : * CopyReadAttributesText, except we parse the fields according to
1826 : * "standard" (i.e. common) CSV usage.
1827 : */
1828 : static int
1829 508 : CopyReadAttributesCSV(CopyFromState cstate)
1830 : {
1831 508 : char delimc = cstate->opts.delim[0];
1832 508 : char quotec = cstate->opts.quote[0];
1833 508 : char escapec = cstate->opts.escape[0];
1834 : int fieldno;
1835 : char *output_ptr;
1836 : char *cur_ptr;
1837 : char *line_end_ptr;
1838 :
1839 : /*
1840 : * We need a special case for zero-column tables: check that the input
1841 : * line is empty, and return.
1842 : */
1843 508 : if (cstate->max_fields <= 0)
1844 : {
1845 0 : if (cstate->line_buf.len != 0)
1846 0 : ereport(ERROR,
1847 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1848 : errmsg("extra data after last expected column")));
1849 0 : return 0;
1850 : }
1851 :
1852 508 : resetStringInfo(&cstate->attribute_buf);
1853 :
1854 : /*
1855 : * The de-escaped attributes will certainly not be longer than the input
1856 : * data line, so we can just force attribute_buf to be large enough and
1857 : * then transfer data without any checks for enough space. We need to do
1858 : * it this way because enlarging attribute_buf mid-stream would invalidate
1859 : * pointers already stored into cstate->raw_fields[].
1860 : */
1861 508 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1862 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1863 508 : output_ptr = cstate->attribute_buf.data;
1864 :
1865 : /* set pointer variables for loop */
1866 508 : cur_ptr = cstate->line_buf.data;
1867 508 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1868 :
1869 : /* Outer loop iterates over fields */
1870 508 : fieldno = 0;
1871 : for (;;)
1872 534 : {
1873 1042 : bool found_delim = false;
1874 1042 : bool saw_quote = false;
1875 : char *start_ptr;
1876 : char *end_ptr;
1877 : int input_len;
1878 :
1879 : /* Make sure there is enough space for the next value */
1880 1042 : if (fieldno >= cstate->max_fields)
1881 : {
1882 0 : cstate->max_fields *= 2;
1883 0 : cstate->raw_fields =
1884 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1885 : }
1886 :
1887 : /* Remember start of field on both input and output sides */
1888 1042 : start_ptr = cur_ptr;
1889 1042 : cstate->raw_fields[fieldno] = output_ptr;
1890 :
1891 : /*
1892 : * Scan data for field,
1893 : *
1894 : * The loop starts in "not quote" mode and then toggles between that
1895 : * and "in quote" mode. The loop exits normally if it is in "not
1896 : * quote" mode and a delimiter or line end is seen.
1897 : */
1898 : for (;;)
1899 222 : {
1900 : char c;
1901 :
1902 : /* Not in quote */
1903 : for (;;)
1904 : {
1905 3320 : end_ptr = cur_ptr;
1906 3320 : if (cur_ptr >= line_end_ptr)
1907 502 : goto endfield;
1908 2818 : c = *cur_ptr++;
1909 : /* unquoted field delimiter */
1910 2818 : if (c == delimc)
1911 : {
1912 540 : found_delim = true;
1913 540 : goto endfield;
1914 : }
1915 : /* start of quoted field (or part of field) */
1916 2278 : if (c == quotec)
1917 : {
1918 222 : saw_quote = true;
1919 222 : break;
1920 : }
1921 : /* Add c to output string */
1922 2056 : *output_ptr++ = c;
1923 : }
1924 :
1925 : /* In quote */
1926 : for (;;)
1927 : {
1928 1390 : end_ptr = cur_ptr;
1929 1390 : if (cur_ptr >= line_end_ptr)
1930 0 : ereport(ERROR,
1931 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1932 : errmsg("unterminated CSV quoted field")));
1933 :
1934 1390 : c = *cur_ptr++;
1935 :
1936 : /* escape within a quoted field */
1937 1390 : if (c == escapec)
1938 : {
1939 : /*
1940 : * peek at the next char if available, and escape it if it
1941 : * is an escape char or a quote char
1942 : */
1943 118 : if (cur_ptr < line_end_ptr)
1944 : {
1945 72 : char nextc = *cur_ptr;
1946 :
1947 72 : if (nextc == escapec || nextc == quotec)
1948 : {
1949 24 : *output_ptr++ = nextc;
1950 24 : cur_ptr++;
1951 24 : continue;
1952 : }
1953 : }
1954 : }
1955 :
1956 : /*
1957 : * end of quoted field. Must do this test after testing for
1958 : * escape in case quote char and escape char are the same
1959 : * (which is the common case).
1960 : */
1961 1366 : if (c == quotec)
1962 222 : break;
1963 :
1964 : /* Add c to output string */
1965 1144 : *output_ptr++ = c;
1966 : }
1967 : }
1968 1042 : endfield:
1969 :
1970 : /* Terminate attribute value in output area */
1971 1042 : *output_ptr++ = '\0';
1972 :
1973 : /* Check whether raw input matched null marker */
1974 1042 : input_len = end_ptr - start_ptr;
1975 1042 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
1976 44 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1977 44 : cstate->raw_fields[fieldno] = NULL;
1978 : /* Check whether raw input matched default marker */
1979 998 : else if (fieldno < list_length(cstate->attnumlist) &&
1980 998 : cstate->opts.default_print &&
1981 150 : input_len == cstate->opts.default_print_len &&
1982 42 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1983 : {
1984 : /* fieldno is 0-index and attnum is 1-index */
1985 42 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1986 :
1987 42 : if (cstate->defexprs[m] != NULL)
1988 : {
1989 : /* defaults contain entries for all physical attributes */
1990 36 : cstate->defaults[m] = true;
1991 : }
1992 : else
1993 : {
1994 6 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1995 6 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1996 :
1997 6 : ereport(ERROR,
1998 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1999 : errmsg("unexpected default marker in COPY data"),
2000 : errdetail("Column \"%s\" has no default value.",
2001 : NameStr(att->attname))));
2002 : }
2003 : }
2004 :
2005 1036 : fieldno++;
2006 : /* Done if we hit EOL instead of a delim */
2007 1036 : if (!found_delim)
2008 502 : break;
2009 : }
2010 :
2011 : /* Clean up state of attribute_buf */
2012 502 : output_ptr--;
2013 : Assert(*output_ptr == '\0');
2014 502 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2015 :
2016 502 : return fieldno;
2017 : }
2018 :
2019 :
2020 : /*
2021 : * Read a binary attribute
2022 : */
2023 : static Datum
2024 158 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2025 : Oid typioparam, int32 typmod,
2026 : bool *isnull)
2027 : {
2028 : int32 fld_size;
2029 : Datum result;
2030 :
2031 158 : if (!CopyGetInt32(cstate, &fld_size))
2032 0 : ereport(ERROR,
2033 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2034 : errmsg("unexpected EOF in COPY data")));
2035 158 : if (fld_size == -1)
2036 : {
2037 30 : *isnull = true;
2038 30 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2039 : }
2040 128 : if (fld_size < 0)
2041 0 : ereport(ERROR,
2042 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2043 : errmsg("invalid field size")));
2044 :
2045 : /* reset attribute_buf to empty, and load raw data in it */
2046 128 : resetStringInfo(&cstate->attribute_buf);
2047 :
2048 128 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2049 128 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2050 128 : fld_size) != fld_size)
2051 0 : ereport(ERROR,
2052 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2053 : errmsg("unexpected EOF in COPY data")));
2054 :
2055 128 : cstate->attribute_buf.len = fld_size;
2056 128 : cstate->attribute_buf.data[fld_size] = '\0';
2057 :
2058 : /* Call the column type's binary input converter */
2059 128 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2060 : typioparam, typmod);
2061 :
2062 : /* Trouble if it didn't eat the whole buffer */
2063 128 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2064 2 : ereport(ERROR,
2065 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2066 : errmsg("incorrect binary data format")));
2067 :
2068 126 : *isnull = false;
2069 126 : return result;
2070 : }
|