LCOV - code coverage report
Current view: top level - src/backend/commands - copyfromparse.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 513 628 81.7 %
Date: 2025-01-18 04:15:08 Functions: 17 18 94.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * copyfromparse.c
       4             :  *      Parse CSV/text/binary format for COPY FROM.
       5             :  *
       6             :  * This file contains routines to parse the text, CSV and binary input
       7             :  * formats.  The main entry point is NextCopyFrom(), which parses the
       8             :  * next input line and returns it as Datums.
       9             :  *
      10             :  * In text/CSV mode, the parsing happens in multiple stages:
      11             :  *
      12             :  * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
      13             :  *                1.          2.            3.           4.
      14             :  *
      15             :  * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
      16             :  *    places it into 'raw_buf'.
      17             :  *
      18             :  * 2. CopyConvertBuf() calls the encoding conversion function to convert
      19             :  *    the data in 'raw_buf' from client to server encoding, placing the
      20             :  *    converted result in 'input_buf'.
      21             :  *
      22             :  * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
      23             :  *    It is responsible for finding the next newline marker, taking quote and
      24             :  *    escape characters into account according to the COPY options.  The line
      25             :  *    is copied into 'line_buf', with quotes and escape characters still
      26             :  *    intact.
      27             :  *
      28             :  * 4. CopyReadAttributesText/CSV() function takes the input line from
      29             :  *    'line_buf', and splits it into fields, unescaping the data as required.
      30             :  *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
      31             :  *    pointers to each field.
      32             :  *
      33             :  * If encoding conversion is not required, a shortcut is taken in step 2 to
      34             :  * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
      35             :  * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
      36             :  * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
      37             :  * the data is valid in the current encoding.
      38             :  *
      39             :  * In binary mode, the pipeline is much simpler.  Input is loaded into
      40             :  * 'raw_buf', and encoding conversion is done in the datatype-specific
      41             :  * receive functions, if required.  'input_buf' and 'line_buf' are not used,
      42             :  * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
      43             :  * data when it's passed the receive function.
      44             :  *
      45             :  * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
      46             :  * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
      47             :  * and 'attribute_buf' are expanded on demand, to hold the longest line
      48             :  * encountered so far.
      49             :  *
      50             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      51             :  * Portions Copyright (c) 1994, Regents of the University of California
      52             :  *
      53             :  *
      54             :  * IDENTIFICATION
      55             :  *    src/backend/commands/copyfromparse.c
      56             :  *
      57             :  *-------------------------------------------------------------------------
      58             :  */
      59             : #include "postgres.h"
      60             : 
      61             : #include <ctype.h>
      62             : #include <unistd.h>
      63             : #include <sys/stat.h>
      64             : 
      65             : #include "commands/copy.h"
      66             : #include "commands/copyfrom_internal.h"
      67             : #include "commands/progress.h"
      68             : #include "executor/executor.h"
      69             : #include "libpq/libpq.h"
      70             : #include "libpq/pqformat.h"
      71             : #include "mb/pg_wchar.h"
      72             : #include "miscadmin.h"
      73             : #include "pgstat.h"
      74             : #include "port/pg_bswap.h"
      75             : #include "utils/builtins.h"
      76             : #include "utils/rel.h"
      77             : 
      78             : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
      79             : #define OCTVALUE(c) ((c) - '0')
      80             : 
      81             : /*
      82             :  * These macros centralize code used to process line_buf and input_buf buffers.
      83             :  * They are macros because they often do continue/break control and to avoid
      84             :  * function call overhead in tight COPY loops.
      85             :  *
      86             :  * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
      87             :  * prevent the continue/break processing from working.  We end the "if (1)"
      88             :  * with "else ((void) 0)" to ensure the "if" does not unintentionally match
      89             :  * any "else" in the calling code, and to avoid any compiler warnings about
      90             :  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
      91             :  */
      92             : 
      93             : /*
      94             :  * This keeps the character read at the top of the loop in the buffer
      95             :  * even if there is more than one read-ahead.
      96             :  */
      97             : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
      98             : if (1) \
      99             : { \
     100             :     if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
     101             :     { \
     102             :         input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
     103             :         need_data = true; \
     104             :         continue; \
     105             :     } \
     106             : } else ((void) 0)
     107             : 
     108             : /* This consumes the remainder of the buffer and breaks */
     109             : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
     110             : if (1) \
     111             : { \
     112             :     if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
     113             :     { \
     114             :         if (extralen) \
     115             :             input_buf_ptr = copy_buf_len; /* consume the partial character */ \
     116             :         /* backslash just before EOF, treat as data char */ \
     117             :         result = true; \
     118             :         break; \
     119             :     } \
     120             : } else ((void) 0)
     121             : 
     122             : /*
     123             :  * Transfer any approved data to line_buf; must do this to be sure
     124             :  * there is some room in input_buf.
     125             :  */
     126             : #define REFILL_LINEBUF \
     127             : if (1) \
     128             : { \
     129             :     if (input_buf_ptr > cstate->input_buf_index) \
     130             :     { \
     131             :         appendBinaryStringInfo(&cstate->line_buf, \
     132             :                              cstate->input_buf + cstate->input_buf_index, \
     133             :                                input_buf_ptr - cstate->input_buf_index); \
     134             :         cstate->input_buf_index = input_buf_ptr; \
     135             :     } \
     136             : } else ((void) 0)
     137             : 
     138             : /* NOTE: there's a copy of this in copyto.c */
     139             : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
     140             : 
     141             : 
     142             : /* non-export function prototypes */
     143             : static bool CopyReadLine(CopyFromState cstate);
     144             : static bool CopyReadLineText(CopyFromState cstate);
     145             : static int  CopyReadAttributesText(CopyFromState cstate);
     146             : static int  CopyReadAttributesCSV(CopyFromState cstate);
     147             : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
     148             :                                      Oid typioparam, int32 typmod,
     149             :                                      bool *isnull);
     150             : 
     151             : 
     152             : /* Low-level communications functions */
     153             : static int  CopyGetData(CopyFromState cstate, void *databuf,
     154             :                         int minread, int maxread);
     155             : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
     156             : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
     157             : static void CopyLoadInputBuf(CopyFromState cstate);
     158             : static int  CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
     159             : 
     160             : void
     161         920 : ReceiveCopyBegin(CopyFromState cstate)
     162             : {
     163             :     StringInfoData buf;
     164         920 :     int         natts = list_length(cstate->attnumlist);
     165         920 :     int16       format = (cstate->opts.binary ? 1 : 0);
     166             :     int         i;
     167             : 
     168         920 :     pq_beginmessage(&buf, PqMsg_CopyInResponse);
     169         920 :     pq_sendbyte(&buf, format);  /* overall format */
     170         920 :     pq_sendint16(&buf, natts);
     171        3102 :     for (i = 0; i < natts; i++)
     172        2182 :         pq_sendint16(&buf, format); /* per-column formats */
     173         920 :     pq_endmessage(&buf);
     174         920 :     cstate->copy_src = COPY_FRONTEND;
     175         920 :     cstate->fe_msgbuf = makeStringInfo();
     176             :     /* We *must* flush here to ensure FE knows it can send. */
     177         920 :     pq_flush();
     178         920 : }
     179             : 
     180             : void
     181          14 : ReceiveCopyBinaryHeader(CopyFromState cstate)
     182             : {
     183             :     char        readSig[11];
     184             :     int32       tmp;
     185             : 
     186             :     /* Signature */
     187          14 :     if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
     188          14 :         memcmp(readSig, BinarySignature, 11) != 0)
     189           0 :         ereport(ERROR,
     190             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     191             :                  errmsg("COPY file signature not recognized")));
     192             :     /* Flags field */
     193          14 :     if (!CopyGetInt32(cstate, &tmp))
     194           0 :         ereport(ERROR,
     195             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     196             :                  errmsg("invalid COPY file header (missing flags)")));
     197          14 :     if ((tmp & (1 << 16)) != 0)
     198           0 :         ereport(ERROR,
     199             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     200             :                  errmsg("invalid COPY file header (WITH OIDS)")));
     201          14 :     tmp &= ~(1 << 16);
     202          14 :     if ((tmp >> 16) != 0)
     203           0 :         ereport(ERROR,
     204             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     205             :                  errmsg("unrecognized critical flags in COPY file header")));
     206             :     /* Header extension length */
     207          14 :     if (!CopyGetInt32(cstate, &tmp) ||
     208          14 :         tmp < 0)
     209           0 :         ereport(ERROR,
     210             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     211             :                  errmsg("invalid COPY file header (missing length)")));
     212             :     /* Skip extension header, if present */
     213          14 :     while (tmp-- > 0)
     214             :     {
     215           0 :         if (CopyReadBinaryData(cstate, readSig, 1) != 1)
     216           0 :             ereport(ERROR,
     217             :                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     218             :                      errmsg("invalid COPY file header (wrong length)")));
     219             :     }
     220          14 : }
     221             : 
     222             : /*
     223             :  * CopyGetData reads data from the source (file or frontend)
     224             :  *
     225             :  * We attempt to read at least minread, and at most maxread, bytes from
     226             :  * the source.  The actual number of bytes read is returned; if this is
     227             :  * less than minread, EOF was detected.
     228             :  *
     229             :  * Note: when copying from the frontend, we expect a proper EOF mark per
     230             :  * protocol; if the frontend simply drops the connection, we raise error.
     231             :  * It seems unwise to allow the COPY IN to complete normally in that case.
     232             :  *
     233             :  * NB: no data conversion is applied here.
     234             :  */
     235             : static int
     236      431510 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
     237             : {
     238      431510 :     int         bytesread = 0;
     239             : 
     240      431510 :     switch (cstate->copy_src)
     241             :     {
     242        1068 :         case COPY_FILE:
     243        1068 :             bytesread = fread(databuf, 1, maxread, cstate->copy_file);
     244        1068 :             if (ferror(cstate->copy_file))
     245           0 :                 ereport(ERROR,
     246             :                         (errcode_for_file_access(),
     247             :                          errmsg("could not read from COPY file: %m")));
     248        1068 :             if (bytesread == 0)
     249         416 :                 cstate->raw_reached_eof = true;
     250        1068 :             break;
     251      402500 :         case COPY_FRONTEND:
     252      803582 :             while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
     253             :             {
     254             :                 int         avail;
     255             : 
     256      401806 :                 while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
     257             :                 {
     258             :                     /* Try to receive another message */
     259             :                     int         mtype;
     260             :                     int         maxmsglen;
     261             : 
     262      401806 :             readmessage:
     263      401806 :                     HOLD_CANCEL_INTERRUPTS();
     264      401806 :                     pq_startmsgread();
     265      401806 :                     mtype = pq_getbyte();
     266      401806 :                     if (mtype == EOF)
     267           0 :                         ereport(ERROR,
     268             :                                 (errcode(ERRCODE_CONNECTION_FAILURE),
     269             :                                  errmsg("unexpected EOF on client connection with an open transaction")));
     270             :                     /* Validate message type and set packet size limit */
     271             :                     switch (mtype)
     272             :                     {
     273      401082 :                         case PqMsg_CopyData:
     274      401082 :                             maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
     275      401082 :                             break;
     276         724 :                         case PqMsg_CopyDone:
     277             :                         case PqMsg_CopyFail:
     278             :                         case PqMsg_Flush:
     279             :                         case PqMsg_Sync:
     280         724 :                             maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
     281         724 :                             break;
     282           0 :                         default:
     283           0 :                             ereport(ERROR,
     284             :                                     (errcode(ERRCODE_PROTOCOL_VIOLATION),
     285             :                                      errmsg("unexpected message type 0x%02X during COPY from stdin",
     286             :                                             mtype)));
     287             :                             maxmsglen = 0;  /* keep compiler quiet */
     288             :                             break;
     289             :                     }
     290             :                     /* Now collect the message body */
     291      401806 :                     if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
     292           0 :                         ereport(ERROR,
     293             :                                 (errcode(ERRCODE_CONNECTION_FAILURE),
     294             :                                  errmsg("unexpected EOF on client connection with an open transaction")));
     295      401806 :                     RESUME_CANCEL_INTERRUPTS();
     296             :                     /* ... and process it */
     297             :                     switch (mtype)
     298             :                     {
     299      401082 :                         case PqMsg_CopyData:
     300      401082 :                             break;
     301         724 :                         case PqMsg_CopyDone:
     302             :                             /* COPY IN correctly terminated by frontend */
     303         724 :                             cstate->raw_reached_eof = true;
     304         724 :                             return bytesread;
     305           0 :                         case PqMsg_CopyFail:
     306           0 :                             ereport(ERROR,
     307             :                                     (errcode(ERRCODE_QUERY_CANCELED),
     308             :                                      errmsg("COPY from stdin failed: %s",
     309             :                                             pq_getmsgstring(cstate->fe_msgbuf))));
     310             :                             break;
     311           0 :                         case PqMsg_Flush:
     312             :                         case PqMsg_Sync:
     313             : 
     314             :                             /*
     315             :                              * Ignore Flush/Sync for the convenience of client
     316             :                              * libraries (such as libpq) that may send those
     317             :                              * without noticing that the command they just
     318             :                              * sent was COPY.
     319             :                              */
     320           0 :                             goto readmessage;
     321      802888 :                         default:
     322             :                             Assert(false);  /* NOT REACHED */
     323             :                     }
     324             :                 }
     325      401082 :                 avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
     326      401082 :                 if (avail > maxread)
     327           0 :                     avail = maxread;
     328      401082 :                 pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
     329      401082 :                 databuf = (void *) ((char *) databuf + avail);
     330      401082 :                 maxread -= avail;
     331      401082 :                 bytesread += avail;
     332             :             }
     333      401776 :             break;
     334       27942 :         case COPY_CALLBACK:
     335       27942 :             bytesread = cstate->data_source_cb(databuf, minread, maxread);
     336       27942 :             break;
     337             :     }
     338             : 
     339      430786 :     return bytesread;
     340             : }
     341             : 
     342             : 
     343             : /*
     344             :  * These functions do apply some data conversion
     345             :  */
     346             : 
     347             : /*
     348             :  * CopyGetInt32 reads an int32 that appears in network byte order
     349             :  *
     350             :  * Returns true if OK, false if EOF
     351             :  */
     352             : static inline bool
     353         186 : CopyGetInt32(CopyFromState cstate, int32 *val)
     354             : {
     355             :     uint32      buf;
     356             : 
     357         186 :     if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
     358             :     {
     359           0 :         *val = 0;               /* suppress compiler warning */
     360           0 :         return false;
     361             :     }
     362         186 :     *val = (int32) pg_ntoh32(buf);
     363         186 :     return true;
     364             : }
     365             : 
     366             : /*
     367             :  * CopyGetInt16 reads an int16 that appears in network byte order
     368             :  */
     369             : static inline bool
     370          42 : CopyGetInt16(CopyFromState cstate, int16 *val)
     371             : {
     372             :     uint16      buf;
     373             : 
     374          42 :     if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
     375             :     {
     376           0 :         *val = 0;               /* suppress compiler warning */
     377           0 :         return false;
     378             :     }
     379          42 :     *val = (int16) pg_ntoh16(buf);
     380          42 :     return true;
     381             : }
     382             : 
     383             : 
     384             : /*
     385             :  * Perform encoding conversion on data in 'raw_buf', writing the converted
     386             :  * data into 'input_buf'.
     387             :  *
     388             :  * On entry, there must be some data to convert in 'raw_buf'.
     389             :  */
     390             : static void
     391      861488 : CopyConvertBuf(CopyFromState cstate)
     392             : {
     393             :     /*
     394             :      * If the file and server encoding are the same, no encoding conversion is
     395             :      * required.  However, we still need to verify that the input is valid for
     396             :      * the encoding.
     397             :      */
     398      861488 :     if (!cstate->need_transcoding)
     399             :     {
     400             :         /*
     401             :          * When conversion is not required, input_buf and raw_buf are the
     402             :          * same.  raw_buf_len is the total number of bytes in the buffer, and
     403             :          * input_buf_len tracks how many of those bytes have already been
     404             :          * verified.
     405             :          */
     406      861404 :         int         preverifiedlen = cstate->input_buf_len;
     407      861404 :         int         unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
     408             :         int         nverified;
     409             : 
     410      861404 :         if (unverifiedlen == 0)
     411             :         {
     412             :             /*
     413             :              * If no more raw data is coming, report the EOF to the caller.
     414             :              */
     415      432136 :             if (cstate->raw_reached_eof)
     416        1434 :                 cstate->input_reached_eof = true;
     417      432136 :             return;
     418             :         }
     419             : 
     420             :         /*
     421             :          * Verify the new data, including any residual unverified bytes from
     422             :          * previous round.
     423             :          */
     424      429268 :         nverified = pg_encoding_verifymbstr(cstate->file_encoding,
     425      429268 :                                             cstate->raw_buf + preverifiedlen,
     426             :                                             unverifiedlen);
     427      429268 :         if (nverified == 0)
     428             :         {
     429             :             /*
     430             :              * Could not verify anything.
     431             :              *
     432             :              * If there is no more raw input data coming, it means that there
     433             :              * was an incomplete multi-byte sequence at the end.  Also, if
     434             :              * there's "enough" input left, we should be able to verify at
     435             :              * least one character, and a failure to do so means that we've
     436             :              * hit an invalid byte sequence.
     437             :              */
     438           0 :             if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
     439           0 :                 cstate->input_reached_error = true;
     440           0 :             return;
     441             :         }
     442      429268 :         cstate->input_buf_len += nverified;
     443             :     }
     444             :     else
     445             :     {
     446             :         /*
     447             :          * Encoding conversion is needed.
     448             :          */
     449             :         int         nbytes;
     450             :         unsigned char *src;
     451             :         int         srclen;
     452             :         unsigned char *dst;
     453             :         int         dstlen;
     454             :         int         convertedlen;
     455             : 
     456          84 :         if (RAW_BUF_BYTES(cstate) == 0)
     457             :         {
     458             :             /*
     459             :              * If no more raw data is coming, report the EOF to the caller.
     460             :              */
     461          48 :             if (cstate->raw_reached_eof)
     462          12 :                 cstate->input_reached_eof = true;
     463          48 :             return;
     464             :         }
     465             : 
     466             :         /*
     467             :          * First, copy down any unprocessed data.
     468             :          */
     469          36 :         nbytes = INPUT_BUF_BYTES(cstate);
     470          36 :         if (nbytes > 0 && cstate->input_buf_index > 0)
     471           0 :             memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
     472             :                     nbytes);
     473          36 :         cstate->input_buf_index = 0;
     474          36 :         cstate->input_buf_len = nbytes;
     475          36 :         cstate->input_buf[nbytes] = '\0';
     476             : 
     477          36 :         src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
     478          36 :         srclen = cstate->raw_buf_len - cstate->raw_buf_index;
     479          36 :         dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
     480          36 :         dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
     481             : 
     482             :         /*
     483             :          * Do the conversion.  This might stop short, if there is an invalid
     484             :          * byte sequence in the input.  We'll convert as much as we can in
     485             :          * that case.
     486             :          *
     487             :          * Note: Even if we hit an invalid byte sequence, we don't report the
     488             :          * error until all the valid bytes have been consumed.  The input
     489             :          * might contain an end-of-input marker (\.), and we don't want to
     490             :          * report an error if the invalid byte sequence is after the
     491             :          * end-of-input marker.  We might unnecessarily convert some data
     492             :          * after the end-of-input marker as long as it's valid for the
     493             :          * encoding, but that's harmless.
     494             :          */
     495          36 :         convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
     496             :                                                      cstate->file_encoding,
     497             :                                                      GetDatabaseEncoding(),
     498             :                                                      src, srclen,
     499             :                                                      dst, dstlen,
     500             :                                                      true);
     501          36 :         if (convertedlen == 0)
     502             :         {
     503             :             /*
     504             :              * Could not convert anything.  If there is no more raw input data
     505             :              * coming, it means that there was an incomplete multi-byte
     506             :              * sequence at the end.  Also, if there is plenty of input left,
     507             :              * we should be able to convert at least one character, so a
     508             :              * failure to do so must mean that we've hit a byte sequence
     509             :              * that's invalid.
     510             :              */
     511          24 :             if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
     512          12 :                 cstate->input_reached_error = true;
     513          24 :             return;
     514             :         }
     515          12 :         cstate->raw_buf_index += convertedlen;
     516          12 :         cstate->input_buf_len += strlen((char *) dst);
     517             :     }
     518             : }
     519             : 
     520             : /*
     521             :  * Report an encoding or conversion error.
     522             :  */
     523             : static void
     524          12 : CopyConversionError(CopyFromState cstate)
     525             : {
     526             :     Assert(cstate->raw_buf_len > 0);
     527             :     Assert(cstate->input_reached_error);
     528             : 
     529          12 :     if (!cstate->need_transcoding)
     530             :     {
     531             :         /*
     532             :          * Everything up to input_buf_len was successfully verified, and
     533             :          * input_buf_len points to the invalid or incomplete character.
     534             :          */
     535           0 :         report_invalid_encoding(cstate->file_encoding,
     536           0 :                                 cstate->raw_buf + cstate->input_buf_len,
     537           0 :                                 cstate->raw_buf_len - cstate->input_buf_len);
     538             :     }
     539             :     else
     540             :     {
     541             :         /*
     542             :          * raw_buf_index points to the invalid or untranslatable character. We
     543             :          * let the conversion routine report the error, because it can provide
     544             :          * a more specific error message than we could here.  An earlier call
     545             :          * to the conversion routine in CopyConvertBuf() detected that there
     546             :          * is an error, now we call the conversion routine again with
     547             :          * noError=false, to have it throw the error.
     548             :          */
     549             :         unsigned char *src;
     550             :         int         srclen;
     551             :         unsigned char *dst;
     552             :         int         dstlen;
     553             : 
     554          12 :         src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
     555          12 :         srclen = cstate->raw_buf_len - cstate->raw_buf_index;
     556          12 :         dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
     557          12 :         dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
     558             : 
     559          12 :         (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
     560             :                                              cstate->file_encoding,
     561             :                                              GetDatabaseEncoding(),
     562             :                                              src, srclen,
     563             :                                              dst, dstlen,
     564             :                                              false);
     565             : 
     566             :         /*
     567             :          * The conversion routine should have reported an error, so this
     568             :          * should not be reached.
     569             :          */
     570           0 :         elog(ERROR, "encoding conversion failed without error");
     571             :     }
     572             : }
     573             : 
     574             : /*
     575             :  * Load more data from data source to raw_buf.
     576             :  *
     577             :  * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
     578             :  * beginning of the buffer, and we load new data after that.
     579             :  */
     580             : static void
     581      430786 : CopyLoadRawBuf(CopyFromState cstate)
     582             : {
     583             :     int         nbytes;
     584             :     int         inbytes;
     585             : 
     586             :     /*
     587             :      * In text mode, if encoding conversion is not required, raw_buf and
     588             :      * input_buf point to the same buffer.  Their len/index better agree, too.
     589             :      */
     590      430786 :     if (cstate->raw_buf == cstate->input_buf)
     591             :     {
     592             :         Assert(!cstate->need_transcoding);
     593             :         Assert(cstate->raw_buf_index == cstate->input_buf_index);
     594             :         Assert(cstate->input_buf_len <= cstate->raw_buf_len);
     595             :     }
     596             : 
     597             :     /*
     598             :      * Copy down the unprocessed data if any.
     599             :      */
     600      430786 :     nbytes = RAW_BUF_BYTES(cstate);
     601      430786 :     if (nbytes > 0 && cstate->raw_buf_index > 0)
     602           0 :         memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
     603             :                 nbytes);
     604      430786 :     cstate->raw_buf_len -= cstate->raw_buf_index;
     605      430786 :     cstate->raw_buf_index = 0;
     606             : 
     607             :     /*
     608             :      * If raw_buf and input_buf are in fact the same buffer, adjust the
     609             :      * input_buf variables, too.
     610             :      */
     611      430786 :     if (cstate->raw_buf == cstate->input_buf)
     612             :     {
     613      430702 :         cstate->input_buf_len -= cstate->input_buf_index;
     614      430702 :         cstate->input_buf_index = 0;
     615             :     }
     616             : 
     617             :     /* Load more data */
     618      430786 :     inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
     619      430786 :                           1, RAW_BUF_SIZE - cstate->raw_buf_len);
     620      430786 :     nbytes += inbytes;
     621      430786 :     cstate->raw_buf[nbytes] = '\0';
     622      430786 :     cstate->raw_buf_len = nbytes;
     623             : 
     624      430786 :     cstate->bytes_processed += inbytes;
     625      430786 :     pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
     626             : 
     627      430786 :     if (inbytes == 0)
     628        1470 :         cstate->raw_reached_eof = true;
     629      430786 : }
     630             : 
     631             : /*
     632             :  * CopyLoadInputBuf loads some more data into input_buf
     633             :  *
     634             :  * On return, at least one more input character is loaded into
     635             :  * input_buf, or input_reached_eof is set.
     636             :  *
     637             :  * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
     638             :  * of the buffer and then we load more data after that.
     639             :  */
     640             : static void
     641      430738 : CopyLoadInputBuf(CopyFromState cstate)
     642             : {
     643      430738 :     int         nbytes = INPUT_BUF_BYTES(cstate);
     644             : 
     645             :     /*
     646             :      * The caller has updated input_buf_index to indicate how much of the
     647             :      * input has been consumed and isn't needed anymore.  If input_buf is the
     648             :      * same physical area as raw_buf, update raw_buf_index accordingly.
     649             :      */
     650      430738 :     if (cstate->raw_buf == cstate->input_buf)
     651             :     {
     652             :         Assert(!cstate->need_transcoding);
     653             :         Assert(cstate->input_buf_index >= cstate->raw_buf_index);
     654      430702 :         cstate->raw_buf_index = cstate->input_buf_index;
     655             :     }
     656             : 
     657             :     for (;;)
     658             :     {
     659             :         /* If we now have some unconverted data, try to convert it */
     660      861488 :         CopyConvertBuf(cstate);
     661             : 
     662             :         /* If we now have some more input bytes ready, return them */
     663      861488 :         if (INPUT_BUF_BYTES(cstate) > nbytes)
     664      429280 :             return;
     665             : 
     666             :         /*
     667             :          * If we reached an invalid byte sequence, or we're at an incomplete
     668             :          * multi-byte character but there is no more raw input data, report
     669             :          * conversion error.
     670             :          */
     671      432208 :         if (cstate->input_reached_error)
     672          12 :             CopyConversionError(cstate);
     673             : 
     674             :         /* no more input, and everything has been converted */
     675      432196 :         if (cstate->input_reached_eof)
     676        1446 :             break;
     677             : 
     678             :         /* Try to load more raw data */
     679             :         Assert(!cstate->raw_reached_eof);
     680      430750 :         CopyLoadRawBuf(cstate);
     681             :     }
     682             : }
     683             : 
     684             : /*
     685             :  * CopyReadBinaryData
     686             :  *
     687             :  * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
     688             :  * and writes them to 'dest'.  Returns the number of bytes read (which
     689             :  * would be less than 'nbytes' only if we reach EOF).
     690             :  */
     691             : static int
     692         382 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
     693             : {
     694         382 :     int         copied_bytes = 0;
     695             : 
     696         382 :     if (RAW_BUF_BYTES(cstate) >= nbytes)
     697             :     {
     698             :         /* Enough bytes are present in the buffer. */
     699         346 :         memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
     700         346 :         cstate->raw_buf_index += nbytes;
     701         346 :         copied_bytes = nbytes;
     702             :     }
     703             :     else
     704             :     {
     705             :         /*
     706             :          * Not enough bytes in the buffer, so must read from the file.  Need
     707             :          * to loop since 'nbytes' could be larger than the buffer size.
     708             :          */
     709             :         do
     710             :         {
     711             :             int         copy_bytes;
     712             : 
     713             :             /* Load more data if buffer is empty. */
     714          36 :             if (RAW_BUF_BYTES(cstate) == 0)
     715             :             {
     716          36 :                 CopyLoadRawBuf(cstate);
     717          36 :                 if (cstate->raw_reached_eof)
     718          12 :                     break;      /* EOF */
     719             :             }
     720             : 
     721             :             /* Transfer some bytes. */
     722          24 :             copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
     723          24 :             memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
     724          24 :             cstate->raw_buf_index += copy_bytes;
     725          24 :             dest += copy_bytes;
     726          24 :             copied_bytes += copy_bytes;
     727          24 :         } while (copied_bytes < nbytes);
     728             :     }
     729             : 
     730         382 :     return copied_bytes;
     731             : }
     732             : 
     733             : /*
     734             :  * Read raw fields in the next line for COPY FROM in text or csv mode.
     735             :  * Return false if no more lines.
     736             :  *
     737             :  * An internal temporary buffer is returned via 'fields'. It is valid until
     738             :  * the next call of the function. Since the function returns all raw fields
     739             :  * in the input file, 'nfields' could be different from the number of columns
     740             :  * in the relation.
     741             :  *
     742             :  * NOTE: force_not_null option are not applied to the returned fields.
     743             :  */
     744             : bool
     745     1257952 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
     746             : {
     747             :     int         fldct;
     748             :     bool        done;
     749             : 
     750             :     /* only available for text or csv input */
     751             :     Assert(!cstate->opts.binary);
     752             : 
     753             :     /* on input check that the header line is correct if needed */
     754     1257952 :     if (cstate->cur_lineno == 0 && cstate->opts.header_line)
     755             :     {
     756             :         ListCell   *cur;
     757             :         TupleDesc   tupDesc;
     758             : 
     759         120 :         tupDesc = RelationGetDescr(cstate->rel);
     760             : 
     761         120 :         cstate->cur_lineno++;
     762         120 :         done = CopyReadLine(cstate);
     763             : 
     764         120 :         if (cstate->opts.header_line == COPY_HEADER_MATCH)
     765             :         {
     766             :             int         fldnum;
     767             : 
     768          76 :             if (cstate->opts.csv_mode)
     769          10 :                 fldct = CopyReadAttributesCSV(cstate);
     770             :             else
     771          66 :                 fldct = CopyReadAttributesText(cstate);
     772             : 
     773          76 :             if (fldct != list_length(cstate->attnumlist))
     774          24 :                 ereport(ERROR,
     775             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     776             :                          errmsg("wrong number of fields in header line: got %d, expected %d",
     777             :                                 fldct, list_length(cstate->attnumlist))));
     778             : 
     779          52 :             fldnum = 0;
     780         158 :             foreach(cur, cstate->attnumlist)
     781             :             {
     782         126 :                 int         attnum = lfirst_int(cur);
     783             :                 char       *colName;
     784         126 :                 Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
     785             : 
     786             :                 Assert(fldnum < cstate->max_fields);
     787             : 
     788         126 :                 colName = cstate->raw_fields[fldnum++];
     789         126 :                 if (colName == NULL)
     790           6 :                     ereport(ERROR,
     791             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     792             :                              errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
     793             :                                     fldnum, cstate->opts.null_print, NameStr(attr->attname))));
     794             : 
     795         120 :                 if (namestrcmp(&attr->attname, colName) != 0)
     796             :                 {
     797          14 :                     ereport(ERROR,
     798             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     799             :                              errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
     800             :                                     fldnum, colName, NameStr(attr->attname))));
     801             :                 }
     802             :             }
     803             :         }
     804             : 
     805          76 :         if (done)
     806           0 :             return false;
     807             :     }
     808             : 
     809     1257908 :     cstate->cur_lineno++;
     810             : 
     811             :     /* Actually read the line into memory here */
     812     1257908 :     done = CopyReadLine(cstate);
     813             : 
     814             :     /*
     815             :      * EOF at start of line means we're done.  If we see EOF after some
     816             :      * characters, we act as though it was newline followed by EOF, ie,
     817             :      * process the line and then exit loop on next iteration.
     818             :      */
     819     1257884 :     if (done && cstate->line_buf.len == 0)
     820        1476 :         return false;
     821             : 
     822             :     /* Parse the line into de-escaped field values */
     823     1256408 :     if (cstate->opts.csv_mode)
     824         464 :         fldct = CopyReadAttributesCSV(cstate);
     825             :     else
     826     1255944 :         fldct = CopyReadAttributesText(cstate);
     827             : 
     828     1256396 :     *fields = cstate->raw_fields;
     829     1256396 :     *nfields = fldct;
     830     1256396 :     return true;
     831             : }
     832             : 
     833             : /*
     834             :  * Read next tuple from file for COPY FROM. Return false if no more tuples.
     835             :  *
     836             :  * 'econtext' is used to evaluate default expression for each column that is
     837             :  * either not read from the file or is using the DEFAULT option of COPY FROM.
     838             :  * It can be NULL when no default values are used, i.e. when all columns are
     839             :  * read from the file, and DEFAULT option is unset.
     840             :  *
     841             :  * 'values' and 'nulls' arrays must be the same length as columns of the
     842             :  * relation passed to BeginCopyFrom. This function fills the arrays.
     843             :  */
     844             : bool
     845     1257994 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
     846             :              Datum *values, bool *nulls)
     847             : {
     848             :     TupleDesc   tupDesc;
     849             :     AttrNumber  num_phys_attrs,
     850             :                 attr_count,
     851     1257994 :                 num_defaults = cstate->num_defaults;
     852     1257994 :     FmgrInfo   *in_functions = cstate->in_functions;
     853     1257994 :     Oid        *typioparams = cstate->typioparams;
     854             :     int         i;
     855     1257994 :     int        *defmap = cstate->defmap;
     856     1257994 :     ExprState **defexprs = cstate->defexprs;
     857             : 
     858     1257994 :     tupDesc = RelationGetDescr(cstate->rel);
     859     1257994 :     num_phys_attrs = tupDesc->natts;
     860     1257994 :     attr_count = list_length(cstate->attnumlist);
     861             : 
     862             :     /* Initialize all values for row to NULL */
     863     5863012 :     MemSet(values, 0, num_phys_attrs * sizeof(Datum));
     864     1257994 :     MemSet(nulls, true, num_phys_attrs * sizeof(bool));
     865     1402102 :     MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
     866             : 
     867     1257994 :     if (!cstate->opts.binary)
     868             :     {
     869             :         char      **field_strings;
     870             :         ListCell   *cur;
     871             :         int         fldct;
     872             :         int         fieldno;
     873             :         char       *string;
     874             : 
     875             :         /* read raw fields in the next line */
     876     1257952 :         if (!NextCopyFromRawFields(cstate, &field_strings, &fldct))
     877        1592 :             return false;
     878             : 
     879             :         /* check for overflowing fields */
     880     1256396 :         if (attr_count > 0 && fldct > attr_count)
     881          18 :             ereport(ERROR,
     882             :                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     883             :                      errmsg("extra data after last expected column")));
     884             : 
     885     1256378 :         fieldno = 0;
     886             : 
     887             :         /* Loop to read the user attributes on the line. */
     888     5730288 :         foreach(cur, cstate->attnumlist)
     889             :         {
     890     4474082 :             int         attnum = lfirst_int(cur);
     891     4474082 :             int         m = attnum - 1;
     892     4474082 :             Form_pg_attribute att = TupleDescAttr(tupDesc, m);
     893             : 
     894     4474082 :             if (fieldno >= fldct)
     895          18 :                 ereport(ERROR,
     896             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
     897             :                          errmsg("missing data for column \"%s\"",
     898             :                                 NameStr(att->attname))));
     899     4474064 :             string = field_strings[fieldno++];
     900             : 
     901     4474064 :             if (cstate->convert_select_flags &&
     902          20 :                 !cstate->convert_select_flags[m])
     903             :             {
     904             :                 /* ignore input field, leaving column as NULL */
     905          10 :                 continue;
     906             :             }
     907             : 
     908     4474054 :             if (cstate->opts.csv_mode)
     909             :             {
     910         962 :                 if (string == NULL &&
     911          44 :                     cstate->opts.force_notnull_flags[m])
     912             :                 {
     913             :                     /*
     914             :                      * FORCE_NOT_NULL option is set and column is NULL -
     915             :                      * convert it to the NULL string.
     916             :                      */
     917          28 :                     string = cstate->opts.null_print;
     918             :                 }
     919         934 :                 else if (string != NULL && cstate->opts.force_null_flags[m]
     920          50 :                          && strcmp(string, cstate->opts.null_print) == 0)
     921             :                 {
     922             :                     /*
     923             :                      * FORCE_NULL option is set and column matches the NULL
     924             :                      * string. It must have been quoted, or otherwise the
     925             :                      * string would already have been set to NULL. Convert it
     926             :                      * to NULL as specified.
     927             :                      */
     928          26 :                     string = NULL;
     929             :                 }
     930             :             }
     931             : 
     932     4474054 :             cstate->cur_attname = NameStr(att->attname);
     933     4474054 :             cstate->cur_attval = string;
     934             : 
     935     4474054 :             if (string != NULL)
     936     4469210 :                 nulls[m] = false;
     937             : 
     938     4474054 :             if (cstate->defaults[m])
     939             :             {
     940             :                 /*
     941             :                  * The caller must supply econtext and have switched into the
     942             :                  * per-tuple memory context in it.
     943             :                  */
     944             :                 Assert(econtext != NULL);
     945             :                 Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
     946             : 
     947          60 :                 values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
     948             :             }
     949             : 
     950             :             /*
     951             :              * If ON_ERROR is specified with IGNORE, skip rows with soft
     952             :              * errors
     953             :              */
     954     4473956 :             else if (!InputFunctionCallSafe(&in_functions[m],
     955             :                                             string,
     956     4473994 :                                             typioparams[m],
     957             :                                             att->atttypmod,
     958     4473994 :                                             (Node *) cstate->escontext,
     959     4473994 :                                             &values[m]))
     960             :             {
     961             :                 Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
     962             : 
     963         116 :                 cstate->num_errors++;
     964             : 
     965         116 :                 if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
     966             :                 {
     967             :                     /*
     968             :                      * Since we emit line number and column info in the below
     969             :                      * notice message, we suppress error context information
     970             :                      * other than the relation name.
     971             :                      */
     972             :                     Assert(!cstate->relname_only);
     973          42 :                     cstate->relname_only = true;
     974             : 
     975          42 :                     if (cstate->cur_attval)
     976             :                     {
     977             :                         char       *attval;
     978             : 
     979          36 :                         attval = CopyLimitPrintoutLength(cstate->cur_attval);
     980          36 :                         ereport(NOTICE,
     981             :                                 errmsg("skipping row due to data type incompatibility at line %llu for column \"%s\": \"%s\"",
     982             :                                        (unsigned long long) cstate->cur_lineno,
     983             :                                        cstate->cur_attname,
     984             :                                        attval));
     985          36 :                         pfree(attval);
     986             :                     }
     987             :                     else
     988           6 :                         ereport(NOTICE,
     989             :                                 errmsg("skipping row due to data type incompatibility at line %llu for column \"%s\": null input",
     990             :                                        (unsigned long long) cstate->cur_lineno,
     991             :                                        cstate->cur_attname));
     992             : 
     993             :                     /* reset relname_only */
     994          42 :                     cstate->relname_only = false;
     995             :                 }
     996             : 
     997         116 :                 return true;
     998             :             }
     999             : 
    1000     4473900 :             cstate->cur_attname = NULL;
    1001     4473900 :             cstate->cur_attval = NULL;
    1002             :         }
    1003             : 
    1004             :         Assert(fieldno == attr_count);
    1005             :     }
    1006             :     else
    1007             :     {
    1008             :         /* binary */
    1009             :         int16       fld_count;
    1010             :         ListCell   *cur;
    1011             : 
    1012          42 :         cstate->cur_lineno++;
    1013             : 
    1014          42 :         if (!CopyGetInt16(cstate, &fld_count))
    1015             :         {
    1016             :             /* EOF detected (end of file, or protocol-level EOF) */
    1017          12 :             return false;
    1018             :         }
    1019             : 
    1020          42 :         if (fld_count == -1)
    1021             :         {
    1022             :             /*
    1023             :              * Received EOF marker.  Wait for the protocol-level EOF, and
    1024             :              * complain if it doesn't come immediately.  In COPY FROM STDIN,
    1025             :              * this ensures that we correctly handle CopyFail, if client
    1026             :              * chooses to send that now.  When copying from file, we could
    1027             :              * ignore the rest of the file like in text mode, but we choose to
    1028             :              * be consistent with the COPY FROM STDIN case.
    1029             :              */
    1030             :             char        dummy;
    1031             : 
    1032          12 :             if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
    1033           0 :                 ereport(ERROR,
    1034             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1035             :                          errmsg("received copy data after EOF marker")));
    1036          12 :             return false;
    1037             :         }
    1038             : 
    1039          30 :         if (fld_count != attr_count)
    1040           0 :             ereport(ERROR,
    1041             :                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1042             :                      errmsg("row field count is %d, expected %d",
    1043             :                             (int) fld_count, attr_count)));
    1044             : 
    1045         186 :         foreach(cur, cstate->attnumlist)
    1046             :         {
    1047         158 :             int         attnum = lfirst_int(cur);
    1048         158 :             int         m = attnum - 1;
    1049         158 :             Form_pg_attribute att = TupleDescAttr(tupDesc, m);
    1050             : 
    1051         158 :             cstate->cur_attname = NameStr(att->attname);
    1052         314 :             values[m] = CopyReadBinaryAttribute(cstate,
    1053         158 :                                                 &in_functions[m],
    1054         158 :                                                 typioparams[m],
    1055             :                                                 att->atttypmod,
    1056             :                                                 &nulls[m]);
    1057         156 :             cstate->cur_attname = NULL;
    1058             :         }
    1059             :     }
    1060             : 
    1061             :     /*
    1062             :      * Now compute and insert any defaults available for the columns not
    1063             :      * provided by the input data.  Anything not processed here or above will
    1064             :      * remain NULL.
    1065             :      */
    1066     1316764 :     for (i = 0; i < num_defaults; i++)
    1067             :     {
    1068             :         /*
    1069             :          * The caller must supply econtext and have switched into the
    1070             :          * per-tuple memory context in it.
    1071             :          */
    1072             :         Assert(econtext != NULL);
    1073             :         Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
    1074             : 
    1075       60530 :         values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
    1076       60530 :                                          &nulls[defmap[i]]);
    1077             :     }
    1078             : 
    1079     1256234 :     return true;
    1080             : }
    1081             : 
    1082             : /*
    1083             :  * Read the next input line and stash it in line_buf.
    1084             :  *
    1085             :  * Result is true if read was terminated by EOF, false if terminated
    1086             :  * by newline.  The terminating newline or EOF marker is not included
    1087             :  * in the final value of line_buf.
    1088             :  */
    1089             : static bool
    1090     1258028 : CopyReadLine(CopyFromState cstate)
    1091             : {
    1092             :     bool        result;
    1093             : 
    1094     1258028 :     resetStringInfo(&cstate->line_buf);
    1095     1258028 :     cstate->line_buf_valid = false;
    1096             : 
    1097             :     /* Parse data and transfer into line_buf */
    1098     1258028 :     result = CopyReadLineText(cstate);
    1099             : 
    1100     1258004 :     if (result)
    1101             :     {
    1102             :         /*
    1103             :          * Reached EOF.  In protocol version 3, we should ignore anything
    1104             :          * after \. up to the protocol end of copy data.  (XXX maybe better
    1105             :          * not to treat \. as special?)
    1106             :          */
    1107        1476 :         if (cstate->copy_src == COPY_FRONTEND)
    1108             :         {
    1109             :             int         inbytes;
    1110             : 
    1111             :             do
    1112             :             {
    1113         724 :                 inbytes = CopyGetData(cstate, cstate->input_buf,
    1114             :                                       1, INPUT_BUF_SIZE);
    1115         724 :             } while (inbytes > 0);
    1116         724 :             cstate->input_buf_index = 0;
    1117         724 :             cstate->input_buf_len = 0;
    1118         724 :             cstate->raw_buf_index = 0;
    1119         724 :             cstate->raw_buf_len = 0;
    1120             :         }
    1121             :     }
    1122             :     else
    1123             :     {
    1124             :         /*
    1125             :          * If we didn't hit EOF, then we must have transferred the EOL marker
    1126             :          * to line_buf along with the data.  Get rid of it.
    1127             :          */
    1128     1256528 :         switch (cstate->eol_type)
    1129             :         {
    1130     1256528 :             case EOL_NL:
    1131             :                 Assert(cstate->line_buf.len >= 1);
    1132             :                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
    1133     1256528 :                 cstate->line_buf.len--;
    1134     1256528 :                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
    1135     1256528 :                 break;
    1136           0 :             case EOL_CR:
    1137             :                 Assert(cstate->line_buf.len >= 1);
    1138             :                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
    1139           0 :                 cstate->line_buf.len--;
    1140           0 :                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
    1141           0 :                 break;
    1142           0 :             case EOL_CRNL:
    1143             :                 Assert(cstate->line_buf.len >= 2);
    1144             :                 Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
    1145             :                 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
    1146           0 :                 cstate->line_buf.len -= 2;
    1147           0 :                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
    1148           0 :                 break;
    1149           0 :             case EOL_UNKNOWN:
    1150             :                 /* shouldn't get here */
    1151             :                 Assert(false);
    1152           0 :                 break;
    1153             :         }
    1154     1258004 :     }
    1155             : 
    1156             :     /* Now it's safe to use the buffer in error messages */
    1157     1258004 :     cstate->line_buf_valid = true;
    1158             : 
    1159     1258004 :     return result;
    1160             : }
    1161             : 
    1162             : /*
    1163             :  * CopyReadLineText - inner loop of CopyReadLine for text mode
    1164             :  */
    1165             : static bool
    1166     1258028 : CopyReadLineText(CopyFromState cstate)
    1167             : {
    1168             :     char       *copy_input_buf;
    1169             :     int         input_buf_ptr;
    1170             :     int         copy_buf_len;
    1171     1258028 :     bool        need_data = false;
    1172     1258028 :     bool        hit_eof = false;
    1173     1258028 :     bool        result = false;
    1174             : 
    1175             :     /* CSV variables */
    1176     1258028 :     bool        in_quote = false,
    1177     1258028 :                 last_was_esc = false;
    1178     1258028 :     char        quotec = '\0';
    1179     1258028 :     char        escapec = '\0';
    1180             : 
    1181     1258028 :     if (cstate->opts.csv_mode)
    1182             :     {
    1183         720 :         quotec = cstate->opts.quote[0];
    1184         720 :         escapec = cstate->opts.escape[0];
    1185             :         /* ignore special escape processing if it's the same as quotec */
    1186         720 :         if (quotec == escapec)
    1187         526 :             escapec = '\0';
    1188             :     }
    1189             : 
    1190             :     /*
    1191             :      * The objective of this loop is to transfer the entire next input line
    1192             :      * into line_buf.  Hence, we only care for detecting newlines (\r and/or
    1193             :      * \n) and the end-of-copy marker (\.).
    1194             :      *
    1195             :      * In CSV mode, \r and \n inside a quoted field are just part of the data
    1196             :      * value and are put in line_buf.  We keep just enough state to know if we
    1197             :      * are currently in a quoted field or not.
    1198             :      *
    1199             :      * The input has already been converted to the database encoding.  All
    1200             :      * supported server encodings have the property that all bytes in a
    1201             :      * multi-byte sequence have the high bit set, so a multibyte character
    1202             :      * cannot contain any newline or escape characters embedded in the
    1203             :      * multibyte sequence.  Therefore, we can process the input byte-by-byte,
    1204             :      * regardless of the encoding.
    1205             :      *
    1206             :      * For speed, we try to move data from input_buf to line_buf in chunks
    1207             :      * rather than one character at a time.  input_buf_ptr points to the next
    1208             :      * character to examine; any characters from input_buf_index to
    1209             :      * input_buf_ptr have been determined to be part of the line, but not yet
    1210             :      * transferred to line_buf.
    1211             :      *
    1212             :      * For a little extra speed within the loop, we copy input_buf and
    1213             :      * input_buf_len into local variables.
    1214             :      */
    1215     1258028 :     copy_input_buf = cstate->input_buf;
    1216     1258028 :     input_buf_ptr = cstate->input_buf_index;
    1217     1258028 :     copy_buf_len = cstate->input_buf_len;
    1218             : 
    1219             :     for (;;)
    1220    25107062 :     {
    1221             :         int         prev_raw_ptr;
    1222             :         char        c;
    1223             : 
    1224             :         /*
    1225             :          * Load more data if needed.
    1226             :          *
    1227             :          * TODO: We could just force four bytes of read-ahead and avoid the
    1228             :          * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE().  That was
    1229             :          * unsafe with the old v2 COPY protocol, but we don't support that
    1230             :          * anymore.
    1231             :          */
    1232    26365090 :         if (input_buf_ptr >= copy_buf_len || need_data)
    1233             :         {
    1234      430738 :             REFILL_LINEBUF;
    1235             : 
    1236      430738 :             CopyLoadInputBuf(cstate);
    1237             :             /* update our local variables */
    1238      430726 :             hit_eof = cstate->input_reached_eof;
    1239      430726 :             input_buf_ptr = cstate->input_buf_index;
    1240      430726 :             copy_buf_len = cstate->input_buf_len;
    1241             : 
    1242             :             /*
    1243             :              * If we are completely out of data, break out of the loop,
    1244             :              * reporting EOF.
    1245             :              */
    1246      430726 :             if (INPUT_BUF_BYTES(cstate) <= 0)
    1247             :             {
    1248        1446 :                 result = true;
    1249        1446 :                 break;
    1250             :             }
    1251      429280 :             need_data = false;
    1252             :         }
    1253             : 
    1254             :         /* OK to fetch a character */
    1255    26363632 :         prev_raw_ptr = input_buf_ptr;
    1256    26363632 :         c = copy_input_buf[input_buf_ptr++];
    1257             : 
    1258    26363632 :         if (cstate->opts.csv_mode)
    1259             :         {
    1260             :             /*
    1261             :              * If character is '\r', we may need to look ahead below.  Force
    1262             :              * fetch of the next character if we don't already have it.  We
    1263             :              * need to do this before changing CSV state, in case '\r' is also
    1264             :              * the quote or escape character.
    1265             :              */
    1266        5418 :             if (c == '\r')
    1267             :             {
    1268          36 :                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
    1269             :             }
    1270             : 
    1271             :             /*
    1272             :              * Dealing with quotes and escapes here is mildly tricky. If the
    1273             :              * quote char is also the escape char, there's no problem - we
    1274             :              * just use the char as a toggle. If they are different, we need
    1275             :              * to ensure that we only take account of an escape inside a
    1276             :              * quoted field and immediately preceding a quote char, and not
    1277             :              * the second in an escape-escape sequence.
    1278             :              */
    1279        5418 :             if (in_quote && c == escapec)
    1280          48 :                 last_was_esc = !last_was_esc;
    1281        5418 :             if (c == quotec && !last_was_esc)
    1282         508 :                 in_quote = !in_quote;
    1283        5418 :             if (c != escapec)
    1284        5364 :                 last_was_esc = false;
    1285             : 
    1286             :             /*
    1287             :              * Updating the line count for embedded CR and/or LF chars is
    1288             :              * necessarily a little fragile - this test is probably about the
    1289             :              * best we can do.  (XXX it's arguable whether we should do this
    1290             :              * at all --- is cur_lineno a physical or logical count?)
    1291             :              */
    1292        5418 :             if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
    1293          36 :                 cstate->cur_lineno++;
    1294             :         }
    1295             : 
    1296             :         /* Process \r */
    1297    26363632 :         if (c == '\r' && (!cstate->opts.csv_mode || !in_quote))
    1298             :         {
    1299             :             /* Check for \r\n on first line, _and_ handle \r\n. */
    1300           0 :             if (cstate->eol_type == EOL_UNKNOWN ||
    1301           0 :                 cstate->eol_type == EOL_CRNL)
    1302             :             {
    1303             :                 /*
    1304             :                  * If need more data, go back to loop top to load it.
    1305             :                  *
    1306             :                  * Note that if we are at EOF, c will wind up as '\0' because
    1307             :                  * of the guaranteed pad of input_buf.
    1308             :                  */
    1309           0 :                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
    1310             : 
    1311             :                 /* get next char */
    1312           0 :                 c = copy_input_buf[input_buf_ptr];
    1313             : 
    1314           0 :                 if (c == '\n')
    1315             :                 {
    1316           0 :                     input_buf_ptr++;    /* eat newline */
    1317           0 :                     cstate->eol_type = EOL_CRNL; /* in case not set yet */
    1318             :                 }
    1319             :                 else
    1320             :                 {
    1321             :                     /* found \r, but no \n */
    1322           0 :                     if (cstate->eol_type == EOL_CRNL)
    1323           0 :                         ereport(ERROR,
    1324             :                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1325             :                                  !cstate->opts.csv_mode ?
    1326             :                                  errmsg("literal carriage return found in data") :
    1327             :                                  errmsg("unquoted carriage return found in data"),
    1328             :                                  !cstate->opts.csv_mode ?
    1329             :                                  errhint("Use \"\\r\" to represent carriage return.") :
    1330             :                                  errhint("Use quoted CSV field to represent carriage return.")));
    1331             : 
    1332             :                     /*
    1333             :                      * if we got here, it is the first line and we didn't find
    1334             :                      * \n, so don't consume the peeked character
    1335             :                      */
    1336           0 :                     cstate->eol_type = EOL_CR;
    1337             :                 }
    1338             :             }
    1339           0 :             else if (cstate->eol_type == EOL_NL)
    1340           0 :                 ereport(ERROR,
    1341             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1342             :                          !cstate->opts.csv_mode ?
    1343             :                          errmsg("literal carriage return found in data") :
    1344             :                          errmsg("unquoted carriage return found in data"),
    1345             :                          !cstate->opts.csv_mode ?
    1346             :                          errhint("Use \"\\r\" to represent carriage return.") :
    1347             :                          errhint("Use quoted CSV field to represent carriage return.")));
    1348             :             /* If reach here, we have found the line terminator */
    1349           0 :             break;
    1350             :         }
    1351             : 
    1352             :         /* Process \n */
    1353    26363632 :         if (c == '\n' && (!cstate->opts.csv_mode || !in_quote))
    1354             :         {
    1355     1256528 :             if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
    1356           0 :                 ereport(ERROR,
    1357             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1358             :                          !cstate->opts.csv_mode ?
    1359             :                          errmsg("literal newline found in data") :
    1360             :                          errmsg("unquoted newline found in data"),
    1361             :                          !cstate->opts.csv_mode ?
    1362             :                          errhint("Use \"\\n\" to represent newline.") :
    1363             :                          errhint("Use quoted CSV field to represent newline.")));
    1364     1256528 :             cstate->eol_type = EOL_NL;   /* in case not set yet */
    1365             :             /* If reach here, we have found the line terminator */
    1366     1256528 :             break;
    1367             :         }
    1368             : 
    1369             :         /*
    1370             :          * Process backslash, except in CSV mode where backslash is a normal
    1371             :          * character.
    1372             :          */
    1373    25107104 :         if (c == '\\' && !cstate->opts.csv_mode)
    1374             :         {
    1375             :             char        c2;
    1376             : 
    1377        8036 :             IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
    1378        8036 :             IF_NEED_REFILL_AND_EOF_BREAK(0);
    1379             : 
    1380             :             /* -----
    1381             :              * get next character
    1382             :              * Note: we do not change c so if it isn't \., we can fall
    1383             :              * through and continue processing.
    1384             :              * -----
    1385             :              */
    1386        8036 :             c2 = copy_input_buf[input_buf_ptr];
    1387             : 
    1388        8036 :             if (c2 == '.')
    1389             :             {
    1390          42 :                 input_buf_ptr++;    /* consume the '.' */
    1391          42 :                 if (cstate->eol_type == EOL_CRNL)
    1392             :                 {
    1393             :                     /* Get the next character */
    1394           0 :                     IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
    1395             :                     /* if hit_eof, c2 will become '\0' */
    1396           0 :                     c2 = copy_input_buf[input_buf_ptr++];
    1397             : 
    1398           0 :                     if (c2 == '\n')
    1399           0 :                         ereport(ERROR,
    1400             :                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1401             :                                  errmsg("end-of-copy marker does not match previous newline style")));
    1402           0 :                     else if (c2 != '\r')
    1403           0 :                         ereport(ERROR,
    1404             :                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1405             :                                  errmsg("end-of-copy marker is not alone on its line")));
    1406             :                 }
    1407             : 
    1408             :                 /* Get the next character */
    1409          42 :                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
    1410             :                 /* if hit_eof, c2 will become '\0' */
    1411          42 :                 c2 = copy_input_buf[input_buf_ptr++];
    1412             : 
    1413          42 :                 if (c2 != '\r' && c2 != '\n')
    1414           6 :                     ereport(ERROR,
    1415             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1416             :                              errmsg("end-of-copy marker is not alone on its line")));
    1417             : 
    1418          36 :                 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
    1419          36 :                     (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
    1420          36 :                     (cstate->eol_type == EOL_CR && c2 != '\r'))
    1421           0 :                     ereport(ERROR,
    1422             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1423             :                              errmsg("end-of-copy marker does not match previous newline style")));
    1424             : 
    1425             :                 /*
    1426             :                  * If there is any data on this line before the \., complain.
    1427             :                  */
    1428          36 :                 if (cstate->line_buf.len > 0 ||
    1429          36 :                     prev_raw_ptr > cstate->input_buf_index)
    1430           6 :                     ereport(ERROR,
    1431             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1432             :                              errmsg("end-of-copy marker is not alone on its line")));
    1433             : 
    1434             :                 /*
    1435             :                  * Discard the \. and newline, then report EOF.
    1436             :                  */
    1437          30 :                 cstate->input_buf_index = input_buf_ptr;
    1438          30 :                 result = true;  /* report EOF */
    1439          30 :                 break;
    1440             :             }
    1441             :             else
    1442             :             {
    1443             :                 /*
    1444             :                  * If we are here, it means we found a backslash followed by
    1445             :                  * something other than a period.  In non-CSV mode, anything
    1446             :                  * after a backslash is special, so we skip over that second
    1447             :                  * character too.  If we didn't do that \\. would be
    1448             :                  * considered an eof-of copy, while in non-CSV mode it is a
    1449             :                  * literal backslash followed by a period.
    1450             :                  */
    1451        7994 :                 input_buf_ptr++;
    1452             :             }
    1453             :         }
    1454             :     }                           /* end of outer loop */
    1455             : 
    1456             :     /*
    1457             :      * Transfer any still-uncopied data to line_buf.
    1458             :      */
    1459     1258004 :     REFILL_LINEBUF;
    1460             : 
    1461     1258004 :     return result;
    1462             : }
    1463             : 
    1464             : /*
    1465             :  *  Return decimal value for a hexadecimal digit
    1466             :  */
    1467             : static int
    1468           0 : GetDecimalFromHex(char hex)
    1469             : {
    1470           0 :     if (isdigit((unsigned char) hex))
    1471           0 :         return hex - '0';
    1472             :     else
    1473           0 :         return tolower((unsigned char) hex) - 'a' + 10;
    1474             : }
    1475             : 
    1476             : /*
    1477             :  * Parse the current line into separate attributes (fields),
    1478             :  * performing de-escaping as needed.
    1479             :  *
    1480             :  * The input is in line_buf.  We use attribute_buf to hold the result
    1481             :  * strings.  cstate->raw_fields[k] is set to point to the k'th attribute
    1482             :  * string, or NULL when the input matches the null marker string.
    1483             :  * This array is expanded as necessary.
    1484             :  *
    1485             :  * (Note that the caller cannot check for nulls since the returned
    1486             :  * string would be the post-de-escaping equivalent, which may look
    1487             :  * the same as some valid data string.)
    1488             :  *
    1489             :  * delim is the column delimiter string (must be just one byte for now).
    1490             :  * null_print is the null marker string.  Note that this is compared to
    1491             :  * the pre-de-escaped input string.
    1492             :  *
    1493             :  * The return value is the number of fields actually read.
    1494             :  */
    1495             : static int
    1496     1256010 : CopyReadAttributesText(CopyFromState cstate)
    1497             : {
    1498     1256010 :     char        delimc = cstate->opts.delim[0];
    1499             :     int         fieldno;
    1500             :     char       *output_ptr;
    1501             :     char       *cur_ptr;
    1502             :     char       *line_end_ptr;
    1503             : 
    1504             :     /*
    1505             :      * We need a special case for zero-column tables: check that the input
    1506             :      * line is empty, and return.
    1507             :      */
    1508     1256010 :     if (cstate->max_fields <= 0)
    1509             :     {
    1510           8 :         if (cstate->line_buf.len != 0)
    1511           0 :             ereport(ERROR,
    1512             :                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1513             :                      errmsg("extra data after last expected column")));
    1514           8 :         return 0;
    1515             :     }
    1516             : 
    1517     1256002 :     resetStringInfo(&cstate->attribute_buf);
    1518             : 
    1519             :     /*
    1520             :      * The de-escaped attributes will certainly not be longer than the input
    1521             :      * data line, so we can just force attribute_buf to be large enough and
    1522             :      * then transfer data without any checks for enough space.  We need to do
    1523             :      * it this way because enlarging attribute_buf mid-stream would invalidate
    1524             :      * pointers already stored into cstate->raw_fields[].
    1525             :      */
    1526     1256002 :     if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
    1527           8 :         enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
    1528     1256002 :     output_ptr = cstate->attribute_buf.data;
    1529             : 
    1530             :     /* set pointer variables for loop */
    1531     1256002 :     cur_ptr = cstate->line_buf.data;
    1532     1256002 :     line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
    1533             : 
    1534             :     /* Outer loop iterates over fields */
    1535     1256002 :     fieldno = 0;
    1536             :     for (;;)
    1537     3217432 :     {
    1538     4473434 :         bool        found_delim = false;
    1539             :         char       *start_ptr;
    1540             :         char       *end_ptr;
    1541             :         int         input_len;
    1542     4473434 :         bool        saw_non_ascii = false;
    1543             : 
    1544             :         /* Make sure there is enough space for the next value */
    1545     4473434 :         if (fieldno >= cstate->max_fields)
    1546             :         {
    1547          36 :             cstate->max_fields *= 2;
    1548          36 :             cstate->raw_fields =
    1549          36 :                 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
    1550             :         }
    1551             : 
    1552             :         /* Remember start of field on both input and output sides */
    1553     4473434 :         start_ptr = cur_ptr;
    1554     4473434 :         cstate->raw_fields[fieldno] = output_ptr;
    1555             : 
    1556             :         /*
    1557             :          * Scan data for field.
    1558             :          *
    1559             :          * Note that in this loop, we are scanning to locate the end of field
    1560             :          * and also speculatively performing de-escaping.  Once we find the
    1561             :          * end-of-field, we can match the raw field contents against the null
    1562             :          * marker string.  Only after that comparison fails do we know that
    1563             :          * de-escaping is actually the right thing to do; therefore we *must
    1564             :          * not* throw any syntax errors before we've done the null-marker
    1565             :          * check.
    1566             :          */
    1567             :         for (;;)
    1568    21884214 :         {
    1569             :             char        c;
    1570             : 
    1571    26357648 :             end_ptr = cur_ptr;
    1572    26357648 :             if (cur_ptr >= line_end_ptr)
    1573     1255996 :                 break;
    1574    25101652 :             c = *cur_ptr++;
    1575    25101652 :             if (c == delimc)
    1576             :             {
    1577     3217438 :                 found_delim = true;
    1578     3217438 :                 break;
    1579             :             }
    1580    21884214 :             if (c == '\\')
    1581             :             {
    1582        7994 :                 if (cur_ptr >= line_end_ptr)
    1583           0 :                     break;
    1584        7994 :                 c = *cur_ptr++;
    1585        7994 :                 switch (c)
    1586             :                 {
    1587          12 :                     case '0':
    1588             :                     case '1':
    1589             :                     case '2':
    1590             :                     case '3':
    1591             :                     case '4':
    1592             :                     case '5':
    1593             :                     case '6':
    1594             :                     case '7':
    1595             :                         {
    1596             :                             /* handle \013 */
    1597             :                             int         val;
    1598             : 
    1599          12 :                             val = OCTVALUE(c);
    1600          12 :                             if (cur_ptr < line_end_ptr)
    1601             :                             {
    1602           6 :                                 c = *cur_ptr;
    1603           6 :                                 if (ISOCTAL(c))
    1604             :                                 {
    1605           0 :                                     cur_ptr++;
    1606           0 :                                     val = (val << 3) + OCTVALUE(c);
    1607           0 :                                     if (cur_ptr < line_end_ptr)
    1608             :                                     {
    1609           0 :                                         c = *cur_ptr;
    1610           0 :                                         if (ISOCTAL(c))
    1611             :                                         {
    1612           0 :                                             cur_ptr++;
    1613           0 :                                             val = (val << 3) + OCTVALUE(c);
    1614             :                                         }
    1615             :                                     }
    1616             :                                 }
    1617             :                             }
    1618          12 :                             c = val & 0377;
    1619          12 :                             if (c == '\0' || IS_HIGHBIT_SET(c))
    1620          12 :                                 saw_non_ascii = true;
    1621             :                         }
    1622          12 :                         break;
    1623          12 :                     case 'x':
    1624             :                         /* Handle \x3F */
    1625          12 :                         if (cur_ptr < line_end_ptr)
    1626             :                         {
    1627           6 :                             char        hexchar = *cur_ptr;
    1628             : 
    1629           6 :                             if (isxdigit((unsigned char) hexchar))
    1630             :                             {
    1631           0 :                                 int         val = GetDecimalFromHex(hexchar);
    1632             : 
    1633           0 :                                 cur_ptr++;
    1634           0 :                                 if (cur_ptr < line_end_ptr)
    1635             :                                 {
    1636           0 :                                     hexchar = *cur_ptr;
    1637           0 :                                     if (isxdigit((unsigned char) hexchar))
    1638             :                                     {
    1639           0 :                                         cur_ptr++;
    1640           0 :                                         val = (val << 4) + GetDecimalFromHex(hexchar);
    1641             :                                     }
    1642             :                                 }
    1643           0 :                                 c = val & 0xff;
    1644           0 :                                 if (c == '\0' || IS_HIGHBIT_SET(c))
    1645           0 :                                     saw_non_ascii = true;
    1646             :                             }
    1647             :                         }
    1648          12 :                         break;
    1649           0 :                     case 'b':
    1650           0 :                         c = '\b';
    1651           0 :                         break;
    1652           0 :                     case 'f':
    1653           0 :                         c = '\f';
    1654           0 :                         break;
    1655        3050 :                     case 'n':
    1656        3050 :                         c = '\n';
    1657        3050 :                         break;
    1658           0 :                     case 'r':
    1659           0 :                         c = '\r';
    1660           0 :                         break;
    1661           0 :                     case 't':
    1662           0 :                         c = '\t';
    1663           0 :                         break;
    1664           0 :                     case 'v':
    1665           0 :                         c = '\v';
    1666           0 :                         break;
    1667             : 
    1668             :                         /*
    1669             :                          * in all other cases, take the char after '\'
    1670             :                          * literally
    1671             :                          */
    1672             :                 }
    1673    21876220 :             }
    1674             : 
    1675             :             /* Add c to output string */
    1676    21884214 :             *output_ptr++ = c;
    1677             :         }
    1678             : 
    1679             :         /* Check whether raw input matched null marker */
    1680     4473434 :         input_len = end_ptr - start_ptr;
    1681     4473434 :         if (input_len == cstate->opts.null_print_len &&
    1682      245764 :             strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
    1683        4808 :             cstate->raw_fields[fieldno] = NULL;
    1684             :         /* Check whether raw input matched default marker */
    1685     4468626 :         else if (fieldno < list_length(cstate->attnumlist) &&
    1686     4468584 :                  cstate->opts.default_print &&
    1687         114 :                  input_len == cstate->opts.default_print_len &&
    1688          30 :                  strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
    1689          24 :         {
    1690             :             /* fieldno is 0-indexed and attnum is 1-indexed */
    1691          30 :             int         m = list_nth_int(cstate->attnumlist, fieldno) - 1;
    1692             : 
    1693          30 :             if (cstate->defexprs[m] != NULL)
    1694             :             {
    1695             :                 /* defaults contain entries for all physical attributes */
    1696          24 :                 cstate->defaults[m] = true;
    1697             :             }
    1698             :             else
    1699             :             {
    1700           6 :                 TupleDesc   tupDesc = RelationGetDescr(cstate->rel);
    1701           6 :                 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
    1702             : 
    1703           6 :                 ereport(ERROR,
    1704             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1705             :                          errmsg("unexpected default marker in COPY data"),
    1706             :                          errdetail("Column \"%s\" has no default value.",
    1707             :                                    NameStr(att->attname))));
    1708             :             }
    1709             :         }
    1710             :         else
    1711             :         {
    1712             :             /*
    1713             :              * At this point we know the field is supposed to contain data.
    1714             :              *
    1715             :              * If we de-escaped any non-7-bit-ASCII chars, make sure the
    1716             :              * resulting string is valid data for the db encoding.
    1717             :              */
    1718     4468596 :             if (saw_non_ascii)
    1719             :             {
    1720           0 :                 char       *fld = cstate->raw_fields[fieldno];
    1721             : 
    1722           0 :                 pg_verifymbstr(fld, output_ptr - fld, false);
    1723             :             }
    1724             :         }
    1725             : 
    1726             :         /* Terminate attribute value in output area */
    1727     4473428 :         *output_ptr++ = '\0';
    1728             : 
    1729     4473428 :         fieldno++;
    1730             :         /* Done if we hit EOL instead of a delim */
    1731     4473428 :         if (!found_delim)
    1732     1255996 :             break;
    1733             :     }
    1734             : 
    1735             :     /* Clean up state of attribute_buf */
    1736     1255996 :     output_ptr--;
    1737             :     Assert(*output_ptr == '\0');
    1738     1255996 :     cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
    1739             : 
    1740     1255996 :     return fieldno;
    1741             : }
    1742             : 
    1743             : /*
    1744             :  * Parse the current line into separate attributes (fields),
    1745             :  * performing de-escaping as needed.  This has exactly the same API as
    1746             :  * CopyReadAttributesText, except we parse the fields according to
    1747             :  * "standard" (i.e. common) CSV usage.
    1748             :  */
    1749             : static int
    1750         474 : CopyReadAttributesCSV(CopyFromState cstate)
    1751             : {
    1752         474 :     char        delimc = cstate->opts.delim[0];
    1753         474 :     char        quotec = cstate->opts.quote[0];
    1754         474 :     char        escapec = cstate->opts.escape[0];
    1755             :     int         fieldno;
    1756             :     char       *output_ptr;
    1757             :     char       *cur_ptr;
    1758             :     char       *line_end_ptr;
    1759             : 
    1760             :     /*
    1761             :      * We need a special case for zero-column tables: check that the input
    1762             :      * line is empty, and return.
    1763             :      */
    1764         474 :     if (cstate->max_fields <= 0)
    1765             :     {
    1766           0 :         if (cstate->line_buf.len != 0)
    1767           0 :             ereport(ERROR,
    1768             :                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1769             :                      errmsg("extra data after last expected column")));
    1770           0 :         return 0;
    1771             :     }
    1772             : 
    1773         474 :     resetStringInfo(&cstate->attribute_buf);
    1774             : 
    1775             :     /*
    1776             :      * The de-escaped attributes will certainly not be longer than the input
    1777             :      * data line, so we can just force attribute_buf to be large enough and
    1778             :      * then transfer data without any checks for enough space.  We need to do
    1779             :      * it this way because enlarging attribute_buf mid-stream would invalidate
    1780             :      * pointers already stored into cstate->raw_fields[].
    1781             :      */
    1782         474 :     if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
    1783           0 :         enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
    1784         474 :     output_ptr = cstate->attribute_buf.data;
    1785             : 
    1786             :     /* set pointer variables for loop */
    1787         474 :     cur_ptr = cstate->line_buf.data;
    1788         474 :     line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
    1789             : 
    1790             :     /* Outer loop iterates over fields */
    1791         474 :     fieldno = 0;
    1792             :     for (;;)
    1793         530 :     {
    1794        1004 :         bool        found_delim = false;
    1795        1004 :         bool        saw_quote = false;
    1796             :         char       *start_ptr;
    1797             :         char       *end_ptr;
    1798             :         int         input_len;
    1799             : 
    1800             :         /* Make sure there is enough space for the next value */
    1801        1004 :         if (fieldno >= cstate->max_fields)
    1802             :         {
    1803           0 :             cstate->max_fields *= 2;
    1804           0 :             cstate->raw_fields =
    1805           0 :                 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
    1806             :         }
    1807             : 
    1808             :         /* Remember start of field on both input and output sides */
    1809        1004 :         start_ptr = cur_ptr;
    1810        1004 :         cstate->raw_fields[fieldno] = output_ptr;
    1811             : 
    1812             :         /*
    1813             :          * Scan data for field,
    1814             :          *
    1815             :          * The loop starts in "not quote" mode and then toggles between that
    1816             :          * and "in quote" mode. The loop exits normally if it is in "not
    1817             :          * quote" mode and a delimiter or line end is seen.
    1818             :          */
    1819             :         for (;;)
    1820         222 :         {
    1821             :             char        c;
    1822             : 
    1823             :             /* Not in quote */
    1824             :             for (;;)
    1825             :             {
    1826        3230 :                 end_ptr = cur_ptr;
    1827        3230 :                 if (cur_ptr >= line_end_ptr)
    1828         468 :                     goto endfield;
    1829        2762 :                 c = *cur_ptr++;
    1830             :                 /* unquoted field delimiter */
    1831        2762 :                 if (c == delimc)
    1832             :                 {
    1833         536 :                     found_delim = true;
    1834         536 :                     goto endfield;
    1835             :                 }
    1836             :                 /* start of quoted field (or part of field) */
    1837        2226 :                 if (c == quotec)
    1838             :                 {
    1839         222 :                     saw_quote = true;
    1840         222 :                     break;
    1841             :                 }
    1842             :                 /* Add c to output string */
    1843        2004 :                 *output_ptr++ = c;
    1844             :             }
    1845             : 
    1846             :             /* In quote */
    1847             :             for (;;)
    1848             :             {
    1849        1390 :                 end_ptr = cur_ptr;
    1850        1390 :                 if (cur_ptr >= line_end_ptr)
    1851           0 :                     ereport(ERROR,
    1852             :                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1853             :                              errmsg("unterminated CSV quoted field")));
    1854             : 
    1855        1390 :                 c = *cur_ptr++;
    1856             : 
    1857             :                 /* escape within a quoted field */
    1858        1390 :                 if (c == escapec)
    1859             :                 {
    1860             :                     /*
    1861             :                      * peek at the next char if available, and escape it if it
    1862             :                      * is an escape char or a quote char
    1863             :                      */
    1864         118 :                     if (cur_ptr < line_end_ptr)
    1865             :                     {
    1866          72 :                         char        nextc = *cur_ptr;
    1867             : 
    1868          72 :                         if (nextc == escapec || nextc == quotec)
    1869             :                         {
    1870          24 :                             *output_ptr++ = nextc;
    1871          24 :                             cur_ptr++;
    1872          24 :                             continue;
    1873             :                         }
    1874             :                     }
    1875             :                 }
    1876             : 
    1877             :                 /*
    1878             :                  * end of quoted field. Must do this test after testing for
    1879             :                  * escape in case quote char and escape char are the same
    1880             :                  * (which is the common case).
    1881             :                  */
    1882        1366 :                 if (c == quotec)
    1883         222 :                     break;
    1884             : 
    1885             :                 /* Add c to output string */
    1886        1144 :                 *output_ptr++ = c;
    1887             :             }
    1888             :         }
    1889        1004 : endfield:
    1890             : 
    1891             :         /* Terminate attribute value in output area */
    1892        1004 :         *output_ptr++ = '\0';
    1893             : 
    1894             :         /* Check whether raw input matched null marker */
    1895        1004 :         input_len = end_ptr - start_ptr;
    1896        1004 :         if (!saw_quote && input_len == cstate->opts.null_print_len &&
    1897          44 :             strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
    1898          44 :             cstate->raw_fields[fieldno] = NULL;
    1899             :         /* Check whether raw input matched default marker */
    1900         960 :         else if (fieldno < list_length(cstate->attnumlist) &&
    1901         960 :                  cstate->opts.default_print &&
    1902         150 :                  input_len == cstate->opts.default_print_len &&
    1903          42 :                  strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
    1904             :         {
    1905             :             /* fieldno is 0-index and attnum is 1-index */
    1906          42 :             int         m = list_nth_int(cstate->attnumlist, fieldno) - 1;
    1907             : 
    1908          42 :             if (cstate->defexprs[m] != NULL)
    1909             :             {
    1910             :                 /* defaults contain entries for all physical attributes */
    1911          36 :                 cstate->defaults[m] = true;
    1912             :             }
    1913             :             else
    1914             :             {
    1915           6 :                 TupleDesc   tupDesc = RelationGetDescr(cstate->rel);
    1916           6 :                 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
    1917             : 
    1918           6 :                 ereport(ERROR,
    1919             :                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1920             :                          errmsg("unexpected default marker in COPY data"),
    1921             :                          errdetail("Column \"%s\" has no default value.",
    1922             :                                    NameStr(att->attname))));
    1923             :             }
    1924             :         }
    1925             : 
    1926         998 :         fieldno++;
    1927             :         /* Done if we hit EOL instead of a delim */
    1928         998 :         if (!found_delim)
    1929         468 :             break;
    1930             :     }
    1931             : 
    1932             :     /* Clean up state of attribute_buf */
    1933         468 :     output_ptr--;
    1934             :     Assert(*output_ptr == '\0');
    1935         468 :     cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
    1936             : 
    1937         468 :     return fieldno;
    1938             : }
    1939             : 
    1940             : 
    1941             : /*
    1942             :  * Read a binary attribute
    1943             :  */
    1944             : static Datum
    1945         158 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
    1946             :                         Oid typioparam, int32 typmod,
    1947             :                         bool *isnull)
    1948             : {
    1949             :     int32       fld_size;
    1950             :     Datum       result;
    1951             : 
    1952         158 :     if (!CopyGetInt32(cstate, &fld_size))
    1953           0 :         ereport(ERROR,
    1954             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1955             :                  errmsg("unexpected EOF in COPY data")));
    1956         158 :     if (fld_size == -1)
    1957             :     {
    1958          30 :         *isnull = true;
    1959          30 :         return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
    1960             :     }
    1961         128 :     if (fld_size < 0)
    1962           0 :         ereport(ERROR,
    1963             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1964             :                  errmsg("invalid field size")));
    1965             : 
    1966             :     /* reset attribute_buf to empty, and load raw data in it */
    1967         128 :     resetStringInfo(&cstate->attribute_buf);
    1968             : 
    1969         128 :     enlargeStringInfo(&cstate->attribute_buf, fld_size);
    1970         128 :     if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
    1971         128 :                            fld_size) != fld_size)
    1972           0 :         ereport(ERROR,
    1973             :                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
    1974             :                  errmsg("unexpected EOF in COPY data")));
    1975             : 
    1976         128 :     cstate->attribute_buf.len = fld_size;
    1977         128 :     cstate->attribute_buf.data[fld_size] = '\0';
    1978             : 
    1979             :     /* Call the column type's binary input converter */
    1980         128 :     result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
    1981             :                                  typioparam, typmod);
    1982             : 
    1983             :     /* Trouble if it didn't eat the whole buffer */
    1984         128 :     if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
    1985           2 :         ereport(ERROR,
    1986             :                 (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
    1987             :                  errmsg("incorrect binary data format")));
    1988             : 
    1989         126 :     *isnull = false;
    1990         126 :     return result;
    1991             : }

Generated by: LCOV version 1.14