LCOV - code coverage report
Current view: top level - src/fe_utils - mbprint.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 135 176 76.7 %
Date: 2024-11-21 08:14:44 Functions: 7 8 87.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * Multibyte character printing support for frontend code
       4             :  *
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * src/fe_utils/mbprint.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "postgres_fe.h"
      14             : 
      15             : #include "fe_utils/mbprint.h"
      16             : 
      17             : #include "libpq-fe.h"
      18             : 
      19             : 
      20             : /*
      21             :  * To avoid version-skew problems, this file must not use declarations
      22             :  * from pg_wchar.h: the encoding IDs we are dealing with are determined
      23             :  * by the libpq.so we are linked with, and that might not match the
      24             :  * numbers we see at compile time.  (If this file were inside libpq,
      25             :  * the problem would go away...)
      26             :  *
      27             :  * Hence, we have our own definition of pg_wchar, and we get the values
      28             :  * of any needed encoding IDs on-the-fly.
      29             :  */
      30             : 
      31             : typedef unsigned int pg_wchar;
      32             : 
      33             : static int
      34     4903756 : pg_get_utf8_id(void)
      35             : {
      36             :     static int  utf8_id = -1;
      37             : 
      38     4903756 :     if (utf8_id < 0)
      39       13170 :         utf8_id = pg_char_to_encoding("utf8");
      40     4903756 :     return utf8_id;
      41             : }
      42             : 
      43             : #define PG_UTF8     pg_get_utf8_id()
      44             : 
      45             : 
      46             : /*
      47             :  * Convert a UTF-8 character to a Unicode code point.
      48             :  * This is a one-character version of pg_utf2wchar_with_len.
      49             :  *
      50             :  * No error checks here, c must point to a long-enough string.
      51             :  */
      52             : static pg_wchar
      53           0 : utf8_to_unicode(const unsigned char *c)
      54             : {
      55           0 :     if ((*c & 0x80) == 0)
      56           0 :         return (pg_wchar) c[0];
      57           0 :     else if ((*c & 0xe0) == 0xc0)
      58           0 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
      59           0 :                            (c[1] & 0x3f));
      60           0 :     else if ((*c & 0xf0) == 0xe0)
      61           0 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
      62           0 :                            ((c[1] & 0x3f) << 6) |
      63           0 :                            (c[2] & 0x3f));
      64           0 :     else if ((*c & 0xf8) == 0xf0)
      65           0 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
      66           0 :                            ((c[1] & 0x3f) << 12) |
      67           0 :                            ((c[2] & 0x3f) << 6) |
      68           0 :                            (c[3] & 0x3f));
      69             :     else
      70             :         /* that is an invalid code on purpose */
      71           0 :         return 0xffffffff;
      72             : }
      73             : 
      74             : 
      75             : /*
      76             :  * Unicode 3.1 compliant validation : for each category, it checks the
      77             :  * combination of each byte to make sure it maps to a valid range. It also
      78             :  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
      79             :  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
      80             :  */
      81             : static int
      82    18350148 : utf_charcheck(const unsigned char *c)
      83             : {
      84    18350148 :     if ((*c & 0x80) == 0)
      85    18348820 :         return 1;
      86        1328 :     else if ((*c & 0xe0) == 0xc0)
      87             :     {
      88             :         /* two-byte char */
      89        1148 :         if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
      90        1148 :             return 2;
      91           0 :         return -1;
      92             :     }
      93         180 :     else if ((*c & 0xf0) == 0xe0)
      94             :     {
      95             :         /* three-byte char */
      96         156 :         if (((c[1] & 0xc0) == 0x80) &&
      97         156 :             (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
      98         156 :             ((c[2] & 0xc0) == 0x80))
      99             :         {
     100         156 :             int         z = c[0] & 0x0f;
     101         156 :             int         yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
     102         156 :             int         lx = yx & 0x7f;
     103             : 
     104             :             /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
     105         156 :             if (((z == 0x0f) &&
     106           0 :                  (((yx & 0xffe) == 0xffe) ||
     107         156 :                   (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
     108           0 :                 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
     109           0 :                 return -1;
     110         156 :             return 3;
     111             :         }
     112           0 :         return -1;
     113             :     }
     114          24 :     else if ((*c & 0xf8) == 0xf0)
     115             :     {
     116          24 :         int         u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
     117             : 
     118             :         /* four-byte char */
     119          24 :         if (((c[1] & 0xc0) == 0x80) &&
     120          24 :             (u > 0x00) && (u <= 0x10) &&
     121          24 :             ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
     122             :         {
     123             :             /* test for 0xzzzzfffe/0xzzzzfffff */
     124          24 :             if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
     125           0 :                 ((c[3] & 0x3e) == 0x3e))
     126           0 :                 return -1;
     127          24 :             return 4;
     128             :         }
     129           0 :         return -1;
     130             :     }
     131           0 :     return -1;
     132             : }
     133             : 
     134             : 
     135             : static void
     136     4900600 : mb_utf_validate(unsigned char *pwcs)
     137             : {
     138     4900600 :     unsigned char *p = pwcs;
     139             : 
     140    23250748 :     while (*pwcs)
     141             :     {
     142             :         int         len;
     143             : 
     144    18350148 :         if ((len = utf_charcheck(pwcs)) > 0)
     145             :         {
     146    18350148 :             if (p != pwcs)
     147             :             {
     148             :                 int         i;
     149             : 
     150           0 :                 for (i = 0; i < len; i++)
     151           0 :                     *p++ = *pwcs++;
     152             :             }
     153             :             else
     154             :             {
     155    18350148 :                 pwcs += len;
     156    18350148 :                 p += len;
     157             :             }
     158             :         }
     159             :         else
     160             :             /* we skip the char */
     161           0 :             pwcs++;
     162             :     }
     163     4900600 :     if (p != pwcs)
     164           0 :         *p = '\0';
     165     4900600 : }
     166             : 
     167             : /*
     168             :  * public functions : wcswidth and mbvalidate
     169             :  */
     170             : 
     171             : /*
     172             :  * pg_wcswidth is the dumb display-width function.
     173             :  * It assumes that everything will appear on one line.
     174             :  * OTOH it is easier to use than pg_wcssize if this applies to you.
     175             :  */
     176             : int
     177        4138 : pg_wcswidth(const char *pwcs, size_t len, int encoding)
     178             : {
     179        4138 :     int         width = 0;
     180             : 
     181       42026 :     while (len > 0)
     182             :     {
     183             :         int         chlen,
     184             :                     chwidth;
     185             : 
     186       37888 :         chlen = PQmblen(pwcs, encoding);
     187       37888 :         if (len < (size_t) chlen)
     188           0 :             break;              /* Invalid string */
     189             : 
     190       37888 :         chwidth = PQdsplen(pwcs, encoding);
     191       37888 :         if (chwidth > 0)
     192       37888 :             width += chwidth;
     193             : 
     194       37888 :         pwcs += chlen;
     195       37888 :         len -= chlen;
     196             :     }
     197        4138 :     return width;
     198             : }
     199             : 
     200             : /*
     201             :  * pg_wcssize takes the given string in the given encoding and returns three
     202             :  * values:
     203             :  *    result_width: Width in display characters of the longest line in string
     204             :  *    result_height: Number of lines in display output
     205             :  *    result_format_size: Number of bytes required to store formatted
     206             :  *      representation of string
     207             :  *
     208             :  * This MUST be kept in sync with pg_wcsformat!
     209             :  */
     210             : void
     211     2320138 : pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
     212             :            int *result_width, int *result_height, int *result_format_size)
     213             : {
     214             :     int         w,
     215     2320138 :                 chlen = 0,
     216     2320138 :                 linewidth = 0;
     217     2320138 :     int         width = 0;
     218     2320138 :     int         height = 1;
     219     2320138 :     int         format_size = 0;
     220             : 
     221    27733908 :     for (; *pwcs && len > 0; pwcs += chlen)
     222             :     {
     223    25413770 :         chlen = PQmblen((const char *) pwcs, encoding);
     224    25413770 :         if (len < (size_t) chlen)
     225           0 :             break;
     226    25413770 :         w = PQdsplen((const char *) pwcs, encoding);
     227             : 
     228    25413770 :         if (chlen == 1)         /* single-byte char */
     229             :         {
     230    25411114 :             if (*pwcs == '\n')  /* Newline */
     231             :             {
     232       37442 :                 if (linewidth > width)
     233        9020 :                     width = linewidth;
     234       37442 :                 linewidth = 0;
     235       37442 :                 height += 1;
     236       37442 :                 format_size += 1;   /* For NUL char */
     237             :             }
     238    25373672 :             else if (*pwcs == '\r') /* Linefeed */
     239             :             {
     240          16 :                 linewidth += 2;
     241          16 :                 format_size += 2;
     242             :             }
     243    25373656 :             else if (*pwcs == '\t') /* Tab */
     244             :             {
     245             :                 do
     246             :                 {
     247        3036 :                     linewidth++;
     248        3036 :                     format_size++;
     249        3036 :                 } while (linewidth % 8 != 0);
     250             :             }
     251    25373268 :             else if (w < 0)      /* Other control char */
     252             :             {
     253         144 :                 linewidth += 4;
     254         144 :                 format_size += 4;
     255             :             }
     256             :             else                /* Output it as-is */
     257             :             {
     258    25373124 :                 linewidth += w;
     259    25373124 :                 format_size += 1;
     260             :             }
     261             :         }
     262        2656 :         else if (w < 0)          /* Non-ascii control char */
     263             :         {
     264           0 :             linewidth += 6;     /* \u0000 */
     265           0 :             format_size += 6;
     266             :         }
     267             :         else                    /* All other chars */
     268             :         {
     269        2656 :             linewidth += w;
     270        2656 :             format_size += chlen;
     271             :         }
     272    25413770 :         len -= chlen;
     273             :     }
     274     2320138 :     if (linewidth > width)
     275     2144802 :         width = linewidth;
     276     2320138 :     format_size += 1;           /* For NUL char */
     277             : 
     278             :     /* Set results */
     279     2320138 :     if (result_width)
     280     2320138 :         *result_width = width;
     281     2320138 :     if (result_height)
     282     2320138 :         *result_height = height;
     283     2320138 :     if (result_format_size)
     284     2314620 :         *result_format_size = format_size;
     285     2320138 : }
     286             : 
     287             : /*
     288             :  *  Format a string into one or more "struct lineptr" lines.
     289             :  *  lines[i].ptr == NULL indicates the end of the array.
     290             :  *
     291             :  * This MUST be kept in sync with pg_wcssize!
     292             :  */
     293             : void
     294     1280168 : pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
     295             :              struct lineptr *lines, int count)
     296             : {
     297             :     int         w,
     298     1280168 :                 chlen = 0;
     299     1280168 :     int         linewidth = 0;
     300     1280168 :     unsigned char *ptr = lines->ptr; /* Pointer to data area */
     301             : 
     302    14835588 :     for (; *pwcs && len > 0; pwcs += chlen)
     303             :     {
     304    13555420 :         chlen = PQmblen((const char *) pwcs, encoding);
     305    13555420 :         if (len < (size_t) chlen)
     306           0 :             break;
     307    13555420 :         w = PQdsplen((const char *) pwcs, encoding);
     308             : 
     309    13555420 :         if (chlen == 1)         /* single-byte char */
     310             :         {
     311    13554092 :             if (*pwcs == '\n')  /* Newline */
     312             :             {
     313       19954 :                 *ptr++ = '\0';
     314       19954 :                 lines->width = linewidth;
     315       19954 :                 linewidth = 0;
     316       19954 :                 lines++;
     317       19954 :                 count--;
     318       19954 :                 if (count <= 0)
     319           0 :                     exit(1);    /* Screwup */
     320             : 
     321             :                 /* make next line point to remaining memory */
     322       19954 :                 lines->ptr = ptr;
     323             :             }
     324    13534138 :             else if (*pwcs == '\r') /* Linefeed */
     325             :             {
     326           8 :                 strcpy((char *) ptr, "\\r");
     327           8 :                 linewidth += 2;
     328           8 :                 ptr += 2;
     329             :             }
     330    13534130 :             else if (*pwcs == '\t') /* Tab */
     331             :             {
     332             :                 do
     333             :                 {
     334        1518 :                     *ptr++ = ' ';
     335        1518 :                     linewidth++;
     336        1518 :                 } while (linewidth % 8 != 0);
     337             :             }
     338    13533936 :             else if (w < 0)      /* Other control char */
     339             :             {
     340          72 :                 sprintf((char *) ptr, "\\x%02X", *pwcs);
     341          72 :                 linewidth += 4;
     342          72 :                 ptr += 4;
     343             :             }
     344             :             else                /* Output it as-is */
     345             :             {
     346    13533864 :                 linewidth += w;
     347    13533864 :                 *ptr++ = *pwcs;
     348             :             }
     349             :         }
     350        1328 :         else if (w < 0)          /* Non-ascii control char */
     351             :         {
     352           0 :             if (encoding == PG_UTF8)
     353           0 :                 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
     354             :             else
     355             :             {
     356             :                 /*
     357             :                  * This case cannot happen in the current code because only
     358             :                  * UTF-8 signals multibyte control characters. But we may need
     359             :                  * to support it at some stage
     360             :                  */
     361           0 :                 sprintf((char *) ptr, "\\u????");
     362             :             }
     363           0 :             ptr += 6;
     364           0 :             linewidth += 6;
     365             :         }
     366             :         else                    /* All other chars */
     367             :         {
     368             :             int         i;
     369             : 
     370        4188 :             for (i = 0; i < chlen; i++)
     371        2860 :                 *ptr++ = pwcs[i];
     372        1328 :             linewidth += w;
     373             :         }
     374    13555420 :         len -= chlen;
     375             :     }
     376     1280168 :     lines->width = linewidth;
     377     1280168 :     *ptr++ = '\0';              /* Terminate formatted string */
     378             : 
     379     1280168 :     if (count <= 0)
     380           0 :         exit(1);                /* Screwup */
     381             : 
     382     1280168 :     (lines + 1)->ptr = NULL; /* terminate line array */
     383     1280168 : }
     384             : 
     385             : 
     386             : /*
     387             :  * Encoding validation: delete any unvalidatable characters from the string
     388             :  *
     389             :  * This seems redundant with existing functionality elsewhere?
     390             :  */
     391             : unsigned char *
     392     4903756 : mbvalidate(unsigned char *pwcs, int encoding)
     393             : {
     394     4903756 :     if (encoding == PG_UTF8)
     395     4900600 :         mb_utf_validate(pwcs);
     396             :     else
     397             :     {
     398             :         /*
     399             :          * other encodings needing validation should add their own routines
     400             :          * here
     401             :          */
     402             :     }
     403             : 
     404     4903756 :     return pwcs;
     405             : }

Generated by: LCOV version 1.14