LCOV - code coverage report
Current view: top level - src/backend/tsearch - ts_locale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 80 99 80.8 %
Date: 2024-12-12 15:15:20 Functions: 10 11 90.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * ts_locale.c
       4             :  *      locale compatibility layer for tsearch
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/ts_locale.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #include "postgres.h"
      15             : 
      16             : #include "common/string.h"
      17             : #include "storage/fd.h"
      18             : #include "tsearch/ts_locale.h"
      19             : 
      20             : static void tsearch_readline_callback(void *arg);
      21             : 
      22             : 
      23             : /*
      24             :  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
      25             :  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
      26             :  * getting from char2wchar() is UTF16 not UTF32.  A single input character
      27             :  * may therefore produce a surrogate pair rather than just one wchar_t;
      28             :  * we also need room for a trailing null.  When we do get a surrogate pair,
      29             :  * we pass just the first code to iswdigit() etc, so that these functions will
      30             :  * always return false for characters outside the Basic Multilingual Plane.
      31             :  */
      32             : #define WC_BUF_LEN  3
      33             : 
      34             : int
      35       20604 : t_isdigit(const char *ptr)
      36             : {
      37       20604 :     int         clen = pg_mblen(ptr);
      38             :     wchar_t     character[WC_BUF_LEN];
      39       20604 :     pg_locale_t mylocale = 0;   /* TODO */
      40             : 
      41       20604 :     if (clen == 1 || database_ctype_is_c)
      42       20604 :         return isdigit(TOUCHAR(ptr));
      43             : 
      44           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      45             : 
      46           0 :     return iswdigit((wint_t) character[0]);
      47             : }
      48             : 
      49             : int
      50      925428 : t_isspace(const char *ptr)
      51             : {
      52      925428 :     int         clen = pg_mblen(ptr);
      53             :     wchar_t     character[WC_BUF_LEN];
      54      925428 :     pg_locale_t mylocale = 0;   /* TODO */
      55             : 
      56      925428 :     if (clen == 1 || database_ctype_is_c)
      57      912724 :         return isspace(TOUCHAR(ptr));
      58             : 
      59       12704 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      60             : 
      61       12704 :     return iswspace((wint_t) character[0]);
      62             : }
      63             : 
      64             : int
      65       10284 : t_isalpha(const char *ptr)
      66             : {
      67       10284 :     int         clen = pg_mblen(ptr);
      68             :     wchar_t     character[WC_BUF_LEN];
      69       10284 :     pg_locale_t mylocale = 0;   /* TODO */
      70             : 
      71       10284 :     if (clen == 1 || database_ctype_is_c)
      72       10284 :         return isalpha(TOUCHAR(ptr));
      73             : 
      74           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      75             : 
      76           0 :     return iswalpha((wint_t) character[0]);
      77             : }
      78             : 
      79             : int
      80     2767776 : t_isalnum(const char *ptr)
      81             : {
      82     2767776 :     int         clen = pg_mblen(ptr);
      83             :     wchar_t     character[WC_BUF_LEN];
      84     2767776 :     pg_locale_t mylocale = 0;   /* TODO */
      85             : 
      86     2767776 :     if (clen == 1 || database_ctype_is_c)
      87     2767776 :         return isalnum(TOUCHAR(ptr));
      88             : 
      89           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      90             : 
      91           0 :     return iswalnum((wint_t) character[0]);
      92             : }
      93             : 
      94             : int
      95        4150 : t_isprint(const char *ptr)
      96             : {
      97        4150 :     int         clen = pg_mblen(ptr);
      98             :     wchar_t     character[WC_BUF_LEN];
      99        4150 :     pg_locale_t mylocale = 0;   /* TODO */
     100             : 
     101        4150 :     if (clen == 1 || database_ctype_is_c)
     102        4150 :         return isprint(TOUCHAR(ptr));
     103             : 
     104           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
     105             : 
     106           0 :     return iswprint((wint_t) character[0]);
     107             : }
     108             : 
     109             : 
     110             : /*
     111             :  * Set up to read a file using tsearch_readline().  This facility is
     112             :  * better than just reading the file directly because it provides error
     113             :  * context pointing to the specific line where a problem is detected.
     114             :  *
     115             :  * Expected usage is:
     116             :  *
     117             :  *      tsearch_readline_state trst;
     118             :  *
     119             :  *      if (!tsearch_readline_begin(&trst, filename))
     120             :  *          ereport(ERROR,
     121             :  *                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
     122             :  *                   errmsg("could not open stop-word file \"%s\": %m",
     123             :  *                          filename)));
     124             :  *      while ((line = tsearch_readline(&trst)) != NULL)
     125             :  *          process line;
     126             :  *      tsearch_readline_end(&trst);
     127             :  *
     128             :  * Note that the caller supplies the ereport() for file open failure;
     129             :  * this is so that a custom message can be provided.  The filename string
     130             :  * passed to tsearch_readline_begin() must remain valid through
     131             :  * tsearch_readline_end().
     132             :  */
     133             : bool
     134         556 : tsearch_readline_begin(tsearch_readline_state *stp,
     135             :                        const char *filename)
     136             : {
     137         556 :     if ((stp->fp = AllocateFile(filename, "r")) == NULL)
     138           0 :         return false;
     139         556 :     stp->filename = filename;
     140         556 :     stp->lineno = 0;
     141         556 :     initStringInfo(&stp->buf);
     142         556 :     stp->curline = NULL;
     143             :     /* Setup error traceback support for ereport() */
     144         556 :     stp->cb.callback = tsearch_readline_callback;
     145         556 :     stp->cb.arg = stp;
     146         556 :     stp->cb.previous = error_context_stack;
     147         556 :     error_context_stack = &stp->cb;
     148         556 :     return true;
     149             : }
     150             : 
     151             : /*
     152             :  * Read the next line from a tsearch data file (expected to be in UTF-8), and
     153             :  * convert it to database encoding if needed. The returned string is palloc'd.
     154             :  * NULL return means EOF.
     155             :  */
     156             : char *
     157       25466 : tsearch_readline(tsearch_readline_state *stp)
     158             : {
     159             :     char       *recoded;
     160             : 
     161             :     /* Advance line number to use in error reports */
     162       25466 :     stp->lineno++;
     163             : 
     164             :     /* Clear curline, it's no longer relevant */
     165       25466 :     if (stp->curline)
     166             :     {
     167       24910 :         if (stp->curline != stp->buf.data)
     168           0 :             pfree(stp->curline);
     169       24910 :         stp->curline = NULL;
     170             :     }
     171             : 
     172             :     /* Collect next line, if there is one */
     173       25466 :     if (!pg_get_line_buf(stp->fp, &stp->buf))
     174         470 :         return NULL;
     175             : 
     176             :     /* Validate the input as UTF-8, then convert to DB encoding if needed */
     177       24996 :     recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
     178             : 
     179             :     /* Save the correctly-encoded string for possible error reports */
     180       24996 :     stp->curline = recoded;      /* might be equal to buf.data */
     181             : 
     182             :     /*
     183             :      * We always return a freshly pstrdup'd string.  This is clearly necessary
     184             :      * if pg_any_to_server() returned buf.data, and we need a second copy even
     185             :      * if encoding conversion did occur.  The caller is entitled to pfree the
     186             :      * returned string at any time, which would leave curline pointing to
     187             :      * recycled storage, causing problems if an error occurs after that point.
     188             :      * (It's preferable to return the result of pstrdup instead of the output
     189             :      * of pg_any_to_server, because the conversion result tends to be
     190             :      * over-allocated.  Since callers might save the result string directly
     191             :      * into a long-lived dictionary structure, we don't want it to be a larger
     192             :      * palloc chunk than necessary.  We'll reclaim the conversion result on
     193             :      * the next call.)
     194             :      */
     195       24996 :     return pstrdup(recoded);
     196             : }
     197             : 
     198             : /*
     199             :  * Close down after reading a file with tsearch_readline()
     200             :  */
     201             : void
     202         556 : tsearch_readline_end(tsearch_readline_state *stp)
     203             : {
     204             :     /* Suppress use of curline in any error reported below */
     205         556 :     if (stp->curline)
     206             :     {
     207          86 :         if (stp->curline != stp->buf.data)
     208           0 :             pfree(stp->curline);
     209          86 :         stp->curline = NULL;
     210             :     }
     211             : 
     212             :     /* Release other resources */
     213         556 :     pfree(stp->buf.data);
     214         556 :     FreeFile(stp->fp);
     215             : 
     216             :     /* Pop the error context stack */
     217         556 :     error_context_stack = stp->cb.previous;
     218         556 : }
     219             : 
     220             : /*
     221             :  * Error context callback for errors occurring while reading a tsearch
     222             :  * configuration file.
     223             :  */
     224             : static void
     225           0 : tsearch_readline_callback(void *arg)
     226             : {
     227           0 :     tsearch_readline_state *stp = (tsearch_readline_state *) arg;
     228             : 
     229             :     /*
     230             :      * We can't include the text of the config line for errors that occur
     231             :      * during tsearch_readline() itself.  The major cause of such errors is
     232             :      * encoding violations, and we daren't try to print error messages
     233             :      * containing badly-encoded data.
     234             :      */
     235           0 :     if (stp->curline)
     236           0 :         errcontext("line %d of configuration file \"%s\": \"%s\"",
     237             :                    stp->lineno,
     238             :                    stp->filename,
     239             :                    stp->curline);
     240             :     else
     241           0 :         errcontext("line %d of configuration file \"%s\"",
     242             :                    stp->lineno,
     243             :                    stp->filename);
     244           0 : }
     245             : 
     246             : 
     247             : /*
     248             :  * lowerstr --- fold null-terminated string to lower case
     249             :  *
     250             :  * Returned string is palloc'd
     251             :  */
     252             : char *
     253       13844 : lowerstr(const char *str)
     254             : {
     255       13844 :     return lowerstr_with_len(str, strlen(str));
     256             : }
     257             : 
     258             : /*
     259             :  * lowerstr_with_len --- fold string to lower case
     260             :  *
     261             :  * Input string need not be null-terminated.
     262             :  *
     263             :  * Returned string is palloc'd
     264             :  */
     265             : char *
     266      283710 : lowerstr_with_len(const char *str, int len)
     267             : {
     268             :     char       *out;
     269      283710 :     pg_locale_t mylocale = 0;   /* TODO */
     270             : 
     271      283710 :     if (len == 0)
     272           0 :         return pstrdup("");
     273             : 
     274             :     /*
     275             :      * Use wide char code only when max encoding length > 1 and ctype != C.
     276             :      * Some operating systems fail with multi-byte encodings and a C locale.
     277             :      * Also, for a C locale there is no need to process as multibyte. From
     278             :      * backend/utils/adt/oracle_compat.c Teodor
     279             :      */
     280      283710 :     if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
     281      273996 :     {
     282             :         wchar_t    *wstr,
     283             :                    *wptr;
     284             :         int         wlen;
     285             : 
     286             :         /*
     287             :          * alloc number of wchar_t for worst case, len contains number of
     288             :          * bytes >= number of characters and alloc 1 wchar_t for 0, because
     289             :          * wchar2char wants zero-terminated string
     290             :          */
     291      273996 :         wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
     292             : 
     293      273996 :         wlen = char2wchar(wstr, len + 1, str, len, mylocale);
     294             :         Assert(wlen <= len);
     295             : 
     296     2305520 :         while (*wptr)
     297             :         {
     298     2031524 :             *wptr = towlower((wint_t) *wptr);
     299     2031524 :             wptr++;
     300             :         }
     301             : 
     302             :         /*
     303             :          * Alloc result string for worst case + '\0'
     304             :          */
     305      273996 :         len = pg_database_encoding_max_length() * wlen + 1;
     306      273996 :         out = (char *) palloc(len);
     307             : 
     308      273996 :         wlen = wchar2char(out, wstr, len, mylocale);
     309             : 
     310      273996 :         pfree(wstr);
     311             : 
     312      273996 :         if (wlen < 0)
     313           0 :             ereport(ERROR,
     314             :                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     315             :                      errmsg("conversion from wchar_t to server encoding failed: %m")));
     316             :         Assert(wlen < len);
     317             :     }
     318             :     else
     319             :     {
     320        9714 :         const char *ptr = str;
     321             :         char       *outptr;
     322             : 
     323        9714 :         outptr = out = (char *) palloc(sizeof(char) * (len + 1));
     324       56420 :         while ((ptr - str) < len && *ptr)
     325             :         {
     326       46706 :             *outptr++ = tolower(TOUCHAR(ptr));
     327       46706 :             ptr++;
     328             :         }
     329        9714 :         *outptr = '\0';
     330             :     }
     331             : 
     332      283710 :     return out;
     333             : }

Generated by: LCOV version 1.14