LCOV - code coverage report
Current view: top level - src/backend/tsearch - ts_locale.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15devel Lines: 72 97 74.2 %
Date: 2021-12-09 03:08:47 Functions: 9 10 90.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * ts_locale.c
       4             :  *      locale compatibility layer for tsearch
       5             :  *
       6             :  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/ts_locale.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #include "postgres.h"
      15             : 
      16             : #include "catalog/pg_collation.h"
      17             : #include "common/string.h"
      18             : #include "storage/fd.h"
      19             : #include "tsearch/ts_locale.h"
      20             : #include "tsearch/ts_public.h"
      21             : 
      22             : static void tsearch_readline_callback(void *arg);
      23             : 
      24             : 
      25             : /*
      26             :  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
      27             :  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
      28             :  * getting from char2wchar() is UTF16 not UTF32.  A single input character
      29             :  * may therefore produce a surrogate pair rather than just one wchar_t;
      30             :  * we also need room for a trailing null.  When we do get a surrogate pair,
      31             :  * we pass just the first code to iswdigit() etc, so that these functions will
      32             :  * always return false for characters outside the Basic Multilingual Plane.
      33             :  */
      34             : #define WC_BUF_LEN  3
      35             : 
      36             : int
      37      782530 : t_isdigit(const char *ptr)
      38             : {
      39      782530 :     int         clen = pg_mblen(ptr);
      40             :     wchar_t     character[WC_BUF_LEN];
      41      782530 :     Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
      42      782530 :     pg_locale_t mylocale = 0;   /* TODO */
      43             : 
      44      782530 :     if (clen == 1 || lc_ctype_is_c(collation))
      45      782530 :         return isdigit(TOUCHAR(ptr));
      46             : 
      47           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      48             : 
      49           0 :     return iswdigit((wint_t) character[0]);
      50             : }
      51             : 
      52             : int
      53      616922 : t_isspace(const char *ptr)
      54             : {
      55      616922 :     int         clen = pg_mblen(ptr);
      56             :     wchar_t     character[WC_BUF_LEN];
      57      616922 :     Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
      58      616922 :     pg_locale_t mylocale = 0;   /* TODO */
      59             : 
      60      616922 :     if (clen == 1 || lc_ctype_is_c(collation))
      61      616922 :         return isspace(TOUCHAR(ptr));
      62             : 
      63           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      64             : 
      65           0 :     return iswspace((wint_t) character[0]);
      66             : }
      67             : 
      68             : int
      69     2741870 : t_isalpha(const char *ptr)
      70             : {
      71     2741870 :     int         clen = pg_mblen(ptr);
      72             :     wchar_t     character[WC_BUF_LEN];
      73     2741870 :     Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
      74     2741870 :     pg_locale_t mylocale = 0;   /* TODO */
      75             : 
      76     2741870 :     if (clen == 1 || lc_ctype_is_c(collation))
      77     2741870 :         return isalpha(TOUCHAR(ptr));
      78             : 
      79           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      80             : 
      81           0 :     return iswalpha((wint_t) character[0]);
      82             : }
      83             : 
      84             : int
      85        2922 : t_isprint(const char *ptr)
      86             : {
      87        2922 :     int         clen = pg_mblen(ptr);
      88             :     wchar_t     character[WC_BUF_LEN];
      89        2922 :     Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
      90        2922 :     pg_locale_t mylocale = 0;   /* TODO */
      91             : 
      92        2922 :     if (clen == 1 || lc_ctype_is_c(collation))
      93        2922 :         return isprint(TOUCHAR(ptr));
      94             : 
      95           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      96             : 
      97           0 :     return iswprint((wint_t) character[0]);
      98             : }
      99             : 
     100             : 
     101             : /*
     102             :  * Set up to read a file using tsearch_readline().  This facility is
     103             :  * better than just reading the file directly because it provides error
     104             :  * context pointing to the specific line where a problem is detected.
     105             :  *
     106             :  * Expected usage is:
     107             :  *
     108             :  *      tsearch_readline_state trst;
     109             :  *
     110             :  *      if (!tsearch_readline_begin(&trst, filename))
     111             :  *          ereport(ERROR,
     112             :  *                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
     113             :  *                   errmsg("could not open stop-word file \"%s\": %m",
     114             :  *                          filename)));
     115             :  *      while ((line = tsearch_readline(&trst)) != NULL)
     116             :  *          process line;
     117             :  *      tsearch_readline_end(&trst);
     118             :  *
     119             :  * Note that the caller supplies the ereport() for file open failure;
     120             :  * this is so that a custom message can be provided.  The filename string
     121             :  * passed to tsearch_readline_begin() must remain valid through
     122             :  * tsearch_readline_end().
     123             :  */
     124             : bool
     125         398 : tsearch_readline_begin(tsearch_readline_state *stp,
     126             :                        const char *filename)
     127             : {
     128         398 :     if ((stp->fp = AllocateFile(filename, "r")) == NULL)
     129           0 :         return false;
     130         398 :     stp->filename = filename;
     131         398 :     stp->lineno = 0;
     132         398 :     initStringInfo(&stp->buf);
     133         398 :     stp->curline = NULL;
     134             :     /* Setup error traceback support for ereport() */
     135         398 :     stp->cb.callback = tsearch_readline_callback;
     136         398 :     stp->cb.arg = (void *) stp;
     137         398 :     stp->cb.previous = error_context_stack;
     138         398 :     error_context_stack = &stp->cb;
     139         398 :     return true;
     140             : }
     141             : 
     142             : /*
     143             :  * Read the next line from a tsearch data file (expected to be in UTF-8), and
     144             :  * convert it to database encoding if needed. The returned string is palloc'd.
     145             :  * NULL return means EOF.
     146             :  */
     147             : char *
     148       16802 : tsearch_readline(tsearch_readline_state *stp)
     149             : {
     150             :     char       *recoded;
     151             : 
     152             :     /* Advance line number to use in error reports */
     153       16802 :     stp->lineno++;
     154             : 
     155             :     /* Clear curline, it's no longer relevant */
     156       16802 :     if (stp->curline)
     157             :     {
     158       16404 :         if (stp->curline != stp->buf.data)
     159           0 :             pfree(stp->curline);
     160       16404 :         stp->curline = NULL;
     161             :     }
     162             : 
     163             :     /* Collect next line, if there is one */
     164       16802 :     if (!pg_get_line_buf(stp->fp, &stp->buf))
     165         338 :         return NULL;
     166             : 
     167             :     /* Validate the input as UTF-8, then convert to DB encoding if needed */
     168       16464 :     recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
     169             : 
     170             :     /* Save the correctly-encoded string for possible error reports */
     171       16464 :     stp->curline = recoded;      /* might be equal to buf.data */
     172             : 
     173             :     /*
     174             :      * We always return a freshly pstrdup'd string.  This is clearly necessary
     175             :      * if pg_any_to_server() returned buf.data, and we need a second copy even
     176             :      * if encoding conversion did occur.  The caller is entitled to pfree the
     177             :      * returned string at any time, which would leave curline pointing to
     178             :      * recycled storage, causing problems if an error occurs after that point.
     179             :      * (It's preferable to return the result of pstrdup instead of the output
     180             :      * of pg_any_to_server, because the conversion result tends to be
     181             :      * over-allocated.  Since callers might save the result string directly
     182             :      * into a long-lived dictionary structure, we don't want it to be a larger
     183             :      * palloc chunk than necessary.  We'll reclaim the conversion result on
     184             :      * the next call.)
     185             :      */
     186       16464 :     return pstrdup(recoded);
     187             : }
     188             : 
     189             : /*
     190             :  * Close down after reading a file with tsearch_readline()
     191             :  */
     192             : void
     193         398 : tsearch_readline_end(tsearch_readline_state *stp)
     194             : {
     195             :     /* Suppress use of curline in any error reported below */
     196         398 :     if (stp->curline)
     197             :     {
     198          60 :         if (stp->curline != stp->buf.data)
     199           0 :             pfree(stp->curline);
     200          60 :         stp->curline = NULL;
     201             :     }
     202             : 
     203             :     /* Release other resources */
     204         398 :     pfree(stp->buf.data);
     205         398 :     FreeFile(stp->fp);
     206             : 
     207             :     /* Pop the error context stack */
     208         398 :     error_context_stack = stp->cb.previous;
     209         398 : }
     210             : 
     211             : /*
     212             :  * Error context callback for errors occurring while reading a tsearch
     213             :  * configuration file.
     214             :  */
     215             : static void
     216           0 : tsearch_readline_callback(void *arg)
     217             : {
     218           0 :     tsearch_readline_state *stp = (tsearch_readline_state *) arg;
     219             : 
     220             :     /*
     221             :      * We can't include the text of the config line for errors that occur
     222             :      * during tsearch_readline() itself.  The major cause of such errors is
     223             :      * encoding violations, and we daren't try to print error messages
     224             :      * containing badly-encoded data.
     225             :      */
     226           0 :     if (stp->curline)
     227           0 :         errcontext("line %d of configuration file \"%s\": \"%s\"",
     228             :                    stp->lineno,
     229             :                    stp->filename,
     230             :                    stp->curline);
     231             :     else
     232           0 :         errcontext("line %d of configuration file \"%s\"",
     233             :                    stp->lineno,
     234             :                    stp->filename);
     235           0 : }
     236             : 
     237             : 
     238             : /*
     239             :  * lowerstr --- fold null-terminated string to lower case
     240             :  *
     241             :  * Returned string is palloc'd
     242             :  */
     243             : char *
     244       10138 : lowerstr(const char *str)
     245             : {
     246       10138 :     return lowerstr_with_len(str, strlen(str));
     247             : }
     248             : 
     249             : /*
     250             :  * lowerstr_with_len --- fold string to lower case
     251             :  *
     252             :  * Input string need not be null-terminated.
     253             :  *
     254             :  * Returned string is palloc'd
     255             :  */
     256             : char *
     257      271770 : lowerstr_with_len(const char *str, int len)
     258             : {
     259             :     char       *out;
     260      271770 :     Oid         collation = DEFAULT_COLLATION_OID;  /* TODO */
     261      271770 :     pg_locale_t mylocale = 0;   /* TODO */
     262             : 
     263      271770 :     if (len == 0)
     264           0 :         return pstrdup("");
     265             : 
     266             :     /*
     267             :      * Use wide char code only when max encoding length > 1 and ctype != C.
     268             :      * Some operating systems fail with multi-byte encodings and a C locale.
     269             :      * Also, for a C locale there is no need to process as multibyte. From
     270             :      * backend/utils/adt/oracle_compat.c Teodor
     271             :      */
     272      271770 :     if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
     273      271770 :     {
     274             :         wchar_t    *wstr,
     275             :                    *wptr;
     276             :         int         wlen;
     277             : 
     278             :         /*
     279             :          * alloc number of wchar_t for worst case, len contains number of
     280             :          * bytes >= number of characters and alloc 1 wchar_t for 0, because
     281             :          * wchar2char wants zero-terminated string
     282             :          */
     283      271770 :         wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
     284             : 
     285      271770 :         wlen = char2wchar(wstr, len + 1, str, len, mylocale);
     286             :         Assert(wlen <= len);
     287             : 
     288     2282632 :         while (*wptr)
     289             :         {
     290     2010862 :             *wptr = towlower((wint_t) *wptr);
     291     2010862 :             wptr++;
     292             :         }
     293             : 
     294             :         /*
     295             :          * Alloc result string for worst case + '\0'
     296             :          */
     297      271770 :         len = pg_database_encoding_max_length() * wlen + 1;
     298      271770 :         out = (char *) palloc(len);
     299             : 
     300      271770 :         wlen = wchar2char(out, wstr, len, mylocale);
     301             : 
     302      271770 :         pfree(wstr);
     303             : 
     304      271770 :         if (wlen < 0)
     305           0 :             ereport(ERROR,
     306             :                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     307             :                      errmsg("conversion from wchar_t to server encoding failed: %m")));
     308             :         Assert(wlen < len);
     309             :     }
     310             :     else
     311             :     {
     312           0 :         const char *ptr = str;
     313             :         char       *outptr;
     314             : 
     315           0 :         outptr = out = (char *) palloc(sizeof(char) * (len + 1));
     316           0 :         while ((ptr - str) < len && *ptr)
     317             :         {
     318           0 :             *outptr++ = tolower(TOUCHAR(ptr));
     319           0 :             ptr++;
     320             :         }
     321           0 :         *outptr = '\0';
     322             :     }
     323             : 
     324      271770 :     return out;
     325             : }

Generated by: LCOV version 1.14