LCOV - PostgreSQL 19devel - src/backend/utils/adt/like

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - like_match.c (source / functions)		Coverage	Total	Hit
Test:	PostgreSQL 19devel	Lines:	90.2 %	132	119
Test Date:	2026-02-17 17:20:33	Functions:	83.3 %	6	5
Legend:	Lines: hit not hit

            Line data    Source code

       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * like_match.c
       4              :  *    LIKE pattern matching internal code.
       5              :  *
       6              :  * This file is included by like.c four times, to provide matching code for
       7              :  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
       8              :  * and (4) case insensitive matches in single-byte encodings.
       9              :  * (UTF8 is a special case because we can use a much more efficient version
      10              :  * of NextChar than can be used for general multi-byte encodings.)
      11              :  *
      12              :  * Before the inclusion, we need to define the following macros:
      13              :  *
      14              :  * NextChar
      15              :  * MatchText - to name of function wanted
      16              :  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
      17              :  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
      18              :  *
      19              :  * Copyright (c) 1996-2026, PostgreSQL Global Development Group
      20              :  *
      21              :  * IDENTIFICATION
      22              :  *  src/backend/utils/adt/like_match.c
      23              :  *
      24              :  *-------------------------------------------------------------------------
      25              :  */
      26              : 
      27              : /*
      28              :  *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
      29              :  *  Rich $alz is now <rsalz@bbn.com>.
      30              :  *  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
      31              :  *  LIKE_ABORT code.
      32              :  *
      33              :  *  This code was shamelessly stolen from the "pql" code by myself and
      34              :  *  slightly modified :)
      35              :  *
      36              :  *  All references to the word "star" were replaced by "percent"
      37              :  *  All references to the word "wild" were replaced by "like"
      38              :  *
      39              :  *  All the nice shell RE matching stuff was replaced by just "_" and "%"
      40              :  *
      41              :  *  As I don't have a copy of the SQL standard handy I wasn't sure whether
      42              :  *  to leave in the '\' escape character handling.
      43              :  *
      44              :  *  Keith Parks. <keith@mtcc.demon.co.uk>
      45              :  *
      46              :  *  SQL lets you specify the escape character by saying
      47              :  *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
      48              :  *  so we force you to use '\'. - ay 7/95
      49              :  *
      50              :  *  Now we have the like_escape() function that converts patterns with
      51              :  *  any specified escape character (or none at all) to the internal
      52              :  *  default escape character, which is still '\'. - tgl 9/2000
      53              :  *
      54              :  * The code is rewritten to avoid requiring null-terminated strings,
      55              :  * which in turn allows us to leave out some memcpy() operations.
      56              :  * This code should be faster and take less memory, but no promises...
      57              :  * - thomas 2000-08-06
      58              :  */
      59              : 
      60              : 
      61              : /*--------------------
      62              :  *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
      63              :  *
      64              :  *  LIKE_TRUE: they match
      65              :  *  LIKE_FALSE: they don't match
      66              :  *  LIKE_ABORT: not only don't they match, but the text is too short.
      67              :  *
      68              :  * If LIKE_ABORT is returned, then no suffix of the text can match the
      69              :  * pattern either, so an upper-level % scan can stop scanning now.
      70              :  *--------------------
      71              :  */
      72              : 
      73              : /*
      74              :  * MATCH_LOWER is defined for ILIKE in the C locale as an optimization. Other
      75              :  * locales must casefold the inputs before matching.
      76              :  */
      77              : #ifdef MATCH_LOWER
      78              : #define GETCHAR(t) pg_ascii_tolower(t)
      79              : #else
      80              : #define GETCHAR(t) (t)
      81              : #endif
      82              : 
      83              : static int
      84       645176 : MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
      85              : {
      86              :     /* Fast path for match-everything pattern */
      87       645176 :     if (plen == 1 && *p == '%')
      88          122 :         return LIKE_TRUE;
      89              : 
      90              :     /* Since this function recurses, it could be driven to stack overflow */
      91       645054 :     check_stack_depth();
      92              : 
      93              :     /*
      94              :      * In this loop, we advance by char when matching wildcards (and thus on
      95              :      * recursive entry to this function we are properly char-synced). On other
      96              :      * occasions it is safe to advance by byte, as the text and pattern will
      97              :      * be in lockstep. This allows us to perform all comparisons between the
      98              :      * text and pattern on a byte by byte basis, even for multi-byte
      99              :      * encodings.
     100              :      */
     101       956072 :     while (tlen > 0 && plen > 0)
     102              :     {
     103       950807 :         if (*p == '\\')
     104              :         {
     105              :             /* Next pattern byte must match literally, whatever it is */
     106         6473 :             NextByte(p, plen);
     107              :             /* ... and there had better be one, per SQL standard */
     108         6473 :             if (plen <= 0)
     109            0 :                 ereport(ERROR,
     110              :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     111              :                          errmsg("LIKE pattern must not end with escape character")));
     112         6473 :             if (GETCHAR(*p) != GETCHAR(*t))
     113         1725 :                 return LIKE_FALSE;
     114              :         }
     115       944334 :         else if (*p == '%')
     116              :         {
     117              :             char        firstpat;
     118              : 
     119              :             /*
     120              :              * % processing is essentially a search for a text position at
     121              :              * which the remainder of the text matches the remainder of the
     122              :              * pattern, using a recursive call to check each potential match.
     123              :              *
     124              :              * If there are wildcards immediately following the %, we can skip
     125              :              * over them first, using the idea that any sequence of N _'s and
     126              :              * one or more %'s is equivalent to N _'s and one % (ie, it will
     127              :              * match any sequence of at least N text characters).  In this way
     128              :              * we will always run the recursive search loop using a pattern
     129              :              * fragment that begins with a literal character-to-match, thereby
     130              :              * not recursing more than we have to.
     131              :              */
     132        83640 :             NextByte(p, plen);
     133              : 
     134        83891 :             while (plen > 0)
     135              :             {
     136        66170 :                 if (*p == '%')
     137            9 :                     NextByte(p, plen);
     138        66161 :                 else if (*p == '_')
     139              :                 {
     140              :                     /* If not enough text left to match the pattern, ABORT */
     141          245 :                     if (tlen <= 0)
     142            3 :                         return LIKE_ABORT;
     143          242 :                     NextChar(t, tlen);
     144          242 :                     NextByte(p, plen);
     145              :                 }
     146              :                 else
     147        65916 :                     break;      /* Reached a non-wildcard pattern char */
     148              :             }
     149              : 
     150              :             /*
     151              :              * If we're at end of pattern, match: we have a trailing % which
     152              :              * matches any remaining text string.
     153              :              */
     154        83637 :             if (plen <= 0)
     155        17721 :                 return LIKE_TRUE;
     156              : 
     157              :             /*
     158              :              * Otherwise, scan for a text position at which we can match the
     159              :              * rest of the pattern.  The first remaining pattern char is known
     160              :              * to be a regular or escaped literal character, so we can compare
     161              :              * the first pattern byte to each text byte to avoid recursing
     162              :              * more than we have to.  This fact also guarantees that we don't
     163              :              * have to consider a match to the zero-length substring at the
     164              :              * end of the text.  With a nondeterministic collation, we can't
     165              :              * rely on the first bytes being equal, so we have to recurse in
     166              :              * any case.
     167              :              */
     168        65916 :             if (*p == '\\')
     169              :             {
     170            2 :                 if (plen < 2)
     171            0 :                     ereport(ERROR,
     172              :                             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     173              :                              errmsg("LIKE pattern must not end with escape character")));
     174            2 :                 firstpat = GETCHAR(p[1]);
     175              :             }
     176              :             else
     177        65914 :                 firstpat = GETCHAR(*p);
     178              : 
     179      1924175 :             while (tlen > 0)
     180              :             {
     181      1872793 :                 if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic))
     182              :                 {
     183        54209 :                     int         matched = MatchText(t, tlen, p, plen, locale);
     184              : 
     185        54209 :                     if (matched != LIKE_FALSE)
     186        14534 :                         return matched; /* TRUE or ABORT */
     187              :                 }
     188              : 
     189      1858283 :                 NextChar(t, tlen);
     190              :             }
     191              : 
     192              :             /*
     193              :              * End of text with no match, so no point in trying later places
     194              :              * to start matching this pattern.
     195              :              */
     196        51382 :             return LIKE_ABORT;
     197              :         }
     198       860694 :         else if (*p == '_')
     199              :         {
     200              :             /* _ matches any single character, and we know there is one */
     201         6178 :             NextChar(t, tlen);
     202         6169 :             NextByte(p, plen);
     203         6169 :             continue;
     204              :         }
     205       854525 :         else if (locale && !locale->deterministic)
     206              :         {
     207              :             /*
     208              :              * For nondeterministic locales, we find the next substring of the
     209              :              * pattern that does not contain wildcards and try to find a
     210              :              * matching substring in the text.  Crucially, we cannot do this
     211              :              * character by character, as in the normal case, but must do it
     212              :              * substring by substring, partitioned by the wildcard characters.
     213              :              * (This is per SQL standard.)
     214              :              */
     215              :             const char *p1;
     216              :             size_t      p1len;
     217              :             const char *t1;
     218              :             size_t      t1len;
     219              :             bool        found_escape;
     220              :             const char *subpat;
     221              :             size_t      subpatlen;
     222          141 :             char       *buf = NULL;
     223              : 
     224              :             /*
     225              :              * Determine next substring of pattern without wildcards.  p is
     226              :              * the start of the subpattern, p1 is one past the last byte. Also
     227              :              * track if we found an escape character.
     228              :              */
     229          141 :             p1 = p;
     230          141 :             p1len = plen;
     231          141 :             found_escape = false;
     232          417 :             while (p1len > 0)
     233              :             {
     234          345 :                 if (*p1 == '\\')
     235              :                 {
     236            6 :                     found_escape = true;
     237            6 :                     NextByte(p1, p1len);
     238            6 :                     if (p1len == 0)
     239            3 :                         ereport(ERROR,
     240              :                                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     241              :                                  errmsg("LIKE pattern must not end with escape character")));
     242              :                 }
     243          339 :                 else if (*p1 == '_' || *p1 == '%')
     244              :                     break;
     245          276 :                 NextByte(p1, p1len);
     246              :             }
     247              : 
     248              :             /*
     249              :              * If we found an escape character, then make an unescaped copy of
     250              :              * the subpattern.
     251              :              */
     252          138 :             if (found_escape)
     253              :             {
     254              :                 char       *b;
     255              : 
     256            3 :                 b = buf = palloc(p1 - p);
     257           15 :                 for (const char *c = p; c < p1; c++)
     258              :                 {
     259           12 :                     if (*c == '\\')
     260              :                         ;
     261              :                     else
     262            9 :                         *(b++) = *c;
     263              :                 }
     264              : 
     265            3 :                 subpat = buf;
     266            3 :                 subpatlen = b - buf;
     267              :             }
     268              :             else
     269              :             {
     270          135 :                 subpat = p;
     271          135 :                 subpatlen = p1 - p;
     272              :             }
     273              : 
     274              :             /*
     275              :              * Shortcut: If this is the end of the pattern, then the rest of
     276              :              * the text has to match the rest of the pattern.
     277              :              */
     278          138 :             if (p1len == 0)
     279              :             {
     280              :                 int         cmp;
     281              : 
     282           72 :                 cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
     283              : 
     284           72 :                 if (buf)
     285            3 :                     pfree(buf);
     286           72 :                 if (cmp == 0)
     287           45 :                     return LIKE_TRUE;
     288              :                 else
     289           27 :                     return LIKE_FALSE;
     290              :             }
     291              : 
     292              :             /*
     293              :              * Now build a substring of the text and try to match it against
     294              :              * the subpattern.  t is the start of the text, t1 is one past the
     295              :              * last byte.  We start with a zero-length string.
     296              :              */
     297           66 :             t1 = t;
     298           66 :             t1len = tlen;
     299              :             for (;;)
     300          129 :             {
     301              :                 int         cmp;
     302              : 
     303          195 :                 CHECK_FOR_INTERRUPTS();
     304              : 
     305          195 :                 cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
     306              : 
     307              :                 /*
     308              :                  * If we found a match, we have to test if the rest of pattern
     309              :                  * can match against the rest of the string.  Otherwise we
     310              :                  * have to continue here try matching with a longer substring.
     311              :                  * (This is similar to the recursion for the '%' wildcard
     312              :                  * above.)
     313              :                  *
     314              :                  * Note that we can't just wind forward p and t and continue
     315              :                  * with the main loop.  This would fail for example with
     316              :                  *
     317              :                  * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
     318              :                  *
     319              :                  * You'd find that t=\0061 matches p=\00E4, but then the rest
     320              :                  * won't match; but t=\0061\0308 also matches p=\00E4, and
     321              :                  * then the rest will match.
     322              :                  */
     323          195 :                 if (cmp == 0)
     324              :                 {
     325           51 :                     int         matched = MatchText(t1, t1len, p1, p1len, locale);
     326              : 
     327           51 :                     if (matched == LIKE_TRUE)
     328              :                     {
     329           45 :                         if (buf)
     330            0 :                             pfree(buf);
     331           45 :                         return matched;
     332              :                     }
     333              :                 }
     334              : 
     335              :                 /*
     336              :                  * Didn't match.  If we used up the whole text, then the match
     337              :                  * fails.  Otherwise, try again with a longer substring.
     338              :                  */
     339          150 :                 if (t1len == 0)
     340              :                 {
     341           21 :                     if (buf)
     342            0 :                         pfree(buf);
     343           21 :                     return LIKE_FALSE;
     344              :                 }
     345              :                 else
     346          144 :                     NextChar(t1, t1len);
     347              :             }
     348              :         }
     349       854384 :         else if (GETCHAR(*p) != GETCHAR(*t))
     350              :         {
     351              :             /* non-wildcard pattern char fails to match text char */
     352       554283 :             return LIKE_FALSE;
     353              :         }
     354              : 
     355              :         /*
     356              :          * Pattern and text match, so advance.
     357              :          *
     358              :          * It is safe to use NextByte instead of NextChar here, even for
     359              :          * multi-byte character sets, because we are not following immediately
     360              :          * after a wildcard character. If we are in the middle of a multibyte
     361              :          * character, we must already have matched at least one byte of the
     362              :          * character from both text and pattern; so we cannot get out-of-sync
     363              :          * on character boundaries.  And we know that no backend-legal
     364              :          * encoding allows ASCII characters such as '%' to appear as non-first
     365              :          * bytes of characters, so we won't mistakenly detect a new wildcard.
     366              :          */
     367       304849 :         NextByte(t, tlen);
     368       304849 :         NextByte(p, plen);
     369              :     }
     370              : 
     371         5265 :     if (tlen > 0)
     372          153 :         return LIKE_FALSE;      /* end of pattern, but not of text */
     373              : 
     374              :     /*
     375              :      * End of text, but perhaps not of pattern.  Match iff the remaining
     376              :      * pattern can match a zero-length string, ie, it's zero or more %'s.
     377              :      */
     378         5406 :     while (plen > 0 && *p == '%')
     379          294 :         NextByte(p, plen);
     380         5112 :     if (plen <= 0)
     381         2336 :         return LIKE_TRUE;
     382              : 
     383              :     /*
     384              :      * End of text with no match, so no point in trying later places to start
     385              :      * matching this pattern.
     386              :      */
     387         2776 :     return LIKE_ABORT;
     388              : }                               /* MatchText() */
     389              : 
     390              : /*
     391              :  * like_escape() --- given a pattern and an ESCAPE string,
     392              :  * convert the pattern to use Postgres' standard backslash escape convention.
     393              :  */
     394              : #ifdef do_like_escape
     395              : 
     396              : static text *
     397          112 : do_like_escape(text *pat, text *esc)
     398              : {
     399              :     text       *result;
     400              :     char       *p,
     401              :                *e,
     402              :                *r;
     403              :     int         plen,
     404              :                 elen;
     405              :     bool        afterescape;
     406              : 
     407          112 :     p = VARDATA_ANY(pat);
     408          112 :     plen = VARSIZE_ANY_EXHDR(pat);
     409          112 :     e = VARDATA_ANY(esc);
     410          112 :     elen = VARSIZE_ANY_EXHDR(esc);
     411              : 
     412              :     /*
     413              :      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
     414              :      * trying to calculate the size more accurately than that.
     415              :      */
     416          112 :     result = (text *) palloc(plen * 2 + VARHDRSZ);
     417          112 :     r = VARDATA(result);
     418              : 
     419          112 :     if (elen == 0)
     420              :     {
     421              :         /*
     422              :          * No escape character is wanted.  Double any backslashes in the
     423              :          * pattern to make them act like ordinary characters.
     424              :          */
     425           64 :         while (plen > 0)
     426              :         {
     427           48 :             if (*p == '\\')
     428            0 :                 *r++ = '\\';
     429           96 :             CopyAdvChar(r, p, plen);
     430              :         }
     431              :     }
     432              :     else
     433              :     {
     434              :         /*
     435              :          * The specified escape must be only a single character.
     436              :          */
     437           96 :         NextChar(e, elen);
     438           96 :         if (elen != 0)
     439            0 :             ereport(ERROR,
     440              :                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     441              :                      errmsg("invalid escape string"),
     442              :                      errhint("Escape string must be empty or one character.")));
     443              : 
     444           96 :         e = VARDATA_ANY(esc);
     445           96 :         elen = VARSIZE_ANY_EXHDR(esc);
     446              : 
     447              :         /*
     448              :          * If specified escape is '\', just copy the pattern as-is.
     449              :          */
     450           96 :         if (*e == '\\')
     451              :         {
     452            0 :             memcpy(result, pat, VARSIZE_ANY(pat));
     453            0 :             return result;
     454              :         }
     455              : 
     456              :         /*
     457              :          * Otherwise, convert occurrences of the specified escape character to
     458              :          * '\', and double occurrences of '\' --- unless they immediately
     459              :          * follow an escape character!
     460              :          */
     461           96 :         afterescape = false;
     462          582 :         while (plen > 0)
     463              :         {
     464          486 :             if (CHAREQ(p, plen, e, elen) && !afterescape)
     465              :             {
     466           96 :                 *r++ = '\\';
     467           96 :                 NextChar(p, plen);
     468           96 :                 afterescape = true;
     469              :             }
     470          390 :             else if (*p == '\\')
     471              :             {
     472            0 :                 *r++ = '\\';
     473            0 :                 if (!afterescape)
     474            0 :                     *r++ = '\\';
     475            0 :                 NextChar(p, plen);
     476            0 :                 afterescape = false;
     477              :             }
     478              :             else
     479              :             {
     480          762 :                 CopyAdvChar(r, p, plen);
     481          390 :                 afterescape = false;
     482              :             }
     483              :         }
     484              :     }
     485              : 
     486          112 :     SET_VARSIZE(result, r - ((char *) result));
     487              : 
     488          112 :     return result;
     489              : }
     490              : #endif                          /* do_like_escape */
     491              : 
     492              : #ifdef CHAREQ
     493              : #undef CHAREQ
     494              : #endif
     495              : 
     496              : #undef NextChar
     497              : #undef CopyAdvChar
     498              : #undef MatchText
     499              : 
     500              : #ifdef do_like_escape
     501              : #undef do_like_escape
     502              : #endif
     503              : 
     504              : #undef GETCHAR
     505              : 
     506              : #ifdef MATCH_LOWER
     507              : #undef MATCH_LOWER
     508              : 
     509              : #endif

Generated by: LCOV version 2.0-1