LCOV - code coverage report
Current view: top level - src/backend/utils/adt - like_match.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 118 131 90.1 %
Date: 2025-12-23 12:18:01 Functions: 5 6 83.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * like_match.c
       4             :  *    LIKE pattern matching internal code.
       5             :  *
       6             :  * This file is included by like.c four times, to provide matching code for
       7             :  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
       8             :  * and (4) case insensitive matches in single-byte encodings.
       9             :  * (UTF8 is a special case because we can use a much more efficient version
      10             :  * of NextChar than can be used for general multi-byte encodings.)
      11             :  *
      12             :  * Before the inclusion, we need to define the following macros:
      13             :  *
      14             :  * NextChar
      15             :  * MatchText - to name of function wanted
      16             :  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
      17             :  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
      18             :  *
      19             :  * Copyright (c) 1996-2025, PostgreSQL Global Development Group
      20             :  *
      21             :  * IDENTIFICATION
      22             :  *  src/backend/utils/adt/like_match.c
      23             :  *
      24             :  *-------------------------------------------------------------------------
      25             :  */
      26             : 
      27             : /*
      28             :  *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
      29             :  *  Rich $alz is now <rsalz@bbn.com>.
      30             :  *  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
      31             :  *  LIKE_ABORT code.
      32             :  *
      33             :  *  This code was shamelessly stolen from the "pql" code by myself and
      34             :  *  slightly modified :)
      35             :  *
      36             :  *  All references to the word "star" were replaced by "percent"
      37             :  *  All references to the word "wild" were replaced by "like"
      38             :  *
      39             :  *  All the nice shell RE matching stuff was replaced by just "_" and "%"
      40             :  *
      41             :  *  As I don't have a copy of the SQL standard handy I wasn't sure whether
      42             :  *  to leave in the '\' escape character handling.
      43             :  *
      44             :  *  Keith Parks. <keith@mtcc.demon.co.uk>
      45             :  *
      46             :  *  SQL lets you specify the escape character by saying
      47             :  *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
      48             :  *  so we force you to use '\'. - ay 7/95
      49             :  *
      50             :  *  Now we have the like_escape() function that converts patterns with
      51             :  *  any specified escape character (or none at all) to the internal
      52             :  *  default escape character, which is still '\'. - tgl 9/2000
      53             :  *
      54             :  * The code is rewritten to avoid requiring null-terminated strings,
      55             :  * which in turn allows us to leave out some memcpy() operations.
      56             :  * This code should be faster and take less memory, but no promises...
      57             :  * - thomas 2000-08-06
      58             :  */
      59             : 
      60             : 
      61             : /*--------------------
      62             :  *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
      63             :  *
      64             :  *  LIKE_TRUE: they match
      65             :  *  LIKE_FALSE: they don't match
      66             :  *  LIKE_ABORT: not only don't they match, but the text is too short.
      67             :  *
      68             :  * If LIKE_ABORT is returned, then no suffix of the text can match the
      69             :  * pattern either, so an upper-level % scan can stop scanning now.
      70             :  *--------------------
      71             :  */
      72             : 
      73             : /*
      74             :  * MATCH_LOWER is defined for ILIKE in the C locale as an optimization. Other
      75             :  * locales must casefold the inputs before matching.
      76             :  */
      77             : #ifdef MATCH_LOWER
      78             : #define GETCHAR(t) pg_ascii_tolower(t)
      79             : #else
      80             : #define GETCHAR(t) (t)
      81             : #endif
      82             : 
      83             : static int
      84     1281820 : MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
      85             : {
      86             :     /* Fast path for match-everything pattern */
      87     1281820 :     if (plen == 1 && *p == '%')
      88         244 :         return LIKE_TRUE;
      89             : 
      90             :     /* Since this function recurses, it could be driven to stack overflow */
      91     1281576 :     check_stack_depth();
      92             : 
      93             :     /*
      94             :      * In this loop, we advance by char when matching wildcards (and thus on
      95             :      * recursive entry to this function we are properly char-synced). On other
      96             :      * occasions it is safe to advance by byte, as the text and pattern will
      97             :      * be in lockstep. This allows us to perform all comparisons between the
      98             :      * text and pattern on a byte by byte basis, even for multi-byte
      99             :      * encodings.
     100             :      */
     101     1907744 :     while (tlen > 0 && plen > 0)
     102             :     {
     103     1897202 :         if (*p == '\\')
     104             :         {
     105             :             /* Next pattern byte must match literally, whatever it is */
     106       12966 :             NextByte(p, plen);
     107             :             /* ... and there had better be one, per SQL standard */
     108       12966 :             if (plen <= 0)
     109           0 :                 ereport(ERROR,
     110             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     111             :                          errmsg("LIKE pattern must not end with escape character")));
     112       12966 :             if (GETCHAR(*p) != GETCHAR(*t))
     113        3442 :                 return LIKE_FALSE;
     114             :         }
     115     1884236 :         else if (*p == '%')
     116             :         {
     117             :             char        firstpat;
     118             : 
     119             :             /*
     120             :              * % processing is essentially a search for a text position at
     121             :              * which the remainder of the text matches the remainder of the
     122             :              * pattern, using a recursive call to check each potential match.
     123             :              *
     124             :              * If there are wildcards immediately following the %, we can skip
     125             :              * over them first, using the idea that any sequence of N _'s and
     126             :              * one or more %'s is equivalent to N _'s and one % (ie, it will
     127             :              * match any sequence of at least N text characters).  In this way
     128             :              * we will always run the recursive search loop using a pattern
     129             :              * fragment that begins with a literal character-to-match, thereby
     130             :              * not recursing more than we have to.
     131             :              */
     132      167456 :             NextByte(p, plen);
     133             : 
     134      167954 :             while (plen > 0)
     135             :             {
     136      132080 :                 if (*p == '%')
     137          18 :                     NextByte(p, plen);
     138      132062 :                 else if (*p == '_')
     139             :                 {
     140             :                     /* If not enough text left to match the pattern, ABORT */
     141         486 :                     if (tlen <= 0)
     142           6 :                         return LIKE_ABORT;
     143         480 :                     NextChar(t, tlen);
     144         480 :                     NextByte(p, plen);
     145             :                 }
     146             :                 else
     147      131576 :                     break;      /* Reached a non-wildcard pattern char */
     148             :             }
     149             : 
     150             :             /*
     151             :              * If we're at end of pattern, match: we have a trailing % which
     152             :              * matches any remaining text string.
     153             :              */
     154      167450 :             if (plen <= 0)
     155       35874 :                 return LIKE_TRUE;
     156             : 
     157             :             /*
     158             :              * Otherwise, scan for a text position at which we can match the
     159             :              * rest of the pattern.  The first remaining pattern char is known
     160             :              * to be a regular or escaped literal character, so we can compare
     161             :              * the first pattern byte to each text byte to avoid recursing
     162             :              * more than we have to.  This fact also guarantees that we don't
     163             :              * have to consider a match to the zero-length substring at the
     164             :              * end of the text.  With a nondeterministic collation, we can't
     165             :              * rely on the first bytes being equal, so we have to recurse in
     166             :              * any case.
     167             :              */
     168      131576 :             if (*p == '\\')
     169             :             {
     170           4 :                 if (plen < 2)
     171           0 :                     ereport(ERROR,
     172             :                             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     173             :                              errmsg("LIKE pattern must not end with escape character")));
     174           4 :                 firstpat = GETCHAR(p[1]);
     175             :             }
     176             :             else
     177      131572 :                 firstpat = GETCHAR(*p);
     178             : 
     179     3840798 :             while (tlen > 0)
     180             :             {
     181     3738234 :                 if (GETCHAR(*t) == firstpat || (locale && !locale->deterministic))
     182             :                 {
     183      108140 :                     int         matched = MatchText(t, tlen, p, plen, locale);
     184             : 
     185      108140 :                     if (matched != LIKE_FALSE)
     186       29012 :                         return matched; /* TRUE or ABORT */
     187             :                 }
     188             : 
     189     3709270 :                 NextChar(t, tlen);
     190             :             }
     191             : 
     192             :             /*
     193             :              * End of text with no match, so no point in trying later places
     194             :              * to start matching this pattern.
     195             :              */
     196      102564 :             return LIKE_ABORT;
     197             :         }
     198     1716780 :         else if (*p == '_')
     199             :         {
     200             :             /* _ matches any single character, and we know there is one */
     201       13212 :             NextChar(t, tlen);
     202       13194 :             NextByte(p, plen);
     203       13194 :             continue;
     204             :         }
     205     1703586 :         else if (locale && !locale->deterministic)
     206             :         {
     207             :             /*
     208             :              * For nondeterministic locales, we find the next substring of the
     209             :              * pattern that does not contain wildcards and try to find a
     210             :              * matching substring in the text.  Crucially, we cannot do this
     211             :              * character by character, as in the normal case, but must do it
     212             :              * substring by substring, partitioned by the wildcard characters.
     213             :              * (This is per SQL standard.)
     214             :              */
     215             :             const char *p1;
     216             :             size_t      p1len;
     217             :             const char *t1;
     218             :             size_t      t1len;
     219             :             bool        found_escape;
     220             :             const char *subpat;
     221             :             size_t      subpatlen;
     222         282 :             char       *buf = NULL;
     223             : 
     224             :             /*
     225             :              * Determine next substring of pattern without wildcards.  p is
     226             :              * the start of the subpattern, p1 is one past the last byte. Also
     227             :              * track if we found an escape character.
     228             :              */
     229         282 :             p1 = p;
     230         282 :             p1len = plen;
     231         282 :             found_escape = false;
     232         834 :             while (p1len > 0)
     233             :             {
     234         690 :                 if (*p1 == '\\')
     235             :                 {
     236          12 :                     found_escape = true;
     237          12 :                     NextByte(p1, p1len);
     238          12 :                     if (p1len == 0)
     239           6 :                         ereport(ERROR,
     240             :                                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     241             :                                  errmsg("LIKE pattern must not end with escape character")));
     242             :                 }
     243         678 :                 else if (*p1 == '_' || *p1 == '%')
     244             :                     break;
     245         552 :                 NextByte(p1, p1len);
     246             :             }
     247             : 
     248             :             /*
     249             :              * If we found an escape character, then make an unescaped copy of
     250             :              * the subpattern.
     251             :              */
     252         276 :             if (found_escape)
     253             :             {
     254             :                 char       *b;
     255             : 
     256           6 :                 b = buf = palloc(p1 - p);
     257          30 :                 for (const char *c = p; c < p1; c++)
     258             :                 {
     259          24 :                     if (*c == '\\')
     260             :                         ;
     261             :                     else
     262          18 :                         *(b++) = *c;
     263             :                 }
     264             : 
     265           6 :                 subpat = buf;
     266           6 :                 subpatlen = b - buf;
     267             :             }
     268             :             else
     269             :             {
     270         270 :                 subpat = p;
     271         270 :                 subpatlen = p1 - p;
     272             :             }
     273             : 
     274             :             /*
     275             :              * Shortcut: If this is the end of the pattern, then the rest of
     276             :              * the text has to match the rest of the pattern.
     277             :              */
     278         276 :             if (p1len == 0)
     279             :             {
     280             :                 int         cmp;
     281             : 
     282         144 :                 cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
     283             : 
     284         144 :                 if (buf)
     285           6 :                     pfree(buf);
     286         144 :                 if (cmp == 0)
     287          90 :                     return LIKE_TRUE;
     288             :                 else
     289          54 :                     return LIKE_FALSE;
     290             :             }
     291             : 
     292             :             /*
     293             :              * Now build a substring of the text and try to match it against
     294             :              * the subpattern.  t is the start of the text, t1 is one past the
     295             :              * last byte.  We start with a zero-length string.
     296             :              */
     297         132 :             t1 = t;
     298         132 :             t1len = tlen;
     299             :             for (;;)
     300         258 :             {
     301             :                 int         cmp;
     302             : 
     303         390 :                 CHECK_FOR_INTERRUPTS();
     304             : 
     305         390 :                 cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
     306             : 
     307             :                 /*
     308             :                  * If we found a match, we have to test if the rest of pattern
     309             :                  * can match against the rest of the string.  Otherwise we
     310             :                  * have to continue here try matching with a longer substring.
     311             :                  * (This is similar to the recursion for the '%' wildcard
     312             :                  * above.)
     313             :                  *
     314             :                  * Note that we can't just wind forward p and t and continue
     315             :                  * with the main loop.  This would fail for example with
     316             :                  *
     317             :                  * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
     318             :                  *
     319             :                  * You'd find that t=\0061 matches p=\00E4, but then the rest
     320             :                  * won't match; but t=\0061\0308 also matches p=\00E4, and
     321             :                  * then the rest will match.
     322             :                  */
     323         390 :                 if (cmp == 0)
     324             :                 {
     325         102 :                     int         matched = MatchText(t1, t1len, p1, p1len, locale);
     326             : 
     327         102 :                     if (matched == LIKE_TRUE)
     328             :                     {
     329          90 :                         if (buf)
     330           0 :                             pfree(buf);
     331          90 :                         return matched;
     332             :                     }
     333             :                 }
     334             : 
     335             :                 /*
     336             :                  * Didn't match.  If we used up the whole text, then the match
     337             :                  * fails.  Otherwise, try again with a longer substring.
     338             :                  */
     339         300 :                 if (t1len == 0)
     340             :                 {
     341          42 :                     if (buf)
     342           0 :                         pfree(buf);
     343          42 :                     return LIKE_FALSE;
     344             :                 }
     345             :                 else
     346         288 :                     NextChar(t1, t1len);
     347             :             }
     348             :         }
     349     1703304 :         else if (GETCHAR(*p) != GETCHAR(*t))
     350             :         {
     351             :             /* non-wildcard pattern char fails to match text char */
     352     1099854 :             return LIKE_FALSE;
     353             :         }
     354             : 
     355             :         /*
     356             :          * Pattern and text match, so advance.
     357             :          *
     358             :          * It is safe to use NextByte instead of NextChar here, even for
     359             :          * multi-byte character sets, because we are not following immediately
     360             :          * after a wildcard character. If we are in the middle of a multibyte
     361             :          * character, we must already have matched at least one byte of the
     362             :          * character from both text and pattern; so we cannot get out-of-sync
     363             :          * on character boundaries.  And we know that no backend-legal
     364             :          * encoding allows ASCII characters such as '%' to appear as non-first
     365             :          * bytes of characters, so we won't mistakenly detect a new wildcard.
     366             :          */
     367      612974 :         NextByte(t, tlen);
     368      612974 :         NextByte(p, plen);
     369             :     }
     370             : 
     371       10542 :     if (tlen > 0)
     372         306 :         return LIKE_FALSE;      /* end of pattern, but not of text */
     373             : 
     374             :     /*
     375             :      * End of text, but perhaps not of pattern.  Match iff the remaining
     376             :      * pattern can match a zero-length string, ie, it's zero or more %'s.
     377             :      */
     378       10828 :     while (plen > 0 && *p == '%')
     379         592 :         NextByte(p, plen);
     380       10236 :     if (plen <= 0)
     381        4672 :         return LIKE_TRUE;
     382             : 
     383             :     /*
     384             :      * End of text with no match, so no point in trying later places to start
     385             :      * matching this pattern.
     386             :      */
     387        5564 :     return LIKE_ABORT;
     388             : }                               /* MatchText() */
     389             : 
     390             : /*
     391             :  * like_escape() --- given a pattern and an ESCAPE string,
     392             :  * convert the pattern to use Postgres' standard backslash escape convention.
     393             :  */
     394             : #ifdef do_like_escape
     395             : 
     396             : static text *
     397         224 : do_like_escape(text *pat, text *esc)
     398             : {
     399             :     text       *result;
     400             :     char       *p,
     401             :                *e,
     402             :                *r;
     403             :     int         plen,
     404             :                 elen;
     405             :     bool        afterescape;
     406             : 
     407         224 :     p = VARDATA_ANY(pat);
     408         224 :     plen = VARSIZE_ANY_EXHDR(pat);
     409         224 :     e = VARDATA_ANY(esc);
     410         224 :     elen = VARSIZE_ANY_EXHDR(esc);
     411             : 
     412             :     /*
     413             :      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
     414             :      * trying to calculate the size more accurately than that.
     415             :      */
     416         224 :     result = (text *) palloc(plen * 2 + VARHDRSZ);
     417         224 :     r = VARDATA(result);
     418             : 
     419         224 :     if (elen == 0)
     420             :     {
     421             :         /*
     422             :          * No escape character is wanted.  Double any backslashes in the
     423             :          * pattern to make them act like ordinary characters.
     424             :          */
     425         128 :         while (plen > 0)
     426             :         {
     427          96 :             if (*p == '\\')
     428           0 :                 *r++ = '\\';
     429         192 :             CopyAdvChar(r, p, plen);
     430             :         }
     431             :     }
     432             :     else
     433             :     {
     434             :         /*
     435             :          * The specified escape must be only a single character.
     436             :          */
     437         192 :         NextChar(e, elen);
     438         192 :         if (elen != 0)
     439           0 :             ereport(ERROR,
     440             :                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     441             :                      errmsg("invalid escape string"),
     442             :                      errhint("Escape string must be empty or one character.")));
     443             : 
     444         192 :         e = VARDATA_ANY(esc);
     445             : 
     446             :         /*
     447             :          * If specified escape is '\', just copy the pattern as-is.
     448             :          */
     449         192 :         if (*e == '\\')
     450             :         {
     451           0 :             memcpy(result, pat, VARSIZE_ANY(pat));
     452           0 :             return result;
     453             :         }
     454             : 
     455             :         /*
     456             :          * Otherwise, convert occurrences of the specified escape character to
     457             :          * '\', and double occurrences of '\' --- unless they immediately
     458             :          * follow an escape character!
     459             :          */
     460         192 :         afterescape = false;
     461        1164 :         while (plen > 0)
     462             :         {
     463         972 :             if (CHAREQ(p, e) && !afterescape)
     464             :             {
     465         192 :                 *r++ = '\\';
     466         192 :                 NextChar(p, plen);
     467         192 :                 afterescape = true;
     468             :             }
     469         780 :             else if (*p == '\\')
     470             :             {
     471           0 :                 *r++ = '\\';
     472           0 :                 if (!afterescape)
     473           0 :                     *r++ = '\\';
     474           0 :                 NextChar(p, plen);
     475           0 :                 afterescape = false;
     476             :             }
     477             :             else
     478             :             {
     479        1524 :                 CopyAdvChar(r, p, plen);
     480         780 :                 afterescape = false;
     481             :             }
     482             :         }
     483             :     }
     484             : 
     485         224 :     SET_VARSIZE(result, r - ((char *) result));
     486             : 
     487         224 :     return result;
     488             : }
     489             : #endif                          /* do_like_escape */
     490             : 
     491             : #ifdef CHAREQ
     492             : #undef CHAREQ
     493             : #endif
     494             : 
     495             : #undef NextChar
     496             : #undef CopyAdvChar
     497             : #undef MatchText
     498             : 
     499             : #ifdef do_like_escape
     500             : #undef do_like_escape
     501             : #endif
     502             : 
     503             : #undef GETCHAR
     504             : 
     505             : #ifdef MATCH_LOWER
     506             : #undef MATCH_LOWER
     507             : 
     508             : #endif

Generated by: LCOV version 1.16