LCOV - code coverage report
Current view: top level - src/backend/utils/adt - like_match.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 118 131 90.1 %
Date: 2024-12-12 17:14:55 Functions: 4 6 66.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * like_match.c
       4             :  *    LIKE pattern matching internal code.
       5             :  *
       6             :  * This file is included by like.c four times, to provide matching code for
       7             :  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
       8             :  * and (4) case insensitive matches in single-byte encodings.
       9             :  * (UTF8 is a special case because we can use a much more efficient version
      10             :  * of NextChar than can be used for general multi-byte encodings.)
      11             :  *
      12             :  * Before the inclusion, we need to define the following macros:
      13             :  *
      14             :  * NextChar
      15             :  * MatchText - to name of function wanted
      16             :  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
      17             :  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
      18             :  *
      19             :  * Copyright (c) 1996-2024, PostgreSQL Global Development Group
      20             :  *
      21             :  * IDENTIFICATION
      22             :  *  src/backend/utils/adt/like_match.c
      23             :  *
      24             :  *-------------------------------------------------------------------------
      25             :  */
      26             : 
      27             : /*
      28             :  *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
      29             :  *  Rich $alz is now <rsalz@bbn.com>.
      30             :  *  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
      31             :  *  LIKE_ABORT code.
      32             :  *
      33             :  *  This code was shamelessly stolen from the "pql" code by myself and
      34             :  *  slightly modified :)
      35             :  *
      36             :  *  All references to the word "star" were replaced by "percent"
      37             :  *  All references to the word "wild" were replaced by "like"
      38             :  *
      39             :  *  All the nice shell RE matching stuff was replaced by just "_" and "%"
      40             :  *
      41             :  *  As I don't have a copy of the SQL standard handy I wasn't sure whether
      42             :  *  to leave in the '\' escape character handling.
      43             :  *
      44             :  *  Keith Parks. <keith@mtcc.demon.co.uk>
      45             :  *
      46             :  *  SQL lets you specify the escape character by saying
      47             :  *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
      48             :  *  so we force you to use '\'. - ay 7/95
      49             :  *
      50             :  *  Now we have the like_escape() function that converts patterns with
      51             :  *  any specified escape character (or none at all) to the internal
      52             :  *  default escape character, which is still '\'. - tgl 9/2000
      53             :  *
      54             :  * The code is rewritten to avoid requiring null-terminated strings,
      55             :  * which in turn allows us to leave out some memcpy() operations.
      56             :  * This code should be faster and take less memory, but no promises...
      57             :  * - thomas 2000-08-06
      58             :  */
      59             : 
      60             : 
      61             : /*--------------------
      62             :  *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
      63             :  *
      64             :  *  LIKE_TRUE: they match
      65             :  *  LIKE_FALSE: they don't match
      66             :  *  LIKE_ABORT: not only don't they match, but the text is too short.
      67             :  *
      68             :  * If LIKE_ABORT is returned, then no suffix of the text can match the
      69             :  * pattern either, so an upper-level % scan can stop scanning now.
      70             :  *--------------------
      71             :  */
      72             : 
      73             : #ifdef MATCH_LOWER
      74             : #define GETCHAR(t, locale) MATCH_LOWER(t, locale)
      75             : #else
      76             : #define GETCHAR(t, locale) (t)
      77             : #endif
      78             : 
      79             : static int
      80     1201096 : MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
      81             : {
      82             :     /* Fast path for match-everything pattern */
      83     1201096 :     if (plen == 1 && *p == '%')
      84         244 :         return LIKE_TRUE;
      85             : 
      86             :     /* Since this function recurses, it could be driven to stack overflow */
      87     1200852 :     check_stack_depth();
      88             : 
      89             :     /*
      90             :      * In this loop, we advance by char when matching wildcards (and thus on
      91             :      * recursive entry to this function we are properly char-synced). On other
      92             :      * occasions it is safe to advance by byte, as the text and pattern will
      93             :      * be in lockstep. This allows us to perform all comparisons between the
      94             :      * text and pattern on a byte by byte basis, even for multi-byte
      95             :      * encodings.
      96             :      */
      97     1804194 :     while (tlen > 0 && plen > 0)
      98             :     {
      99     1793752 :         if (*p == '\\')
     100             :         {
     101             :             /* Next pattern byte must match literally, whatever it is */
     102       12382 :             NextByte(p, plen);
     103             :             /* ... and there had better be one, per SQL standard */
     104       12382 :             if (plen <= 0)
     105           0 :                 ereport(ERROR,
     106             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     107             :                          errmsg("LIKE pattern must not end with escape character")));
     108       12382 :             if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
     109        3408 :                 return LIKE_FALSE;
     110             :         }
     111     1781370 :         else if (*p == '%')
     112             :         {
     113             :             char        firstpat;
     114             : 
     115             :             /*
     116             :              * % processing is essentially a search for a text position at
     117             :              * which the remainder of the text matches the remainder of the
     118             :              * pattern, using a recursive call to check each potential match.
     119             :              *
     120             :              * If there are wildcards immediately following the %, we can skip
     121             :              * over them first, using the idea that any sequence of N _'s and
     122             :              * one or more %'s is equivalent to N _'s and one % (ie, it will
     123             :              * match any sequence of at least N text characters).  In this way
     124             :              * we will always run the recursive search loop using a pattern
     125             :              * fragment that begins with a literal character-to-match, thereby
     126             :              * not recursing more than we have to.
     127             :              */
     128      165564 :             NextByte(p, plen);
     129             : 
     130      166070 :             while (plen > 0)
     131             :             {
     132      131668 :                 if (*p == '%')
     133          18 :                     NextByte(p, plen);
     134      131650 :                 else if (*p == '_')
     135             :                 {
     136             :                     /* If not enough text left to match the pattern, ABORT */
     137         494 :                     if (tlen <= 0)
     138           6 :                         return LIKE_ABORT;
     139         488 :                     NextChar(t, tlen);
     140         488 :                     NextByte(p, plen);
     141             :                 }
     142             :                 else
     143      131156 :                     break;      /* Reached a non-wildcard pattern char */
     144             :             }
     145             : 
     146             :             /*
     147             :              * If we're at end of pattern, match: we have a trailing % which
     148             :              * matches any remaining text string.
     149             :              */
     150      165558 :             if (plen <= 0)
     151       34402 :                 return LIKE_TRUE;
     152             : 
     153             :             /*
     154             :              * Otherwise, scan for a text position at which we can match the
     155             :              * rest of the pattern.  The first remaining pattern char is known
     156             :              * to be a regular or escaped literal character, so we can compare
     157             :              * the first pattern byte to each text byte to avoid recursing
     158             :              * more than we have to.  This fact also guarantees that we don't
     159             :              * have to consider a match to the zero-length substring at the
     160             :              * end of the text.  With a nondeterministic collation, we can't
     161             :              * rely on the first bytes being equal, so we have to recurse in
     162             :              * any case.
     163             :              */
     164      131156 :             if (*p == '\\')
     165             :             {
     166           4 :                 if (plen < 2)
     167           0 :                     ereport(ERROR,
     168             :                             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     169             :                              errmsg("LIKE pattern must not end with escape character")));
     170           4 :                 firstpat = GETCHAR(p[1], locale);
     171             :             }
     172             :             else
     173      131152 :                 firstpat = GETCHAR(*p, locale);
     174             : 
     175     3841224 :             while (tlen > 0)
     176             :             {
     177     3738996 :                 if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
     178             :                 {
     179      107342 :                     int         matched = MatchText(t, tlen, p, plen, locale);
     180             : 
     181      107342 :                     if (matched != LIKE_FALSE)
     182       28928 :                         return matched; /* TRUE or ABORT */
     183             :                 }
     184             : 
     185     3710116 :                 NextChar(t, tlen);
     186             :             }
     187             : 
     188             :             /*
     189             :              * End of text with no match, so no point in trying later places
     190             :              * to start matching this pattern.
     191             :              */
     192      102228 :             return LIKE_ABORT;
     193             :         }
     194     1615806 :         else if (*p == '_')
     195             :         {
     196             :             /* _ matches any single character, and we know there is one */
     197       10736 :             NextChar(t, tlen);
     198       10718 :             NextByte(p, plen);
     199       10718 :             continue;
     200             :         }
     201     1605088 :         else if (locale && !locale->deterministic)
     202             :         {
     203             :             /*
     204             :              * For nondeterministic locales, we find the next substring of the
     205             :              * pattern that does not contain wildcards and try to find a
     206             :              * matching substring in the text.  Crucially, we cannot do this
     207             :              * character by character, as in the normal case, but must do it
     208             :              * substring by substring, partitioned by the wildcard characters.
     209             :              * (This is per SQL standard.)
     210             :              */
     211             :             const char *p1;
     212             :             size_t      p1len;
     213             :             const char *t1;
     214             :             size_t      t1len;
     215             :             bool        found_escape;
     216             :             const char *subpat;
     217             :             size_t      subpatlen;
     218         282 :             char       *buf = NULL;
     219             : 
     220             :             /*
     221             :              * Determine next substring of pattern without wildcards.  p is
     222             :              * the start of the subpattern, p1 is one past the last byte. Also
     223             :              * track if we found an escape character.
     224             :              */
     225         282 :             p1 = p;
     226         282 :             p1len = plen;
     227         282 :             found_escape = false;
     228         810 :             while (p1len > 0)
     229             :             {
     230         666 :                 if (*p1 == '\\')
     231             :                 {
     232          12 :                     found_escape = true;
     233          12 :                     NextByte(p1, p1len);
     234          12 :                     if (p1len == 0)
     235           6 :                         ereport(ERROR,
     236             :                                 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     237             :                                  errmsg("LIKE pattern must not end with escape character")));
     238             :                 }
     239         654 :                 else if (*p1 == '_' || *p1 == '%')
     240             :                     break;
     241         528 :                 NextByte(p1, p1len);
     242             :             }
     243             : 
     244             :             /*
     245             :              * If we found an escape character, then make an unescaped copy of
     246             :              * the subpattern.
     247             :              */
     248         276 :             if (found_escape)
     249             :             {
     250             :                 char       *b;
     251             : 
     252           6 :                 b = buf = palloc(p1 - p);
     253          30 :                 for (const char *c = p; c < p1; c++)
     254             :                 {
     255          24 :                     if (*c == '\\')
     256             :                         ;
     257             :                     else
     258          18 :                         *(b++) = *c;
     259             :                 }
     260             : 
     261           6 :                 subpat = buf;
     262           6 :                 subpatlen = b - buf;
     263             :             }
     264             :             else
     265             :             {
     266         270 :                 subpat = p;
     267         270 :                 subpatlen = p1 - p;
     268             :             }
     269             : 
     270             :             /*
     271             :              * Shortcut: If this is the end of the pattern, then the rest of
     272             :              * the text has to match the rest of the pattern.
     273             :              */
     274         276 :             if (p1len == 0)
     275             :             {
     276             :                 int         cmp;
     277             : 
     278         144 :                 cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
     279             : 
     280         144 :                 if (buf)
     281           6 :                     pfree(buf);
     282         144 :                 if (cmp == 0)
     283          90 :                     return LIKE_TRUE;
     284             :                 else
     285          54 :                     return LIKE_FALSE;
     286             :             }
     287             : 
     288             :             /*
     289             :              * Now build a substring of the text and try to match it against
     290             :              * the subpattern.  t is the start of the text, t1 is one past the
     291             :              * last byte.  We start with a zero-length string.
     292             :              */
     293         132 :             t1 = t;
     294         132 :             t1len = tlen;
     295             :             for (;;)
     296         258 :             {
     297             :                 int         cmp;
     298             : 
     299         390 :                 CHECK_FOR_INTERRUPTS();
     300             : 
     301         390 :                 cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
     302             : 
     303             :                 /*
     304             :                  * If we found a match, we have to test if the rest of pattern
     305             :                  * can match against the rest of the string.  Otherwise we
     306             :                  * have to continue here try matching with a longer substring.
     307             :                  * (This is similar to the recursion for the '%' wildcard
     308             :                  * above.)
     309             :                  *
     310             :                  * Note that we can't just wind forward p and t and continue
     311             :                  * with the main loop.  This would fail for example with
     312             :                  *
     313             :                  * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
     314             :                  *
     315             :                  * You'd find that t=\0061 matches p=\00E4, but then the rest
     316             :                  * won't match; but t=\0061\0308 also matches p=\00E4, and
     317             :                  * then the rest will match.
     318             :                  */
     319         390 :                 if (cmp == 0)
     320             :                 {
     321         102 :                     int         matched = MatchText(t1, t1len, p1, p1len, locale);
     322             : 
     323         102 :                     if (matched == LIKE_TRUE)
     324             :                     {
     325          90 :                         if (buf)
     326           0 :                             pfree(buf);
     327          90 :                         return matched;
     328             :                     }
     329             :                 }
     330             : 
     331             :                 /*
     332             :                  * Didn't match.  If we used up the whole text, then the match
     333             :                  * fails.  Otherwise, try again with a longer substring.
     334             :                  */
     335         300 :                 if (t1len == 0)
     336             :                 {
     337          42 :                     if (buf)
     338           0 :                         pfree(buf);
     339          42 :                     return LIKE_FALSE;
     340             :                 }
     341             :                 else
     342         288 :                     NextChar(t1, t1len);
     343             :             }
     344             :         }
     345     1604806 :         else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
     346             :         {
     347             :             /* non-wildcard pattern char fails to match text char */
     348     1021156 :             return LIKE_FALSE;
     349             :         }
     350             : 
     351             :         /*
     352             :          * Pattern and text match, so advance.
     353             :          *
     354             :          * It is safe to use NextByte instead of NextChar here, even for
     355             :          * multi-byte character sets, because we are not following immediately
     356             :          * after a wildcard character. If we are in the middle of a multibyte
     357             :          * character, we must already have matched at least one byte of the
     358             :          * character from both text and pattern; so we cannot get out-of-sync
     359             :          * on character boundaries.  And we know that no backend-legal
     360             :          * encoding allows ASCII characters such as '%' to appear as non-first
     361             :          * bytes of characters, so we won't mistakenly detect a new wildcard.
     362             :          */
     363      592624 :         NextByte(t, tlen);
     364      592624 :         NextByte(p, plen);
     365             :     }
     366             : 
     367       10442 :     if (tlen > 0)
     368         318 :         return LIKE_FALSE;      /* end of pattern, but not of text */
     369             : 
     370             :     /*
     371             :      * End of text, but perhaps not of pattern.  Match iff the remaining
     372             :      * pattern can match a zero-length string, ie, it's zero or more %'s.
     373             :      */
     374       10696 :     while (plen > 0 && *p == '%')
     375         572 :         NextByte(p, plen);
     376       10124 :     if (plen <= 0)
     377        4644 :         return LIKE_TRUE;
     378             : 
     379             :     /*
     380             :      * End of text with no match, so no point in trying later places to start
     381             :      * matching this pattern.
     382             :      */
     383        5480 :     return LIKE_ABORT;
     384             : }                               /* MatchText() */
     385             : 
     386             : /*
     387             :  * like_escape() --- given a pattern and an ESCAPE string,
     388             :  * convert the pattern to use Postgres' standard backslash escape convention.
     389             :  */
     390             : #ifdef do_like_escape
     391             : 
     392             : static text *
     393         224 : do_like_escape(text *pat, text *esc)
     394             : {
     395             :     text       *result;
     396             :     char       *p,
     397             :                *e,
     398             :                *r;
     399             :     int         plen,
     400             :                 elen;
     401             :     bool        afterescape;
     402             : 
     403         224 :     p = VARDATA_ANY(pat);
     404         224 :     plen = VARSIZE_ANY_EXHDR(pat);
     405         224 :     e = VARDATA_ANY(esc);
     406         224 :     elen = VARSIZE_ANY_EXHDR(esc);
     407             : 
     408             :     /*
     409             :      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
     410             :      * trying to calculate the size more accurately than that.
     411             :      */
     412         224 :     result = (text *) palloc(plen * 2 + VARHDRSZ);
     413         224 :     r = VARDATA(result);
     414             : 
     415         224 :     if (elen == 0)
     416             :     {
     417             :         /*
     418             :          * No escape character is wanted.  Double any backslashes in the
     419             :          * pattern to make them act like ordinary characters.
     420             :          */
     421         128 :         while (plen > 0)
     422             :         {
     423          96 :             if (*p == '\\')
     424           0 :                 *r++ = '\\';
     425         192 :             CopyAdvChar(r, p, plen);
     426             :         }
     427             :     }
     428             :     else
     429             :     {
     430             :         /*
     431             :          * The specified escape must be only a single character.
     432             :          */
     433         192 :         NextChar(e, elen);
     434         192 :         if (elen != 0)
     435           0 :             ereport(ERROR,
     436             :                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     437             :                      errmsg("invalid escape string"),
     438             :                      errhint("Escape string must be empty or one character.")));
     439             : 
     440         192 :         e = VARDATA_ANY(esc);
     441             : 
     442             :         /*
     443             :          * If specified escape is '\', just copy the pattern as-is.
     444             :          */
     445         192 :         if (*e == '\\')
     446             :         {
     447           0 :             memcpy(result, pat, VARSIZE_ANY(pat));
     448           0 :             return result;
     449             :         }
     450             : 
     451             :         /*
     452             :          * Otherwise, convert occurrences of the specified escape character to
     453             :          * '\', and double occurrences of '\' --- unless they immediately
     454             :          * follow an escape character!
     455             :          */
     456         192 :         afterescape = false;
     457        1164 :         while (plen > 0)
     458             :         {
     459         972 :             if (CHAREQ(p, e) && !afterescape)
     460             :             {
     461         192 :                 *r++ = '\\';
     462         192 :                 NextChar(p, plen);
     463         192 :                 afterescape = true;
     464             :             }
     465         780 :             else if (*p == '\\')
     466             :             {
     467           0 :                 *r++ = '\\';
     468           0 :                 if (!afterescape)
     469           0 :                     *r++ = '\\';
     470           0 :                 NextChar(p, plen);
     471           0 :                 afterescape = false;
     472             :             }
     473             :             else
     474             :             {
     475        1524 :                 CopyAdvChar(r, p, plen);
     476         780 :                 afterescape = false;
     477             :             }
     478             :         }
     479             :     }
     480             : 
     481         224 :     SET_VARSIZE(result, r - ((char *) result));
     482             : 
     483         224 :     return result;
     484             : }
     485             : #endif                          /* do_like_escape */
     486             : 
     487             : #ifdef CHAREQ
     488             : #undef CHAREQ
     489             : #endif
     490             : 
     491             : #undef NextChar
     492             : #undef CopyAdvChar
     493             : #undef MatchText
     494             : 
     495             : #ifdef do_like_escape
     496             : #undef do_like_escape
     497             : #endif
     498             : 
     499             : #undef GETCHAR
     500             : 
     501             : #ifdef MATCH_LOWER
     502             : #undef MATCH_LOWER
     503             : 
     504             : #endif

Generated by: LCOV version 1.14