LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 125 279 44.8 %
Date: 2026-02-02 14:17:46 Functions: 19 21 90.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * fuzzystrmatch.c
       3             :  *
       4             :  * Functions for "fuzzy" comparison of strings
       5             :  *
       6             :  * Joe Conway <mail@joeconway.com>
       7             :  *
       8             :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9             :  * Copyright (c) 2001-2026, PostgreSQL Global Development Group
      10             :  * ALL RIGHTS RESERVED;
      11             :  *
      12             :  * metaphone()
      13             :  * -----------
      14             :  * Modified for PostgreSQL by Joe Conway.
      15             :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16             :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17             :  * Metaphone was originally created by Lawrence Philips and presented in article
      18             :  * in "Computer Language" December 1990 issue.
      19             :  *
      20             :  * Permission to use, copy, modify, and distribute this software and its
      21             :  * documentation for any purpose, without fee, and without a written agreement
      22             :  * is hereby granted, provided that the above copyright notice and this
      23             :  * paragraph and the following two paragraphs appear in all copies.
      24             :  *
      25             :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26             :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27             :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28             :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29             :  * POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33             :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34             :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35             :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36             :  *
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include <ctype.h>
      42             : 
      43             : #include "utils/builtins.h"
      44             : #include "utils/varlena.h"
      45             : #include "varatt.h"
      46             : 
      47           4 : PG_MODULE_MAGIC_EXT(
      48             :                     .name = "fuzzystrmatch",
      49             :                     .version = PG_VERSION
      50             : );
      51             : 
      52             : /*
      53             :  * Soundex
      54             :  */
      55             : static void _soundex(const char *instr, char *outstr);
      56             : 
      57             : #define SOUNDEX_LEN 4
      58             : 
      59             : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      60             : static const char *const soundex_table = "01230120022455012623010202";
      61             : 
      62             : static char
      63         254 : soundex_code(char letter)
      64             : {
      65         254 :     letter = pg_ascii_toupper((unsigned char) letter);
      66             :     /* Defend against non-ASCII letters */
      67         254 :     if (letter >= 'A' && letter <= 'Z')
      68         252 :         return soundex_table[letter - 'A'];
      69           2 :     return letter;
      70             : }
      71             : 
      72             : /*
      73             :  * Metaphone
      74             :  */
      75             : #define MAX_METAPHONE_STRLEN        255
      76             : 
      77             : /*
      78             :  * Original code by Michael G Schwern starts here.
      79             :  * Code slightly modified for use as PostgreSQL function.
      80             :  */
      81             : 
      82             : 
      83             : /**************************************************************************
      84             :     metaphone -- Breaks english phrases down into their phonemes.
      85             : 
      86             :     Input
      87             :         word            --  An english word to be phonized
      88             :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
      89             :                             will phonize the entire phrase.
      90             :         phoned_word     --  The final phonized word.  (We'll allocate the
      91             :                             memory.)
      92             :     Output
      93             :         error   --  A simple error flag, returns true or false
      94             : 
      95             :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      96             :     although non-alpha characters will break up phonemes.
      97             : ****************************************************************************/
      98             : 
      99             : 
     100             : /*  I add modifications to the traditional metaphone algorithm that you
     101             :     might find in books.  Define this if you want metaphone to behave
     102             :     traditionally */
     103             : #undef USE_TRADITIONAL_METAPHONE
     104             : 
     105             : /* Special encodings */
     106             : #define  SH     'X'
     107             : #define  TH     '0'
     108             : 
     109             : static char Lookahead(char *word, int how_far);
     110             : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     111             : 
     112             : /* Metachar.h ... little bits about characters for metaphone */
     113             : 
     114             : 
     115             : /*-- Character encoding array & accessing macros --*/
     116             : /* Stolen directly out of the book... */
     117             : static const char _codes[26] = {
     118             :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     119             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     120             : };
     121             : 
     122             : static int
     123           2 : getcode(char c)
     124             : {
     125           2 :     c = pg_ascii_toupper((unsigned char) c);
     126             :     /* Defend against non-ASCII letters */
     127           2 :     if (c >= 'A' && c <= 'Z')
     128           2 :         return _codes[c - 'A'];
     129             : 
     130           0 :     return 0;
     131             : }
     132             : 
     133             : static bool
     134         132 : ascii_isalpha(char c)
     135             : {
     136         226 :     return (c >= 'A' && c <= 'Z') ||
     137          94 :         (c >= 'a' && c <= 'z');
     138             : }
     139             : 
     140             : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
     141             : 
     142             : /* These letters are passed through unchanged */
     143             : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
     144             : 
     145             : /* These form diphthongs when preceding H */
     146             : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
     147             : 
     148             : /* These make C and G soft */
     149             : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
     150             : 
     151             : /* These prevent GH from becoming F */
     152             : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
     153             : 
     154           4 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     155             : Datum
     156           2 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     157             : {
     158           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     159           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     160           2 :     int         ins_c = PG_GETARG_INT32(2);
     161           2 :     int         del_c = PG_GETARG_INT32(3);
     162           2 :     int         sub_c = PG_GETARG_INT32(4);
     163             :     const char *s_data;
     164             :     const char *t_data;
     165             :     int         s_bytes,
     166             :                 t_bytes;
     167             : 
     168             :     /* Extract a pointer to the actual character data */
     169           2 :     s_data = VARDATA_ANY(src);
     170           2 :     t_data = VARDATA_ANY(dst);
     171             :     /* Determine length of each string in bytes */
     172           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     173           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     174             : 
     175           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     176             :                                        ins_c, del_c, sub_c, false));
     177             : }
     178             : 
     179             : 
     180           4 : PG_FUNCTION_INFO_V1(levenshtein);
     181             : Datum
     182           2 : levenshtein(PG_FUNCTION_ARGS)
     183             : {
     184           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     185           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     186             :     const char *s_data;
     187             :     const char *t_data;
     188             :     int         s_bytes,
     189             :                 t_bytes;
     190             : 
     191             :     /* Extract a pointer to the actual character data */
     192           2 :     s_data = VARDATA_ANY(src);
     193           2 :     t_data = VARDATA_ANY(dst);
     194             :     /* Determine length of each string in bytes */
     195           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     196           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     197             : 
     198           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     199             :                                        1, 1, 1, false));
     200             : }
     201             : 
     202             : 
     203           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     204             : Datum
     205           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     206             : {
     207           0 :     text       *src = PG_GETARG_TEXT_PP(0);
     208           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
     209           0 :     int         ins_c = PG_GETARG_INT32(2);
     210           0 :     int         del_c = PG_GETARG_INT32(3);
     211           0 :     int         sub_c = PG_GETARG_INT32(4);
     212           0 :     int         max_d = PG_GETARG_INT32(5);
     213             :     const char *s_data;
     214             :     const char *t_data;
     215             :     int         s_bytes,
     216             :                 t_bytes;
     217             : 
     218             :     /* Extract a pointer to the actual character data */
     219           0 :     s_data = VARDATA_ANY(src);
     220           0 :     t_data = VARDATA_ANY(dst);
     221             :     /* Determine length of each string in bytes */
     222           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     223           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     224             : 
     225           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     226             :                                                   t_data, t_bytes,
     227             :                                                   ins_c, del_c, sub_c,
     228             :                                                   max_d, false));
     229             : }
     230             : 
     231             : 
     232           4 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     233             : Datum
     234           4 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     235             : {
     236           4 :     text       *src = PG_GETARG_TEXT_PP(0);
     237           4 :     text       *dst = PG_GETARG_TEXT_PP(1);
     238           4 :     int         max_d = PG_GETARG_INT32(2);
     239             :     const char *s_data;
     240             :     const char *t_data;
     241             :     int         s_bytes,
     242             :                 t_bytes;
     243             : 
     244             :     /* Extract a pointer to the actual character data */
     245           4 :     s_data = VARDATA_ANY(src);
     246           4 :     t_data = VARDATA_ANY(dst);
     247             :     /* Determine length of each string in bytes */
     248           4 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     249           4 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     250             : 
     251           4 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     252             :                                                   t_data, t_bytes,
     253             :                                                   1, 1, 1,
     254             :                                                   max_d, false));
     255             : }
     256             : 
     257             : 
     258             : /*
     259             :  * Calculates the metaphone of an input string.
     260             :  * Returns number of characters requested
     261             :  * (suggested value is 4)
     262             :  */
     263           4 : PG_FUNCTION_INFO_V1(metaphone);
     264             : Datum
     265           2 : metaphone(PG_FUNCTION_ARGS)
     266             : {
     267           2 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     268           2 :     size_t      str_i_len = strlen(str_i);
     269             :     int         reqlen;
     270             :     char       *metaph;
     271             : 
     272             :     /* return an empty string if we receive one */
     273           2 :     if (!(str_i_len > 0))
     274           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
     275             : 
     276           2 :     if (str_i_len > MAX_METAPHONE_STRLEN)
     277           0 :         ereport(ERROR,
     278             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     279             :                  errmsg("argument exceeds the maximum length of %d bytes",
     280             :                         MAX_METAPHONE_STRLEN)));
     281             : 
     282           2 :     reqlen = PG_GETARG_INT32(1);
     283           2 :     if (reqlen > MAX_METAPHONE_STRLEN)
     284           0 :         ereport(ERROR,
     285             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     286             :                  errmsg("output exceeds the maximum length of %d bytes",
     287             :                         MAX_METAPHONE_STRLEN)));
     288             : 
     289           2 :     if (!(reqlen > 0))
     290           0 :         ereport(ERROR,
     291             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     292             :                  errmsg("output cannot be empty string")));
     293             : 
     294           2 :     _metaphone(str_i, reqlen, &metaph);
     295           2 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
     296             : }
     297             : 
     298             : 
     299             : /*
     300             :  * Original code by Michael G Schwern starts here.
     301             :  * Code slightly modified for use as PostgreSQL
     302             :  * function (palloc, etc).
     303             :  */
     304             : 
     305             : /* I suppose I could have been using a character pointer instead of
     306             :  * accessing the array directly... */
     307             : 
     308             : /* Look at the next letter in the word */
     309             : #define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
     310             : /* Look at the current letter in the word */
     311             : #define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
     312             : /* Go N letters back. */
     313             : #define Look_Back_Letter(n) \
     314             :     (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
     315             : /* Previous letter.  I dunno, should this return null on failure? */
     316             : #define Prev_Letter (Look_Back_Letter(1))
     317             : /* Look two letters down.  It makes sure you don't walk off the string. */
     318             : #define After_Next_Letter \
     319             :     (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
     320             : #define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
     321             : 
     322             : 
     323             : /* Allows us to safely look ahead an arbitrary # of letters */
     324             : /* I probably could have just used strlen... */
     325             : static char
     326           0 : Lookahead(char *word, int how_far)
     327             : {
     328           0 :     char        letter_ahead = '\0';    /* null by default */
     329             :     int         idx;
     330             : 
     331           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     332             :     /* Edge forward in the string... */
     333             : 
     334           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
     335             :                                  * end of the string */
     336           0 :     return letter_ahead;
     337             : }
     338             : 
     339             : 
     340             : /* phonize one letter */
     341             : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
     342             : /* Slap a null character on the end of the phoned word */
     343             : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     344             : /* How long is the phoned word? */
     345             : #define Phone_Len   (p_idx)
     346             : 
     347             : /* Note is a letter is a 'break' in the word */
     348             : #define Isbreak(c)  (!ascii_isalpha((unsigned char) (c)))
     349             : 
     350             : 
     351             : static void
     352           2 : _metaphone(char *word,          /* IN */
     353             :            int max_phonemes,
     354             :            char **phoned_word)  /* OUT */
     355             : {
     356           2 :     int         w_idx = 0;      /* point in the phonization we're at. */
     357           2 :     int         p_idx = 0;      /* end of the phoned phrase */
     358             : 
     359             :     /*-- Parameter checks --*/
     360             : 
     361             :     /*
     362             :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     363             :      */
     364             : 
     365             :     /* Negative phoneme length is meaningless */
     366           2 :     if (!(max_phonemes > 0))
     367             :         /* internal error */
     368           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
     369             : 
     370             :     /* Empty/null string is meaningless */
     371           2 :     if ((word == NULL) || !(strlen(word) > 0))
     372             :         /* internal error */
     373           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
     374             : 
     375             :     /*-- Allocate memory for our phoned_phrase --*/
     376           2 :     if (max_phonemes == 0)
     377             :     {                           /* Assume largest possible */
     378           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     379             :     }
     380             :     else
     381             :     {
     382           2 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     383             :     }
     384             : 
     385             :     /*-- The first phoneme has to be processed specially. --*/
     386             :     /* Find our first letter */
     387           2 :     for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
     388             :     {
     389             :         /* On the off chance we were given nothing but crap... */
     390           0 :         if (Curr_Letter == '\0')
     391             :         {
     392           0 :             End_Phoned_Word;
     393           0 :             return;
     394             :         }
     395             :     }
     396             : 
     397           2 :     switch (Curr_Letter)
     398             :     {
     399             :             /* AE becomes E */
     400           0 :         case 'A':
     401           0 :             if (Next_Letter == 'E')
     402             :             {
     403           0 :                 Phonize('E');
     404           0 :                 w_idx += 2;
     405             :             }
     406             :             /* Remember, preserve vowels at the beginning */
     407             :             else
     408             :             {
     409           0 :                 Phonize('A');
     410           0 :                 w_idx++;
     411             :             }
     412           0 :             break;
     413             :             /* [GKP]N becomes N */
     414           2 :         case 'G':
     415             :         case 'K':
     416             :         case 'P':
     417           2 :             if (Next_Letter == 'N')
     418             :             {
     419           0 :                 Phonize('N');
     420           0 :                 w_idx += 2;
     421             :             }
     422           2 :             break;
     423             : 
     424             :             /*
     425             :              * WH becomes H, WR becomes R W if followed by a vowel
     426             :              */
     427           0 :         case 'W':
     428           0 :             if (Next_Letter == 'H' ||
     429           0 :                 Next_Letter == 'R')
     430             :             {
     431           0 :                 Phonize(Next_Letter);
     432           0 :                 w_idx += 2;
     433             :             }
     434           0 :             else if (isvowel(Next_Letter))
     435             :             {
     436           0 :                 Phonize('W');
     437           0 :                 w_idx += 2;
     438             :             }
     439             :             /* else ignore */
     440           0 :             break;
     441             :             /* X becomes S */
     442           0 :         case 'X':
     443           0 :             Phonize('S');
     444           0 :             w_idx++;
     445           0 :             break;
     446             :             /* Vowels are kept */
     447             : 
     448             :             /*
     449             :              * We did A already case 'A': case 'a':
     450             :              */
     451           0 :         case 'E':
     452             :         case 'I':
     453             :         case 'O':
     454             :         case 'U':
     455           0 :             Phonize(Curr_Letter);
     456           0 :             w_idx++;
     457           0 :             break;
     458           0 :         default:
     459             :             /* do nothing */
     460           0 :             break;
     461             :     }
     462             : 
     463             : 
     464             : 
     465             :     /* On to the metaphoning */
     466          12 :     for (; Curr_Letter != '\0' &&
     467          10 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
     468          10 :          w_idx++)
     469             :     {
     470             :         /*
     471             :          * How many letters to skip because an earlier encoding handled
     472             :          * multiple letters
     473             :          */
     474          10 :         unsigned short int skip_letter = 0;
     475             : 
     476             : 
     477             :         /*
     478             :          * THOUGHT:  It would be nice if, rather than having things like...
     479             :          * well, SCI.  For SCI you encode the S, then have to remember to skip
     480             :          * the C.  So the phonome SCI invades both S and C.  It would be
     481             :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
     482             :          * I'm trying it.
     483             :          */
     484             : 
     485             :         /* Ignore non-alphas */
     486          10 :         if (!ascii_isalpha((unsigned char) (Curr_Letter)))
     487           0 :             continue;
     488             : 
     489             :         /* Drop duplicates, except CC */
     490          10 :         if (Curr_Letter == Prev_Letter &&
     491           0 :             Curr_Letter != 'C')
     492           0 :             continue;
     493             : 
     494          10 :         switch (Curr_Letter)
     495             :         {
     496             :                 /* B -> B unless in MB */
     497           2 :             case 'B':
     498           2 :                 if (Prev_Letter != 'M')
     499           0 :                     Phonize('B');
     500           2 :                 break;
     501             : 
     502             :                 /*
     503             :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     504             :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     505             :                  * SCE-, -SCY- (handed in S) else K
     506             :                  */
     507           0 :             case 'C':
     508           0 :                 if (MAKESOFT(Next_Letter))
     509             :                 {               /* C[IEY] */
     510           0 :                     if (After_Next_Letter == 'A' &&
     511           0 :                         Next_Letter == 'I')
     512             :                     {           /* CIA */
     513           0 :                         Phonize(SH);
     514             :                     }
     515             :                     /* SC[IEY] */
     516           0 :                     else if (Prev_Letter == 'S')
     517             :                     {
     518             :                         /* Dropped */
     519             :                     }
     520             :                     else
     521           0 :                         Phonize('S');
     522             :                 }
     523           0 :                 else if (Next_Letter == 'H')
     524             :                 {
     525             : #ifndef USE_TRADITIONAL_METAPHONE
     526           0 :                     if (After_Next_Letter == 'R' ||
     527           0 :                         Prev_Letter == 'S')
     528             :                     {           /* Christ, School */
     529           0 :                         Phonize('K');
     530             :                     }
     531             :                     else
     532           0 :                         Phonize(SH);
     533             : #else
     534             :                     Phonize(SH);
     535             : #endif
     536           0 :                     skip_letter++;
     537             :                 }
     538             :                 else
     539           0 :                     Phonize('K');
     540           0 :                 break;
     541             : 
     542             :                 /*
     543             :                  * J if in -DGE-, -DGI- or -DGY- else T
     544             :                  */
     545           0 :             case 'D':
     546           0 :                 if (Next_Letter == 'G' &&
     547           0 :                     MAKESOFT(After_Next_Letter))
     548             :                 {
     549           0 :                     Phonize('J');
     550           0 :                     skip_letter++;
     551             :                 }
     552             :                 else
     553           0 :                     Phonize('T');
     554           0 :                 break;
     555             : 
     556             :                 /*
     557             :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     558             :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     559             :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     560             :                  * else K
     561             :                  */
     562           2 :             case 'G':
     563           2 :                 if (Next_Letter == 'H')
     564             :                 {
     565           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
     566           0 :                           Look_Back_Letter(4) == 'H'))
     567             :                     {
     568           0 :                         Phonize('F');
     569           0 :                         skip_letter++;
     570             :                     }
     571             :                     else
     572             :                     {
     573             :                         /* silent */
     574             :                     }
     575             :                 }
     576           2 :                 else if (Next_Letter == 'N')
     577             :                 {
     578           0 :                     if (Isbreak(After_Next_Letter) ||
     579           0 :                         (After_Next_Letter == 'E' &&
     580           0 :                          Look_Ahead_Letter(3) == 'D'))
     581             :                     {
     582             :                         /* dropped */
     583             :                     }
     584             :                     else
     585           0 :                         Phonize('K');
     586             :                 }
     587           2 :                 else if (MAKESOFT(Next_Letter) &&
     588           0 :                          Prev_Letter != 'G')
     589           0 :                     Phonize('J');
     590             :                 else
     591           2 :                     Phonize('K');
     592           2 :                 break;
     593             :                 /* H if before a vowel and not after C,G,P,S,T */
     594           0 :             case 'H':
     595           0 :                 if (isvowel(Next_Letter) &&
     596           0 :                     !AFFECTH(Prev_Letter))
     597           0 :                     Phonize('H');
     598           0 :                 break;
     599             : 
     600             :                 /*
     601             :                  * dropped if after C else K
     602             :                  */
     603           0 :             case 'K':
     604           0 :                 if (Prev_Letter != 'C')
     605           0 :                     Phonize('K');
     606           0 :                 break;
     607             : 
     608             :                 /*
     609             :                  * F if before H else P
     610             :                  */
     611           0 :             case 'P':
     612           0 :                 if (Next_Letter == 'H')
     613           0 :                     Phonize('F');
     614             :                 else
     615           0 :                     Phonize('P');
     616           0 :                 break;
     617             : 
     618             :                 /*
     619             :                  * K
     620             :                  */
     621           0 :             case 'Q':
     622           0 :                 Phonize('K');
     623           0 :                 break;
     624             : 
     625             :                 /*
     626             :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     627             :                  */
     628           0 :             case 'S':
     629           0 :                 if (Next_Letter == 'I' &&
     630           0 :                     (After_Next_Letter == 'O' ||
     631           0 :                      After_Next_Letter == 'A'))
     632           0 :                     Phonize(SH);
     633           0 :                 else if (Next_Letter == 'H')
     634             :                 {
     635           0 :                     Phonize(SH);
     636           0 :                     skip_letter++;
     637             :                 }
     638             : #ifndef USE_TRADITIONAL_METAPHONE
     639           0 :                 else if (Next_Letter == 'C' &&
     640           0 :                          Look_Ahead_Letter(2) == 'H' &&
     641           0 :                          Look_Ahead_Letter(3) == 'W')
     642             :                 {
     643           0 :                     Phonize(SH);
     644           0 :                     skip_letter += 2;
     645             :                 }
     646             : #endif
     647             :                 else
     648           0 :                     Phonize('S');
     649           0 :                 break;
     650             : 
     651             :                 /*
     652             :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     653             :                  */
     654           0 :             case 'T':
     655           0 :                 if (Next_Letter == 'I' &&
     656           0 :                     (After_Next_Letter == 'O' ||
     657           0 :                      After_Next_Letter == 'A'))
     658           0 :                     Phonize(SH);
     659           0 :                 else if (Next_Letter == 'H')
     660             :                 {
     661           0 :                     Phonize(TH);
     662           0 :                     skip_letter++;
     663             :                 }
     664             :                 else
     665           0 :                     Phonize('T');
     666           0 :                 break;
     667             :                 /* F */
     668           0 :             case 'V':
     669           0 :                 Phonize('F');
     670           0 :                 break;
     671             :                 /* W before a vowel, else dropped */
     672           0 :             case 'W':
     673           0 :                 if (isvowel(Next_Letter))
     674           0 :                     Phonize('W');
     675           0 :                 break;
     676             :                 /* KS */
     677           0 :             case 'X':
     678           0 :                 Phonize('K');
     679           0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     680           0 :                     Phonize('S');
     681           0 :                 break;
     682             :                 /* Y if followed by a vowel */
     683           0 :             case 'Y':
     684           0 :                 if (isvowel(Next_Letter))
     685           0 :                     Phonize('Y');
     686           0 :                 break;
     687             :                 /* S */
     688           0 :             case 'Z':
     689           0 :                 Phonize('S');
     690           0 :                 break;
     691             :                 /* No transformation */
     692           2 :             case 'F':
     693             :             case 'J':
     694             :             case 'L':
     695             :             case 'M':
     696             :             case 'N':
     697             :             case 'R':
     698           2 :                 Phonize(Curr_Letter);
     699           2 :                 break;
     700           4 :             default:
     701             :                 /* nothing */
     702           4 :                 break;
     703             :         }                       /* END SWITCH */
     704             : 
     705          10 :         w_idx += skip_letter;
     706             :     }                           /* END FOR */
     707             : 
     708           2 :     End_Phoned_Word;
     709             : }                               /* END metaphone */
     710             : 
     711             : 
     712             : /*
     713             :  * SQL function: soundex(text) returns text
     714             :  */
     715           6 : PG_FUNCTION_INFO_V1(soundex);
     716             : 
     717             : Datum
     718          16 : soundex(PG_FUNCTION_ARGS)
     719             : {
     720             :     char        outstr[SOUNDEX_LEN + 1];
     721             :     char       *arg;
     722             : 
     723          16 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     724             : 
     725          16 :     _soundex(arg, outstr);
     726             : 
     727          16 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
     728             : }
     729             : 
     730             : static void
     731          32 : _soundex(const char *instr, char *outstr)
     732             : {
     733             :     int         count;
     734             : 
     735             :     Assert(instr);
     736             :     Assert(outstr);
     737             : 
     738             :     /* Skip leading non-alphabetic characters */
     739          32 :     while (*instr && !ascii_isalpha((unsigned char) *instr))
     740           0 :         ++instr;
     741             : 
     742             :     /* If no string left, return all-zeroes buffer */
     743          32 :     if (!*instr)
     744             :     {
     745           6 :         memset(outstr, '\0', SOUNDEX_LEN + 1);
     746           6 :         return;
     747             :     }
     748             : 
     749             :     /* Take the first letter as is */
     750          26 :     *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
     751             : 
     752          26 :     count = 1;
     753         120 :     while (*instr && count < SOUNDEX_LEN)
     754             :     {
     755         186 :         if (ascii_isalpha((unsigned char) *instr) &&
     756          92 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
     757             :         {
     758          70 :             *outstr = soundex_code(*instr);
     759          70 :             if (*outstr != '0')
     760             :             {
     761          46 :                 ++outstr;
     762          46 :                 ++count;
     763             :             }
     764             :         }
     765          94 :         ++instr;
     766             :     }
     767             : 
     768             :     /* Fill with 0's */
     769          58 :     while (count < SOUNDEX_LEN)
     770             :     {
     771          32 :         *outstr = '0';
     772          32 :         ++outstr;
     773          32 :         ++count;
     774             :     }
     775             : 
     776             :     /* And null-terminate */
     777          26 :     *outstr = '\0';
     778             : }
     779             : 
     780           4 : PG_FUNCTION_INFO_V1(difference);
     781             : 
     782             : Datum
     783           8 : difference(PG_FUNCTION_ARGS)
     784             : {
     785             :     char        sndx1[SOUNDEX_LEN + 1],
     786             :                 sndx2[SOUNDEX_LEN + 1];
     787             :     int         i,
     788             :                 result;
     789             : 
     790           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     791           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     792             : 
     793           8 :     result = 0;
     794          40 :     for (i = 0; i < SOUNDEX_LEN; i++)
     795             :     {
     796          32 :         if (sndx1[i] == sndx2[i])
     797          20 :             result++;
     798             :     }
     799             : 
     800           8 :     PG_RETURN_INT32(result);
     801             : }

Generated by: LCOV version 1.16