LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 123 277 44.4 %
Date: 2025-04-01 16:15:31 Functions: 18 20 90.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * fuzzystrmatch.c
       3             :  *
       4             :  * Functions for "fuzzy" comparison of strings
       5             :  *
       6             :  * Joe Conway <mail@joeconway.com>
       7             :  *
       8             :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9             :  * Copyright (c) 2001-2025, PostgreSQL Global Development Group
      10             :  * ALL RIGHTS RESERVED;
      11             :  *
      12             :  * metaphone()
      13             :  * -----------
      14             :  * Modified for PostgreSQL by Joe Conway.
      15             :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16             :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17             :  * Metaphone was originally created by Lawrence Philips and presented in article
      18             :  * in "Computer Language" December 1990 issue.
      19             :  *
      20             :  * Permission to use, copy, modify, and distribute this software and its
      21             :  * documentation for any purpose, without fee, and without a written agreement
      22             :  * is hereby granted, provided that the above copyright notice and this
      23             :  * paragraph and the following two paragraphs appear in all copies.
      24             :  *
      25             :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26             :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27             :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28             :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29             :  * POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33             :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34             :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35             :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36             :  *
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include <ctype.h>
      42             : 
      43             : #include "utils/builtins.h"
      44             : #include "utils/varlena.h"
      45             : #include "varatt.h"
      46             : 
      47           4 : PG_MODULE_MAGIC_EXT(
      48             :                     .name = "fuzzystrmatch",
      49             :                     .version = PG_VERSION
      50             : );
      51             : 
      52             : /*
      53             :  * Soundex
      54             :  */
      55             : static void _soundex(const char *instr, char *outstr);
      56             : 
      57             : #define SOUNDEX_LEN 4
      58             : 
      59             : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      60             : static const char *const soundex_table = "01230120022455012623010202";
      61             : 
      62             : static char
      63         254 : soundex_code(char letter)
      64             : {
      65         254 :     letter = toupper((unsigned char) letter);
      66             :     /* Defend against non-ASCII letters */
      67         254 :     if (letter >= 'A' && letter <= 'Z')
      68         252 :         return soundex_table[letter - 'A'];
      69           2 :     return letter;
      70             : }
      71             : 
      72             : /*
      73             :  * Metaphone
      74             :  */
      75             : #define MAX_METAPHONE_STRLEN        255
      76             : 
      77             : /*
      78             :  * Original code by Michael G Schwern starts here.
      79             :  * Code slightly modified for use as PostgreSQL function.
      80             :  */
      81             : 
      82             : 
      83             : /**************************************************************************
      84             :     metaphone -- Breaks english phrases down into their phonemes.
      85             : 
      86             :     Input
      87             :         word            --  An english word to be phonized
      88             :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
      89             :                             will phonize the entire phrase.
      90             :         phoned_word     --  The final phonized word.  (We'll allocate the
      91             :                             memory.)
      92             :     Output
      93             :         error   --  A simple error flag, returns true or false
      94             : 
      95             :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      96             :     although non-alpha characters will break up phonemes.
      97             : ****************************************************************************/
      98             : 
      99             : 
     100             : /*  I add modifications to the traditional metaphone algorithm that you
     101             :     might find in books.  Define this if you want metaphone to behave
     102             :     traditionally */
     103             : #undef USE_TRADITIONAL_METAPHONE
     104             : 
     105             : /* Special encodings */
     106             : #define  SH     'X'
     107             : #define  TH     '0'
     108             : 
     109             : static char Lookahead(char *word, int how_far);
     110             : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     111             : 
     112             : /* Metachar.h ... little bits about characters for metaphone */
     113             : 
     114             : 
     115             : /*-- Character encoding array & accessing macros --*/
     116             : /* Stolen directly out of the book... */
     117             : static const char _codes[26] = {
     118             :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     119             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     120             : };
     121             : 
     122             : static int
     123           2 : getcode(char c)
     124             : {
     125           2 :     if (isalpha((unsigned char) c))
     126             :     {
     127           2 :         c = toupper((unsigned char) c);
     128             :         /* Defend against non-ASCII letters */
     129           2 :         if (c >= 'A' && c <= 'Z')
     130           2 :             return _codes[c - 'A'];
     131             :     }
     132           0 :     return 0;
     133             : }
     134             : 
     135             : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
     136             : 
     137             : /* These letters are passed through unchanged */
     138             : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
     139             : 
     140             : /* These form diphthongs when preceding H */
     141             : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
     142             : 
     143             : /* These make C and G soft */
     144             : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
     145             : 
     146             : /* These prevent GH from becoming F */
     147             : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
     148             : 
     149           4 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     150             : Datum
     151           2 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     152             : {
     153           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     154           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     155           2 :     int         ins_c = PG_GETARG_INT32(2);
     156           2 :     int         del_c = PG_GETARG_INT32(3);
     157           2 :     int         sub_c = PG_GETARG_INT32(4);
     158             :     const char *s_data;
     159             :     const char *t_data;
     160             :     int         s_bytes,
     161             :                 t_bytes;
     162             : 
     163             :     /* Extract a pointer to the actual character data */
     164           2 :     s_data = VARDATA_ANY(src);
     165           2 :     t_data = VARDATA_ANY(dst);
     166             :     /* Determine length of each string in bytes */
     167           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     168           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     169             : 
     170           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     171             :                                        ins_c, del_c, sub_c, false));
     172             : }
     173             : 
     174             : 
     175           4 : PG_FUNCTION_INFO_V1(levenshtein);
     176             : Datum
     177           2 : levenshtein(PG_FUNCTION_ARGS)
     178             : {
     179           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     180           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     181             :     const char *s_data;
     182             :     const char *t_data;
     183             :     int         s_bytes,
     184             :                 t_bytes;
     185             : 
     186             :     /* Extract a pointer to the actual character data */
     187           2 :     s_data = VARDATA_ANY(src);
     188           2 :     t_data = VARDATA_ANY(dst);
     189             :     /* Determine length of each string in bytes */
     190           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     191           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     192             : 
     193           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     194             :                                        1, 1, 1, false));
     195             : }
     196             : 
     197             : 
     198           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     199             : Datum
     200           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     201             : {
     202           0 :     text       *src = PG_GETARG_TEXT_PP(0);
     203           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
     204           0 :     int         ins_c = PG_GETARG_INT32(2);
     205           0 :     int         del_c = PG_GETARG_INT32(3);
     206           0 :     int         sub_c = PG_GETARG_INT32(4);
     207           0 :     int         max_d = PG_GETARG_INT32(5);
     208             :     const char *s_data;
     209             :     const char *t_data;
     210             :     int         s_bytes,
     211             :                 t_bytes;
     212             : 
     213             :     /* Extract a pointer to the actual character data */
     214           0 :     s_data = VARDATA_ANY(src);
     215           0 :     t_data = VARDATA_ANY(dst);
     216             :     /* Determine length of each string in bytes */
     217           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     218           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     219             : 
     220           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     221             :                                                   t_data, t_bytes,
     222             :                                                   ins_c, del_c, sub_c,
     223             :                                                   max_d, false));
     224             : }
     225             : 
     226             : 
     227           4 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     228             : Datum
     229           4 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     230             : {
     231           4 :     text       *src = PG_GETARG_TEXT_PP(0);
     232           4 :     text       *dst = PG_GETARG_TEXT_PP(1);
     233           4 :     int         max_d = PG_GETARG_INT32(2);
     234             :     const char *s_data;
     235             :     const char *t_data;
     236             :     int         s_bytes,
     237             :                 t_bytes;
     238             : 
     239             :     /* Extract a pointer to the actual character data */
     240           4 :     s_data = VARDATA_ANY(src);
     241           4 :     t_data = VARDATA_ANY(dst);
     242             :     /* Determine length of each string in bytes */
     243           4 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     244           4 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     245             : 
     246           4 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     247             :                                                   t_data, t_bytes,
     248             :                                                   1, 1, 1,
     249             :                                                   max_d, false));
     250             : }
     251             : 
     252             : 
     253             : /*
     254             :  * Calculates the metaphone of an input string.
     255             :  * Returns number of characters requested
     256             :  * (suggested value is 4)
     257             :  */
     258           4 : PG_FUNCTION_INFO_V1(metaphone);
     259             : Datum
     260           2 : metaphone(PG_FUNCTION_ARGS)
     261             : {
     262           2 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     263           2 :     size_t      str_i_len = strlen(str_i);
     264             :     int         reqlen;
     265             :     char       *metaph;
     266             : 
     267             :     /* return an empty string if we receive one */
     268           2 :     if (!(str_i_len > 0))
     269           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
     270             : 
     271           2 :     if (str_i_len > MAX_METAPHONE_STRLEN)
     272           0 :         ereport(ERROR,
     273             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     274             :                  errmsg("argument exceeds the maximum length of %d bytes",
     275             :                         MAX_METAPHONE_STRLEN)));
     276             : 
     277           2 :     reqlen = PG_GETARG_INT32(1);
     278           2 :     if (reqlen > MAX_METAPHONE_STRLEN)
     279           0 :         ereport(ERROR,
     280             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     281             :                  errmsg("output exceeds the maximum length of %d bytes",
     282             :                         MAX_METAPHONE_STRLEN)));
     283             : 
     284           2 :     if (!(reqlen > 0))
     285           0 :         ereport(ERROR,
     286             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     287             :                  errmsg("output cannot be empty string")));
     288             : 
     289           2 :     _metaphone(str_i, reqlen, &metaph);
     290           2 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
     291             : }
     292             : 
     293             : 
     294             : /*
     295             :  * Original code by Michael G Schwern starts here.
     296             :  * Code slightly modified for use as PostgreSQL
     297             :  * function (palloc, etc).
     298             :  */
     299             : 
     300             : /* I suppose I could have been using a character pointer instead of
     301             :  * accessing the array directly... */
     302             : 
     303             : /* Look at the next letter in the word */
     304             : #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
     305             : /* Look at the current letter in the word */
     306             : #define Curr_Letter (toupper((unsigned char) word[w_idx]))
     307             : /* Go N letters back. */
     308             : #define Look_Back_Letter(n) \
     309             :     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
     310             : /* Previous letter.  I dunno, should this return null on failure? */
     311             : #define Prev_Letter (Look_Back_Letter(1))
     312             : /* Look two letters down.  It makes sure you don't walk off the string. */
     313             : #define After_Next_Letter \
     314             :     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
     315             : #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
     316             : 
     317             : 
     318             : /* Allows us to safely look ahead an arbitrary # of letters */
     319             : /* I probably could have just used strlen... */
     320             : static char
     321           0 : Lookahead(char *word, int how_far)
     322             : {
     323           0 :     char        letter_ahead = '\0';    /* null by default */
     324             :     int         idx;
     325             : 
     326           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     327             :     /* Edge forward in the string... */
     328             : 
     329           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
     330             :                                  * end of the string */
     331           0 :     return letter_ahead;
     332             : }
     333             : 
     334             : 
     335             : /* phonize one letter */
     336             : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
     337             : /* Slap a null character on the end of the phoned word */
     338             : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     339             : /* How long is the phoned word? */
     340             : #define Phone_Len   (p_idx)
     341             : 
     342             : /* Note is a letter is a 'break' in the word */
     343             : #define Isbreak(c)  (!isalpha((unsigned char) (c)))
     344             : 
     345             : 
     346             : static void
     347           2 : _metaphone(char *word,          /* IN */
     348             :            int max_phonemes,
     349             :            char **phoned_word)  /* OUT */
     350             : {
     351           2 :     int         w_idx = 0;      /* point in the phonization we're at. */
     352           2 :     int         p_idx = 0;      /* end of the phoned phrase */
     353             : 
     354             :     /*-- Parameter checks --*/
     355             : 
     356             :     /*
     357             :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     358             :      */
     359             : 
     360             :     /* Negative phoneme length is meaningless */
     361           2 :     if (!(max_phonemes > 0))
     362             :         /* internal error */
     363           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
     364             : 
     365             :     /* Empty/null string is meaningless */
     366           2 :     if ((word == NULL) || !(strlen(word) > 0))
     367             :         /* internal error */
     368           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
     369             : 
     370             :     /*-- Allocate memory for our phoned_phrase --*/
     371           2 :     if (max_phonemes == 0)
     372             :     {                           /* Assume largest possible */
     373           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     374             :     }
     375             :     else
     376             :     {
     377           2 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     378             :     }
     379             : 
     380             :     /*-- The first phoneme has to be processed specially. --*/
     381             :     /* Find our first letter */
     382           2 :     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
     383             :     {
     384             :         /* On the off chance we were given nothing but crap... */
     385           0 :         if (Curr_Letter == '\0')
     386             :         {
     387           0 :             End_Phoned_Word;
     388           0 :             return;
     389             :         }
     390             :     }
     391             : 
     392           2 :     switch (Curr_Letter)
     393             :     {
     394             :             /* AE becomes E */
     395           0 :         case 'A':
     396           0 :             if (Next_Letter == 'E')
     397             :             {
     398           0 :                 Phonize('E');
     399           0 :                 w_idx += 2;
     400             :             }
     401             :             /* Remember, preserve vowels at the beginning */
     402             :             else
     403             :             {
     404           0 :                 Phonize('A');
     405           0 :                 w_idx++;
     406             :             }
     407           0 :             break;
     408             :             /* [GKP]N becomes N */
     409           2 :         case 'G':
     410             :         case 'K':
     411             :         case 'P':
     412           2 :             if (Next_Letter == 'N')
     413             :             {
     414           0 :                 Phonize('N');
     415           0 :                 w_idx += 2;
     416             :             }
     417           2 :             break;
     418             : 
     419             :             /*
     420             :              * WH becomes H, WR becomes R W if followed by a vowel
     421             :              */
     422           0 :         case 'W':
     423           0 :             if (Next_Letter == 'H' ||
     424           0 :                 Next_Letter == 'R')
     425             :             {
     426           0 :                 Phonize(Next_Letter);
     427           0 :                 w_idx += 2;
     428             :             }
     429           0 :             else if (isvowel(Next_Letter))
     430             :             {
     431           0 :                 Phonize('W');
     432           0 :                 w_idx += 2;
     433             :             }
     434             :             /* else ignore */
     435           0 :             break;
     436             :             /* X becomes S */
     437           0 :         case 'X':
     438           0 :             Phonize('S');
     439           0 :             w_idx++;
     440           0 :             break;
     441             :             /* Vowels are kept */
     442             : 
     443             :             /*
     444             :              * We did A already case 'A': case 'a':
     445             :              */
     446           0 :         case 'E':
     447             :         case 'I':
     448             :         case 'O':
     449             :         case 'U':
     450           0 :             Phonize(Curr_Letter);
     451           0 :             w_idx++;
     452           0 :             break;
     453           0 :         default:
     454             :             /* do nothing */
     455           0 :             break;
     456             :     }
     457             : 
     458             : 
     459             : 
     460             :     /* On to the metaphoning */
     461          12 :     for (; Curr_Letter != '\0' &&
     462          10 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
     463          10 :          w_idx++)
     464             :     {
     465             :         /*
     466             :          * How many letters to skip because an earlier encoding handled
     467             :          * multiple letters
     468             :          */
     469          10 :         unsigned short int skip_letter = 0;
     470             : 
     471             : 
     472             :         /*
     473             :          * THOUGHT:  It would be nice if, rather than having things like...
     474             :          * well, SCI.  For SCI you encode the S, then have to remember to skip
     475             :          * the C.  So the phonome SCI invades both S and C.  It would be
     476             :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
     477             :          * I'm trying it.
     478             :          */
     479             : 
     480             :         /* Ignore non-alphas */
     481          10 :         if (!isalpha((unsigned char) (Curr_Letter)))
     482           0 :             continue;
     483             : 
     484             :         /* Drop duplicates, except CC */
     485          10 :         if (Curr_Letter == Prev_Letter &&
     486           0 :             Curr_Letter != 'C')
     487           0 :             continue;
     488             : 
     489          10 :         switch (Curr_Letter)
     490             :         {
     491             :                 /* B -> B unless in MB */
     492           2 :             case 'B':
     493           2 :                 if (Prev_Letter != 'M')
     494           0 :                     Phonize('B');
     495           2 :                 break;
     496             : 
     497             :                 /*
     498             :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     499             :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     500             :                  * SCE-, -SCY- (handed in S) else K
     501             :                  */
     502           0 :             case 'C':
     503           0 :                 if (MAKESOFT(Next_Letter))
     504             :                 {               /* C[IEY] */
     505           0 :                     if (After_Next_Letter == 'A' &&
     506           0 :                         Next_Letter == 'I')
     507             :                     {           /* CIA */
     508           0 :                         Phonize(SH);
     509             :                     }
     510             :                     /* SC[IEY] */
     511           0 :                     else if (Prev_Letter == 'S')
     512             :                     {
     513             :                         /* Dropped */
     514             :                     }
     515             :                     else
     516           0 :                         Phonize('S');
     517             :                 }
     518           0 :                 else if (Next_Letter == 'H')
     519             :                 {
     520             : #ifndef USE_TRADITIONAL_METAPHONE
     521           0 :                     if (After_Next_Letter == 'R' ||
     522           0 :                         Prev_Letter == 'S')
     523             :                     {           /* Christ, School */
     524           0 :                         Phonize('K');
     525             :                     }
     526             :                     else
     527           0 :                         Phonize(SH);
     528             : #else
     529             :                     Phonize(SH);
     530             : #endif
     531           0 :                     skip_letter++;
     532             :                 }
     533             :                 else
     534           0 :                     Phonize('K');
     535           0 :                 break;
     536             : 
     537             :                 /*
     538             :                  * J if in -DGE-, -DGI- or -DGY- else T
     539             :                  */
     540           0 :             case 'D':
     541           0 :                 if (Next_Letter == 'G' &&
     542           0 :                     MAKESOFT(After_Next_Letter))
     543             :                 {
     544           0 :                     Phonize('J');
     545           0 :                     skip_letter++;
     546             :                 }
     547             :                 else
     548           0 :                     Phonize('T');
     549           0 :                 break;
     550             : 
     551             :                 /*
     552             :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     553             :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     554             :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     555             :                  * else K
     556             :                  */
     557           2 :             case 'G':
     558           2 :                 if (Next_Letter == 'H')
     559             :                 {
     560           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
     561           0 :                           Look_Back_Letter(4) == 'H'))
     562             :                     {
     563           0 :                         Phonize('F');
     564           0 :                         skip_letter++;
     565             :                     }
     566             :                     else
     567             :                     {
     568             :                         /* silent */
     569             :                     }
     570             :                 }
     571           2 :                 else if (Next_Letter == 'N')
     572             :                 {
     573           0 :                     if (Isbreak(After_Next_Letter) ||
     574           0 :                         (After_Next_Letter == 'E' &&
     575           0 :                          Look_Ahead_Letter(3) == 'D'))
     576             :                     {
     577             :                         /* dropped */
     578             :                     }
     579             :                     else
     580           0 :                         Phonize('K');
     581             :                 }
     582           2 :                 else if (MAKESOFT(Next_Letter) &&
     583           0 :                          Prev_Letter != 'G')
     584           0 :                     Phonize('J');
     585             :                 else
     586           2 :                     Phonize('K');
     587           2 :                 break;
     588             :                 /* H if before a vowel and not after C,G,P,S,T */
     589           0 :             case 'H':
     590           0 :                 if (isvowel(Next_Letter) &&
     591           0 :                     !AFFECTH(Prev_Letter))
     592           0 :                     Phonize('H');
     593           0 :                 break;
     594             : 
     595             :                 /*
     596             :                  * dropped if after C else K
     597             :                  */
     598           0 :             case 'K':
     599           0 :                 if (Prev_Letter != 'C')
     600           0 :                     Phonize('K');
     601           0 :                 break;
     602             : 
     603             :                 /*
     604             :                  * F if before H else P
     605             :                  */
     606           0 :             case 'P':
     607           0 :                 if (Next_Letter == 'H')
     608           0 :                     Phonize('F');
     609             :                 else
     610           0 :                     Phonize('P');
     611           0 :                 break;
     612             : 
     613             :                 /*
     614             :                  * K
     615             :                  */
     616           0 :             case 'Q':
     617           0 :                 Phonize('K');
     618           0 :                 break;
     619             : 
     620             :                 /*
     621             :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     622             :                  */
     623           0 :             case 'S':
     624           0 :                 if (Next_Letter == 'I' &&
     625           0 :                     (After_Next_Letter == 'O' ||
     626           0 :                      After_Next_Letter == 'A'))
     627           0 :                     Phonize(SH);
     628           0 :                 else if (Next_Letter == 'H')
     629             :                 {
     630           0 :                     Phonize(SH);
     631           0 :                     skip_letter++;
     632             :                 }
     633             : #ifndef USE_TRADITIONAL_METAPHONE
     634           0 :                 else if (Next_Letter == 'C' &&
     635           0 :                          Look_Ahead_Letter(2) == 'H' &&
     636           0 :                          Look_Ahead_Letter(3) == 'W')
     637             :                 {
     638           0 :                     Phonize(SH);
     639           0 :                     skip_letter += 2;
     640             :                 }
     641             : #endif
     642             :                 else
     643           0 :                     Phonize('S');
     644           0 :                 break;
     645             : 
     646             :                 /*
     647             :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     648             :                  */
     649           0 :             case 'T':
     650           0 :                 if (Next_Letter == 'I' &&
     651           0 :                     (After_Next_Letter == 'O' ||
     652           0 :                      After_Next_Letter == 'A'))
     653           0 :                     Phonize(SH);
     654           0 :                 else if (Next_Letter == 'H')
     655             :                 {
     656           0 :                     Phonize(TH);
     657           0 :                     skip_letter++;
     658             :                 }
     659             :                 else
     660           0 :                     Phonize('T');
     661           0 :                 break;
     662             :                 /* F */
     663           0 :             case 'V':
     664           0 :                 Phonize('F');
     665           0 :                 break;
     666             :                 /* W before a vowel, else dropped */
     667           0 :             case 'W':
     668           0 :                 if (isvowel(Next_Letter))
     669           0 :                     Phonize('W');
     670           0 :                 break;
     671             :                 /* KS */
     672           0 :             case 'X':
     673           0 :                 Phonize('K');
     674           0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     675           0 :                     Phonize('S');
     676           0 :                 break;
     677             :                 /* Y if followed by a vowel */
     678           0 :             case 'Y':
     679           0 :                 if (isvowel(Next_Letter))
     680           0 :                     Phonize('Y');
     681           0 :                 break;
     682             :                 /* S */
     683           0 :             case 'Z':
     684           0 :                 Phonize('S');
     685           0 :                 break;
     686             :                 /* No transformation */
     687           2 :             case 'F':
     688             :             case 'J':
     689             :             case 'L':
     690             :             case 'M':
     691             :             case 'N':
     692             :             case 'R':
     693           2 :                 Phonize(Curr_Letter);
     694           2 :                 break;
     695           4 :             default:
     696             :                 /* nothing */
     697           4 :                 break;
     698             :         }                       /* END SWITCH */
     699             : 
     700          10 :         w_idx += skip_letter;
     701             :     }                           /* END FOR */
     702             : 
     703           2 :     End_Phoned_Word;
     704             : }                               /* END metaphone */
     705             : 
     706             : 
     707             : /*
     708             :  * SQL function: soundex(text) returns text
     709             :  */
     710           6 : PG_FUNCTION_INFO_V1(soundex);
     711             : 
     712             : Datum
     713          16 : soundex(PG_FUNCTION_ARGS)
     714             : {
     715             :     char        outstr[SOUNDEX_LEN + 1];
     716             :     char       *arg;
     717             : 
     718          16 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     719             : 
     720          16 :     _soundex(arg, outstr);
     721             : 
     722          16 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
     723             : }
     724             : 
     725             : static void
     726          32 : _soundex(const char *instr, char *outstr)
     727             : {
     728             :     int         count;
     729             : 
     730             :     Assert(instr);
     731             :     Assert(outstr);
     732             : 
     733             :     /* Skip leading non-alphabetic characters */
     734          32 :     while (*instr && !isalpha((unsigned char) *instr))
     735           0 :         ++instr;
     736             : 
     737             :     /* If no string left, return all-zeroes buffer */
     738          32 :     if (!*instr)
     739             :     {
     740           6 :         memset(outstr, '\0', SOUNDEX_LEN + 1);
     741           6 :         return;
     742             :     }
     743             : 
     744             :     /* Take the first letter as is */
     745          26 :     *outstr++ = (char) toupper((unsigned char) *instr++);
     746             : 
     747          26 :     count = 1;
     748         120 :     while (*instr && count < SOUNDEX_LEN)
     749             :     {
     750         186 :         if (isalpha((unsigned char) *instr) &&
     751          92 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
     752             :         {
     753          70 :             *outstr = soundex_code(*instr);
     754          70 :             if (*outstr != '0')
     755             :             {
     756          46 :                 ++outstr;
     757          46 :                 ++count;
     758             :             }
     759             :         }
     760          94 :         ++instr;
     761             :     }
     762             : 
     763             :     /* Fill with 0's */
     764          58 :     while (count < SOUNDEX_LEN)
     765             :     {
     766          32 :         *outstr = '0';
     767          32 :         ++outstr;
     768          32 :         ++count;
     769             :     }
     770             : 
     771             :     /* And null-terminate */
     772          26 :     *outstr = '\0';
     773             : }
     774             : 
     775           4 : PG_FUNCTION_INFO_V1(difference);
     776             : 
     777             : Datum
     778           8 : difference(PG_FUNCTION_ARGS)
     779             : {
     780             :     char        sndx1[SOUNDEX_LEN + 1],
     781             :                 sndx2[SOUNDEX_LEN + 1];
     782             :     int         i,
     783             :                 result;
     784             : 
     785           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     786           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     787             : 
     788           8 :     result = 0;
     789          40 :     for (i = 0; i < SOUNDEX_LEN; i++)
     790             :     {
     791          32 :         if (sndx1[i] == sndx2[i])
     792          20 :             result++;
     793             :     }
     794             : 
     795           8 :     PG_RETURN_INT32(result);
     796             : }

Generated by: LCOV version 1.14