LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 123 277 44.4 %
Date: 2024-04-19 23:11:07 Functions: 18 20 90.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * fuzzystrmatch.c
       3             :  *
       4             :  * Functions for "fuzzy" comparison of strings
       5             :  *
       6             :  * Joe Conway <mail@joeconway.com>
       7             :  *
       8             :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9             :  * Copyright (c) 2001-2024, PostgreSQL Global Development Group
      10             :  * ALL RIGHTS RESERVED;
      11             :  *
      12             :  * metaphone()
      13             :  * -----------
      14             :  * Modified for PostgreSQL by Joe Conway.
      15             :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16             :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17             :  * Metaphone was originally created by Lawrence Philips and presented in article
      18             :  * in "Computer Language" December 1990 issue.
      19             :  *
      20             :  * Permission to use, copy, modify, and distribute this software and its
      21             :  * documentation for any purpose, without fee, and without a written agreement
      22             :  * is hereby granted, provided that the above copyright notice and this
      23             :  * paragraph and the following two paragraphs appear in all copies.
      24             :  *
      25             :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26             :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27             :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28             :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29             :  * POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33             :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34             :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35             :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36             :  *
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include <ctype.h>
      42             : 
      43             : #include "mb/pg_wchar.h"
      44             : #include "utils/builtins.h"
      45             : #include "utils/varlena.h"
      46             : #include "varatt.h"
      47             : 
      48           4 : PG_MODULE_MAGIC;
      49             : 
      50             : /*
      51             :  * Soundex
      52             :  */
      53             : static void _soundex(const char *instr, char *outstr);
      54             : 
      55             : #define SOUNDEX_LEN 4
      56             : 
      57             : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      58             : static const char *const soundex_table = "01230120022455012623010202";
      59             : 
      60             : static char
      61         254 : soundex_code(char letter)
      62             : {
      63         254 :     letter = toupper((unsigned char) letter);
      64             :     /* Defend against non-ASCII letters */
      65         254 :     if (letter >= 'A' && letter <= 'Z')
      66         252 :         return soundex_table[letter - 'A'];
      67           2 :     return letter;
      68             : }
      69             : 
      70             : /*
      71             :  * Metaphone
      72             :  */
      73             : #define MAX_METAPHONE_STRLEN        255
      74             : 
      75             : /*
      76             :  * Original code by Michael G Schwern starts here.
      77             :  * Code slightly modified for use as PostgreSQL function.
      78             :  */
      79             : 
      80             : 
      81             : /**************************************************************************
      82             :     metaphone -- Breaks english phrases down into their phonemes.
      83             : 
      84             :     Input
      85             :         word            --  An english word to be phonized
      86             :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
      87             :                             will phonize the entire phrase.
      88             :         phoned_word     --  The final phonized word.  (We'll allocate the
      89             :                             memory.)
      90             :     Output
      91             :         error   --  A simple error flag, returns true or false
      92             : 
      93             :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      94             :     although non-alpha characters will break up phonemes.
      95             : ****************************************************************************/
      96             : 
      97             : 
      98             : /*  I add modifications to the traditional metaphone algorithm that you
      99             :     might find in books.  Define this if you want metaphone to behave
     100             :     traditionally */
     101             : #undef USE_TRADITIONAL_METAPHONE
     102             : 
     103             : /* Special encodings */
     104             : #define  SH     'X'
     105             : #define  TH     '0'
     106             : 
     107             : static char Lookahead(char *word, int how_far);
     108             : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     109             : 
     110             : /* Metachar.h ... little bits about characters for metaphone */
     111             : 
     112             : 
     113             : /*-- Character encoding array & accessing macros --*/
     114             : /* Stolen directly out of the book... */
     115             : static const char _codes[26] = {
     116             :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     117             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     118             : };
     119             : 
     120             : static int
     121           2 : getcode(char c)
     122             : {
     123           2 :     if (isalpha((unsigned char) c))
     124             :     {
     125           2 :         c = toupper((unsigned char) c);
     126             :         /* Defend against non-ASCII letters */
     127           2 :         if (c >= 'A' && c <= 'Z')
     128           2 :             return _codes[c - 'A'];
     129             :     }
     130           0 :     return 0;
     131             : }
     132             : 
     133             : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
     134             : 
     135             : /* These letters are passed through unchanged */
     136             : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
     137             : 
     138             : /* These form diphthongs when preceding H */
     139             : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
     140             : 
     141             : /* These make C and G soft */
     142             : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
     143             : 
     144             : /* These prevent GH from becoming F */
     145             : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
     146             : 
     147           4 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     148             : Datum
     149           2 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     150             : {
     151           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     152           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     153           2 :     int         ins_c = PG_GETARG_INT32(2);
     154           2 :     int         del_c = PG_GETARG_INT32(3);
     155           2 :     int         sub_c = PG_GETARG_INT32(4);
     156             :     const char *s_data;
     157             :     const char *t_data;
     158             :     int         s_bytes,
     159             :                 t_bytes;
     160             : 
     161             :     /* Extract a pointer to the actual character data */
     162           2 :     s_data = VARDATA_ANY(src);
     163           2 :     t_data = VARDATA_ANY(dst);
     164             :     /* Determine length of each string in bytes */
     165           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     166           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     167             : 
     168           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     169             :                                        ins_c, del_c, sub_c, false));
     170             : }
     171             : 
     172             : 
     173           4 : PG_FUNCTION_INFO_V1(levenshtein);
     174             : Datum
     175           2 : levenshtein(PG_FUNCTION_ARGS)
     176             : {
     177           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     178           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     179             :     const char *s_data;
     180             :     const char *t_data;
     181             :     int         s_bytes,
     182             :                 t_bytes;
     183             : 
     184             :     /* Extract a pointer to the actual character data */
     185           2 :     s_data = VARDATA_ANY(src);
     186           2 :     t_data = VARDATA_ANY(dst);
     187             :     /* Determine length of each string in bytes */
     188           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     189           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     190             : 
     191           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     192             :                                        1, 1, 1, false));
     193             : }
     194             : 
     195             : 
     196           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     197             : Datum
     198           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     199             : {
     200           0 :     text       *src = PG_GETARG_TEXT_PP(0);
     201           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
     202           0 :     int         ins_c = PG_GETARG_INT32(2);
     203           0 :     int         del_c = PG_GETARG_INT32(3);
     204           0 :     int         sub_c = PG_GETARG_INT32(4);
     205           0 :     int         max_d = PG_GETARG_INT32(5);
     206             :     const char *s_data;
     207             :     const char *t_data;
     208             :     int         s_bytes,
     209             :                 t_bytes;
     210             : 
     211             :     /* Extract a pointer to the actual character data */
     212           0 :     s_data = VARDATA_ANY(src);
     213           0 :     t_data = VARDATA_ANY(dst);
     214             :     /* Determine length of each string in bytes */
     215           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     216           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     217             : 
     218           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     219             :                                                   t_data, t_bytes,
     220             :                                                   ins_c, del_c, sub_c,
     221             :                                                   max_d, false));
     222             : }
     223             : 
     224             : 
     225           4 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     226             : Datum
     227           4 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     228             : {
     229           4 :     text       *src = PG_GETARG_TEXT_PP(0);
     230           4 :     text       *dst = PG_GETARG_TEXT_PP(1);
     231           4 :     int         max_d = PG_GETARG_INT32(2);
     232             :     const char *s_data;
     233             :     const char *t_data;
     234             :     int         s_bytes,
     235             :                 t_bytes;
     236             : 
     237             :     /* Extract a pointer to the actual character data */
     238           4 :     s_data = VARDATA_ANY(src);
     239           4 :     t_data = VARDATA_ANY(dst);
     240             :     /* Determine length of each string in bytes */
     241           4 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     242           4 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     243             : 
     244           4 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     245             :                                                   t_data, t_bytes,
     246             :                                                   1, 1, 1,
     247             :                                                   max_d, false));
     248             : }
     249             : 
     250             : 
     251             : /*
     252             :  * Calculates the metaphone of an input string.
     253             :  * Returns number of characters requested
     254             :  * (suggested value is 4)
     255             :  */
     256           4 : PG_FUNCTION_INFO_V1(metaphone);
     257             : Datum
     258           2 : metaphone(PG_FUNCTION_ARGS)
     259             : {
     260           2 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     261           2 :     size_t      str_i_len = strlen(str_i);
     262             :     int         reqlen;
     263             :     char       *metaph;
     264             : 
     265             :     /* return an empty string if we receive one */
     266           2 :     if (!(str_i_len > 0))
     267           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
     268             : 
     269           2 :     if (str_i_len > MAX_METAPHONE_STRLEN)
     270           0 :         ereport(ERROR,
     271             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     272             :                  errmsg("argument exceeds the maximum length of %d bytes",
     273             :                         MAX_METAPHONE_STRLEN)));
     274             : 
     275           2 :     reqlen = PG_GETARG_INT32(1);
     276           2 :     if (reqlen > MAX_METAPHONE_STRLEN)
     277           0 :         ereport(ERROR,
     278             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     279             :                  errmsg("output exceeds the maximum length of %d bytes",
     280             :                         MAX_METAPHONE_STRLEN)));
     281             : 
     282           2 :     if (!(reqlen > 0))
     283           0 :         ereport(ERROR,
     284             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     285             :                  errmsg("output cannot be empty string")));
     286             : 
     287           2 :     _metaphone(str_i, reqlen, &metaph);
     288           2 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
     289             : }
     290             : 
     291             : 
     292             : /*
     293             :  * Original code by Michael G Schwern starts here.
     294             :  * Code slightly modified for use as PostgreSQL
     295             :  * function (palloc, etc).
     296             :  */
     297             : 
     298             : /* I suppose I could have been using a character pointer instead of
     299             :  * accessing the array directly... */
     300             : 
     301             : /* Look at the next letter in the word */
     302             : #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
     303             : /* Look at the current letter in the word */
     304             : #define Curr_Letter (toupper((unsigned char) word[w_idx]))
     305             : /* Go N letters back. */
     306             : #define Look_Back_Letter(n) \
     307             :     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
     308             : /* Previous letter.  I dunno, should this return null on failure? */
     309             : #define Prev_Letter (Look_Back_Letter(1))
     310             : /* Look two letters down.  It makes sure you don't walk off the string. */
     311             : #define After_Next_Letter \
     312             :     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
     313             : #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
     314             : 
     315             : 
     316             : /* Allows us to safely look ahead an arbitrary # of letters */
     317             : /* I probably could have just used strlen... */
     318             : static char
     319           0 : Lookahead(char *word, int how_far)
     320             : {
     321           0 :     char        letter_ahead = '\0';    /* null by default */
     322             :     int         idx;
     323             : 
     324           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     325             :     /* Edge forward in the string... */
     326             : 
     327           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
     328             :                                  * end of the string */
     329           0 :     return letter_ahead;
     330             : }
     331             : 
     332             : 
     333             : /* phonize one letter */
     334             : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
     335             : /* Slap a null character on the end of the phoned word */
     336             : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     337             : /* How long is the phoned word? */
     338             : #define Phone_Len   (p_idx)
     339             : 
     340             : /* Note is a letter is a 'break' in the word */
     341             : #define Isbreak(c)  (!isalpha((unsigned char) (c)))
     342             : 
     343             : 
     344             : static void
     345           2 : _metaphone(char *word,          /* IN */
     346             :            int max_phonemes,
     347             :            char **phoned_word)  /* OUT */
     348             : {
     349           2 :     int         w_idx = 0;      /* point in the phonization we're at. */
     350           2 :     int         p_idx = 0;      /* end of the phoned phrase */
     351             : 
     352             :     /*-- Parameter checks --*/
     353             : 
     354             :     /*
     355             :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     356             :      */
     357             : 
     358             :     /* Negative phoneme length is meaningless */
     359           2 :     if (!(max_phonemes > 0))
     360             :         /* internal error */
     361           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
     362             : 
     363             :     /* Empty/null string is meaningless */
     364           2 :     if ((word == NULL) || !(strlen(word) > 0))
     365             :         /* internal error */
     366           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
     367             : 
     368             :     /*-- Allocate memory for our phoned_phrase --*/
     369           2 :     if (max_phonemes == 0)
     370             :     {                           /* Assume largest possible */
     371           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     372             :     }
     373             :     else
     374             :     {
     375           2 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     376             :     }
     377             : 
     378             :     /*-- The first phoneme has to be processed specially. --*/
     379             :     /* Find our first letter */
     380           2 :     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
     381             :     {
     382             :         /* On the off chance we were given nothing but crap... */
     383           0 :         if (Curr_Letter == '\0')
     384             :         {
     385           0 :             End_Phoned_Word;
     386           0 :             return;
     387             :         }
     388             :     }
     389             : 
     390           2 :     switch (Curr_Letter)
     391             :     {
     392             :             /* AE becomes E */
     393           0 :         case 'A':
     394           0 :             if (Next_Letter == 'E')
     395             :             {
     396           0 :                 Phonize('E');
     397           0 :                 w_idx += 2;
     398             :             }
     399             :             /* Remember, preserve vowels at the beginning */
     400             :             else
     401             :             {
     402           0 :                 Phonize('A');
     403           0 :                 w_idx++;
     404             :             }
     405           0 :             break;
     406             :             /* [GKP]N becomes N */
     407           2 :         case 'G':
     408             :         case 'K':
     409             :         case 'P':
     410           2 :             if (Next_Letter == 'N')
     411             :             {
     412           0 :                 Phonize('N');
     413           0 :                 w_idx += 2;
     414             :             }
     415           2 :             break;
     416             : 
     417             :             /*
     418             :              * WH becomes H, WR becomes R W if followed by a vowel
     419             :              */
     420           0 :         case 'W':
     421           0 :             if (Next_Letter == 'H' ||
     422           0 :                 Next_Letter == 'R')
     423             :             {
     424           0 :                 Phonize(Next_Letter);
     425           0 :                 w_idx += 2;
     426             :             }
     427           0 :             else if (isvowel(Next_Letter))
     428             :             {
     429           0 :                 Phonize('W');
     430           0 :                 w_idx += 2;
     431             :             }
     432             :             /* else ignore */
     433           0 :             break;
     434             :             /* X becomes S */
     435           0 :         case 'X':
     436           0 :             Phonize('S');
     437           0 :             w_idx++;
     438           0 :             break;
     439             :             /* Vowels are kept */
     440             : 
     441             :             /*
     442             :              * We did A already case 'A': case 'a':
     443             :              */
     444           0 :         case 'E':
     445             :         case 'I':
     446             :         case 'O':
     447             :         case 'U':
     448           0 :             Phonize(Curr_Letter);
     449           0 :             w_idx++;
     450           0 :             break;
     451           0 :         default:
     452             :             /* do nothing */
     453           0 :             break;
     454             :     }
     455             : 
     456             : 
     457             : 
     458             :     /* On to the metaphoning */
     459          12 :     for (; Curr_Letter != '\0' &&
     460          10 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
     461          10 :          w_idx++)
     462             :     {
     463             :         /*
     464             :          * How many letters to skip because an earlier encoding handled
     465             :          * multiple letters
     466             :          */
     467          10 :         unsigned short int skip_letter = 0;
     468             : 
     469             : 
     470             :         /*
     471             :          * THOUGHT:  It would be nice if, rather than having things like...
     472             :          * well, SCI.  For SCI you encode the S, then have to remember to skip
     473             :          * the C.  So the phonome SCI invades both S and C.  It would be
     474             :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
     475             :          * I'm trying it.
     476             :          */
     477             : 
     478             :         /* Ignore non-alphas */
     479          10 :         if (!isalpha((unsigned char) (Curr_Letter)))
     480           0 :             continue;
     481             : 
     482             :         /* Drop duplicates, except CC */
     483          10 :         if (Curr_Letter == Prev_Letter &&
     484           0 :             Curr_Letter != 'C')
     485           0 :             continue;
     486             : 
     487          10 :         switch (Curr_Letter)
     488             :         {
     489             :                 /* B -> B unless in MB */
     490           2 :             case 'B':
     491           2 :                 if (Prev_Letter != 'M')
     492           0 :                     Phonize('B');
     493           2 :                 break;
     494             : 
     495             :                 /*
     496             :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     497             :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     498             :                  * SCE-, -SCY- (handed in S) else K
     499             :                  */
     500           0 :             case 'C':
     501           0 :                 if (MAKESOFT(Next_Letter))
     502             :                 {               /* C[IEY] */
     503           0 :                     if (After_Next_Letter == 'A' &&
     504           0 :                         Next_Letter == 'I')
     505             :                     {           /* CIA */
     506           0 :                         Phonize(SH);
     507             :                     }
     508             :                     /* SC[IEY] */
     509           0 :                     else if (Prev_Letter == 'S')
     510             :                     {
     511             :                         /* Dropped */
     512             :                     }
     513             :                     else
     514           0 :                         Phonize('S');
     515             :                 }
     516           0 :                 else if (Next_Letter == 'H')
     517             :                 {
     518             : #ifndef USE_TRADITIONAL_METAPHONE
     519           0 :                     if (After_Next_Letter == 'R' ||
     520           0 :                         Prev_Letter == 'S')
     521             :                     {           /* Christ, School */
     522           0 :                         Phonize('K');
     523             :                     }
     524             :                     else
     525           0 :                         Phonize(SH);
     526             : #else
     527             :                     Phonize(SH);
     528             : #endif
     529           0 :                     skip_letter++;
     530             :                 }
     531             :                 else
     532           0 :                     Phonize('K');
     533           0 :                 break;
     534             : 
     535             :                 /*
     536             :                  * J if in -DGE-, -DGI- or -DGY- else T
     537             :                  */
     538           0 :             case 'D':
     539           0 :                 if (Next_Letter == 'G' &&
     540           0 :                     MAKESOFT(After_Next_Letter))
     541             :                 {
     542           0 :                     Phonize('J');
     543           0 :                     skip_letter++;
     544             :                 }
     545             :                 else
     546           0 :                     Phonize('T');
     547           0 :                 break;
     548             : 
     549             :                 /*
     550             :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     551             :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     552             :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     553             :                  * else K
     554             :                  */
     555           2 :             case 'G':
     556           2 :                 if (Next_Letter == 'H')
     557             :                 {
     558           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
     559           0 :                           Look_Back_Letter(4) == 'H'))
     560             :                     {
     561           0 :                         Phonize('F');
     562           0 :                         skip_letter++;
     563             :                     }
     564             :                     else
     565             :                     {
     566             :                         /* silent */
     567             :                     }
     568             :                 }
     569           2 :                 else if (Next_Letter == 'N')
     570             :                 {
     571           0 :                     if (Isbreak(After_Next_Letter) ||
     572           0 :                         (After_Next_Letter == 'E' &&
     573           0 :                          Look_Ahead_Letter(3) == 'D'))
     574             :                     {
     575             :                         /* dropped */
     576             :                     }
     577             :                     else
     578           0 :                         Phonize('K');
     579             :                 }
     580           2 :                 else if (MAKESOFT(Next_Letter) &&
     581           0 :                          Prev_Letter != 'G')
     582           0 :                     Phonize('J');
     583             :                 else
     584           2 :                     Phonize('K');
     585           2 :                 break;
     586             :                 /* H if before a vowel and not after C,G,P,S,T */
     587           0 :             case 'H':
     588           0 :                 if (isvowel(Next_Letter) &&
     589           0 :                     !AFFECTH(Prev_Letter))
     590           0 :                     Phonize('H');
     591           0 :                 break;
     592             : 
     593             :                 /*
     594             :                  * dropped if after C else K
     595             :                  */
     596           0 :             case 'K':
     597           0 :                 if (Prev_Letter != 'C')
     598           0 :                     Phonize('K');
     599           0 :                 break;
     600             : 
     601             :                 /*
     602             :                  * F if before H else P
     603             :                  */
     604           0 :             case 'P':
     605           0 :                 if (Next_Letter == 'H')
     606           0 :                     Phonize('F');
     607             :                 else
     608           0 :                     Phonize('P');
     609           0 :                 break;
     610             : 
     611             :                 /*
     612             :                  * K
     613             :                  */
     614           0 :             case 'Q':
     615           0 :                 Phonize('K');
     616           0 :                 break;
     617             : 
     618             :                 /*
     619             :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     620             :                  */
     621           0 :             case 'S':
     622           0 :                 if (Next_Letter == 'I' &&
     623           0 :                     (After_Next_Letter == 'O' ||
     624           0 :                      After_Next_Letter == 'A'))
     625           0 :                     Phonize(SH);
     626           0 :                 else if (Next_Letter == 'H')
     627             :                 {
     628           0 :                     Phonize(SH);
     629           0 :                     skip_letter++;
     630             :                 }
     631             : #ifndef USE_TRADITIONAL_METAPHONE
     632           0 :                 else if (Next_Letter == 'C' &&
     633           0 :                          Look_Ahead_Letter(2) == 'H' &&
     634           0 :                          Look_Ahead_Letter(3) == 'W')
     635             :                 {
     636           0 :                     Phonize(SH);
     637           0 :                     skip_letter += 2;
     638             :                 }
     639             : #endif
     640             :                 else
     641           0 :                     Phonize('S');
     642           0 :                 break;
     643             : 
     644             :                 /*
     645             :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     646             :                  */
     647           0 :             case 'T':
     648           0 :                 if (Next_Letter == 'I' &&
     649           0 :                     (After_Next_Letter == 'O' ||
     650           0 :                      After_Next_Letter == 'A'))
     651           0 :                     Phonize(SH);
     652           0 :                 else if (Next_Letter == 'H')
     653             :                 {
     654           0 :                     Phonize(TH);
     655           0 :                     skip_letter++;
     656             :                 }
     657             :                 else
     658           0 :                     Phonize('T');
     659           0 :                 break;
     660             :                 /* F */
     661           0 :             case 'V':
     662           0 :                 Phonize('F');
     663           0 :                 break;
     664             :                 /* W before a vowel, else dropped */
     665           0 :             case 'W':
     666           0 :                 if (isvowel(Next_Letter))
     667           0 :                     Phonize('W');
     668           0 :                 break;
     669             :                 /* KS */
     670           0 :             case 'X':
     671           0 :                 Phonize('K');
     672           0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     673           0 :                     Phonize('S');
     674           0 :                 break;
     675             :                 /* Y if followed by a vowel */
     676           0 :             case 'Y':
     677           0 :                 if (isvowel(Next_Letter))
     678           0 :                     Phonize('Y');
     679           0 :                 break;
     680             :                 /* S */
     681           0 :             case 'Z':
     682           0 :                 Phonize('S');
     683           0 :                 break;
     684             :                 /* No transformation */
     685           2 :             case 'F':
     686             :             case 'J':
     687             :             case 'L':
     688             :             case 'M':
     689             :             case 'N':
     690             :             case 'R':
     691           2 :                 Phonize(Curr_Letter);
     692           2 :                 break;
     693           4 :             default:
     694             :                 /* nothing */
     695           4 :                 break;
     696             :         }                       /* END SWITCH */
     697             : 
     698          10 :         w_idx += skip_letter;
     699             :     }                           /* END FOR */
     700             : 
     701           2 :     End_Phoned_Word;
     702             : }                               /* END metaphone */
     703             : 
     704             : 
     705             : /*
     706             :  * SQL function: soundex(text) returns text
     707             :  */
     708           6 : PG_FUNCTION_INFO_V1(soundex);
     709             : 
     710             : Datum
     711          16 : soundex(PG_FUNCTION_ARGS)
     712             : {
     713             :     char        outstr[SOUNDEX_LEN + 1];
     714             :     char       *arg;
     715             : 
     716          16 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     717             : 
     718          16 :     _soundex(arg, outstr);
     719             : 
     720          16 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
     721             : }
     722             : 
     723             : static void
     724          32 : _soundex(const char *instr, char *outstr)
     725             : {
     726             :     int         count;
     727             : 
     728             :     Assert(instr);
     729             :     Assert(outstr);
     730             : 
     731             :     /* Skip leading non-alphabetic characters */
     732          32 :     while (*instr && !isalpha((unsigned char) *instr))
     733           0 :         ++instr;
     734             : 
     735             :     /* If no string left, return all-zeroes buffer */
     736          32 :     if (!*instr)
     737             :     {
     738           6 :         memset(outstr, '\0', SOUNDEX_LEN + 1);
     739           6 :         return;
     740             :     }
     741             : 
     742             :     /* Take the first letter as is */
     743          26 :     *outstr++ = (char) toupper((unsigned char) *instr++);
     744             : 
     745          26 :     count = 1;
     746         120 :     while (*instr && count < SOUNDEX_LEN)
     747             :     {
     748         186 :         if (isalpha((unsigned char) *instr) &&
     749          92 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
     750             :         {
     751          70 :             *outstr = soundex_code(*instr);
     752          70 :             if (*outstr != '0')
     753             :             {
     754          46 :                 ++outstr;
     755          46 :                 ++count;
     756             :             }
     757             :         }
     758          94 :         ++instr;
     759             :     }
     760             : 
     761             :     /* Fill with 0's */
     762          58 :     while (count < SOUNDEX_LEN)
     763             :     {
     764          32 :         *outstr = '0';
     765          32 :         ++outstr;
     766          32 :         ++count;
     767             :     }
     768             : 
     769             :     /* And null-terminate */
     770          26 :     *outstr = '\0';
     771             : }
     772             : 
     773           4 : PG_FUNCTION_INFO_V1(difference);
     774             : 
     775             : Datum
     776           8 : difference(PG_FUNCTION_ARGS)
     777             : {
     778             :     char        sndx1[SOUNDEX_LEN + 1],
     779             :                 sndx2[SOUNDEX_LEN + 1];
     780             :     int         i,
     781             :                 result;
     782             : 
     783           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     784           8 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     785             : 
     786           8 :     result = 0;
     787          40 :     for (i = 0; i < SOUNDEX_LEN; i++)
     788             :     {
     789          32 :         if (sndx1[i] == sndx2[i])
     790          20 :             result++;
     791             :     }
     792             : 
     793           8 :     PG_RETURN_INT32(result);
     794             : }

Generated by: LCOV version 1.14