LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 117 255 45.9 %
Date: 2019-11-15 23:07:02 Functions: 18 20 90.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * fuzzystrmatch.c
       3             :  *
       4             :  * Functions for "fuzzy" comparison of strings
       5             :  *
       6             :  * Joe Conway <mail@joeconway.com>
       7             :  *
       8             :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9             :  * Copyright (c) 2001-2019, PostgreSQL Global Development Group
      10             :  * ALL RIGHTS RESERVED;
      11             :  *
      12             :  * metaphone()
      13             :  * -----------
      14             :  * Modified for PostgreSQL by Joe Conway.
      15             :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16             :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17             :  * Metaphone was originally created by Lawrence Philips and presented in article
      18             :  * in "Computer Language" December 1990 issue.
      19             :  *
      20             :  * Permission to use, copy, modify, and distribute this software and its
      21             :  * documentation for any purpose, without fee, and without a written agreement
      22             :  * is hereby granted, provided that the above copyright notice and this
      23             :  * paragraph and the following two paragraphs appear in all copies.
      24             :  *
      25             :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26             :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27             :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28             :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29             :  * POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33             :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34             :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35             :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36             :  *
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include <ctype.h>
      42             : 
      43             : #include "mb/pg_wchar.h"
      44             : #include "utils/builtins.h"
      45             : #include "utils/varlena.h"
      46             : 
      47           2 : PG_MODULE_MAGIC;
      48             : 
      49             : /*
      50             :  * Soundex
      51             :  */
      52             : static void _soundex(const char *instr, char *outstr);
      53             : 
      54             : #define SOUNDEX_LEN 4
      55             : 
      56             : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      57             : static const char *soundex_table = "01230120022455012623010202";
      58             : 
      59             : static char
      60         254 : soundex_code(char letter)
      61             : {
      62         254 :     letter = toupper((unsigned char) letter);
      63             :     /* Defend against non-ASCII letters */
      64         254 :     if (letter >= 'A' && letter <= 'Z')
      65         252 :         return soundex_table[letter - 'A'];
      66           2 :     return letter;
      67             : }
      68             : 
      69             : /*
      70             :  * Metaphone
      71             :  */
      72             : #define MAX_METAPHONE_STRLEN        255
      73             : 
      74             : /*
      75             :  * Original code by Michael G Schwern starts here.
      76             :  * Code slightly modified for use as PostgreSQL function.
      77             :  */
      78             : 
      79             : 
      80             : /**************************************************************************
      81             :     metaphone -- Breaks english phrases down into their phonemes.
      82             : 
      83             :     Input
      84             :         word            --  An english word to be phonized
      85             :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
      86             :                             will phonize the entire phrase.
      87             :         phoned_word     --  The final phonized word.  (We'll allocate the
      88             :                             memory.)
      89             :     Output
      90             :         error   --  A simple error flag, returns true or false
      91             : 
      92             :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      93             :     although non-alpha characters will break up phonemes.
      94             : ****************************************************************************/
      95             : 
      96             : 
      97             : /*  I add modifications to the traditional metaphone algorithm that you
      98             :     might find in books.  Define this if you want metaphone to behave
      99             :     traditionally */
     100             : #undef USE_TRADITIONAL_METAPHONE
     101             : 
     102             : /* Special encodings */
     103             : #define  SH     'X'
     104             : #define  TH     '0'
     105             : 
     106             : static char Lookahead(char *word, int how_far);
     107             : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     108             : 
     109             : /* Metachar.h ... little bits about characters for metaphone */
     110             : 
     111             : 
     112             : /*-- Character encoding array & accessing macros --*/
     113             : /* Stolen directly out of the book... */
     114             : static const char _codes[26] = {
     115             :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     116             : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     117             : };
     118             : 
     119             : static int
     120           2 : getcode(char c)
     121             : {
     122           2 :     if (isalpha((unsigned char) c))
     123             :     {
     124           2 :         c = toupper((unsigned char) c);
     125             :         /* Defend against non-ASCII letters */
     126           2 :         if (c >= 'A' && c <= 'Z')
     127           2 :             return _codes[c - 'A'];
     128             :     }
     129           0 :     return 0;
     130             : }
     131             : 
     132             : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
     133             : 
     134             : /* These letters are passed through unchanged */
     135             : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
     136             : 
     137             : /* These form diphthongs when preceding H */
     138             : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
     139             : 
     140             : /* These make C and G soft */
     141             : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
     142             : 
     143             : /* These prevent GH from becoming F */
     144             : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
     145             : 
     146           4 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     147             : Datum
     148           2 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     149             : {
     150           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     151           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     152           2 :     int         ins_c = PG_GETARG_INT32(2);
     153           2 :     int         del_c = PG_GETARG_INT32(3);
     154           2 :     int         sub_c = PG_GETARG_INT32(4);
     155             :     const char *s_data;
     156             :     const char *t_data;
     157             :     int         s_bytes,
     158             :                 t_bytes;
     159             : 
     160             :     /* Extract a pointer to the actual character data */
     161           2 :     s_data = VARDATA_ANY(src);
     162           2 :     t_data = VARDATA_ANY(dst);
     163             :     /* Determine length of each string in bytes */
     164           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     165           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     166             : 
     167           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     168             :                                        ins_c, del_c, sub_c, false));
     169             : }
     170             : 
     171             : 
     172           4 : PG_FUNCTION_INFO_V1(levenshtein);
     173             : Datum
     174           2 : levenshtein(PG_FUNCTION_ARGS)
     175             : {
     176           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     177           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     178             :     const char *s_data;
     179             :     const char *t_data;
     180             :     int         s_bytes,
     181             :                 t_bytes;
     182             : 
     183             :     /* Extract a pointer to the actual character data */
     184           2 :     s_data = VARDATA_ANY(src);
     185           2 :     t_data = VARDATA_ANY(dst);
     186             :     /* Determine length of each string in bytes */
     187           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     188           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     189             : 
     190           2 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     191             :                                        1, 1, 1, false));
     192             : }
     193             : 
     194             : 
     195           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     196             : Datum
     197           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     198             : {
     199           0 :     text       *src = PG_GETARG_TEXT_PP(0);
     200           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
     201           0 :     int         ins_c = PG_GETARG_INT32(2);
     202           0 :     int         del_c = PG_GETARG_INT32(3);
     203           0 :     int         sub_c = PG_GETARG_INT32(4);
     204           0 :     int         max_d = PG_GETARG_INT32(5);
     205             :     const char *s_data;
     206             :     const char *t_data;
     207             :     int         s_bytes,
     208             :                 t_bytes;
     209             : 
     210             :     /* Extract a pointer to the actual character data */
     211           0 :     s_data = VARDATA_ANY(src);
     212           0 :     t_data = VARDATA_ANY(dst);
     213             :     /* Determine length of each string in bytes */
     214           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     215           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     216             : 
     217           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     218             :                                                   t_data, t_bytes,
     219             :                                                   ins_c, del_c, sub_c,
     220             :                                                   max_d, false));
     221             : }
     222             : 
     223             : 
     224           4 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     225             : Datum
     226           4 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     227             : {
     228           4 :     text       *src = PG_GETARG_TEXT_PP(0);
     229           4 :     text       *dst = PG_GETARG_TEXT_PP(1);
     230           4 :     int         max_d = PG_GETARG_INT32(2);
     231             :     const char *s_data;
     232             :     const char *t_data;
     233             :     int         s_bytes,
     234             :                 t_bytes;
     235             : 
     236             :     /* Extract a pointer to the actual character data */
     237           4 :     s_data = VARDATA_ANY(src);
     238           4 :     t_data = VARDATA_ANY(dst);
     239             :     /* Determine length of each string in bytes */
     240           4 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     241           4 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     242             : 
     243           4 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     244             :                                                   t_data, t_bytes,
     245             :                                                   1, 1, 1,
     246             :                                                   max_d, false));
     247             : }
     248             : 
     249             : 
     250             : /*
     251             :  * Calculates the metaphone of an input string.
     252             :  * Returns number of characters requested
     253             :  * (suggested value is 4)
     254             :  */
     255           4 : PG_FUNCTION_INFO_V1(metaphone);
     256             : Datum
     257           2 : metaphone(PG_FUNCTION_ARGS)
     258             : {
     259           2 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     260           2 :     size_t      str_i_len = strlen(str_i);
     261             :     int         reqlen;
     262             :     char       *metaph;
     263             : 
     264             :     /* return an empty string if we receive one */
     265           2 :     if (!(str_i_len > 0))
     266           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
     267             : 
     268           2 :     if (str_i_len > MAX_METAPHONE_STRLEN)
     269           0 :         ereport(ERROR,
     270             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     271             :                  errmsg("argument exceeds the maximum length of %d bytes",
     272             :                         MAX_METAPHONE_STRLEN)));
     273             : 
     274           2 :     reqlen = PG_GETARG_INT32(1);
     275           2 :     if (reqlen > MAX_METAPHONE_STRLEN)
     276           0 :         ereport(ERROR,
     277             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     278             :                  errmsg("output exceeds the maximum length of %d bytes",
     279             :                         MAX_METAPHONE_STRLEN)));
     280             : 
     281           2 :     if (!(reqlen > 0))
     282           0 :         ereport(ERROR,
     283             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     284             :                  errmsg("output cannot be empty string")));
     285             : 
     286           2 :     _metaphone(str_i, reqlen, &metaph);
     287           2 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
     288             : }
     289             : 
     290             : 
     291             : /*
     292             :  * Original code by Michael G Schwern starts here.
     293             :  * Code slightly modified for use as PostgreSQL
     294             :  * function (palloc, etc).
     295             :  */
     296             : 
     297             : /* I suppose I could have been using a character pointer instead of
     298             :  * accessing the array directly... */
     299             : 
     300             : /* Look at the next letter in the word */
     301             : #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
     302             : /* Look at the current letter in the word */
     303             : #define Curr_Letter (toupper((unsigned char) word[w_idx]))
     304             : /* Go N letters back. */
     305             : #define Look_Back_Letter(n) \
     306             :     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
     307             : /* Previous letter.  I dunno, should this return null on failure? */
     308             : #define Prev_Letter (Look_Back_Letter(1))
     309             : /* Look two letters down.  It makes sure you don't walk off the string. */
     310             : #define After_Next_Letter \
     311             :     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
     312             : #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
     313             : 
     314             : 
     315             : /* Allows us to safely look ahead an arbitrary # of letters */
     316             : /* I probably could have just used strlen... */
     317             : static char
     318           0 : Lookahead(char *word, int how_far)
     319             : {
     320           0 :     char        letter_ahead = '\0';    /* null by default */
     321             :     int         idx;
     322             : 
     323           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     324             :     /* Edge forward in the string... */
     325             : 
     326           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
     327             :                                  * end of the string */
     328           0 :     return letter_ahead;
     329             : }
     330             : 
     331             : 
     332             : /* phonize one letter */
     333             : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
     334             : /* Slap a null character on the end of the phoned word */
     335             : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     336             : /* How long is the phoned word? */
     337             : #define Phone_Len   (p_idx)
     338             : 
     339             : /* Note is a letter is a 'break' in the word */
     340             : #define Isbreak(c)  (!isalpha((unsigned char) (c)))
     341             : 
     342             : 
     343             : static void
     344           2 : _metaphone(char *word,          /* IN */
     345             :            int max_phonemes,
     346             :            char **phoned_word)  /* OUT */
     347             : {
     348           2 :     int         w_idx = 0;      /* point in the phonization we're at. */
     349           2 :     int         p_idx = 0;      /* end of the phoned phrase */
     350             : 
     351             :     /*-- Parameter checks --*/
     352             : 
     353             :     /*
     354             :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     355             :      */
     356             : 
     357             :     /* Negative phoneme length is meaningless */
     358           2 :     if (!(max_phonemes > 0))
     359             :         /* internal error */
     360           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
     361             : 
     362             :     /* Empty/null string is meaningless */
     363           2 :     if ((word == NULL) || !(strlen(word) > 0))
     364             :         /* internal error */
     365           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
     366             : 
     367             :     /*-- Allocate memory for our phoned_phrase --*/
     368           2 :     if (max_phonemes == 0)
     369             :     {                           /* Assume largest possible */
     370           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     371             :     }
     372             :     else
     373             :     {
     374           2 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     375             :     }
     376             : 
     377             :     /*-- The first phoneme has to be processed specially. --*/
     378             :     /* Find our first letter */
     379           2 :     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
     380             :     {
     381             :         /* On the off chance we were given nothing but crap... */
     382           0 :         if (Curr_Letter == '\0')
     383             :         {
     384           0 :             End_Phoned_Word;
     385           0 :             return;
     386             :         }
     387             :     }
     388             : 
     389           2 :     switch (Curr_Letter)
     390             :     {
     391             :             /* AE becomes E */
     392             :         case 'A':
     393           0 :             if (Next_Letter == 'E')
     394             :             {
     395           0 :                 Phonize('E');
     396           0 :                 w_idx += 2;
     397             :             }
     398             :             /* Remember, preserve vowels at the beginning */
     399             :             else
     400             :             {
     401           0 :                 Phonize('A');
     402           0 :                 w_idx++;
     403             :             }
     404           0 :             break;
     405             :             /* [GKP]N becomes N */
     406             :         case 'G':
     407             :         case 'K':
     408             :         case 'P':
     409           2 :             if (Next_Letter == 'N')
     410             :             {
     411           0 :                 Phonize('N');
     412           0 :                 w_idx += 2;
     413             :             }
     414           2 :             break;
     415             : 
     416             :             /*
     417             :              * WH becomes H, WR becomes R W if followed by a vowel
     418             :              */
     419             :         case 'W':
     420           0 :             if (Next_Letter == 'H' ||
     421           0 :                 Next_Letter == 'R')
     422             :             {
     423           0 :                 Phonize(Next_Letter);
     424           0 :                 w_idx += 2;
     425             :             }
     426           0 :             else if (isvowel(Next_Letter))
     427             :             {
     428           0 :                 Phonize('W');
     429           0 :                 w_idx += 2;
     430             :             }
     431             :             /* else ignore */
     432           0 :             break;
     433             :             /* X becomes S */
     434             :         case 'X':
     435           0 :             Phonize('S');
     436           0 :             w_idx++;
     437           0 :             break;
     438             :             /* Vowels are kept */
     439             : 
     440             :             /*
     441             :              * We did A already case 'A': case 'a':
     442             :              */
     443             :         case 'E':
     444             :         case 'I':
     445             :         case 'O':
     446             :         case 'U':
     447           0 :             Phonize(Curr_Letter);
     448           0 :             w_idx++;
     449           0 :             break;
     450             :         default:
     451             :             /* do nothing */
     452           0 :             break;
     453             :     }
     454             : 
     455             : 
     456             : 
     457             :     /* On to the metaphoning */
     458          14 :     for (; Curr_Letter != '\0' &&
     459          10 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
     460          10 :          w_idx++)
     461             :     {
     462             :         /*
     463             :          * How many letters to skip because an earlier encoding handled
     464             :          * multiple letters
     465             :          */
     466          10 :         unsigned short int skip_letter = 0;
     467             : 
     468             : 
     469             :         /*
     470             :          * THOUGHT:  It would be nice if, rather than having things like...
     471             :          * well, SCI.  For SCI you encode the S, then have to remember to skip
     472             :          * the C.  So the phonome SCI invades both S and C.  It would be
     473             :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
     474             :          * I'm trying it.
     475             :          */
     476             : 
     477             :         /* Ignore non-alphas */
     478          10 :         if (!isalpha((unsigned char) (Curr_Letter)))
     479           0 :             continue;
     480             : 
     481             :         /* Drop duplicates, except CC */
     482          10 :         if (Curr_Letter == Prev_Letter &&
     483           0 :             Curr_Letter != 'C')
     484           0 :             continue;
     485             : 
     486          10 :         switch (Curr_Letter)
     487             :         {
     488             :                 /* B -> B unless in MB */
     489             :             case 'B':
     490           2 :                 if (Prev_Letter != 'M')
     491           0 :                     Phonize('B');
     492           2 :                 break;
     493             : 
     494             :                 /*
     495             :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     496             :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     497             :                  * SCE-, -SCY- (handed in S) else K
     498             :                  */
     499             :             case 'C':
     500           0 :                 if (MAKESOFT(Next_Letter))
     501             :                 {               /* C[IEY] */
     502           0 :                     if (After_Next_Letter == 'A' &&
     503           0 :                         Next_Letter == 'I')
     504             :                     {           /* CIA */
     505           0 :                         Phonize(SH);
     506             :                     }
     507             :                     /* SC[IEY] */
     508           0 :                     else if (Prev_Letter == 'S')
     509             :                     {
     510             :                         /* Dropped */
     511             :                     }
     512             :                     else
     513           0 :                         Phonize('S');
     514             :                 }
     515           0 :                 else if (Next_Letter == 'H')
     516             :                 {
     517             : #ifndef USE_TRADITIONAL_METAPHONE
     518           0 :                     if (After_Next_Letter == 'R' ||
     519           0 :                         Prev_Letter == 'S')
     520             :                     {           /* Christ, School */
     521           0 :                         Phonize('K');
     522             :                     }
     523             :                     else
     524           0 :                         Phonize(SH);
     525             : #else
     526             :                     Phonize(SH);
     527             : #endif
     528           0 :                     skip_letter++;
     529             :                 }
     530             :                 else
     531           0 :                     Phonize('K');
     532           0 :                 break;
     533             : 
     534             :                 /*
     535             :                  * J if in -DGE-, -DGI- or -DGY- else T
     536             :                  */
     537             :             case 'D':
     538           0 :                 if (Next_Letter == 'G' &&
     539           0 :                     MAKESOFT(After_Next_Letter))
     540             :                 {
     541           0 :                     Phonize('J');
     542           0 :                     skip_letter++;
     543             :                 }
     544             :                 else
     545           0 :                     Phonize('T');
     546           0 :                 break;
     547             : 
     548             :                 /*
     549             :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     550             :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     551             :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     552             :                  * else K
     553             :                  */
     554             :             case 'G':
     555           2 :                 if (Next_Letter == 'H')
     556             :                 {
     557           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
     558           0 :                           Look_Back_Letter(4) == 'H'))
     559             :                     {
     560           0 :                         Phonize('F');
     561           0 :                         skip_letter++;
     562             :                     }
     563             :                     else
     564             :                     {
     565             :                         /* silent */
     566             :                     }
     567             :                 }
     568           2 :                 else if (Next_Letter == 'N')
     569             :                 {
     570           0 :                     if (Isbreak(After_Next_Letter) ||
     571           0 :                         (After_Next_Letter == 'E' &&
     572           0 :                          Look_Ahead_Letter(3) == 'D'))
     573             :                     {
     574             :                         /* dropped */
     575             :                     }
     576             :                     else
     577           0 :                         Phonize('K');
     578             :                 }
     579           2 :                 else if (MAKESOFT(Next_Letter) &&
     580           0 :                          Prev_Letter != 'G')
     581           0 :                     Phonize('J');
     582             :                 else
     583           2 :                     Phonize('K');
     584           2 :                 break;
     585             :                 /* H if before a vowel and not after C,G,P,S,T */
     586             :             case 'H':
     587           0 :                 if (isvowel(Next_Letter) &&
     588           0 :                     !AFFECTH(Prev_Letter))
     589           0 :                     Phonize('H');
     590           0 :                 break;
     591             : 
     592             :                 /*
     593             :                  * dropped if after C else K
     594             :                  */
     595             :             case 'K':
     596           0 :                 if (Prev_Letter != 'C')
     597           0 :                     Phonize('K');
     598           0 :                 break;
     599             : 
     600             :                 /*
     601             :                  * F if before H else P
     602             :                  */
     603             :             case 'P':
     604           0 :                 if (Next_Letter == 'H')
     605           0 :                     Phonize('F');
     606             :                 else
     607           0 :                     Phonize('P');
     608           0 :                 break;
     609             : 
     610             :                 /*
     611             :                  * K
     612             :                  */
     613             :             case 'Q':
     614           0 :                 Phonize('K');
     615           0 :                 break;
     616             : 
     617             :                 /*
     618             :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     619             :                  */
     620             :             case 'S':
     621           0 :                 if (Next_Letter == 'I' &&
     622           0 :                     (After_Next_Letter == 'O' ||
     623           0 :                      After_Next_Letter == 'A'))
     624           0 :                     Phonize(SH);
     625           0 :                 else if (Next_Letter == 'H')
     626             :                 {
     627           0 :                     Phonize(SH);
     628           0 :                     skip_letter++;
     629             :                 }
     630             : #ifndef USE_TRADITIONAL_METAPHONE
     631           0 :                 else if (Next_Letter == 'C' &&
     632           0 :                          Look_Ahead_Letter(2) == 'H' &&
     633           0 :                          Look_Ahead_Letter(3) == 'W')
     634             :                 {
     635           0 :                     Phonize(SH);
     636           0 :                     skip_letter += 2;
     637             :                 }
     638             : #endif
     639             :                 else
     640           0 :                     Phonize('S');
     641           0 :                 break;
     642             : 
     643             :                 /*
     644             :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     645             :                  */
     646             :             case 'T':
     647           0 :                 if (Next_Letter == 'I' &&
     648           0 :                     (After_Next_Letter == 'O' ||
     649           0 :                      After_Next_Letter == 'A'))
     650           0 :                     Phonize(SH);
     651           0 :                 else if (Next_Letter == 'H')
     652             :                 {
     653           0 :                     Phonize(TH);
     654           0 :                     skip_letter++;
     655             :                 }
     656             :                 else
     657           0 :                     Phonize('T');
     658           0 :                 break;
     659             :                 /* F */
     660             :             case 'V':
     661           0 :                 Phonize('F');
     662           0 :                 break;
     663             :                 /* W before a vowel, else dropped */
     664             :             case 'W':
     665           0 :                 if (isvowel(Next_Letter))
     666           0 :                     Phonize('W');
     667           0 :                 break;
     668             :                 /* KS */
     669             :             case 'X':
     670           0 :                 Phonize('K');
     671           0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     672           0 :                     Phonize('S');
     673           0 :                 break;
     674             :                 /* Y if followed by a vowel */
     675             :             case 'Y':
     676           0 :                 if (isvowel(Next_Letter))
     677           0 :                     Phonize('Y');
     678           0 :                 break;
     679             :                 /* S */
     680             :             case 'Z':
     681           0 :                 Phonize('S');
     682           0 :                 break;
     683             :                 /* No transformation */
     684             :             case 'F':
     685             :             case 'J':
     686             :             case 'L':
     687             :             case 'M':
     688             :             case 'N':
     689             :             case 'R':
     690           2 :                 Phonize(Curr_Letter);
     691           2 :                 break;
     692             :             default:
     693             :                 /* nothing */
     694           4 :                 break;
     695             :         }                       /* END SWITCH */
     696             : 
     697          10 :         w_idx += skip_letter;
     698             :     }                           /* END FOR */
     699             : 
     700           2 :     End_Phoned_Word;
     701             : 
     702           2 :     return;
     703             : }                               /* END metaphone */
     704             : 
     705             : 
     706             : /*
     707             :  * SQL function: soundex(text) returns text
     708             :  */
     709           6 : PG_FUNCTION_INFO_V1(soundex);
     710             : 
     711             : Datum
     712          14 : soundex(PG_FUNCTION_ARGS)
     713             : {
     714             :     char        outstr[SOUNDEX_LEN + 1];
     715             :     char       *arg;
     716             : 
     717          14 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     718             : 
     719          14 :     _soundex(arg, outstr);
     720             : 
     721          14 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
     722             : }
     723             : 
     724             : static void
     725          26 : _soundex(const char *instr, char *outstr)
     726             : {
     727             :     int         count;
     728             : 
     729             :     AssertArg(instr);
     730             :     AssertArg(outstr);
     731             : 
     732          26 :     outstr[SOUNDEX_LEN] = '\0';
     733             : 
     734             :     /* Skip leading non-alphabetic characters */
     735          52 :     while (!isalpha((unsigned char) instr[0]) && instr[0])
     736           0 :         ++instr;
     737             : 
     738             :     /* No string left */
     739          26 :     if (!instr[0])
     740             :     {
     741           0 :         outstr[0] = (char) 0;
     742           0 :         return;
     743             :     }
     744             : 
     745             :     /* Take the first letter as is */
     746          26 :     *outstr++ = (char) toupper((unsigned char) *instr++);
     747             : 
     748          26 :     count = 1;
     749         146 :     while (*instr && count < SOUNDEX_LEN)
     750             :     {
     751         186 :         if (isalpha((unsigned char) *instr) &&
     752          92 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
     753             :         {
     754          70 :             *outstr = soundex_code(instr[0]);
     755          70 :             if (*outstr != '0')
     756             :             {
     757          46 :                 ++outstr;
     758          46 :                 ++count;
     759             :             }
     760             :         }
     761          94 :         ++instr;
     762             :     }
     763             : 
     764             :     /* Fill with 0's */
     765          84 :     while (count < SOUNDEX_LEN)
     766             :     {
     767          32 :         *outstr = '0';
     768          32 :         ++outstr;
     769          32 :         ++count;
     770             :     }
     771             : }
     772             : 
     773           4 : PG_FUNCTION_INFO_V1(difference);
     774             : 
     775             : Datum
     776           6 : difference(PG_FUNCTION_ARGS)
     777             : {
     778             :     char        sndx1[SOUNDEX_LEN + 1],
     779             :                 sndx2[SOUNDEX_LEN + 1];
     780             :     int         i,
     781             :                 result;
     782             : 
     783           6 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     784           6 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     785             : 
     786           6 :     result = 0;
     787          30 :     for (i = 0; i < SOUNDEX_LEN; i++)
     788             :     {
     789          24 :         if (sndx1[i] == sndx2[i])
     790          12 :             result++;
     791             :     }
     792             : 
     793           6 :     PG_RETURN_INT32(result);
     794             : }

Generated by: LCOV version 1.13