LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - daitch_mokotoff.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 155 162 95.7 %
Date: 2024-04-23 09:11:01 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Daitch-Mokotoff Soundex
       3             :  *
       4             :  * Copyright (c) 2023-2024, PostgreSQL Global Development Group
       5             :  *
       6             :  * This module was originally sponsored by Finance Norway /
       7             :  * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
       8             :  *
       9             :  * The implementation of the Daitch-Mokotoff Soundex System aims at correctness
      10             :  * and high performance, and can be summarized as follows:
      11             :  *
      12             :  * - The processing of each phoneme is initiated by an O(1) table lookup.
      13             :  * - For phonemes containing more than one character, a coding tree is traversed
      14             :  *   to process the complete phoneme.
      15             :  * - The (alternate) soundex codes are produced digit by digit in-place in
      16             :  *   another tree structure.
      17             :  *
      18             :  * References:
      19             :  *
      20             :  * https://www.avotaynu.com/soundex.htm
      21             :  * https://www.jewishgen.org/InfoFiles/Soundex.html
      22             :  * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
      23             :  * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
      24             :  * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
      25             :  * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
      26             :  *
      27             :  * A few notes on other implementations:
      28             :  *
      29             :  * - All other known implementations have the same unofficial rules for "UE",
      30             :  *   these are also adapted by this implementation (0, 1, NC).
      31             :  * - The only other known implementation which is capable of generating all
      32             :  *   correct soundex codes in all cases is the JOS Soundex Calculator at
      33             :  *   https://www.jewishgen.org/jos/jossound.htm
      34             :  * - "J" is considered (only) a vowel in dmlat.php
      35             :  * - The official rules for "RS" are commented out in dmlat.php
      36             :  * - Identical code digits for adjacent letters are not collapsed correctly in
      37             :  *   dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
      38             :  *   744300 instead of 743000 as for "BEST".
      39             :  * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
      40             :  * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
      41             : */
      42             : 
      43             : #include "postgres.h"
      44             : 
      45             : #include "catalog/pg_type.h"
      46             : #include "mb/pg_wchar.h"
      47             : #include "utils/array.h"
      48             : #include "utils/builtins.h"
      49             : #include "utils/memutils.h"
      50             : 
      51             : 
      52             : /*
      53             :  * The soundex coding chart table is adapted from
      54             :  * https://www.jewishgen.org/InfoFiles/Soundex.html
      55             :  * See daitch_mokotoff_header.pl for details.
      56             : */
      57             : 
      58             : /* Generated coding chart table */
      59             : #include "daitch_mokotoff.h"
      60             : 
      61             : #define DM_CODE_DIGITS 6
      62             : 
      63             : /* Node in soundex code tree */
      64             : typedef struct dm_node
      65             : {
      66             :     int         soundex_length; /* Length of generated soundex code */
      67             :     char        soundex[DM_CODE_DIGITS];    /* Soundex code */
      68             :     int         is_leaf;        /* Candidate for complete soundex code */
      69             :     int         last_update;    /* Letter number for last update of node */
      70             :     char        code_digit;     /* Last code digit, 0 - 9 */
      71             : 
      72             :     /*
      73             :      * One or two alternate code digits leading to this node. If there are two
      74             :      * digits, one of them is always an 'X'. Repeated code digits and 'X' lead
      75             :      * back to the same node.
      76             :      */
      77             :     char        prev_code_digits[2];
      78             :     /* One or two alternate code digits moving forward. */
      79             :     char        next_code_digits[2];
      80             :     /* ORed together code index(es) used to reach current node. */
      81             :     int         prev_code_index;
      82             :     int         next_code_index;
      83             :     /* Possible nodes branching out from this node - digits 0-9. */
      84             :     struct dm_node *children[10];
      85             :     /* Next node in linked list. Alternating index for each iteration. */
      86             :     struct dm_node *next[2];
      87             : } dm_node;
      88             : 
      89             : /* Template for new node in soundex code tree. */
      90             : static const dm_node start_node = {
      91             :     .soundex_length = 0,
      92             :     .soundex = "000000",      /* Six digits */
      93             :     .is_leaf = 0,
      94             :     .last_update = 0,
      95             :     .code_digit = '\0',
      96             :     .prev_code_digits = {'\0', '\0'},
      97             :     .next_code_digits = {'\0', '\0'},
      98             :     .prev_code_index = 0,
      99             :     .next_code_index = 0,
     100             :     .children = {NULL},
     101             :     .next = {NULL}
     102             : };
     103             : 
     104             : /* Dummy soundex codes at end of input. */
     105             : static const dm_codes end_codes[2] =
     106             : {
     107             :     {
     108             :         "X", "X", "X"
     109             :     }
     110             : };
     111             : 
     112             : /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
     113             : static const char iso8859_1_to_ascii_upper[] =
     114             : "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~                                  !                             ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";
     115             : 
     116             : /* Internal C implementation */
     117             : static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);
     118             : 
     119             : 
     120           6 : PG_FUNCTION_INFO_V1(daitch_mokotoff);
     121             : 
     122             : Datum
     123          70 : daitch_mokotoff(PG_FUNCTION_ARGS)
     124             : {
     125          70 :     text       *arg = PG_GETARG_TEXT_PP(0);
     126             :     Datum       retval;
     127             :     char       *string;
     128             :     ArrayBuildState *soundex;
     129             :     MemoryContext old_ctx,
     130             :                 tmp_ctx;
     131             : 
     132             :     /* Work in a temporary context to simplify cleanup. */
     133          70 :     tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,
     134             :                                     "daitch_mokotoff temporary context",
     135             :                                     ALLOCSET_DEFAULT_SIZES);
     136          70 :     old_ctx = MemoryContextSwitchTo(tmp_ctx);
     137             : 
     138             :     /* We must convert the string to UTF-8 if it isn't already. */
     139          70 :     string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),
     140             :                               PG_UTF8);
     141             : 
     142             :     /* The result is built in this ArrayBuildState. */
     143          70 :     soundex = initArrayResult(TEXTOID, tmp_ctx, false);
     144             : 
     145          70 :     if (!daitch_mokotoff_coding(string, soundex))
     146             :     {
     147             :         /* No encodable characters in input */
     148           0 :         MemoryContextSwitchTo(old_ctx);
     149           0 :         MemoryContextDelete(tmp_ctx);
     150           0 :         PG_RETURN_NULL();
     151             :     }
     152             : 
     153          70 :     retval = makeArrayResult(soundex, old_ctx);
     154             : 
     155          70 :     MemoryContextSwitchTo(old_ctx);
     156          70 :     MemoryContextDelete(tmp_ctx);
     157             : 
     158          70 :     PG_RETURN_DATUM(retval);
     159             : }
     160             : 
     161             : 
     162             : /* Initialize soundex code tree node for next code digit. */
     163             : static void
     164        1328 : initialize_node(dm_node *node, int last_update)
     165             : {
     166        1328 :     if (node->last_update < last_update)
     167             :     {
     168         926 :         node->prev_code_digits[0] = node->next_code_digits[0];
     169         926 :         node->prev_code_digits[1] = node->next_code_digits[1];
     170         926 :         node->next_code_digits[0] = '\0';
     171         926 :         node->next_code_digits[1] = '\0';
     172         926 :         node->prev_code_index = node->next_code_index;
     173         926 :         node->next_code_index = 0;
     174         926 :         node->is_leaf = 0;
     175         926 :         node->last_update = last_update;
     176             :     }
     177        1328 : }
     178             : 
     179             : 
     180             : /* Update soundex code tree node with next code digit. */
     181             : static void
     182         802 : add_next_code_digit(dm_node *node, int code_index, char code_digit)
     183             : {
     184             :     /* OR in index 1 or 2. */
     185         802 :     node->next_code_index |= code_index;
     186             : 
     187         802 :     if (!node->next_code_digits[0])
     188         720 :         node->next_code_digits[0] = code_digit;
     189          82 :     else if (node->next_code_digits[0] != code_digit)
     190           4 :         node->next_code_digits[1] = code_digit;
     191         802 : }
     192             : 
     193             : 
     194             : /* Mark soundex code tree node as leaf. */
     195             : static void
     196         776 : set_leaf(dm_node *first_node[2], dm_node *last_node[2],
     197             :          dm_node *node, int ix_node)
     198             : {
     199         776 :     if (!node->is_leaf)
     200             :     {
     201         698 :         node->is_leaf = 1;
     202             : 
     203         698 :         if (first_node[ix_node] == NULL)
     204         436 :             first_node[ix_node] = node;
     205             :         else
     206         262 :             last_node[ix_node]->next[ix_node] = node;
     207             : 
     208         698 :         last_node[ix_node] = node;
     209         698 :         node->next[ix_node] = NULL;
     210             :     }
     211         776 : }
     212             : 
     213             : 
     214             : /* Find next node corresponding to code digit, or create a new node. */
     215             : static dm_node *
     216         512 : find_or_create_child_node(dm_node *parent, char code_digit,
     217             :                           ArrayBuildState *soundex)
     218             : {
     219         512 :     int         i = code_digit - '0';
     220         512 :     dm_node   **nodes = parent->children;
     221         512 :     dm_node    *node = nodes[i];
     222             : 
     223         512 :     if (node)
     224             :     {
     225             :         /* Found existing child node. Skip completed nodes. */
     226          42 :         return node->soundex_length < DM_CODE_DIGITS ? node : NULL;
     227             :     }
     228             : 
     229             :     /* Create new child node. */
     230         470 :     node = palloc_object(dm_node);
     231         470 :     nodes[i] = node;
     232             : 
     233         470 :     *node = start_node;
     234         470 :     memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));
     235         470 :     node->soundex_length = parent->soundex_length;
     236         470 :     node->soundex[node->soundex_length++] = code_digit;
     237         470 :     node->code_digit = code_digit;
     238         470 :     node->next_code_index = node->prev_code_index;
     239             : 
     240         470 :     if (node->soundex_length < DM_CODE_DIGITS)
     241             :     {
     242         438 :         return node;
     243             :     }
     244             :     else
     245             :     {
     246             :         /* Append completed soundex code to output array. */
     247          32 :         text       *out = cstring_to_text_with_len(node->soundex,
     248             :                                                    DM_CODE_DIGITS);
     249             : 
     250          32 :         accumArrayResult(soundex,
     251             :                          PointerGetDatum(out),
     252             :                          false,
     253             :                          TEXTOID,
     254             :                          CurrentMemoryContext);
     255          32 :         return NULL;
     256             :     }
     257             : }
     258             : 
     259             : 
     260             : /* Update node for next code digit(s). */
     261             : static void
     262         848 : update_node(dm_node *first_node[2], dm_node *last_node[2],
     263             :             dm_node *node, int ix_node,
     264             :             int letter_no, int prev_code_index, int next_code_index,
     265             :             const char *next_code_digits, int digit_no,
     266             :             ArrayBuildState *soundex)
     267             : {
     268             :     int         i;
     269         848 :     char        next_code_digit = next_code_digits[digit_no];
     270         848 :     int         num_dirty_nodes = 0;
     271             :     dm_node    *dirty_nodes[2];
     272             : 
     273         848 :     initialize_node(node, letter_no);
     274             : 
     275         848 :     if (node->prev_code_index && !(node->prev_code_index & prev_code_index))
     276             :     {
     277             :         /*
     278             :          * If the sound (vowel / consonant) of this letter encoding doesn't
     279             :          * correspond to the coding index of the previous letter, we skip this
     280             :          * letter encoding. Note that currently, only "J" can be either a
     281             :          * vowel or a consonant.
     282             :          */
     283          16 :         return;
     284             :     }
     285             : 
     286         832 :     if (next_code_digit == 'X' ||
     287         518 :         (digit_no == 0 &&
     288         518 :          (node->prev_code_digits[0] == next_code_digit ||
     289         486 :           node->prev_code_digits[1] == next_code_digit)))
     290             :     {
     291             :         /* The code digit is the same as one of the previous (i.e. not added). */
     292         322 :         dirty_nodes[num_dirty_nodes++] = node;
     293             :     }
     294             : 
     295         832 :     if (next_code_digit != 'X' &&
     296         518 :         (digit_no > 0 ||
     297         518 :          node->prev_code_digits[0] != next_code_digit ||
     298          32 :          node->prev_code_digits[1]))
     299             :     {
     300             :         /* The code digit is different from one of the previous (i.e. added). */
     301         512 :         node = find_or_create_child_node(node, next_code_digit, soundex);
     302         512 :         if (node)
     303             :         {
     304         480 :             initialize_node(node, letter_no);
     305         480 :             dirty_nodes[num_dirty_nodes++] = node;
     306             :         }
     307             :     }
     308             : 
     309        1634 :     for (i = 0; i < num_dirty_nodes; i++)
     310             :     {
     311             :         /* Add code digit leading to the current node. */
     312         802 :         add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);
     313             : 
     314         802 :         if (next_code_digits[++digit_no])
     315             :         {
     316          26 :             update_node(first_node, last_node, dirty_nodes[i], ix_node,
     317             :                         letter_no, prev_code_index, next_code_index,
     318             :                         next_code_digits, digit_no,
     319             :                         soundex);
     320             :         }
     321             :         else
     322             :         {
     323             :             /* Add incomplete leaf node to linked list. */
     324         776 :             set_leaf(first_node, last_node, dirty_nodes[i], ix_node);
     325             :         }
     326             :     }
     327             : }
     328             : 
     329             : 
     330             : /* Update soundex tree leaf nodes. */
     331             : static void
     332         446 : update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,
     333             :               const dm_codes *codes, const dm_codes *next_codes,
     334             :               ArrayBuildState *soundex)
     335             : {
     336             :     int         i,
     337             :                 j,
     338             :                 code_index;
     339             :     dm_node    *node,
     340             :                *last_node[2];
     341             :     const dm_code *code,
     342             :                *next_code;
     343         446 :     int         ix_node_next = (*ix_node + 1) & 1;  /* Alternating index: 0, 1 */
     344             : 
     345             :     /* Initialize for new linked list of leaves. */
     346         446 :     first_node[ix_node_next] = NULL;
     347         446 :     last_node[ix_node_next] = NULL;
     348             : 
     349             :     /* Process all nodes. */
     350        1086 :     for (node = first_node[*ix_node]; node; node = node->next[*ix_node])
     351             :     {
     352             :         /* One or two alternate code sequences. */
     353        1380 :         for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)
     354             :         {
     355             :             /* Coding for previous letter - before vowel: 1, all other: 2 */
     356         740 :             int         prev_code_index = (code[0][0] > '1') + 1;
     357             : 
     358             :             /* One or two alternate next code sequences. */
     359        1562 :             for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)
     360             :             {
     361             :                 /* Determine which code to use. */
     362         822 :                 if (letter_no == 0)
     363             :                 {
     364             :                     /* This is the first letter. */
     365          96 :                     code_index = 0;
     366             :                 }
     367         726 :                 else if (next_code[0][0] <= '1')
     368             :                 {
     369             :                     /* The next letter is a vowel. */
     370         194 :                     code_index = 1;
     371             :                 }
     372             :                 else
     373             :                 {
     374             :                     /* All other cases. */
     375         532 :                     code_index = 2;
     376             :                 }
     377             : 
     378             :                 /* One or two sequential code digits. */
     379         822 :                 update_node(first_node, last_node, node, ix_node_next,
     380             :                             letter_no, prev_code_index, code_index,
     381         822 :                             code[code_index], 0,
     382             :                             soundex);
     383             :             }
     384             :         }
     385             :     }
     386             : 
     387         446 :     *ix_node = ix_node_next;
     388         446 : }
     389             : 
     390             : 
     391             : /*
     392             :  * Return next character, converted from UTF-8 to uppercase ASCII.
     393             :  * *ix is the current string index and is incremented by the character length.
     394             :  */
     395             : static char
     396         906 : read_char(const unsigned char *str, int *ix)
     397             : {
     398             :     /* Substitute character for skipped code points. */
     399         906 :     const char  na = '\x1a';
     400             :     pg_wchar    c;
     401             : 
     402             :     /* Decode UTF-8 character to ISO 10646 code point. */
     403         906 :     str += *ix;
     404         906 :     c = utf8_to_unicode(str);
     405             : 
     406             :     /* Advance *ix, but (for safety) not if we've reached end of string. */
     407         906 :     if (c)
     408         788 :         *ix += pg_utf_mblen(str);
     409             : 
     410             :     /* Convert. */
     411         906 :     if (c >= (unsigned char) '[' && c <= (unsigned char) ']')
     412             :     {
     413             :         /* ASCII characters [, \, and ] are reserved for conversions below. */
     414           0 :         return na;
     415             :     }
     416         906 :     else if (c < 0x60)
     417             :     {
     418             :         /* Other non-lowercase ASCII characters can be used as-is. */
     419         308 :         return (char) c;
     420             :     }
     421         598 :     else if (c < 0x100)
     422             :     {
     423             :         /* ISO-8859-1 code point; convert to upper-case ASCII via table. */
     424         590 :         return iso8859_1_to_ascii_upper[c - 0x60];
     425             :     }
     426             :     else
     427             :     {
     428             :         /* Conversion of non-ASCII characters in the coding chart. */
     429           8 :         switch (c)
     430             :         {
     431           2 :             case 0x0104:        /* LATIN CAPITAL LETTER A WITH OGONEK */
     432             :             case 0x0105:        /* LATIN SMALL LETTER A WITH OGONEK */
     433           2 :                 return '[';
     434           2 :             case 0x0118:        /* LATIN CAPITAL LETTER E WITH OGONEK */
     435             :             case 0x0119:        /* LATIN SMALL LETTER E WITH OGONEK */
     436           2 :                 return '\\';
     437           4 :             case 0x0162:        /* LATIN CAPITAL LETTER T WITH CEDILLA */
     438             :             case 0x0163:        /* LATIN SMALL LETTER T WITH CEDILLA */
     439             :             case 0x021A:        /* LATIN CAPITAL LETTER T WITH COMMA BELOW */
     440             :             case 0x021B:        /* LATIN SMALL LETTER T WITH COMMA BELOW */
     441           4 :                 return ']';
     442           0 :             default:
     443           0 :                 return na;
     444             :         }
     445             :     }
     446             : }
     447             : 
     448             : 
     449             : /* Read next ASCII character, skipping any characters not in [A-\]]. */
     450             : static char
     451         898 : read_valid_char(const char *str, int *ix)
     452             : {
     453             :     char        c;
     454             : 
     455         906 :     while ((c = read_char((const unsigned char *) str, ix)) != '\0')
     456             :     {
     457         788 :         if (c >= 'A' && c <= ']')
     458         780 :             break;
     459             :     }
     460             : 
     461         898 :     return c;
     462             : }
     463             : 
     464             : 
     465             : /* Return sound coding for "letter" (letter sequence) */
     466             : static const dm_codes *
     467         516 : read_letter(const char *str, int *ix)
     468             : {
     469             :     char        c,
     470             :                 cmp;
     471             :     int         i,
     472             :                 j;
     473             :     const dm_letter *letters;
     474             :     const dm_codes *codes;
     475             : 
     476             :     /* First letter in sequence. */
     477         516 :     if ((c = read_valid_char(str, ix)) == '\0')
     478          70 :         return NULL;
     479             : 
     480         446 :     letters = &letter_[c - 'A'];
     481         446 :     codes = letters->codes;
     482         446 :     i = *ix;
     483             : 
     484             :     /* Any subsequent letters in sequence. */
     485         530 :     while ((letters = letters->letters) && (c = read_valid_char(str, &i)))
     486             :     {
     487        1216 :         for (j = 0; (cmp = letters[j].letter); j++)
     488             :         {
     489         966 :             if (cmp == c)
     490             :             {
     491             :                 /* Letter found. */
     492          84 :                 letters = &letters[j];
     493          84 :                 if (letters->codes)
     494             :                 {
     495             :                     /* Coding for letter sequence found. */
     496          80 :                     codes = letters->codes;
     497          80 :                     *ix = i;
     498             :                 }
     499          84 :                 break;
     500             :             }
     501             :         }
     502         334 :         if (!cmp)
     503             :         {
     504             :             /* The sequence of letters has no coding. */
     505         250 :             break;
     506             :         }
     507             :     }
     508             : 
     509         446 :     return codes;
     510             : }
     511             : 
     512             : 
     513             : /*
     514             :  * Generate all Daitch-Mokotoff soundex codes for word,
     515             :  * adding them to the "soundex" ArrayBuildState.
     516             :  * Returns false if string has no encodable characters, else true.
     517             :  */
     518             : static bool
     519          70 : daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
     520             : {
     521          70 :     int         i = 0;
     522          70 :     int         letter_no = 0;
     523          70 :     int         ix_node = 0;
     524             :     const dm_codes *codes,
     525             :                *next_codes;
     526             :     dm_node    *first_node[2],
     527             :                *node;
     528             : 
     529             :     /* First letter. */
     530          70 :     if (!(codes = read_letter(word, &i)))
     531             :     {
     532             :         /* No encodable character in input. */
     533           0 :         return false;
     534             :     }
     535             : 
     536             :     /* Starting point. */
     537          70 :     first_node[ix_node] = palloc_object(dm_node);
     538          70 :     *first_node[ix_node] = start_node;
     539             : 
     540             :     /*
     541             :      * Loop until either the word input is exhausted, or all generated soundex
     542             :      * codes are completed to six digits.
     543             :      */
     544         516 :     while (codes && first_node[ix_node])
     545             :     {
     546         446 :         next_codes = read_letter(word, &i);
     547             : 
     548             :         /* Update leaf nodes. */
     549         446 :         update_leaves(first_node, &ix_node, letter_no,
     550             :                       codes, next_codes ? next_codes : end_codes,
     551             :                       soundex);
     552             : 
     553         446 :         codes = next_codes;
     554         446 :         letter_no++;
     555             :     }
     556             : 
     557             :     /* Append all remaining (incomplete) soundex codes to output array. */
     558         198 :     for (node = first_node[ix_node]; node; node = node->next[ix_node])
     559             :     {
     560         128 :         text       *out = cstring_to_text_with_len(node->soundex,
     561             :                                                    DM_CODE_DIGITS);
     562             : 
     563         128 :         accumArrayResult(soundex,
     564             :                          PointerGetDatum(out),
     565             :                          false,
     566             :                          TEXTOID,
     567             :                          CurrentMemoryContext);
     568             :     }
     569             : 
     570          70 :     return true;
     571             : }

Generated by: LCOV version 1.14