LCOV - code coverage report
Current view: top level - contrib/fuzzystrmatch - daitch_mokotoff.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 95.7 % 162 155
Test Date: 2026-03-03 14:15:12 Functions: 100.0 % 12 12
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*
       2              :  * Daitch-Mokotoff Soundex
       3              :  *
       4              :  * Copyright (c) 2023-2026, PostgreSQL Global Development Group
       5              :  *
       6              :  * This module was originally sponsored by Finance Norway /
       7              :  * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
       8              :  *
       9              :  * The implementation of the Daitch-Mokotoff Soundex System aims at correctness
      10              :  * and high performance, and can be summarized as follows:
      11              :  *
      12              :  * - The processing of each phoneme is initiated by an O(1) table lookup.
      13              :  * - For phonemes containing more than one character, a coding tree is traversed
      14              :  *   to process the complete phoneme.
      15              :  * - The (alternate) soundex codes are produced digit by digit in-place in
      16              :  *   another tree structure.
      17              :  *
      18              :  * References:
      19              :  *
      20              :  * https://www.avotaynu.com/soundex.htm
      21              :  * https://www.jewishgen.org/InfoFiles/Soundex.html
      22              :  * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
      23              :  * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
      24              :  * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
      25              :  * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
      26              :  *
      27              :  * A few notes on other implementations:
      28              :  *
      29              :  * - All other known implementations have the same unofficial rules for "UE",
      30              :  *   these are also adapted by this implementation (0, 1, NC).
      31              :  * - The only other known implementation which is capable of generating all
      32              :  *   correct soundex codes in all cases is the JOS Soundex Calculator at
      33              :  *   https://www.jewishgen.org/jos/jossound.htm
      34              :  * - "J" is considered (only) a vowel in dmlat.php
      35              :  * - The official rules for "RS" are commented out in dmlat.php
      36              :  * - Identical code digits for adjacent letters are not collapsed correctly in
      37              :  *   dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
      38              :  *   744300 instead of 743000 as for "BEST".
      39              :  * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
      40              :  * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
      41              : */
      42              : 
      43              : #include "postgres.h"
      44              : 
      45              : #include "catalog/pg_type.h"
      46              : #include "mb/pg_wchar.h"
      47              : #include "utils/array.h"
      48              : #include "utils/builtins.h"
      49              : #include "utils/memutils.h"
      50              : 
      51              : 
      52              : /*
      53              :  * The soundex coding chart table is adapted from
      54              :  * https://www.jewishgen.org/InfoFiles/Soundex.html
      55              :  * See daitch_mokotoff_header.pl for details.
      56              : */
      57              : 
      58              : /* Generated coding chart table */
      59              : #include "daitch_mokotoff.h"
      60              : 
      61              : #define DM_CODE_DIGITS 6
      62              : 
      63              : /* Node in soundex code tree */
      64              : typedef struct dm_node
      65              : {
      66              :     int         soundex_length; /* Length of generated soundex code */
      67              :     char        soundex[DM_CODE_DIGITS];    /* Soundex code */
      68              :     int         is_leaf;        /* Candidate for complete soundex code */
      69              :     int         last_update;    /* Letter number for last update of node */
      70              :     char        code_digit;     /* Last code digit, 0 - 9 */
      71              : 
      72              :     /*
      73              :      * One or two alternate code digits leading to this node. If there are two
      74              :      * digits, one of them is always an 'X'. Repeated code digits and 'X' lead
      75              :      * back to the same node.
      76              :      */
      77              :     char        prev_code_digits[2];
      78              :     /* One or two alternate code digits moving forward. */
      79              :     char        next_code_digits[2];
      80              :     /* ORed together code index(es) used to reach current node. */
      81              :     int         prev_code_index;
      82              :     int         next_code_index;
      83              :     /* Possible nodes branching out from this node - digits 0-9. */
      84              :     struct dm_node *children[10];
      85              :     /* Next node in linked list. Alternating index for each iteration. */
      86              :     struct dm_node *next[2];
      87              : } dm_node;
      88              : 
      89              : /* Template for new node in soundex code tree. */
      90              : static const dm_node start_node = {
      91              :     .soundex_length = 0,
      92              :     .soundex = "000000",      /* Six digits */
      93              :     .is_leaf = 0,
      94              :     .last_update = 0,
      95              :     .code_digit = '\0',
      96              :     .prev_code_digits = {'\0', '\0'},
      97              :     .next_code_digits = {'\0', '\0'},
      98              :     .prev_code_index = 0,
      99              :     .next_code_index = 0,
     100              :     .children = {NULL},
     101              :     .next = {NULL}
     102              : };
     103              : 
     104              : /* Dummy soundex codes at end of input. */
     105              : static const dm_codes end_codes[2] =
     106              : {
     107              :     {
     108              :         "X", "X", "X"
     109              :     }
     110              : };
     111              : 
     112              : /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
     113              : static const char iso8859_1_to_ascii_upper[] =
     114              : "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~                                  !                             ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";
     115              : 
     116              : /* Internal C implementation */
     117              : static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);
     118              : 
     119              : 
     120            3 : PG_FUNCTION_INFO_V1(daitch_mokotoff);
     121              : 
     122              : Datum
     123           35 : daitch_mokotoff(PG_FUNCTION_ARGS)
     124              : {
     125           35 :     text       *arg = PG_GETARG_TEXT_PP(0);
     126              :     Datum       retval;
     127              :     char       *string;
     128              :     ArrayBuildState *soundex;
     129              :     MemoryContext old_ctx,
     130              :                 tmp_ctx;
     131              : 
     132              :     /* Work in a temporary context to simplify cleanup. */
     133           35 :     tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,
     134              :                                     "daitch_mokotoff temporary context",
     135              :                                     ALLOCSET_DEFAULT_SIZES);
     136           35 :     old_ctx = MemoryContextSwitchTo(tmp_ctx);
     137              : 
     138              :     /* We must convert the string to UTF-8 if it isn't already. */
     139           35 :     string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),
     140              :                               PG_UTF8);
     141              : 
     142              :     /* The result is built in this ArrayBuildState. */
     143           35 :     soundex = initArrayResult(TEXTOID, tmp_ctx, false);
     144              : 
     145           35 :     if (!daitch_mokotoff_coding(string, soundex))
     146              :     {
     147              :         /* No encodable characters in input */
     148            0 :         MemoryContextSwitchTo(old_ctx);
     149            0 :         MemoryContextDelete(tmp_ctx);
     150            0 :         PG_RETURN_NULL();
     151              :     }
     152              : 
     153           35 :     retval = makeArrayResult(soundex, old_ctx);
     154              : 
     155           35 :     MemoryContextSwitchTo(old_ctx);
     156           35 :     MemoryContextDelete(tmp_ctx);
     157              : 
     158           35 :     PG_RETURN_DATUM(retval);
     159              : }
     160              : 
     161              : 
     162              : /* Initialize soundex code tree node for next code digit. */
     163              : static void
     164          664 : initialize_node(dm_node *node, int last_update)
     165              : {
     166          664 :     if (node->last_update < last_update)
     167              :     {
     168          463 :         node->prev_code_digits[0] = node->next_code_digits[0];
     169          463 :         node->prev_code_digits[1] = node->next_code_digits[1];
     170          463 :         node->next_code_digits[0] = '\0';
     171          463 :         node->next_code_digits[1] = '\0';
     172          463 :         node->prev_code_index = node->next_code_index;
     173          463 :         node->next_code_index = 0;
     174          463 :         node->is_leaf = 0;
     175          463 :         node->last_update = last_update;
     176              :     }
     177          664 : }
     178              : 
     179              : 
     180              : /* Update soundex code tree node with next code digit. */
     181              : static void
     182          401 : add_next_code_digit(dm_node *node, int code_index, char code_digit)
     183              : {
     184              :     /* OR in index 1 or 2. */
     185          401 :     node->next_code_index |= code_index;
     186              : 
     187          401 :     if (!node->next_code_digits[0])
     188          360 :         node->next_code_digits[0] = code_digit;
     189           41 :     else if (node->next_code_digits[0] != code_digit)
     190            2 :         node->next_code_digits[1] = code_digit;
     191          401 : }
     192              : 
     193              : 
     194              : /* Mark soundex code tree node as leaf. */
     195              : static void
     196          388 : set_leaf(dm_node *first_node[2], dm_node *last_node[2],
     197              :          dm_node *node, int ix_node)
     198              : {
     199          388 :     if (!node->is_leaf)
     200              :     {
     201          349 :         node->is_leaf = 1;
     202              : 
     203          349 :         if (first_node[ix_node] == NULL)
     204          218 :             first_node[ix_node] = node;
     205              :         else
     206          131 :             last_node[ix_node]->next[ix_node] = node;
     207              : 
     208          349 :         last_node[ix_node] = node;
     209          349 :         node->next[ix_node] = NULL;
     210              :     }
     211          388 : }
     212              : 
     213              : 
     214              : /* Find next node corresponding to code digit, or create a new node. */
     215              : static dm_node *
     216          256 : find_or_create_child_node(dm_node *parent, char code_digit,
     217              :                           ArrayBuildState *soundex)
     218              : {
     219          256 :     int         i = code_digit - '0';
     220          256 :     dm_node   **nodes = parent->children;
     221          256 :     dm_node    *node = nodes[i];
     222              : 
     223          256 :     if (node)
     224              :     {
     225              :         /* Found existing child node. Skip completed nodes. */
     226           21 :         return node->soundex_length < DM_CODE_DIGITS ? node : NULL;
     227              :     }
     228              : 
     229              :     /* Create new child node. */
     230          235 :     node = palloc_object(dm_node);
     231          235 :     nodes[i] = node;
     232              : 
     233          235 :     *node = start_node;
     234          235 :     memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));
     235          235 :     node->soundex_length = parent->soundex_length;
     236          235 :     node->soundex[node->soundex_length++] = code_digit;
     237          235 :     node->code_digit = code_digit;
     238          235 :     node->next_code_index = node->prev_code_index;
     239              : 
     240          235 :     if (node->soundex_length < DM_CODE_DIGITS)
     241              :     {
     242          219 :         return node;
     243              :     }
     244              :     else
     245              :     {
     246              :         /* Append completed soundex code to output array. */
     247           16 :         text       *out = cstring_to_text_with_len(node->soundex,
     248              :                                                    DM_CODE_DIGITS);
     249              : 
     250           16 :         accumArrayResult(soundex,
     251              :                          PointerGetDatum(out),
     252              :                          false,
     253              :                          TEXTOID,
     254              :                          CurrentMemoryContext);
     255           16 :         return NULL;
     256              :     }
     257              : }
     258              : 
     259              : 
     260              : /* Update node for next code digit(s). */
     261              : static void
     262          424 : update_node(dm_node *first_node[2], dm_node *last_node[2],
     263              :             dm_node *node, int ix_node,
     264              :             int letter_no, int prev_code_index, int next_code_index,
     265              :             const char *next_code_digits, int digit_no,
     266              :             ArrayBuildState *soundex)
     267              : {
     268              :     int         i;
     269          424 :     char        next_code_digit = next_code_digits[digit_no];
     270          424 :     int         num_dirty_nodes = 0;
     271              :     dm_node    *dirty_nodes[2];
     272              : 
     273          424 :     initialize_node(node, letter_no);
     274              : 
     275          424 :     if (node->prev_code_index && !(node->prev_code_index & prev_code_index))
     276              :     {
     277              :         /*
     278              :          * If the sound (vowel / consonant) of this letter encoding doesn't
     279              :          * correspond to the coding index of the previous letter, we skip this
     280              :          * letter encoding. Note that currently, only "J" can be either a
     281              :          * vowel or a consonant.
     282              :          */
     283            8 :         return;
     284              :     }
     285              : 
     286          416 :     if (next_code_digit == 'X' ||
     287          259 :         (digit_no == 0 &&
     288          259 :          (node->prev_code_digits[0] == next_code_digit ||
     289          243 :           node->prev_code_digits[1] == next_code_digit)))
     290              :     {
     291              :         /* The code digit is the same as one of the previous (i.e. not added). */
     292          161 :         dirty_nodes[num_dirty_nodes++] = node;
     293              :     }
     294              : 
     295          416 :     if (next_code_digit != 'X' &&
     296          259 :         (digit_no > 0 ||
     297          259 :          node->prev_code_digits[0] != next_code_digit ||
     298           16 :          node->prev_code_digits[1]))
     299              :     {
     300              :         /* The code digit is different from one of the previous (i.e. added). */
     301          256 :         node = find_or_create_child_node(node, next_code_digit, soundex);
     302          256 :         if (node)
     303              :         {
     304          240 :             initialize_node(node, letter_no);
     305          240 :             dirty_nodes[num_dirty_nodes++] = node;
     306              :         }
     307              :     }
     308              : 
     309          817 :     for (i = 0; i < num_dirty_nodes; i++)
     310              :     {
     311              :         /* Add code digit leading to the current node. */
     312          401 :         add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);
     313              : 
     314          401 :         if (next_code_digits[++digit_no])
     315              :         {
     316           13 :             update_node(first_node, last_node, dirty_nodes[i], ix_node,
     317              :                         letter_no, prev_code_index, next_code_index,
     318              :                         next_code_digits, digit_no,
     319              :                         soundex);
     320              :         }
     321              :         else
     322              :         {
     323              :             /* Add incomplete leaf node to linked list. */
     324          388 :             set_leaf(first_node, last_node, dirty_nodes[i], ix_node);
     325              :         }
     326              :     }
     327              : }
     328              : 
     329              : 
     330              : /* Update soundex tree leaf nodes. */
     331              : static void
     332          223 : update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,
     333              :               const dm_codes *codes, const dm_codes *next_codes,
     334              :               ArrayBuildState *soundex)
     335              : {
     336              :     int         i,
     337              :                 j,
     338              :                 code_index;
     339              :     dm_node    *node,
     340              :                *last_node[2];
     341              :     const dm_code *code,
     342              :                *next_code;
     343          223 :     int         ix_node_next = (*ix_node + 1) & 1;  /* Alternating index: 0, 1 */
     344              : 
     345              :     /* Initialize for new linked list of leaves. */
     346          223 :     first_node[ix_node_next] = NULL;
     347          223 :     last_node[ix_node_next] = NULL;
     348              : 
     349              :     /* Process all nodes. */
     350          543 :     for (node = first_node[*ix_node]; node; node = node->next[*ix_node])
     351              :     {
     352              :         /* One or two alternate code sequences. */
     353          690 :         for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)
     354              :         {
     355              :             /* Coding for previous letter - before vowel: 1, all other: 2 */
     356          370 :             int         prev_code_index = (code[0][0] > '1') + 1;
     357              : 
     358              :             /* One or two alternate next code sequences. */
     359          781 :             for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)
     360              :             {
     361              :                 /* Determine which code to use. */
     362          411 :                 if (letter_no == 0)
     363              :                 {
     364              :                     /* This is the first letter. */
     365           48 :                     code_index = 0;
     366              :                 }
     367          363 :                 else if (next_code[0][0] <= '1')
     368              :                 {
     369              :                     /* The next letter is a vowel. */
     370           97 :                     code_index = 1;
     371              :                 }
     372              :                 else
     373              :                 {
     374              :                     /* All other cases. */
     375          266 :                     code_index = 2;
     376              :                 }
     377              : 
     378              :                 /* One or two sequential code digits. */
     379          411 :                 update_node(first_node, last_node, node, ix_node_next,
     380              :                             letter_no, prev_code_index, code_index,
     381          411 :                             code[code_index], 0,
     382              :                             soundex);
     383              :             }
     384              :         }
     385              :     }
     386              : 
     387          223 :     *ix_node = ix_node_next;
     388          223 : }
     389              : 
     390              : 
     391              : /*
     392              :  * Return next character, converted from UTF-8 to uppercase ASCII.
     393              :  * *ix is the current string index and is incremented by the character length.
     394              :  */
     395              : static char
     396          453 : read_char(const unsigned char *str, int *ix)
     397              : {
     398              :     /* Substitute character for skipped code points. */
     399          453 :     const char  na = '\x1a';
     400              :     pg_wchar    c;
     401              : 
     402              :     /* Decode UTF-8 character to ISO 10646 code point. */
     403          453 :     str += *ix;
     404          453 :     c = utf8_to_unicode(str);
     405              : 
     406              :     /* Advance *ix, but (for safety) not if we've reached end of string. */
     407          453 :     if (c)
     408          394 :         *ix += pg_utf_mblen(str);
     409              : 
     410              :     /* Convert. */
     411          453 :     if (c >= (unsigned char) '[' && c <= (unsigned char) ']')
     412              :     {
     413              :         /* ASCII characters [, \, and ] are reserved for conversions below. */
     414            0 :         return na;
     415              :     }
     416          453 :     else if (c < 0x60)
     417              :     {
     418              :         /* Other non-lowercase ASCII characters can be used as-is. */
     419          154 :         return (char) c;
     420              :     }
     421          299 :     else if (c < 0x100)
     422              :     {
     423              :         /* ISO-8859-1 code point; convert to upper-case ASCII via table. */
     424          295 :         return iso8859_1_to_ascii_upper[c - 0x60];
     425              :     }
     426              :     else
     427              :     {
     428              :         /* Conversion of non-ASCII characters in the coding chart. */
     429            4 :         switch (c)
     430              :         {
     431            1 :             case 0x0104:        /* LATIN CAPITAL LETTER A WITH OGONEK */
     432              :             case 0x0105:        /* LATIN SMALL LETTER A WITH OGONEK */
     433            1 :                 return '[';
     434            1 :             case 0x0118:        /* LATIN CAPITAL LETTER E WITH OGONEK */
     435              :             case 0x0119:        /* LATIN SMALL LETTER E WITH OGONEK */
     436            1 :                 return '\\';
     437            2 :             case 0x0162:        /* LATIN CAPITAL LETTER T WITH CEDILLA */
     438              :             case 0x0163:        /* LATIN SMALL LETTER T WITH CEDILLA */
     439              :             case 0x021A:        /* LATIN CAPITAL LETTER T WITH COMMA BELOW */
     440              :             case 0x021B:        /* LATIN SMALL LETTER T WITH COMMA BELOW */
     441            2 :                 return ']';
     442            0 :             default:
     443            0 :                 return na;
     444              :         }
     445              :     }
     446              : }
     447              : 
     448              : 
     449              : /* Read next ASCII character, skipping any characters not in [A-\]]. */
     450              : static char
     451          449 : read_valid_char(const char *str, int *ix)
     452              : {
     453              :     char        c;
     454              : 
     455          453 :     while ((c = read_char((const unsigned char *) str, ix)) != '\0')
     456              :     {
     457          394 :         if (c >= 'A' && c <= ']')
     458          390 :             break;
     459              :     }
     460              : 
     461          449 :     return c;
     462              : }
     463              : 
     464              : 
     465              : /* Return sound coding for "letter" (letter sequence) */
     466              : static const dm_codes *
     467          258 : read_letter(const char *str, int *ix)
     468              : {
     469              :     char        c,
     470              :                 cmp;
     471              :     int         i,
     472              :                 j;
     473              :     const dm_letter *letters;
     474              :     const dm_codes *codes;
     475              : 
     476              :     /* First letter in sequence. */
     477          258 :     if ((c = read_valid_char(str, ix)) == '\0')
     478           35 :         return NULL;
     479              : 
     480          223 :     letters = &letter_[c - 'A'];
     481          223 :     codes = letters->codes;
     482          223 :     i = *ix;
     483              : 
     484              :     /* Any subsequent letters in sequence. */
     485          265 :     while ((letters = letters->letters) && (c = read_valid_char(str, &i)))
     486              :     {
     487          608 :         for (j = 0; (cmp = letters[j].letter); j++)
     488              :         {
     489          483 :             if (cmp == c)
     490              :             {
     491              :                 /* Letter found. */
     492           42 :                 letters = &letters[j];
     493           42 :                 if (letters->codes)
     494              :                 {
     495              :                     /* Coding for letter sequence found. */
     496           40 :                     codes = letters->codes;
     497           40 :                     *ix = i;
     498              :                 }
     499           42 :                 break;
     500              :             }
     501              :         }
     502          167 :         if (!cmp)
     503              :         {
     504              :             /* The sequence of letters has no coding. */
     505          125 :             break;
     506              :         }
     507              :     }
     508              : 
     509          223 :     return codes;
     510              : }
     511              : 
     512              : 
     513              : /*
     514              :  * Generate all Daitch-Mokotoff soundex codes for word,
     515              :  * adding them to the "soundex" ArrayBuildState.
     516              :  * Returns false if string has no encodable characters, else true.
     517              :  */
     518              : static bool
     519           35 : daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
     520              : {
     521           35 :     int         i = 0;
     522           35 :     int         letter_no = 0;
     523           35 :     int         ix_node = 0;
     524              :     const dm_codes *codes,
     525              :                *next_codes;
     526              :     dm_node    *first_node[2],
     527              :                *node;
     528              : 
     529              :     /* First letter. */
     530           35 :     if (!(codes = read_letter(word, &i)))
     531              :     {
     532              :         /* No encodable character in input. */
     533            0 :         return false;
     534              :     }
     535              : 
     536              :     /* Starting point. */
     537           35 :     first_node[ix_node] = palloc_object(dm_node);
     538           35 :     *first_node[ix_node] = start_node;
     539              : 
     540              :     /*
     541              :      * Loop until either the word input is exhausted, or all generated soundex
     542              :      * codes are completed to six digits.
     543              :      */
     544          258 :     while (codes && first_node[ix_node])
     545              :     {
     546          223 :         next_codes = read_letter(word, &i);
     547              : 
     548              :         /* Update leaf nodes. */
     549          223 :         update_leaves(first_node, &ix_node, letter_no,
     550              :                       codes, next_codes ? next_codes : end_codes,
     551              :                       soundex);
     552              : 
     553          223 :         codes = next_codes;
     554          223 :         letter_no++;
     555              :     }
     556              : 
     557              :     /* Append all remaining (incomplete) soundex codes to output array. */
     558           99 :     for (node = first_node[ix_node]; node; node = node->next[ix_node])
     559              :     {
     560           64 :         text       *out = cstring_to_text_with_len(node->soundex,
     561              :                                                    DM_CODE_DIGITS);
     562              : 
     563           64 :         accumArrayResult(soundex,
     564              :                          PointerGetDatum(out),
     565              :                          false,
     566              :                          TEXTOID,
     567              :                          CurrentMemoryContext);
     568              :     }
     569              : 
     570           35 :     return true;
     571              : }
        

Generated by: LCOV version 2.0-1