in src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs [92:399]
public virtual string GetMetaphone(string txt)
{
bool hard; // LUCENENET: IDE0059: Remove unnecessary value assignment
if (txt is null || txt.Length == 0)
{
return "";
}
// single character is itself
if (txt.Length == 1)
{
return LOCALE_ENGLISH.TextInfo.ToUpper(txt);
}
char[] inwd = LOCALE_ENGLISH.TextInfo.ToUpper(txt).ToCharArray();
StringBuilder local = new StringBuilder(40); // manipulate
StringBuilder code = new StringBuilder(10); // output
// handle initial 2 characters exceptions
switch (inwd[0])
{
case 'K':
case 'G':
case 'P': /* looking for KN, etc*/
if (inwd[1] == 'N')
{
local.Append(inwd, 1, inwd.Length - 1);
}
else
{
local.Append(inwd);
}
break;
case 'A': /* looking for AE */
if (inwd[1] == 'E')
{
local.Append(inwd, 1, inwd.Length - 1);
}
else
{
local.Append(inwd);
}
break;
case 'W': /* looking for WR or WH */
if (inwd[1] == 'R')
{ // WR -> R
local.Append(inwd, 1, inwd.Length - 1);
break;
}
if (inwd[1] == 'H')
{
local.Append(inwd, 1, inwd.Length - 1);
local[0] = 'W'; // WH -> W
}
else
{
local.Append(inwd);
}
break;
case 'X': /* initial X becomes S */
inwd[0] = 'S';
local.Append(inwd);
break;
default:
local.Append(inwd);
break;
} // now local has working string with initials fixed
int wdsz = local.Length;
int n = 0;
while (code.Length < this.MaxCodeLen &&
n < wdsz)
{ // max code size of 4 works well
char symb = local[n];
// remove duplicate letters except C
if (symb != 'C' && IsPreviousChar(local, n, symb))
{
n++;
}
else
{ // not dup
switch (symb)
{
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
if (n == 0)
{
code.Append(symb);
}
break; // only use vowel if leading char
case 'B':
if (IsPreviousChar(local, n, 'M') &&
IsLastChar(wdsz, n))
{ // B is silent if word ends in MB
break;
}
code.Append(symb);
break;
case 'C': // lots of C special cases
/* discard if SCI, SCE or SCY */
if (IsPreviousChar(local, n, 'S') &&
!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0)
{
break;
}
if (RegionMatch(local, n, "CIA"))
{ // "CIA" -> X
code.Append('X');
break;
}
if (!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0)
{
code.Append('S');
break; // CI,CE,CY -> S
}
if (IsPreviousChar(local, n, 'S') &&
IsNextChar(local, n, 'H'))
{ // SCH->sk
code.Append('K');
break;
}
if (IsNextChar(local, n, 'H'))
{ // detect CH
if (n == 0 &&
wdsz >= 3 &&
IsVowel(local, 2))
{ // CH consonant -> K consonant
code.Append('K');
}
else
{
code.Append('X'); // CHvowel -> X
}
}
else
{
code.Append('K');
}
break;
case 'D':
if (!IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'G') &&
FRONTV.IndexOf(local[n + 2]) >= 0)
{ // DGE DGI DGY -> J
code.Append('J'); n += 2;
}
else
{
code.Append('T');
}
break;
case 'G': // GH silent at end or before consonant
if (IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'H'))
{
break;
}
if (!IsLastChar(wdsz, n + 1) &&
IsNextChar(local, n, 'H') &&
!IsVowel(local, n + 2))
{
break;
}
if (n > 0 &&
(RegionMatch(local, n, "GN") ||
RegionMatch(local, n, "GNED")))
{
break; // silent G
}
if (IsPreviousChar(local, n, 'G'))
{
// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
hard = true;
}
else
{
hard = false;
}
if (!IsLastChar(wdsz, n) &&
FRONTV.IndexOf(local[n + 1]) >= 0 &&
!hard)
{
code.Append('J');
}
else
{
code.Append('K');
}
break;
case 'H':
if (IsLastChar(wdsz, n))
{
break; // terminal H
}
if (n > 0 &&
VARSON.IndexOf(local[n - 1]) >= 0)
{
break;
}
if (IsVowel(local, n + 1))
{
code.Append('H'); // Hvowel
}
break;
case 'F':
case 'J':
case 'L':
case 'M':
case 'N':
case 'R':
code.Append(symb);
break;
case 'K':
if (n > 0)
{ // not initial
if (!IsPreviousChar(local, n, 'C'))
{
code.Append(symb);
}
}
else
{
code.Append(symb); // initial K
}
break;
case 'P':
if (IsNextChar(local, n, 'H'))
{
// PH -> F
code.Append('F');
}
else
{
code.Append(symb);
}
break;
case 'Q':
code.Append('K');
break;
case 'S':
if (RegionMatch(local, n, "SH") ||
RegionMatch(local, n, "SIO") ||
RegionMatch(local, n, "SIA"))
{
code.Append('X');
}
else
{
code.Append('S');
}
break;
case 'T':
if (RegionMatch(local, n, "TIA") ||
RegionMatch(local, n, "TIO"))
{
code.Append('X');
break;
}
if (RegionMatch(local, n, "TCH"))
{
// Silent if in "TCH"
break;
}
// substitute numeral 0 for TH (resembles theta after all)
if (RegionMatch(local, n, "TH"))
{
code.Append('0');
}
else
{
code.Append('T');
}
break;
case 'V':
code.Append('F'); break;
case 'W':
case 'Y': // silent if not followed by vowel
if (!IsLastChar(wdsz, n) &&
IsVowel(local, n + 1))
{
code.Append(symb);
}
break;
case 'X':
code.Append('K');
code.Append('S');
break;
case 'Z':
code.Append('S');
break;
default:
// do nothing
break;
} // end switch
n++;
} // end else from symb != 'C'
if (code.Length > this.MaxCodeLen)
{
code.Length = this.MaxCodeLen;
}
}
return code.ToString();
}