name_utils.js

/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // Reference: https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc "use strict"; var NameUtils = { NAME_PREFIXES: [ "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", "captain", "col", "cpt", "dr", "gen", "general", "lcdr", "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", "mr", "mrs", "ms", "pastor", "prof", "rep", "reverend", "rev", "sen", "st", ], NAME_SUFFIXES: [ "b.a", "ba", "d.d.s", "dds", "i", "ii", "iii", "iv", "ix", "jr", "m.a", "m.d", "ma", "md", "ms", "ph.d", "phd", "sr", "v", "vi", "vii", "viii", "x", ], FAMILY_NAME_PREFIXES: [ "d'", "de", "del", "der", "di", "la", "le", "mc", "san", "st", "ter", "van", "von", ], WHITESPACE: [ "\u0009", // CHARACTER TABULATION "\u000A", // LINE FEED (LF) "\u000B", // LINE TABULATION "\u000C", // FORM FEED (FF) "\u000D", // CARRIAGE RETURN (CR) "\u0020", // SPACE "\u0085", // NEXT LINE (NEL) "\u00A0", // NO-BREAK SPACE "\u1680", // OGHAM SPACE MARK "\u2000", // EN QUAD "\u2001", // EM QUAD "\u2002", // EN SPACE "\u2003", // EM SPACE "\u2004", // THREE-PER-EM SPACE "\u2005", // FOUR-PER-EM SPACE "\u2006", // SIX-PER-EM SPACE "\u2007", // FIGURE SPACE "\u2008", // PUNCTUATION SPACE "\u2009", // THIN SPACE "\u200A", // HAIR SPACE "\u2028", // LINE SEPARATOR "\u2029", // PARAGRAPH SEPARATOR "\u202F", // NARROW NO-BREAK SPACE "\u205F", // MEDIUM MATHEMATICAL SPACE "\u3000", // IDEOGRAPHIC SPACE ], // The middle dot is used as a separator for foreign names in Japanese. MIDDLE_DOT: [ "\u30FB", // KATAKANA MIDDLE DOT "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT" ], CJK_RANGE: [ "\u1100-\u11FF", // Hangul Jamo "\u3040-\u309F", // Hiragana "\u30A0-\u30FF", // Katakana "\u3105-\u312C", // Bopomofo "\u3130-\u318F", // Hangul Compatibility Jamo "\u31F0-\u31FF", // Katakana Phonetic Extensions "\u3200-\u32FF", // Enclosed CJK Letters and Months "\u3400-\u4DBF", // CJK unified ideographs Extension A "\u4E00-\u9FFF", // CJK Unified Ideographs "\uA960-\uA97F", // Hangul Jamo Extended-A "\uAC00-\uD7AF", // Hangul Syllables "\uD7B0-\uD7FF", // Hangul Jamo Extended-B "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms ], HANGUL_RANGE: [ "\u1100-\u11FF", // Hangul Jamo "\u3130-\u318F", // Hangul Compatibility Jamo "\uA960-\uA97F", // Hangul Jamo Extended-A "\uAC00-\uD7AF", // Hangul Syllables "\uD7B0-\uD7FF", // Hangul Jamo Extended-B ], // The common and non-ambiguous CJK surnames (last names) that have more than // one character. COMMON_CJK_MULTI_CHAR_SURNAMES: [ // Korean, taken from the list of surnames: // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절", // Chinese, taken from the top 10 Chinese 2-character surnames: // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93 // Simplified Chinese (mostly mainland China) "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木", // Traditional Chinese (mostly Taiwan) "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯", ], // All Korean surnames that have more than one character, even the // rare/ambiguous ones. KOREAN_MULTI_CHAR_SURNAMES: [ "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", "소봉", "어금", "장곡", "제갈", "황목", "황보", ], // Returns true if |set| contains |token|, modulo a final period. _containsString(set, token) { let target = token.replace(/\.$/, "").toLowerCase(); return set.includes(target); }, // Removes common name prefixes from |name_tokens|. _stripPrefixes(nameTokens) { for (let i in nameTokens) { if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) { return nameTokens.slice(i); } } return []; }, // Removes common name suffixes from |name_tokens|. _stripSuffixes(nameTokens) { for (let i = nameTokens.length - 1; i >= 0; i--) { if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) { return nameTokens.slice(0, i + 1); } } return []; }, _isCJKName(name) { // The name is considered to be a CJK name if it is only CJK characters, // spaces, and "middle dot" separators, with at least one CJK character, and // no more than 2 words. // // Chinese and Japanese names are usually spelled out using the Han // characters (logographs), which constitute the "CJK Unified Ideographs" // block in Unicode, also referred to as Unihan. Korean names are usually // spelled out in the Korean alphabet (Hangul), although they do have a Han // equivalent as well. let reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]"); let previousWasCJK = false; let wordCount = 0; for (let c of name) { let isMiddleDot = this.MIDDLE_DOT.includes(c); let isCJK = !isMiddleDot && reCJK.test(c); if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) { return false; } if (isCJK && !previousWasCJK) { wordCount++; } previousWasCJK = isCJK; } return wordCount > 0 && wordCount < 3; }, // Tries to split a Chinese, Japanese, or Korean name into its given name & // surname parts. If splitting did not work for whatever reason, returns null. _splitCJKName(nameTokens) { // The convention for CJK languages is to put the surname (last name) first, // and the given name (first name) second. In a continuous text, there is // normally no space between the two parts of the name. When entering their // name into a field, though, some people add a space to disambiguate. CJK // names (almost) never have a middle name. let reHangulName = new RegExp( "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$"); let nameParts = { given: "", middle: "", family: "", }; if (nameTokens.length == 1) { // There is no space between the surname and given name. Try to infer // where to separate between the two. Most Chinese and Korean surnames // have only one character, but there are a few that have 2. If the name // does not start with a surname from a known list, default to one // character. let name = nameTokens[0]; let isKorean = reHangulName.test(name); let surnameLength = 0; // 4-character Korean names are more likely to be 2/2 than 1/3, so use // the full list of Korean 2-char surnames. (instead of only the common // ones) let multiCharSurnames = (isKorean && name.length > 3) ? this.KOREAN_MULTI_CHAR_SURNAMES : this.COMMON_CJK_MULTI_CHAR_SURNAMES; // Default to 1 character if the surname is not in the list. surnameLength = multiCharSurnames.some(surname => name.startsWith(surname)) ? 2 : 1; nameParts.family = name.substr(0, surnameLength); nameParts.given = name.substr(surnameLength); } else if (nameTokens.length == 2) { // The user entered a space between the two name parts. This makes our job // easier. Family name first, given name second. nameParts.family = nameTokens[0]; nameParts.given = nameTokens[1]; } else { return null; } return nameParts; }, splitName(name) { let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/); let nameParts = { given: "", middle: "", family: "", }; nameTokens = this._stripPrefixes(nameTokens); if (this._isCJKName(name)) { let parts = this._splitCJKName(nameTokens); if (parts) { return parts; } } // Don't assume "Ma" is a suffix in John Ma. if (nameTokens.length > 2) { nameTokens = this._stripSuffixes(nameTokens); } if (!nameTokens.length) { // Bad things have happened; just assume the whole thing is a given name. nameParts.given = name; return nameParts; } // Only one token, assume given name. if (nameTokens.length == 1) { nameParts.given = nameTokens[0]; return nameParts; } // 2 or more tokens. Grab the family, which is the last word plus any // recognizable family prefixes. let familyTokens = [nameTokens.pop()]; while (nameTokens.length) { let lastToken = nameTokens[nameTokens.length - 1]; if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) { break; } familyTokens.unshift(lastToken); nameTokens.pop(); } nameParts.family = familyTokens.join(" "); // Take the last remaining token as the middle name (if there are at least 2 // tokens). if (nameTokens.length >= 2) { nameParts.middle = nameTokens.pop(); } // Remainder is given name. nameParts.given = nameTokens.join(" "); return nameParts; }, joinNameParts({given, middle, family}) { if (this._isCJKName(given) && this._isCJKName(family) && middle == "") { return family + given; } return [given, middle, family].filter(part => part.length).join(" "); }, };

name_utils.js (190 lines of code) (raw):