auto-param-new-string-helper/categorizeNewStrings.js (315 lines of code) (raw):
/**
* (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
*
* @emails oncall+i18n_fbt_js
* @format
* @noflow
*/
const {
areNewAndLegacyLeavesIdentical,
forEachPairOfNewAndLegacyPhrase,
} = require('./newAndLegacyPhraseComparisonUtil');
const path = require('path');
const process = require('process');
const legacyPhrasesFile = path.resolve(process.argv[2]);
const newPhrasesFile = path.resolve(process.argv[3]);
const shouldReturnPhraseWithNewHashKey =
process.argv[4] === '--return-new-hash-key';
const legacyPhrases = require(legacyPhrasesFile);
const newPhrases = require(newPhrasesFile);
const HASH_TO_LEAF = 'hashToLeaf';
const HASH_TO_TEXT = 'hashToText';
// Please keep this in sync with https://fburl.com/code/3tjesmtm
const CATEGORY = {
SAME_HASH_BUT_ADDITIONAL_IMPLICIT_VARIATIONS:
'SAME_HASH_BUT_ADDITIONAL_IMPLICIT_VARIATIONS',
UPDATED_TEXT_DUE_TO_VARIATIONS_IN_INNER_STRING:
'UPDATED_TEXT_DUE_TO_VARIATIONS_IN_INNER_STRING',
UPDATED_TEXT_DUE_TO_REPLACING_HIDDEN_TOKEN_WITH_VARIATIONS:
'UPDATED_TEXT_DUE_TO_REPLACING_HIDDEN_TOKEN_WITH_VARIATIONS',
UPDATED_TEXT_DUE_TO_HIDDEN_INNER_STRING_TOKEN_NAME:
'UPDATED_TEXT_DUE_TO_HIDDEN_INNER_STRING_TOKEN_NAME',
UPDATED_TEXT_DUE_TO_LEADING_OR_TRIALING_SPACE:
'UPDATED_TEXT_DUE_TO_LEADING_OR_TRIALING_SPACE',
UPDATED_TEXT_DUE_TO_FIXING_UNPRESERVED_WHITESPACES:
'UPDATED_TEXT_DUE_TO_FIXING_UNPRESERVED_WHITESPACES',
UPDATED_TEXT_DUE_TO_OTHER_REASON: 'UPDATED_TEXT_DUE_TO_OTHER_REASON',
UPDATED_DESC_DUE_TO_SPACES_IN_FRONT_OF_INNER_STRINGS:
'UPDATED_DESC_DUE_TO_SPACES_IN_FRONT_OF_INNER_STRINGS',
UPDATED_DESC_DUE_TO_HIDDEN_FBT_PARAM_TOKEN_NAME:
'UPDATED_DESC_DUE_TO_HIDDEN_FBT_PARAM_TOKEN_NAME',
UPDATED_DESC_DUE_TO_VARIATIONS: 'UPDATED_DESC_DUE_TO_VARIATIONS',
UPDATED_DESC_DUE_TO_HIDDEN_TOKEN_AND_ADDED_VARIATIONS:
'UPDATED_DESC_DUE_TO_HIDDEN_TOKEN_AND_ADDED_VARIATIONS',
UPDATED_DESC_DUE_TO_OTHER_REASON: 'UPDATED_DESC_DUE_TO_OTHER_REASON',
// A new string's `hash_key` could be different from the old one for two reasons:
// 1. the new string has updated text or description
// 2. the new string has the same text and descrition as the legacy string,
// but it now has `tokenAliases` because it contains inner strings
//
// Category `SAME_HASH_BUT_UPDATED_HASH_KEY` only covers the second type.
SAME_HASH_BUT_UPDATED_HASH_KEY: 'SAME_HASH_BUT_UPDATED_HASH_KEY',
};
/**
* Compare each pair of new phrase and legacy phrase, and categorize any new string
* found in the legacy phrase.
* Read https://fburl.com/gdoc/d7evsumh for detailed categorization results and
* examples.
*/
const categoryToCnt = {};
const newStringByCategory = {};
const addedPhrases = [];
const removedPhrases = [];
Object.keys(CATEGORY).map(category => {
categoryToCnt[category] = 0;
newStringByCategory[category] = {};
});
forEachPairOfNewAndLegacyPhrase(
newPhrases,
legacyPhrases,
(newPhrase, legacyPhrase) =>
categorizeStringsInPhrase(newPhrase, legacyPhrase, newStringByCategory),
(newPhrase, _) => addedPhrases.push(newPhrase),
(_, legacyPhrase) => removedPhrases.push(legacyPhrase),
);
Object.keys(CATEGORY).map(category => {
categoryToCnt[category] = Object.keys(newStringByCategory[category]).length;
});
console.log(
JSON.stringify(
{categoryToCnt, newStringByCategory, addedPhrases, removedPhrases},
null,
2,
),
);
/**
* Compare a phrase with its legacy phrase and categorize every new string
*/
function categorizeStringsInPhrase(
newPhrase,
legacyPhraseMaybeInLegacyShape,
newStringByCategory,
) /*: void */ {
const legacyPhrase = convertLegacyPhraseToNewShape(
legacyPhraseMaybeInLegacyShape,
);
const legacyLeaves = legacyPhrase[HASH_TO_LEAF];
const newLeaves = newPhrase[HASH_TO_LEAF];
if (areNewAndLegacyLeavesIdentical(newLeaves, legacyLeaves)) {
const commonResult = {
project: newPhrase.project,
filepath: newPhrase.filepath,
lineBeg: newPhrase.line_beg,
lineEnd: newPhrase.line_end,
hashKey: newPhrase.hash_key,
legacyHashKey: legacyPhrase.hash_key,
};
if (
newPhrase.hash_key !== legacyPhrase.hash_key &&
shouldReturnPhraseWithNewHashKey
) {
for (const [hash, {desc, text}] of Object.entries(newLeaves)) {
newStringByCategory[CATEGORY.SAME_HASH_BUT_UPDATED_HASH_KEY][hash] = {
...commonResult,
legacyHash: hash,
text,
legacyText: text,
desc,
legacyDesc: desc,
};
}
return;
}
const {m: mNew} = newPhrase.jsfbt;
const {m: mOld} = legacyPhrase.jsfbt;
if (mNew.length > mOld.length) {
for (const [hash, {desc, text}] of Object.entries(newLeaves)) {
newStringByCategory[
CATEGORY.SAME_HASH_BUT_ADDITIONAL_IMPLICIT_VARIATIONS
][hash] = {
legacyHash: hash,
text,
legacyText: text,
desc,
legacyDesc: desc,
legacyM: JSON.stringify(mOld),
m: JSON.stringify(mNew),
...commonResult,
};
}
}
return;
}
// Categorize each leaf(which is potentially upgraded) in the new phrase
const legacyTexts = Object.values(legacyLeaves).map(({text}) => text);
Object.entries(newLeaves).map(newLeaf => {
if (legacyTexts.includes(newLeaf[1].text)) {
categorizeLeafWithUnchangedText(
newLeaf,
legacyLeaves,
newStringByCategory,
newPhrase,
legacyPhrase,
);
return;
}
categorizeLeafWithUpdatedText(
newLeaf,
legacyLeaves,
newStringByCategory,
newPhrase,
legacyPhrase,
);
});
}
/**
* Categorize a string(`newLeaf`) whose `text` is updated
*/
function categorizeLeafWithUpdatedText(
newLeaf,
legacyLeaves,
newStringByCategory,
newPhrase,
legacyPhrase,
) /*: void */ {
const innerStringTokenRegex = /{=[^}]+}/g;
const spaceRegex = /\s/g;
const {m: mNew} = newPhrase.jsfbt;
const {m: mOld} = legacyPhrase.jsfbt;
const [hash, {desc, text}] = newLeaf;
for (const [
legacyHash,
{desc: legacyDesc, text: legacyText},
] of Object.entries(legacyLeaves)) {
const result = {
legacyHash,
text,
legacyText,
desc,
legacyDesc,
hashKey: newPhrase.hash_key,
legacyHashKey: legacyPhrase.hash_key,
project: newPhrase.project,
filepath: newPhrase.filepath,
lineBeg: newPhrase.line_beg,
lineEnd: newPhrase.line_end,
};
// CASE 1:
// Inner string token is mis-represented as "{=}" in legacy string.
// Consider this fbt:
if (
legacyText.indexOf('{=}') !== -1 &&
// After fixing the hidden token {=}, the legacy text and new text should be identical
text.replace(innerStringTokenRegex, '') === legacyText.replace(/{=}/g, '')
) {
if (mNew.length > mOld.length) {
// If new phrase has new variations, they must be in an inner string
newStringByCategory[
CATEGORY.UPDATED_TEXT_DUE_TO_REPLACING_HIDDEN_TOKEN_WITH_VARIATIONS
][hash] = result;
} else {
newStringByCategory[
CATEGORY.UPDATED_TEXT_DUE_TO_HIDDEN_INNER_STRING_TOKEN_NAME
][hash] = result;
}
return;
}
// CASE 2:
// The new string differs from the legacy string due to updated text variation.
// When an fbt callsite have string variation arguments nested inside of inner strings,
// new string collection script will extract more variation strings in this scenario.
if (
text.replace(innerStringTokenRegex, '') ===
// With inner string and hidden token removed, the legacy string should be the same as inner string
legacyText.replace(innerStringTokenRegex, '').replace(/{=}/g, '')
) {
newStringByCategory[
CATEGORY.UPDATED_TEXT_DUE_TO_VARIATIONS_IN_INNER_STRING
][hash] = result;
return;
}
// CASE 3:
// Some legacy strings have leading and trailing space(s), whereas new strings are always
// trimmed on both ends.
if (text === legacyText.trim()) {
newStringByCategory[
CATEGORY.UPDATED_TEXT_DUE_TO_LEADING_OR_TRIALING_SPACE
][hash] = result;
return;
}
// CASE 4:
// White spaces (including \t and \n) were sometimes not preserved in old
// string.
if (text.replace(spaceRegex, '') === legacyText.replace(spaceRegex, '')) {
newStringByCategory[
CATEGORY.UPDATED_TEXT_DUE_TO_FIXING_UNPRESERVED_WHITESPACES
][hash] = result;
return;
}
}
newStringByCategory[CATEGORY.UPDATED_TEXT_DUE_TO_OTHER_REASON][hash] = {
text,
desc,
legacyLeaves,
};
}
/**
* Categorize a string(`newLeaf`) whose `text` is identical to an existing string but
* `desc` is potentially different
*/
function categorizeLeafWithUnchangedText(
newLeaf,
legacyLeaves,
newStringByCategory,
newPhrase,
legacyPhrase,
) /*: void */ {
const legacyLeavesWithSameText = Object.entries(legacyLeaves).filter(
([_, {text: legacyText}]) => newLeaf[1].text === legacyText,
);
if (legacyLeavesWithSameText.length !== 1) {
const err = new Error(
'A string extracted from OSS script should have at most one legacy string with the same text.',
);
err.stack;
throw err;
}
const legacyLeaf = legacyLeavesWithSameText[0];
const [legacyHash, {text: legacyText}] = legacyLeaf;
let {desc: legacyDesc} = legacyLeaf[1];
const [hash, {text}] = newLeaf;
let {desc} = newLeaf[1];
if (hash === legacyHash) {
return;
}
const result = {
legacyHash,
text,
legacyText,
desc,
legacyDesc,
hashKey: newPhrase.hash_key,
legacyHashKey: legacyPhrase.hash_key,
project: newPhrase.project,
filepath: newPhrase.filepath,
lineBeg: newPhrase.line_beg,
lineEnd: newPhrase.line_end,
};
// CASE 1:
// When generating descriptions, the legacy script does not respect
// the explicit whitespaces in front of inner strings. On the contrary, explicit
// whitespaces are always included in the description by the new string collection script.
if (
(desc = desc.replace(/\s*{=/g, '{=')) ===
(legacyDesc = legacyDesc.replace(/\s*{=/g, '{='))
) {
newStringByCategory[
CATEGORY.UPDATED_DESC_DUE_TO_SPACES_IN_FRONT_OF_INNER_STRINGS
][hash] = result;
return;
}
// CASE 2:
// New string is created due to added variations in description.
const {m: mNew} = newPhrase.jsfbt;
const {m: mOld} = legacyPhrase.jsfbt;
const innerStringTokenRegex = /\s*{=[^}]+}\s*/g;
if (
mNew.length > mOld.length ||
// This is the case where the `newPhrase` is an inner string and it contains variations
desc.replace(innerStringTokenRegex, '') ===
legacyDesc.replace(innerStringTokenRegex, '')
) {
if (legacyDesc.indexOf('{=}') !== -1) {
newStringByCategory[
CATEGORY.UPDATED_DESC_DUE_TO_HIDDEN_TOKEN_AND_ADDED_VARIATIONS
][hash] = result;
} else {
newStringByCategory[CATEGORY.UPDATED_DESC_DUE_TO_VARIATIONS][hash] =
result;
}
return;
}
// CASE 3:
// Fbt:param construct are mis-represented as "{=}" in the string description
// by the legacy script. In the latest version, "{=}" is replaced with the
// actual param name, which causes new descriptions to be generated.
if (legacyDesc.indexOf('{=}') !== -1) {
newStringByCategory[
CATEGORY.UPDATED_DESC_DUE_TO_HIDDEN_FBT_PARAM_TOKEN_NAME
][hash] = result;
return;
}
// CASE 4:
// Other small un-categorizable changes
newStringByCategory[CATEGORY.UPDATED_DESC_DUE_TO_OTHER_REASON][hash] = {
...result,
legacyJsfbt: legacyPhrase.jsfbt,
};
}
/**
* If the phrase is in legacy format, this function converts it to the new
* phrase format:
* 1. Replace `hashToText` with `hashToLeaf`
* 2. Replace plain text `jsfbt` with `{t: {text: ..., desc: ....}, m: []}`
* 3. Remove `desc` and `type`
*/
function convertLegacyPhraseToNewShape(legacyPhrase) {
const {desc} = legacyPhrase;
const hashToText = legacyPhrase[HASH_TO_TEXT];
if (desc == null && hashToText == null) {
return legacyPhrase;
}
const hashToLeaf = {};
for (const hash in hashToText) {
hashToLeaf[hash] = {
text: hashToText[hash],
desc,
};
}
legacyPhrase[HASH_TO_LEAF] = hashToLeaf;
if (typeof legacyPhrase.jsfbt === 'string') {
legacyPhrase.jsfbt = {
t: {
text: legacyPhrase.jsfbt,
desc,
},
m: [],
};
}
delete legacyPhrase.desc;
delete legacyPhrase.type;
return legacyPhrase;
}