in translation/translation.ts [43:186]
async function readData (dataFile: string) {
// Vectorize the data.
const inputTexts: string[] = [];
const targetTexts: string[] = [];
const inputCharacters = new Set<string>();
const targetCharacters = new Set<string>();
const fileStream = fs.createReadStream(dataFile);
const rl = readline.createInterface({
input: fileStream,
output: process.stdout,
terminal: false,
});
let lineNumber = 0;
rl.on('line', line => {
if (++lineNumber > args.num_samples) {
rl.close();
return;
}
let [inputText, targetText] = line.split('\t');
// We use "tab" as the "start sequence" character for the targets, and "\n"
// as "end sequence" character.
targetText = '\t' + targetText + '\n';
inputTexts.push(inputText);
targetTexts.push(targetText);
for (const char of inputText) {
if (!inputCharacters.has(char)) {
inputCharacters.add(char);
}
}
for (const char of targetText) {
if (!targetCharacters.has(char)) {
targetCharacters.add(char);
}
}
})
await new Promise(r => rl.on('close', r));
const inputCharacterList = [...inputCharacters].sort();
const targetCharacterList = [...targetCharacters].sort();
const numEncoderTokens = inputCharacterList.length;
const numDecoderTokens = targetCharacterList.length;
// Math.max() does not work with very large arrays because of the stack limitation
const maxEncoderSeqLength = inputTexts.map(text => text.length)
.reduceRight((prev, curr) => curr > prev ? curr : prev, 0);
const maxDecoderSeqLength = targetTexts.map(text => text.length)
.reduceRight((prev, curr) => curr > prev ? curr : prev, 0);
console.log('Number of samples:', inputTexts.length);
console.log('Number of unique input tokens:', numEncoderTokens);
console.log('Number of unique output tokens:', numDecoderTokens);
console.log('Max sequence length for inputs:', maxEncoderSeqLength);
console.log('Max sequence length for outputs:', maxDecoderSeqLength);
const inputTokenIndex = inputCharacterList.reduce(
(prev, curr, idx) => (prev[curr] = idx, prev),
{} as {[char: string]: number},
);
const targetTokenIndex = targetCharacterList.reduce(
(prev, curr, idx) => (prev[curr] = idx, prev),
{} as {[char: string]: number},
);
// Save the token indices to file.
const metadataJsonPath = path.join(
args.artifacts_dir,
'metadata.json',
);
if (!fs.existsSync(path.dirname(metadataJsonPath))) {
mkdirp.sync(path.dirname(metadataJsonPath));
}
const metadata = {
'input_token_index': inputTokenIndex,
'target_token_index': targetTokenIndex,
'max_encoder_seq_length': maxEncoderSeqLength,
'max_decoder_seq_length': maxDecoderSeqLength,
};
fs.writeFileSync(metadataJsonPath, JSON.stringify(metadata));
console.log('Saved metadata at: ', metadataJsonPath);
const encoderInputDataBuf = tf.buffer<tf.Rank.R3>([
inputTexts.length,
maxEncoderSeqLength,
numEncoderTokens,
]);
const decoderInputDataBuf = tf.buffer<tf.Rank.R3>([
inputTexts.length,
maxDecoderSeqLength,
numDecoderTokens,
]);
const decoderTargetDataBuf = tf.buffer<tf.Rank.R3>([
inputTexts.length,
maxDecoderSeqLength,
numDecoderTokens,
]);
for (
const [i, [inputText, targetText]]
of (zip(inputTexts, targetTexts).entries() as IterableIterator<[number, [string, string]]>)
) {
for (const [t, char] of inputText.split('').entries()) {
// encoder_input_data[i, t, input_token_index[char]] = 1.
encoderInputDataBuf.set(1, i, t, inputTokenIndex[char]);
}
for (const [t, char] of targetText.split('').entries()) {
// decoder_target_data is ahead of decoder_input_data by one timestep
decoderInputDataBuf.set(1, i, t, targetTokenIndex[char]);
if (t > 0) {
// decoder_target_data will be ahead by one timestep
// and will not include the start character.
decoderTargetDataBuf.set(1, i, t - 1, targetTokenIndex[char]);
}
}
}
const encoderInputData = encoderInputDataBuf.toTensor();
const decoderInputData = decoderInputDataBuf.toTensor();
const decoderTargetData = decoderTargetDataBuf.toTensor();
return {
inputTexts,
maxEncoderSeqLength,
maxDecoderSeqLength,
numEncoderTokens,
numDecoderTokens,
inputTokenIndex,
targetTokenIndex,
encoderInputData,
decoderInputData,
decoderTargetData,
};
}