async function readData()

in translation/translation.ts [43:186]


async function readData (dataFile: string) {
  // Vectorize the data.
  const inputTexts: string[] = [];
  const targetTexts: string[] = [];

  const inputCharacters = new Set<string>();
  const targetCharacters = new Set<string>();

  const fileStream = fs.createReadStream(dataFile);
  const rl = readline.createInterface({
    input: fileStream,
    output: process.stdout,
    terminal: false,
  });

  let lineNumber = 0;
  rl.on('line', line => {
    if (++lineNumber > args.num_samples) {
      rl.close();
      return;
    }

    let [inputText, targetText] = line.split('\t');
    // We use "tab" as the "start sequence" character for the targets, and "\n"
    // as "end sequence" character.
    targetText = '\t' + targetText + '\n';

    inputTexts.push(inputText);
    targetTexts.push(targetText);

    for (const char of inputText) {
      if (!inputCharacters.has(char)) {
        inputCharacters.add(char);
      }
    }
    for (const char of targetText) {
      if (!targetCharacters.has(char)) {
        targetCharacters.add(char);
      }
    }
  })

  await new Promise(r => rl.on('close', r));

  const inputCharacterList = [...inputCharacters].sort();
  const targetCharacterList = [...targetCharacters].sort();

  const numEncoderTokens = inputCharacterList.length;
  const numDecoderTokens = targetCharacterList.length;

  // Math.max() does not work with very large arrays because of the stack limitation
  const maxEncoderSeqLength = inputTexts.map(text => text.length)
      .reduceRight((prev, curr) => curr > prev ? curr : prev, 0);
  const maxDecoderSeqLength = targetTexts.map(text => text.length)
      .reduceRight((prev, curr) => curr > prev ? curr : prev, 0);

  console.log('Number of samples:', inputTexts.length);
  console.log('Number of unique input tokens:', numEncoderTokens);
  console.log('Number of unique output tokens:', numDecoderTokens);
  console.log('Max sequence length for inputs:', maxEncoderSeqLength);
  console.log('Max sequence length for outputs:', maxDecoderSeqLength);

  const inputTokenIndex = inputCharacterList.reduce(
    (prev, curr, idx) => (prev[curr] = idx, prev),
    {} as {[char: string]: number},
  );
  const targetTokenIndex = targetCharacterList.reduce(
    (prev, curr, idx) => (prev[curr] = idx, prev),
    {} as {[char: string]: number},
  );

  // Save the token indices to file.
  const metadataJsonPath = path.join(
    args.artifacts_dir,
    'metadata.json',
  );

  if (!fs.existsSync(path.dirname(metadataJsonPath))) {
    mkdirp.sync(path.dirname(metadataJsonPath));
  }

  const metadata = {
    'input_token_index': inputTokenIndex,
    'target_token_index': targetTokenIndex,
    'max_encoder_seq_length': maxEncoderSeqLength,
    'max_decoder_seq_length': maxDecoderSeqLength,
  };

  fs.writeFileSync(metadataJsonPath, JSON.stringify(metadata));
  console.log('Saved metadata at: ', metadataJsonPath);

  const encoderInputDataBuf = tf.buffer<tf.Rank.R3>([
    inputTexts.length,
    maxEncoderSeqLength,
    numEncoderTokens,
  ]);
  const decoderInputDataBuf = tf.buffer<tf.Rank.R3>([
    inputTexts.length,
    maxDecoderSeqLength,
    numDecoderTokens,
  ]);
  const decoderTargetDataBuf = tf.buffer<tf.Rank.R3>([
    inputTexts.length,
    maxDecoderSeqLength,
    numDecoderTokens,
  ]);

  for (
    const [i, [inputText, targetText]]
    of (zip(inputTexts, targetTexts).entries() as IterableIterator<[number, [string, string]]>)
  ) {
    for (const [t, char] of inputText.split('').entries()) {
      // encoder_input_data[i, t, input_token_index[char]] = 1.
      encoderInputDataBuf.set(1, i, t, inputTokenIndex[char]);
    }

    for (const [t, char] of targetText.split('').entries()) {
      // decoder_target_data is ahead of decoder_input_data by one timestep
      decoderInputDataBuf.set(1, i, t, targetTokenIndex[char]);
      if (t > 0) {
        // decoder_target_data will be ahead by one timestep
        // and will not include the start character.
        decoderTargetDataBuf.set(1, i, t - 1, targetTokenIndex[char]);
      }
    }
  }

  const encoderInputData = encoderInputDataBuf.toTensor();
  const decoderInputData = decoderInputDataBuf.toTensor();
  const decoderTargetData = decoderTargetDataBuf.toTensor();

  return {
    inputTexts,
    maxEncoderSeqLength,
    maxDecoderSeqLength,
    numEncoderTokens,
    numDecoderTokens,
    inputTokenIndex,
    targetTokenIndex,
    encoderInputData,
    decoderInputData,
    decoderTargetData,
  };
}