async function getDataIterator()

in intent-classifier/training/train_tagger.js [54:161]


async function getDataIterator(
    embeddingsPath, taggedTokensPath, sequenceLength, batchSize) {
  // Load token embeddings and convert to tensors
  let tokenEmbeddingsTuples = await loadNDJSON(embeddingsPath);
  tokenEmbeddingsTuples = tokenEmbeddingsTuples.map(([token, embedding]) => {
    const embeddingAsTensor = tf.tensor1d(embedding);
    return [token, embeddingAsTensor];
  });
  // Add an 'embedding' for the __PAD__ token. We will encode it with tf.ones.
  tokenEmbeddingsTuples.push([TAGS[2], tf.ones([EMBEDDING_SIZE])]);
  const tokenEmbeddings = new Map(tokenEmbeddingsTuples);


  // Load the tagged intent tokens and convert the labels to one-hot
  // We will tranform the queries into tensors dynamically in the generator
  // function below.
  const taggedIntentTokens = await loadNDJSON(taggedTokensPath);
  const labelOneHots = new Map(
      TAGS.map(tag => [tag, tf.oneHot(TAGS.indexOf(tag), TAGS.length)]));

  console.log('Data Loaded');

  /**
   * Returns a batch of data for training. Will assemble a tensor of shape
   * [batchSize, sequenceLength, EMBEDDING_SIZE] that represent a batch of
   * queries whose tokens have been embedded with USE.
   */
  function* getNextBatch() {
    let xs = [];
    let ys = [];

    let toDispose = [];

    // Loop through all the tokenized sentences
    for (let idx = 0; idx < taggedIntentTokens.length; idx++) {
      const sentence = taggedIntentTokens[idx];
      const features = sentence[0];

      // Each example is converted to an array of length `sequenceLength`,
      // adding padding tokens if necessary and truncating the sentence
      // if it is too long.
      const exampleX = [];
      const exampleY = [];
      for (let index = 0; index < sequenceLength; index++) {
        let token;
        let tag;
        if (index < features.length) {
          const tuple = features[index];
          token = tuple.token;
          tag = tuple.tag;
        } else {
          // PADDING
          token = TAGS[TAGS.PAD_IDX];
          tag = TAGS[TAGS.PAD_IDX];
        }

        // Note that we reuse the tensors for a given token or tag.
        const tokenEmbedding = tokenEmbeddings.get(token);
        const tagOnehot = labelOneHots.get(tag);

        exampleX.push(tokenEmbedding);
        exampleY.push(tagOnehot);

        tf.util.assert(
            tokenEmbedding != null,
            () => console.log(`Error getting token embedding for ${token}`));

        tf.util.assertShapesMatch(
            tokenEmbedding.shape, [EMBEDDING_SIZE],
            () => console.log(`Wrong shape for token embedding of ${token}`));

        tf.util.assert(
            tagOnehot != null,
            () => console.log(`Error getting label onehot for ${tag}`));

        tf.util.assertShapesMatch(
            tagOnehot.shape, [TAGS.length],
            () => console.log(`Wrong shape for label onehot for ${tag}`));
      }

      // Add an example to the batch
      const xStacked = tf.stack(exampleX);
      const yStacked = tf.stack(exampleY);
      xs.push(xStacked);
      ys.push(yStacked);

      // We will dispose of these tensors once the higher rank tensor is
      // created for the whole batch.
      toDispose.push(xStacked);
      toDispose.push(yStacked);

      // Create a new batch and yield it.
      if (idx > 0 && idx % (batchSize - 1) === 0) {
        const batchedXS = tf.stack(xs);
        const batchedYS = tf.stack(ys);

        yield {xs: batchedXS, ys: batchedYS};

        tf.dispose([batchedXS, batchedYS, toDispose]);
        toDispose = [];
        xs = [];
        ys = [];
      }
    }
  }

  return getNextBatch;
}