in intent-classifier/training/train_tagger.js [54:161]
async function getDataIterator(
embeddingsPath, taggedTokensPath, sequenceLength, batchSize) {
// Load token embeddings and convert to tensors
let tokenEmbeddingsTuples = await loadNDJSON(embeddingsPath);
tokenEmbeddingsTuples = tokenEmbeddingsTuples.map(([token, embedding]) => {
const embeddingAsTensor = tf.tensor1d(embedding);
return [token, embeddingAsTensor];
});
// Add an 'embedding' for the __PAD__ token. We will encode it with tf.ones.
tokenEmbeddingsTuples.push([TAGS[2], tf.ones([EMBEDDING_SIZE])]);
const tokenEmbeddings = new Map(tokenEmbeddingsTuples);
// Load the tagged intent tokens and convert the labels to one-hot
// We will tranform the queries into tensors dynamically in the generator
// function below.
const taggedIntentTokens = await loadNDJSON(taggedTokensPath);
const labelOneHots = new Map(
TAGS.map(tag => [tag, tf.oneHot(TAGS.indexOf(tag), TAGS.length)]));
console.log('Data Loaded');
/**
* Returns a batch of data for training. Will assemble a tensor of shape
* [batchSize, sequenceLength, EMBEDDING_SIZE] that represent a batch of
* queries whose tokens have been embedded with USE.
*/
function* getNextBatch() {
let xs = [];
let ys = [];
let toDispose = [];
// Loop through all the tokenized sentences
for (let idx = 0; idx < taggedIntentTokens.length; idx++) {
const sentence = taggedIntentTokens[idx];
const features = sentence[0];
// Each example is converted to an array of length `sequenceLength`,
// adding padding tokens if necessary and truncating the sentence
// if it is too long.
const exampleX = [];
const exampleY = [];
for (let index = 0; index < sequenceLength; index++) {
let token;
let tag;
if (index < features.length) {
const tuple = features[index];
token = tuple.token;
tag = tuple.tag;
} else {
// PADDING
token = TAGS[TAGS.PAD_IDX];
tag = TAGS[TAGS.PAD_IDX];
}
// Note that we reuse the tensors for a given token or tag.
const tokenEmbedding = tokenEmbeddings.get(token);
const tagOnehot = labelOneHots.get(tag);
exampleX.push(tokenEmbedding);
exampleY.push(tagOnehot);
tf.util.assert(
tokenEmbedding != null,
() => console.log(`Error getting token embedding for ${token}`));
tf.util.assertShapesMatch(
tokenEmbedding.shape, [EMBEDDING_SIZE],
() => console.log(`Wrong shape for token embedding of ${token}`));
tf.util.assert(
tagOnehot != null,
() => console.log(`Error getting label onehot for ${tag}`));
tf.util.assertShapesMatch(
tagOnehot.shape, [TAGS.length],
() => console.log(`Wrong shape for label onehot for ${tag}`));
}
// Add an example to the batch
const xStacked = tf.stack(exampleX);
const yStacked = tf.stack(exampleY);
xs.push(xStacked);
ys.push(yStacked);
// We will dispose of these tensors once the higher rank tensor is
// created for the whole batch.
toDispose.push(xStacked);
toDispose.push(yStacked);
// Create a new batch and yield it.
if (idx > 0 && idx % (batchSize - 1) === 0) {
const batchedXS = tf.stack(xs);
const batchedYS = tf.stack(ys);
yield {xs: batchedXS, ys: batchedYS};
tf.dispose([batchedXS, batchedYS, toDispose]);
toDispose = [];
xs = [];
ys = [];
}
}
}
return getNextBatch;
}