in packages/dispatcher/src/model/supervised/classifier/auto_active_learning/AppAutoActiveLearner.ts [293:491]
public static async mainAutoActiveLearnerWithLuContent(
luContent: string,
doBootstrapResampling: boolean =
AppAutoActiveLearner.defaultDoBootstrapResampling,
brsDistribution: TMapStringKeyGenericValue<number> =
DictionaryMapUtility.newTMapStringKeyGenericValue<number>(),
doAutoActiveLearning: boolean =
AutoActiveLearner.defaultDoAutoActiveLearning,
aalLimitInitialNumberOfInstancesPerCategory: number =
AutoActiveLearner.defaultAalLimitInitialNumberOfInstancesPerCategory,
aalNumberOfInstancesPerIteration: number =
AutoActiveLearner.defaultAalNumberOfInstancesPerIteration,
aalInstanceSelectionThreshold: number =
AutoActiveLearner.defaultAalInstanceSelectionThreshold,
learnerParameterEpochs: number =
AppSoftmaxRegressionSparse.defaultEpochs,
learnerParameterMiniBatchSize: number =
AppSoftmaxRegressionSparse.defaultMiniBatchSize,
learnerParameterL1Regularization: number =
AppSoftmaxRegressionSparse.defaultL1Regularization,
learnerParameterL2Regularization: number =
AppSoftmaxRegressionSparse.defaultL2Regularization,
learnerParameterLossEarlyStopRatio: number =
AppSoftmaxRegressionSparse.defaultLossEarlyStopRatio,
learnerParameterLearningRate: number =
AppSoftmaxRegressionSparse.defaultLearningRate,
learnerParameterToCalculateOverallLossAfterEpoch: boolean =
true,
limitingSampleSize: number =
DefaultLimitingSampleSize): Promise<{
"newLuDataWithSubwordFeaturizer": LuDataWithSubwordFeaturizer,
"learner": SoftmaxRegressionSparse,
"seedingInstanceIndexArray": number[],
"seedingInstanceIndexArrayInitial": number[],
}> {
// -------------------------------------------------------------------
let luDataWithSubwordFeaturizer: LuDataWithSubwordFeaturizer =
await LuDataWithSubwordFeaturizer.createLuDataWithSubwordFeaturizer(
luContent,
new NgramSubwordFeaturizer(),
true);
// -------------------------------------------------------------------
if (doBootstrapResampling) {
const bootstrapSamplerKeyMap: BootstrapSamplerKeyMapDistribution<number> =
new BootstrapSamplerKeyMapDistribution<number>(
brsDistribution,
luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
// ---- NOTE-FOR-REFERENCE ---- const bootstrapSamplerKeyMap: BootstrapSamplerKeyMap<number> =
// ---- NOTE-FOR-REFERENCE ---- new BootstrapSamplerKeyMap(
// ---- NOTE-FOR-REFERENCE ---- dataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
Utility.debuggingLog(`luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray()=` +
`${Utility.mapToJsonSerialization(luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray())}`);
Utility.debuggingLog(`bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()=` +
`${bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()}`);
// ---- NOTE-FOR-DEBUGGING ---- const samplingIndexArrayGenerator =
// ---- NOTE-FOR-DEBUGGING ---- bootstrapSamplerKeyMap.sampleInstances();
// ---- NOTE-FOR-DEBUGGING ---- for (const element of samplingIndexArrayGenerator) {
// ---- NOTE-FOR-DEBUGGING ---- Utility.debuggingLog(`element of samplingIndexArrayGenerator=` +
// ---- NOTE-FOR-DEBUGGING ---- `${element}`);
// ---- NOTE-FOR-DEBUGGING ---- }
const samplingIndexArray: number[] =
[...bootstrapSamplerKeyMap.sampleInstances()];
Utility.debuggingLog(`samplingIndexArray.length=` +
`${samplingIndexArray.length}`);
const luDataWithSubwordFeaturizerBootstrapSampled: DataWithSubwordFeaturizer =
await luDataWithSubwordFeaturizer.createDataFromSamplingExistingDataUtterances(
luDataWithSubwordFeaturizer,
-1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- labelColumnIndex,
-1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- textColumnIndex,
-1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- weightColumnIndex,
-1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- linesToSkip,
samplingIndexArray,
false) as DataWithSubwordFeaturizer;
luDataWithSubwordFeaturizer = luDataWithSubwordFeaturizerBootstrapSampled as LuDataWithSubwordFeaturizer;
}
// -------------------------------------------------------------------
const results =
luDataWithSubwordFeaturizer.collectSmallUtteranceIndexSetCoveringAllIntentEntityLabels();
const smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
results.smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
const smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
results.smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels;
const smallUtteranceIndexSetCoveringAllIntentEntityLabels: Set<number> =
results.smallUtteranceIndexSetCoveringAllIntentEntityLabels;
const remainingUtteranceIndexSet: Set<number> =
results.remainingUtteranceIndexSet;
Utility.debuggingLog(`smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels=` +
`${Utility.setToJsonSerialization(smallUtteranceIndexSetCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`remainingUtteranceIndexSet=` +
`${Utility.setToJsonSerialization(remainingUtteranceIndexSet)}`);
Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels.size=` +
`${smallUtteranceIndexSetCoveringAllIntentEntityLabels.size}`);
Utility.debuggingLog(`remainingUtteranceIndexSet.size=` +
`${remainingUtteranceIndexSet.size}`);
// -------------------------------------------------------------------
if (!doAutoActiveLearning) {
aalLimitInitialNumberOfInstancesPerCategory = -1;
}
const resultsInitialSampling: {
"seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels": Map<string, Set<number>>,
"candidateUtteranceIndexSetSampled": Set<number>,
"candidateUtteranceIndexSetRemaining": Set<number>,
} = luDataWithSubwordFeaturizer.collectUtteranceIndexSetSeedingIntentTrainingSet(
smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels,
remainingUtteranceIndexSet,
aalLimitInitialNumberOfInstancesPerCategory);
const seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
resultsInitialSampling.seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
const candidateUtteranceIndexSetSampled: Set<number> =
resultsInitialSampling.candidateUtteranceIndexSetSampled;
const candidateUtteranceIndexSetRemaining: Set<number> =
resultsInitialSampling.candidateUtteranceIndexSetRemaining;
Utility.debuggingLog(`seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetSampled=` +
`${Utility.setToJsonSerialization(candidateUtteranceIndexSetSampled)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetRemaining=` +
`${Utility.setToJsonSerialization(candidateUtteranceIndexSetRemaining)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetSampled.size=` +
`${candidateUtteranceIndexSetSampled.size}`);
Utility.debuggingLog(`candidateUtteranceIndexSetRemaining.size=` +
`${candidateUtteranceIndexSetRemaining.size}`);
const countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: number =
[...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
(accumulation: number, entry: [string, Set<number>]) =>
accumulation + entry[1].size, 0);
Utility.debuggingLog(`countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels}`);
// -------------------------------------------------------------------
const seedingUtteranceIndexArray: number[] =
[...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
(accumulation: number[], entry: [string, Set<number>]) =>
accumulation.concat(Array.from(entry[1])), []);
Utility.debuggingLog(`seedingUtteranceIndexArray.length=` +
`${seedingUtteranceIndexArray.length}`);
// -------------------------------------------------------------------
const seedingInstanceIndexArray: number[] =
Utility.cloneArray(seedingUtteranceIndexArray);
const intentLabelIndexArray: number[] =
luDataWithSubwordFeaturizer.getIntentLabelIndexArray();
const utteranceFeatureIndexArrays: number[][] =
luDataWithSubwordFeaturizer.getUtteranceFeatureIndexArrays();
const autoActiveLearner: AutoActiveLearner =
new AutoActiveLearner(
doAutoActiveLearning,
aalLimitInitialNumberOfInstancesPerCategory,
aalNumberOfInstancesPerIteration,
aalInstanceSelectionThreshold,
learnerParameterEpochs,
learnerParameterMiniBatchSize,
learnerParameterL1Regularization,
learnerParameterL2Regularization,
learnerParameterLossEarlyStopRatio,
learnerParameterLearningRate,
learnerParameterToCalculateOverallLossAfterEpoch);
const learned: {
"seedingInstanceIndexArray": number[],
"learner": SoftmaxRegressionSparse,
} = autoActiveLearner.learn(
luDataWithSubwordFeaturizer.getFeaturizerLabels(),
luDataWithSubwordFeaturizer.getFeaturizerLabelMap(),
luDataWithSubwordFeaturizer.getFeaturizer().getNumberLabels(),
luDataWithSubwordFeaturizer.getFeaturizer().getNumberFeatures(),
intentLabelIndexArray,
utteranceFeatureIndexArrays,
seedingInstanceIndexArray,
Array.from(candidateUtteranceIndexSetRemaining));
let aalSampledInstanceIndexArray: number[] =
learned.seedingInstanceIndexArray;
const learner: SoftmaxRegressionSparse =
learned.learner;
// -------------------------------------------------------------------
const numberInstancesPreSelected: number =
seedingUtteranceIndexArray.length;
if (limitingSampleSize > numberInstancesPreSelected) {
limitingSampleSize -= numberInstancesPreSelected;
const reservoirArraySampler: ReservoirArraySampler<number> = new ReservoirArraySampler(
aalSampledInstanceIndexArray,
numberInstancesPreSelected);
aalSampledInstanceIndexArray =
[...reservoirArraySampler.sampleInstances(limitingSampleSize)];
}
// -------------------------------------------------------------------
const newLuDataWithSubwordFeaturizer: LuDataWithSubwordFeaturizer =
await LuDataWithSubwordFeaturizer.createLuDataWithSubwordFeaturizerFromFilteringExistingLuDataUtterances(
luDataWithSubwordFeaturizer,
new Set<number>(aalSampledInstanceIndexArray),
false);
return {
newLuDataWithSubwordFeaturizer,
learner,
seedingInstanceIndexArray: aalSampledInstanceIndexArray,
seedingInstanceIndexArrayInitial: seedingUtteranceIndexArray };
// -------------------------------------------------------------------
}