in packages/dispatcher/src/model/supervised/classifier/auto_active_learning/AppAutoActiveLearner.ts [520:733]
public static async mainAutoActiveLearnerWithColumnarContent(
columnarContent: string,
labelColumnIndex: number,
textColumnIndex: number,
weightColumnIndex: number,
linesToSkip: number,
doBootstrapResampling: boolean =
AppAutoActiveLearner.defaultDoBootstrapResampling,
brsDistribution: TMapStringKeyGenericValue<number> =
DictionaryMapUtility.newTMapStringKeyGenericValue<number>(),
doAutoActiveLearning: boolean =
AutoActiveLearner.defaultDoAutoActiveLearning,
aalLimitInitialNumberOfInstancesPerCategory: number =
AutoActiveLearner.defaultAalLimitInitialNumberOfInstancesPerCategory,
aalNumberOfInstancesPerIteration: number =
AutoActiveLearner.defaultAalNumberOfInstancesPerIteration,
aalInstanceSelectionThreshold: number =
AutoActiveLearner.defaultAalInstanceSelectionThreshold,
learnerParameterEpochs: number =
AppSoftmaxRegressionSparse.defaultEpochs,
learnerParameterMiniBatchSize: number =
AppSoftmaxRegressionSparse.defaultMiniBatchSize,
learnerParameterL1Regularization: number =
AppSoftmaxRegressionSparse.defaultL1Regularization,
learnerParameterL2Regularization: number =
AppSoftmaxRegressionSparse.defaultL2Regularization,
learnerParameterLossEarlyStopRatio: number =
AppSoftmaxRegressionSparse.defaultLossEarlyStopRatio,
learnerParameterLearningRate: number =
AppSoftmaxRegressionSparse.defaultLearningRate,
learnerParameterToCalculateOverallLossAfterEpoch: boolean =
true,
limitingSampleSize: number =
DefaultLimitingSampleSize): Promise<{
"newColumnarDataWithSubwordFeaturizer": ColumnarDataWithSubwordFeaturizer,
"learner": SoftmaxRegressionSparse,
"seedingInstanceIndexArray": number[],
"seedingInstanceIndexArrayInitial": number[],
}> {
// -------------------------------------------------------------------
let columnarDataWithSubwordFeaturizer: ColumnarDataWithSubwordFeaturizer =
ColumnarDataWithSubwordFeaturizer.createColumnarDataWithSubwordFeaturizer(
columnarContent,
new NgramSubwordFeaturizer(),
labelColumnIndex,
textColumnIndex,
weightColumnIndex,
linesToSkip,
true);
// -------------------------------------------------------------------
if (doBootstrapResampling) {
const bootstrapSamplerKeyMap: BootstrapSamplerKeyMapDistribution<number> =
new BootstrapSamplerKeyMapDistribution<number>(
brsDistribution,
columnarDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
// ---- NOTE-FOR-REFERENCE ---- const bootstrapSamplerKeyMap: BootstrapSamplerKeyMap<number> =
// ---- NOTE-FOR-REFERENCE ---- new BootstrapSamplerKeyMap(
// ---- NOTE-FOR-REFERENCE ---- dataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
Utility.debuggingLog(`columnarDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray()=` +
`${Utility.mapToJsonSerialization(
columnarDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray())}`);
Utility.debuggingLog(`bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()=` +
`${bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()}`);
// ---- NOTE-FOR-DEBUGGING ---- const samplingIndexArrayGenerator =
// ---- NOTE-FOR-DEBUGGING ---- bootstrapSamplerKeyMap.sampleInstances();
// ---- NOTE-FOR-DEBUGGING ---- for (const element of samplingIndexArrayGenerator) {
// ---- NOTE-FOR-DEBUGGING ---- Utility.debuggingLog(`element of samplingIndexArrayGenerator=` +
// ---- NOTE-FOR-DEBUGGING ---- `${element}`);
// ---- NOTE-FOR-DEBUGGING ---- }
const samplingIndexArray: number[] =
[...bootstrapSamplerKeyMap.sampleInstances()];
Utility.debuggingLog(`samplingIndexArray.length=` +
`${samplingIndexArray.length}`);
const columnarDataWithSubwordFeaturizerBootstrapSampled: DataWithSubwordFeaturizer =
await columnarDataWithSubwordFeaturizer.createDataFromSamplingExistingDataUtterances(
columnarDataWithSubwordFeaturizer,
labelColumnIndex,
textColumnIndex,
weightColumnIndex,
linesToSkip,
samplingIndexArray,
false) as DataWithSubwordFeaturizer;
columnarDataWithSubwordFeaturizer =
columnarDataWithSubwordFeaturizerBootstrapSampled as ColumnarDataWithSubwordFeaturizer;
}
// -------------------------------------------------------------------
const results =
columnarDataWithSubwordFeaturizer.collectSmallUtteranceIndexSetCoveringAllIntentEntityLabels();
const smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
results.smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
const smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
results.smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels;
const smallUtteranceIndexSetCoveringAllIntentEntityLabels: Set<number> =
results.smallUtteranceIndexSetCoveringAllIntentEntityLabels;
const remainingUtteranceIndexSet: Set<number> =
results.remainingUtteranceIndexSet;
Utility.debuggingLog(`smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels=` +
`${Utility.setToJsonSerialization(smallUtteranceIndexSetCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`remainingUtteranceIndexSet=` +
`${Utility.setToJsonSerialization(remainingUtteranceIndexSet)}`);
Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels.size=` +
`${smallUtteranceIndexSetCoveringAllIntentEntityLabels.size}`);
Utility.debuggingLog(`remainingUtteranceIndexSet.size=` +
`${remainingUtteranceIndexSet.size}`);
// -------------------------------------------------------------------
if (!doAutoActiveLearning) {
aalLimitInitialNumberOfInstancesPerCategory = -1;
}
const resultsInitialSampling: {
"seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels": Map<string, Set<number>>,
"candidateUtteranceIndexSetSampled": Set<number>,
"candidateUtteranceIndexSetRemaining": Set<number>,
} = columnarDataWithSubwordFeaturizer.collectUtteranceIndexSetSeedingIntentTrainingSet(
smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels,
remainingUtteranceIndexSet,
aalLimitInitialNumberOfInstancesPerCategory);
const seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
resultsInitialSampling.seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
const candidateUtteranceIndexSetSampled: Set<number> =
resultsInitialSampling.candidateUtteranceIndexSetSampled;
const candidateUtteranceIndexSetRemaining: Set<number> =
resultsInitialSampling.candidateUtteranceIndexSetRemaining;
Utility.debuggingLog(`seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${Utility.stringMapSetToJson(seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetSampled=` +
`${Utility.setToJsonSerialization(candidateUtteranceIndexSetSampled)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetRemaining=` +
`${Utility.setToJsonSerialization(candidateUtteranceIndexSetRemaining)}`);
Utility.debuggingLog(`candidateUtteranceIndexSetSampled.size=` +
`${candidateUtteranceIndexSetSampled.size}`);
Utility.debuggingLog(`candidateUtteranceIndexSetRemaining.size=` +
`${candidateUtteranceIndexSetRemaining.size}`);
const countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: number =
[...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
(accumulation: number, entry: [string, Set<number>]) =>
accumulation + entry[1].size, 0);
Utility.debuggingLog(`countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
`${countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels}`);
// -------------------------------------------------------------------
const seedingUtteranceIndexArray: number[] =
[...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
(accumulation: number[], entry: [string, Set<number>]) =>
accumulation.concat(Array.from(entry[1])), []);
Utility.debuggingLog(`seedingUtteranceIndexArray.length=` +
`${seedingUtteranceIndexArray.length}`);
// -------------------------------------------------------------------
const seedingInstanceIndexArray: number[] =
Utility.cloneArray(seedingUtteranceIndexArray);
const intentLabelIndexArray: number[] =
columnarDataWithSubwordFeaturizer.getIntentLabelIndexArray();
const utteranceFeatureIndexArrays: number[][] =
columnarDataWithSubwordFeaturizer.getUtteranceFeatureIndexArrays();
const autoActiveLearner: AutoActiveLearner =
new AutoActiveLearner(
doAutoActiveLearning,
aalLimitInitialNumberOfInstancesPerCategory,
aalNumberOfInstancesPerIteration,
aalInstanceSelectionThreshold,
learnerParameterEpochs,
learnerParameterMiniBatchSize,
learnerParameterL1Regularization,
learnerParameterL2Regularization,
learnerParameterLossEarlyStopRatio,
learnerParameterLearningRate,
learnerParameterToCalculateOverallLossAfterEpoch);
const learned: {
"seedingInstanceIndexArray": number[],
"learner": SoftmaxRegressionSparse,
} = autoActiveLearner.learn(
columnarDataWithSubwordFeaturizer.getFeaturizerLabels(),
columnarDataWithSubwordFeaturizer.getFeaturizerLabelMap(),
columnarDataWithSubwordFeaturizer.getFeaturizer().getNumberLabels(),
columnarDataWithSubwordFeaturizer.getFeaturizer().getNumberFeatures(),
intentLabelIndexArray,
utteranceFeatureIndexArrays,
seedingInstanceIndexArray,
Array.from(candidateUtteranceIndexSetRemaining));
let aalSampledInstanceIndexArray: number[] =
learned.seedingInstanceIndexArray;
const learner: SoftmaxRegressionSparse =
learned.learner;
// -------------------------------------------------------------------
const numberInstancesPreSelected: number =
seedingUtteranceIndexArray.length;
if (limitingSampleSize > numberInstancesPreSelected) {
limitingSampleSize -= numberInstancesPreSelected;
const reservoirArraySampler: ReservoirArraySampler<number> = new ReservoirArraySampler(
aalSampledInstanceIndexArray,
numberInstancesPreSelected);
aalSampledInstanceIndexArray =
[...reservoirArraySampler.sampleInstances(limitingSampleSize)];
}
// -------------------------------------------------------------------
const newColumnarDataWithSubwordFeaturizer: ColumnarDataWithSubwordFeaturizer =
// tslint:disable-next-line: max-line-length
ColumnarDataWithSubwordFeaturizer.createColumnarDataWithSubwordFeaturizerFromFilteringExistingColumnarDataUtterances(
columnarDataWithSubwordFeaturizer,
labelColumnIndex,
textColumnIndex,
weightColumnIndex,
linesToSkip,
new Set<number>(aalSampledInstanceIndexArray),
false);
return {
newColumnarDataWithSubwordFeaturizer,
learner,
seedingInstanceIndexArray: aalSampledInstanceIndexArray,
seedingInstanceIndexArrayInitial: seedingUtteranceIndexArray };
// -------------------------------------------------------------------
}