in packages/dispatcher/src/model/supervised/classifier/auto_active_learning/AppAutoActiveLearner.ts [735:1189]
public static async mainAutoActiveLearner(): Promise<string[]> {
// -------------------------------------------------------------------
const dateTimeBeginInString: string = (new Date()).toISOString();
// -------------------------------------------------------------------
const parser = new ArgumentParser({
addHelp: true,
description: "app_auto_active_learning",
version: "0.0.1",
});
parser.addArgument(
["-f", "--filename"],
{
help: "an input data file",
required: true,
},
);
parser.addArgument(
["-t", "--filetype"],
{
help: "data file type",
required: false,
},
);
parser.addArgument(
["-o", "--outputFilename"],
{
help: "output data file",
required: false,
},
);
parser.addArgument(
["-m", "--outputModelFilename"],
{
help: "output serialized model file",
required: false,
},
);
parser.addArgument(
["-x", "--outputFeaturizerFilename"],
{
help: "output serialized featurizer file",
required: false,
},
);
parser.addArgument(
["-d", "--debug"],
{
defaultValue: false,
help: "enable printing debug information",
required: false,
},
);
parser.addArgument(
["-brs", "--doBootstrapResampling"],
{
defaultValue: AppAutoActiveLearner.defaultDoBootstrapResampling,
help: "whether to activate bootstrap resampling or not",
required: false,
},
);
parser.addArgument(
["-brdf", "--bootstrapResamplingDistributionFilename"],
{
defaultValue: "",
help: "The file used to build a distribution for bootstrap resampling",
required: false,
},
);
parser.addArgument(
["-brss", "--bootstrapResamplingSampleSizeConfihuration"],
{
defaultValue: 1,
help: "bootstrap resampling sample size configuration",
required: false,
},
);
parser.addArgument(
["-brdfli", "--bootstrapResamplingDistributionFileLabelColumnIndex"],
{
defaultValue: 0,
help: "label column index for the bootstrap resampling file",
required: false,
},
);
parser.addArgument(
["-brdfti", "--bootstrapResamplingDistributionFileTextColumnIndex"],
{
defaultValue: 0,
help: "text/utterance column index for the bootstrap resampling file",
required: false,
},
);
parser.addArgument(
["-brdfls", "--bootstrapResamplingDistributionFileLinesToSkip"],
{
defaultValue: 0,
help: "number of lines to skip for the bootstrap resampling file",
required: false,
},
);
parser.addArgument(
["-aal", "--doAutoActiveLearning"],
{
defaultValue: AutoActiveLearner.defaultDoAutoActiveLearning,
help: "whether to activate auto active learning or not",
required: false,
},
);
parser.addArgument(
["-aali", "--aalLimitInitialNumberOfInstancesPerCategory"],
{
defaultValue: AutoActiveLearner.defaultAalLimitInitialNumberOfInstancesPerCategory,
help: "initial number of data instances per category for auto active learning",
required: false,
},
);
parser.addArgument(
["-aaln", "--aalNumberOfInstancesPerIteration"],
{
defaultValue: AutoActiveLearner.defaultAalNumberOfInstancesPerIteration,
help: "number of data instances per iteration for auto active learning",
required: false,
},
);
parser.addArgument(
["-aalt", "--aalInstanceSelectionThreshold"],
{
defaultValue: AutoActiveLearner.defaultAalInstanceSelectionThreshold,
help: "prediction threshold for selecting a new training instance from a candidate set",
required: false,
},
);
parser.addArgument(
["-le", "--learnerParameterEpochs"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultEpochs,
help: "number of epochs",
required: false,
},
);
parser.addArgument(
["-lb", "--learnerParameterMiniBatchSize"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultMiniBatchSize,
help: "mini batch size",
required: false,
},
);
parser.addArgument(
["-ll1", "--learnerParameterL1Regularization"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultL1Regularization,
help: "l1 regularization coefficient",
required: false,
},
);
parser.addArgument(
["-ll2", "--learnerParameterL2Regularization"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultL2Regularization,
help: "l2 regularization coefficient",
required: false,
},
);
parser.addArgument(
["-lesr", "--learnerParameterLossEarlyStopRatio"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultLossEarlyStopRatio,
help: "loss early stop ratio",
required: false,
},
);
parser.addArgument(
["-llr", "--learnerParameterLearningRate"],
{
defaultValue: AppSoftmaxRegressionSparse.defaultLearningRate,
help: "learning rate",
required: false,
},
);
parser.addArgument(
["-ltl", "--learnerParameterToCalculateOverallLossAfterEpoch"],
{
defaultValue: true,
help: "whether to calcualte loss after each epoch",
required: false,
},
);
parser.addArgument(
["-ss", "--limitingSampleSize"],
{
defaultValue: 0,
help: "down sample the training instances to this limit",
required: false,
},
);
parser.addArgument(
["-li", "--labelColumnIndex"],
{
defaultValue: 0,
help: "label column index",
required: false,
},
);
parser.addArgument(
["-ti", "--textColumnIndex"],
{
defaultValue: 1,
help: "text/utterance column index",
required: false,
},
);
parser.addArgument(
["-wi", "--weightColumnIndex"],
{
defaultValue: -1,
help: "weight column index",
required: false,
},
);
parser.addArgument(
["-ls", "--linesToSkip"],
{
defaultValue: 0,
help: "number of lines to skip for the input file",
required: false,
},
);
const parsedKnownArgs: any[] = parser.parseKnownArgs();
const args: any = parsedKnownArgs[0];
const unknownArgs: any = parsedKnownArgs[1];
Utility.debuggingLog(
`args=${Utility.jsonStringify(args)}`);
Utility.debuggingLog(
`unknownArgs=${Utility.jsonStringify(unknownArgs)}`);
const debugFlag: boolean = Utility.toBoolean(args.debug);
Utility.resetFlagToPrintDebuggingLogToConsole(debugFlag);
// ---- NOTE-FOR-DEBUGGING ---- console.dir(args);
// -------------------------------------------------------------------
const filename: string =
args.filename;
if (!Utility.exists(filename)) {
Utility.debuggingThrow(
`The input dataset file ${filename} does not exist! process.cwd()=${process.cwd()}`);
}
const filetype: string =
args.filetype;
let outputFilename: string = args.outputFilename;
if (outputFilename == null) {
outputFilename = filename + ".lu";
}
const doBootstrapResampling: boolean =
args.doBootstrapResampling;
const bootstrapResamplingDistributionFilename: string =
args.bootstrapResamplingDistributionFilename;
const bootstrapResamplingSampleSizeConfihuration: number =
+args.bootstrapResamplingSampleSizeConfihuration;
const doAutoActiveLearning: boolean =
args.doAutoActiveLearning;
const aalLimitInitialNumberOfInstancesPerCategory: number =
+args.aalLimitInitialNumberOfInstancesPerCategory;
const aalNumberOfInstancesPerIteration: number =
+args.aalNumberOfInstancesPerIteration;
const aalInstanceSelectionThreshold: number =
+args.aalInstanceSelectionThreshold;
const learnerParameterEpochs: number =
+args.learnerParameterEpochs;
const learnerParameterMiniBatchSize: number =
+args.learnerParameterMiniBatchSize;
const learnerParameterL1Regularization: number =
+args.learnerParameterL1Regularization;
const learnerParameterL2Regularization: number =
+args.learnerParameterL2Regularization;
const learnerParameterLossEarlyStopRatio: number =
+args.learnerParameterLossEarlyStopRatio;
const learnerParameterLearningRate: number =
+args.learnerParameterLearningRate;
const learnerParameterToCalculateOverallLossAfterEpoch: boolean =
args.learnerParameterToCalculateOverallLossAfterEpoch;
const limitingSampleSize: number =
+args.limitingSampleSize;
Utility.debuggingLog(
`filename=${filename}`);
Utility.debuggingLog(
`outputFilename=${outputFilename}`);
Utility.debuggingLog(
`doBootstrapResampling=${doBootstrapResampling}`);
Utility.debuggingLog(
`bootstrapResamplingDistributionFilename=${bootstrapResamplingDistributionFilename}`);
Utility.debuggingLog(
`bootstrapResamplingSampleSizeConfihuration=${bootstrapResamplingSampleSizeConfihuration}`);
Utility.debuggingLog(
`doAutoActiveLearning=${doAutoActiveLearning}`);
Utility.debuggingLog(
`aalLimitInitialNumberOfInstancesPerCategory=${aalLimitInitialNumberOfInstancesPerCategory}`);
Utility.debuggingLog(
`aalNumberOfInstancesPerIteration=${aalNumberOfInstancesPerIteration}`);
Utility.debuggingLog(
`aalInstanceSelectionThreshold=${aalInstanceSelectionThreshold}`);
Utility.debuggingLog(
`learnerParameterEpochs=${learnerParameterEpochs}`);
Utility.debuggingLog(
`learnerParameterMiniBatchSize=${learnerParameterMiniBatchSize}`);
Utility.debuggingLog(
`learnerParameterL1Regularization=${learnerParameterL1Regularization}`);
Utility.debuggingLog(
`learnerParameterL2Regularization=${learnerParameterL2Regularization}`);
Utility.debuggingLog(
`learnerParameterLossEarlyStopRatio=${learnerParameterLossEarlyStopRatio}`);
Utility.debuggingLog(
`learnerParameterLearningRate=${learnerParameterLearningRate}`);
Utility.debuggingLog(
`learnerParameterToCalculateOverallLossAfterEpoch=${learnerParameterToCalculateOverallLossAfterEpoch}`);
Utility.debuggingLog(
`limitingSampleSize=${limitingSampleSize}`);
const outputModelFilename: string =
args.outputModelFilename;
const outputFeaturizerFilename: string =
args.outputFeaturizerFilename;
// -------------------------------------------------------------------
const labelColumnIndex: number = +args.labelColumnIndex;
const textColumnIndex: number = +args.textColumnIndex;
const weightColumnIndex: number = +args.weightColumnIndex;
const linesToSkip: number = +args.linesToSkip;
Utility.debuggingLog(
`labelColumnIndex=${labelColumnIndex}`);
Utility.debuggingLog(
`textColumnIndex=${textColumnIndex}`);
Utility.debuggingLog(
`weightColumnIndex=${weightColumnIndex}`);
Utility.debuggingLog(
`linesToSkip=${linesToSkip}`);
// -------------------------------------------------------------------
const bootstrapResamplingDistributionFileLabelColumnIndex: number =
+args.bootstrapResamplingDistributionFileLabelColumnIndex;
const bootstrapResamplingDistributionFileTextColumnIndex: number =
+args.bootstrapResamplingDistributionFileTextColumnIndex;
const bootstrapResamplingDistributionFileLinesToSkip: number =
+args.bootstrapResamplingDistributionFileLinesToSkip;
Utility.debuggingLog(
`bootstrapResamplingDistributionFileLabelColumnIndex=` +
`${bootstrapResamplingDistributionFileLabelColumnIndex}`);
Utility.debuggingLog(
`bootstrapResamplingDistributionFileTextColumnIndex=` +
`${bootstrapResamplingDistributionFileTextColumnIndex}`);
Utility.debuggingLog(
`bootstrapResamplingDistributionFileLinesToSkip=` +
`${bootstrapResamplingDistributionFileLinesToSkip}`);
// -------------------------------------------------------------------
let intentsUtterancesWeights: {
"intents": string[],
"utterances": string[],
"weights": number[] } = {
intents: [],
utterances: [],
weights: [] };
let intentLabelIndexArray: number[] = [];
let utteranceFeatureIndexArrays: number[][] = [];
const dataWithSubwordFeaturizer: DataWithSubwordFeaturizer =
await DataWithSubwordFeaturizerUtility.LoadDataWithSubwordFeaturizer(
filename,
null,
true,
filetype,
labelColumnIndex,
textColumnIndex,
weightColumnIndex,
linesToSkip);
intentsUtterancesWeights = dataWithSubwordFeaturizer.getIntentsUtterancesWeights();
intentLabelIndexArray = dataWithSubwordFeaturizer.getIntentLabelIndexArray();
utteranceFeatureIndexArrays = dataWithSubwordFeaturizer.getUtteranceFeatureIndexArrays();
// -------------------------------------------------------------------
const bootstrapResamplingDistribution: Map<string, number> = new Map<string, number>();
if (doBootstrapResampling) {
if (Utility.exists(bootstrapResamplingDistributionFilename)) {
const dataWithSubwordFeaturizerBootstrapResampling: DataWithSubwordFeaturizer =
await DataWithSubwordFeaturizerUtility.LoadDataWithSubwordFeaturizer(
bootstrapResamplingDistributionFilename,
null,
true,
filetype,
bootstrapResamplingDistributionFileLabelColumnIndex,
bootstrapResamplingDistributionFileTextColumnIndex,
bootstrapResamplingDistributionFileLinesToSkip);
const bootstrapResamplingIntentInstanceIndexMapArray: Map<string, number[]> =
dataWithSubwordFeaturizerBootstrapResampling.getIntentInstanceIndexMapArray();
for (const entry of bootstrapResamplingIntentInstanceIndexMapArray) {
bootstrapResamplingDistribution.set(entry[0], entry[1].length);
}
}
}
// -------------------------------------------------------------------
const aalResult: {
"newDataWithSubwordFeaturizer": DataWithSubwordFeaturizer,
"learner": SoftmaxRegressionSparse,
"seedingInstanceIndexArray": number[],
"seedingInstanceIndexArrayInitial": number[],
} = await AppAutoActiveLearner.mainAutoActiveLearnerWithDataWithSubwordFeaturizer(
dataWithSubwordFeaturizer,
labelColumnIndex,
textColumnIndex,
weightColumnIndex,
linesToSkip,
doBootstrapResampling,
bootstrapResamplingDistribution,
doAutoActiveLearning,
aalLimitInitialNumberOfInstancesPerCategory,
aalNumberOfInstancesPerIteration,
aalInstanceSelectionThreshold,
learnerParameterEpochs,
learnerParameterMiniBatchSize,
learnerParameterL1Regularization,
learnerParameterL2Regularization,
learnerParameterLossEarlyStopRatio,
learnerParameterLearningRate,
learnerParameterToCalculateOverallLossAfterEpoch,
limitingSampleSize);
const newDataWithSubwordFeaturizer: DataWithSubwordFeaturizer =
aalResult.newDataWithSubwordFeaturizer;
const learner: SoftmaxRegressionSparse =
aalResult.learner;
// -------------------------------------------------------------------
const outputFilenames: string[] = [];
// tslint:disable-next-line: max-line-length
// ---- NOTE-PACE-HOLDER ---- const outputFilenameDump: string = newDataWithSubwordFeaturizer.dumpLuLuisJsonStructureInLuFormat(
// ---- NOTE-PACE-HOLDER ---- outputFilename);
// ---- NOTE-PACE-HOLDER ---- outputFilenames.push(outputFilenameDump);
// ---- NOTE-PACE-HOLDER ---- const outputFilenameLuis: string =
// ---- NOTE-PACE-HOLDER ---- outputFilename + ".luis";
// tslint:disable-next-line: max-line-length
// ---- NOTE-PACE-HOLDER ---- const outputFilenameLuisAfterDumpfile = newDataWithSubwordFeaturizer.dumpLuLuisJsonStructure(
// ---- NOTE-PACE-HOLDER ---- outputFilenameLuis, undefined, 4);
// ---- NOTE-PACE-HOLDER ---- outputFilenames.push(outputFilenameLuisAfterDumpfile);
if (!Utility.isEmptyString(outputModelFilename)) {
const outputModelFilenameAfterDumpfile: string = Utility.dumpFile(
outputModelFilename,
learner.serializeToJsonString(undefined, 4));
outputFilenames.push(outputModelFilenameAfterDumpfile);
}
if (!Utility.isEmptyString(outputFeaturizerFilename)) {
const outputFeaturizerFilenameAfterDumpfile: string = Utility.dumpFile(
outputFeaturizerFilename,
newDataWithSubwordFeaturizer.getFeaturizer().serializeToJsonString(undefined, 4));
outputFilenames.push(outputFeaturizerFilenameAfterDumpfile);
}
// -------------------------------------------------------------------
const dateTimeEndInString: string = (new Date()).toISOString();
// -------------------------------------------------------------------
Utility.debuggingLog(
`dateTimeBeginInString=${dateTimeBeginInString}`);
Utility.debuggingLog(
`dateTimeEndInString=${dateTimeEndInString}`);
// -------------------------------------------------------------------
return outputFilenames;
}