public static async mainAutoActiveLearner()

in packages/dispatcher/src/model/supervised/classifier/auto_active_learning/AppAutoActiveLearner.ts [735:1189]


    public static async mainAutoActiveLearner(): Promise<string[]> {
        // -------------------------------------------------------------------
        const dateTimeBeginInString: string = (new Date()).toISOString();
        // -------------------------------------------------------------------
        const parser = new ArgumentParser({
            addHelp: true,
            description: "app_auto_active_learning",
            version: "0.0.1",
        });
        parser.addArgument(
            ["-f", "--filename"],
            {
                help: "an input data file",
                required: true,
            },
        );
        parser.addArgument(
            ["-t", "--filetype"],
            {
                help: "data file type",
                required: false,
            },
        );
        parser.addArgument(
            ["-o", "--outputFilename"],
            {
                help: "output data file",
                required: false,
            },
        );
        parser.addArgument(
            ["-m", "--outputModelFilename"],
            {
                help: "output serialized model file",
                required: false,
            },
        );
        parser.addArgument(
            ["-x", "--outputFeaturizerFilename"],
            {
                help: "output serialized featurizer file",
                required: false,
            },
        );
        parser.addArgument(
            ["-d", "--debug"],
            {
                defaultValue: false,
                help: "enable printing debug information",
                required: false,
            },
        );
        parser.addArgument(
            ["-brs", "--doBootstrapResampling"],
            {
                defaultValue: AppAutoActiveLearner.defaultDoBootstrapResampling,
                help: "whether to activate bootstrap resampling or not",
                required: false,
            },
        );
        parser.addArgument(
            ["-brdf", "--bootstrapResamplingDistributionFilename"],
            {
                defaultValue: "",
                help: "The file used to build a distribution for bootstrap resampling",
                required: false,
            },
        );
        parser.addArgument(
            ["-brss", "--bootstrapResamplingSampleSizeConfihuration"],
            {
                defaultValue: 1,
                help: "bootstrap resampling sample size configuration",
                required: false,
            },
        );
        parser.addArgument(
            ["-brdfli", "--bootstrapResamplingDistributionFileLabelColumnIndex"],
            {
                defaultValue: 0,
                help: "label column index for the bootstrap resampling file",
                required: false,
            },
        );
        parser.addArgument(
            ["-brdfti", "--bootstrapResamplingDistributionFileTextColumnIndex"],
            {
                defaultValue: 0,
                help: "text/utterance column index for the bootstrap resampling file",
                required: false,
            },
        );
        parser.addArgument(
            ["-brdfls", "--bootstrapResamplingDistributionFileLinesToSkip"],
            {
                defaultValue: 0,
                help: "number of lines to skip for the bootstrap resampling file",
                required: false,
            },
        );
        parser.addArgument(
            ["-aal", "--doAutoActiveLearning"],
            {
                defaultValue: AutoActiveLearner.defaultDoAutoActiveLearning,
                help: "whether to activate auto active learning or not",
                required: false,
            },
        );
        parser.addArgument(
            ["-aali", "--aalLimitInitialNumberOfInstancesPerCategory"],
            {
                defaultValue: AutoActiveLearner.defaultAalLimitInitialNumberOfInstancesPerCategory,
                help: "initial number of data instances per category for auto active learning",
                required: false,
            },
        );
        parser.addArgument(
            ["-aaln", "--aalNumberOfInstancesPerIteration"],
            {
                defaultValue: AutoActiveLearner.defaultAalNumberOfInstancesPerIteration,
                help: "number of data instances per iteration for auto active learning",
                required: false,
            },
        );
        parser.addArgument(
            ["-aalt", "--aalInstanceSelectionThreshold"],
            {
                defaultValue: AutoActiveLearner.defaultAalInstanceSelectionThreshold,
                help: "prediction threshold for selecting a new training instance from a candidate set",
                required: false,
            },
        );
        parser.addArgument(
            ["-le", "--learnerParameterEpochs"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultEpochs,
                help: "number of epochs",
                required: false,
            },
        );
        parser.addArgument(
            ["-lb", "--learnerParameterMiniBatchSize"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultMiniBatchSize,
                help: "mini batch size",
                required: false,
            },
        );
        parser.addArgument(
            ["-ll1", "--learnerParameterL1Regularization"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultL1Regularization,
                help: "l1 regularization coefficient",
                required: false,
            },
        );
        parser.addArgument(
            ["-ll2", "--learnerParameterL2Regularization"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultL2Regularization,
                help: "l2 regularization coefficient",
                required: false,
            },
        );
        parser.addArgument(
            ["-lesr", "--learnerParameterLossEarlyStopRatio"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultLossEarlyStopRatio,
                help: "loss early stop ratio",
                required: false,
            },
        );
        parser.addArgument(
            ["-llr", "--learnerParameterLearningRate"],
            {
                defaultValue: AppSoftmaxRegressionSparse.defaultLearningRate,
                help: "learning rate",
                required: false,
            },
        );
        parser.addArgument(
            ["-ltl", "--learnerParameterToCalculateOverallLossAfterEpoch"],
            {
                defaultValue: true,
                help: "whether to calcualte loss after each epoch",
                required: false,
            },
        );
        parser.addArgument(
            ["-ss", "--limitingSampleSize"],
            {
                defaultValue: 0,
                help: "down sample the training instances to this limit",
                required: false,
            },
        );
        parser.addArgument(
            ["-li", "--labelColumnIndex"],
            {
                defaultValue: 0,
                help: "label column index",
                required: false,
            },
        );
        parser.addArgument(
            ["-ti", "--textColumnIndex"],
            {
                defaultValue: 1,
                help: "text/utterance column index",
                required: false,
            },
        );
        parser.addArgument(
            ["-wi", "--weightColumnIndex"],
            {
                defaultValue: -1,
                help: "weight column index",
                required: false,
            },
        );
        parser.addArgument(
            ["-ls", "--linesToSkip"],
            {
                defaultValue: 0,
                help: "number of lines to skip for the input file",
                required: false,
            },
        );
        const parsedKnownArgs: any[] = parser.parseKnownArgs();
        const args: any = parsedKnownArgs[0];
        const unknownArgs: any = parsedKnownArgs[1];
        Utility.debuggingLog(
            `args=${Utility.jsonStringify(args)}`);
        Utility.debuggingLog(
            `unknownArgs=${Utility.jsonStringify(unknownArgs)}`);
        const debugFlag: boolean = Utility.toBoolean(args.debug);
        Utility.resetFlagToPrintDebuggingLogToConsole(debugFlag);
        // ---- NOTE-FOR-DEBUGGING ----  console.dir(args);
        // -------------------------------------------------------------------
        const filename: string =
            args.filename;
        if (!Utility.exists(filename)) {
            Utility.debuggingThrow(
                `The input dataset file ${filename} does not exist! process.cwd()=${process.cwd()}`);
        }
        const filetype: string =
            args.filetype;
        let outputFilename: string = args.outputFilename;
        if (outputFilename == null) {
            outputFilename = filename + ".lu";
        }
        const doBootstrapResampling: boolean =
            args.doBootstrapResampling;
        const bootstrapResamplingDistributionFilename: string =
            args.bootstrapResamplingDistributionFilename;
        const bootstrapResamplingSampleSizeConfihuration: number =
            +args.bootstrapResamplingSampleSizeConfihuration;
        const doAutoActiveLearning: boolean =
            args.doAutoActiveLearning;
        const aalLimitInitialNumberOfInstancesPerCategory: number =
            +args.aalLimitInitialNumberOfInstancesPerCategory;
        const aalNumberOfInstancesPerIteration: number =
            +args.aalNumberOfInstancesPerIteration;
        const aalInstanceSelectionThreshold: number =
            +args.aalInstanceSelectionThreshold;
        const learnerParameterEpochs: number =
            +args.learnerParameterEpochs;
        const learnerParameterMiniBatchSize: number =
            +args.learnerParameterMiniBatchSize;
        const learnerParameterL1Regularization: number =
            +args.learnerParameterL1Regularization;
        const learnerParameterL2Regularization: number =
            +args.learnerParameterL2Regularization;
        const learnerParameterLossEarlyStopRatio: number =
            +args.learnerParameterLossEarlyStopRatio;
        const learnerParameterLearningRate: number =
            +args.learnerParameterLearningRate;
        const learnerParameterToCalculateOverallLossAfterEpoch: boolean =
            args.learnerParameterToCalculateOverallLossAfterEpoch;
        const limitingSampleSize: number =
            +args.limitingSampleSize;
        Utility.debuggingLog(
            `filename=${filename}`);
        Utility.debuggingLog(
            `outputFilename=${outputFilename}`);
        Utility.debuggingLog(
            `doBootstrapResampling=${doBootstrapResampling}`);
        Utility.debuggingLog(
            `bootstrapResamplingDistributionFilename=${bootstrapResamplingDistributionFilename}`);
        Utility.debuggingLog(
            `bootstrapResamplingSampleSizeConfihuration=${bootstrapResamplingSampleSizeConfihuration}`);
        Utility.debuggingLog(
            `doAutoActiveLearning=${doAutoActiveLearning}`);
        Utility.debuggingLog(
            `aalLimitInitialNumberOfInstancesPerCategory=${aalLimitInitialNumberOfInstancesPerCategory}`);
        Utility.debuggingLog(
            `aalNumberOfInstancesPerIteration=${aalNumberOfInstancesPerIteration}`);
        Utility.debuggingLog(
            `aalInstanceSelectionThreshold=${aalInstanceSelectionThreshold}`);
        Utility.debuggingLog(
            `learnerParameterEpochs=${learnerParameterEpochs}`);
        Utility.debuggingLog(
            `learnerParameterMiniBatchSize=${learnerParameterMiniBatchSize}`);
        Utility.debuggingLog(
            `learnerParameterL1Regularization=${learnerParameterL1Regularization}`);
        Utility.debuggingLog(
            `learnerParameterL2Regularization=${learnerParameterL2Regularization}`);
        Utility.debuggingLog(
            `learnerParameterLossEarlyStopRatio=${learnerParameterLossEarlyStopRatio}`);
        Utility.debuggingLog(
            `learnerParameterLearningRate=${learnerParameterLearningRate}`);
        Utility.debuggingLog(
            `learnerParameterToCalculateOverallLossAfterEpoch=${learnerParameterToCalculateOverallLossAfterEpoch}`);
        Utility.debuggingLog(
            `limitingSampleSize=${limitingSampleSize}`);
        const outputModelFilename: string =
            args.outputModelFilename;
        const outputFeaturizerFilename: string =
            args.outputFeaturizerFilename;
        // -------------------------------------------------------------------
        const labelColumnIndex: number = +args.labelColumnIndex;
        const textColumnIndex: number = +args.textColumnIndex;
        const weightColumnIndex: number = +args.weightColumnIndex;
        const linesToSkip: number = +args.linesToSkip;
        Utility.debuggingLog(
            `labelColumnIndex=${labelColumnIndex}`);
        Utility.debuggingLog(
            `textColumnIndex=${textColumnIndex}`);
        Utility.debuggingLog(
            `weightColumnIndex=${weightColumnIndex}`);
        Utility.debuggingLog(
            `linesToSkip=${linesToSkip}`);
        // -------------------------------------------------------------------
        const bootstrapResamplingDistributionFileLabelColumnIndex: number =
            +args.bootstrapResamplingDistributionFileLabelColumnIndex;
        const bootstrapResamplingDistributionFileTextColumnIndex: number =
            +args.bootstrapResamplingDistributionFileTextColumnIndex;
        const bootstrapResamplingDistributionFileLinesToSkip: number =
            +args.bootstrapResamplingDistributionFileLinesToSkip;
        Utility.debuggingLog(
            `bootstrapResamplingDistributionFileLabelColumnIndex=` +
            `${bootstrapResamplingDistributionFileLabelColumnIndex}`);
        Utility.debuggingLog(
            `bootstrapResamplingDistributionFileTextColumnIndex=` +
            `${bootstrapResamplingDistributionFileTextColumnIndex}`);
        Utility.debuggingLog(
            `bootstrapResamplingDistributionFileLinesToSkip=` +
            `${bootstrapResamplingDistributionFileLinesToSkip}`);
        // -------------------------------------------------------------------
        let intentsUtterancesWeights: {
            "intents": string[],
            "utterances": string[],
            "weights": number[] } = {
                intents: [],
                utterances: [],
                weights: [] };
        let intentLabelIndexArray: number[] = [];
        let utteranceFeatureIndexArrays: number[][] = [];
        const dataWithSubwordFeaturizer: DataWithSubwordFeaturizer =
            await DataWithSubwordFeaturizerUtility.LoadDataWithSubwordFeaturizer(
                filename,
                null,
                true,
                filetype,
                labelColumnIndex,
                textColumnIndex,
                weightColumnIndex,
                linesToSkip);
        intentsUtterancesWeights = dataWithSubwordFeaturizer.getIntentsUtterancesWeights();
        intentLabelIndexArray = dataWithSubwordFeaturizer.getIntentLabelIndexArray();
        utteranceFeatureIndexArrays = dataWithSubwordFeaturizer.getUtteranceFeatureIndexArrays();
        // -------------------------------------------------------------------
        const bootstrapResamplingDistribution: Map<string, number> = new Map<string, number>();
        if (doBootstrapResampling) {
            if (Utility.exists(bootstrapResamplingDistributionFilename)) {
                const dataWithSubwordFeaturizerBootstrapResampling: DataWithSubwordFeaturizer =
                    await DataWithSubwordFeaturizerUtility.LoadDataWithSubwordFeaturizer(
                        bootstrapResamplingDistributionFilename,
                        null,
                        true,
                        filetype,
                        bootstrapResamplingDistributionFileLabelColumnIndex,
                        bootstrapResamplingDistributionFileTextColumnIndex,
                        bootstrapResamplingDistributionFileLinesToSkip);
                const bootstrapResamplingIntentInstanceIndexMapArray: Map<string, number[]> =
                    dataWithSubwordFeaturizerBootstrapResampling.getIntentInstanceIndexMapArray();
                for (const entry of bootstrapResamplingIntentInstanceIndexMapArray) {
                    bootstrapResamplingDistribution.set(entry[0], entry[1].length);
                }
            }
        }
        // -------------------------------------------------------------------
        const aalResult: {
            "newDataWithSubwordFeaturizer": DataWithSubwordFeaturizer,
            "learner": SoftmaxRegressionSparse,
            "seedingInstanceIndexArray": number[],
            "seedingInstanceIndexArrayInitial": number[],
            } = await AppAutoActiveLearner.mainAutoActiveLearnerWithDataWithSubwordFeaturizer(
            dataWithSubwordFeaturizer,
            labelColumnIndex,
            textColumnIndex,
            weightColumnIndex,
            linesToSkip,
            doBootstrapResampling,
            bootstrapResamplingDistribution,
            doAutoActiveLearning,
            aalLimitInitialNumberOfInstancesPerCategory,
            aalNumberOfInstancesPerIteration,
            aalInstanceSelectionThreshold,
            learnerParameterEpochs,
            learnerParameterMiniBatchSize,
            learnerParameterL1Regularization,
            learnerParameterL2Regularization,
            learnerParameterLossEarlyStopRatio,
            learnerParameterLearningRate,
            learnerParameterToCalculateOverallLossAfterEpoch,
            limitingSampleSize);
        const newDataWithSubwordFeaturizer: DataWithSubwordFeaturizer =
            aalResult.newDataWithSubwordFeaturizer;
        const learner: SoftmaxRegressionSparse =
            aalResult.learner;
        // -------------------------------------------------------------------
        const outputFilenames: string[] = [];
        // tslint:disable-next-line: max-line-length
        // ---- NOTE-PACE-HOLDER ---- const outputFilenameDump: string = newDataWithSubwordFeaturizer.dumpLuLuisJsonStructureInLuFormat(
        // ---- NOTE-PACE-HOLDER ----     outputFilename);
        // ---- NOTE-PACE-HOLDER ---- outputFilenames.push(outputFilenameDump);
        // ---- NOTE-PACE-HOLDER ---- const outputFilenameLuis: string =
        // ---- NOTE-PACE-HOLDER ----     outputFilename + ".luis";
        // tslint:disable-next-line: max-line-length
        // ---- NOTE-PACE-HOLDER ---- const outputFilenameLuisAfterDumpfile = newDataWithSubwordFeaturizer.dumpLuLuisJsonStructure(
        // ---- NOTE-PACE-HOLDER ----     outputFilenameLuis, undefined, 4);
        // ---- NOTE-PACE-HOLDER ---- outputFilenames.push(outputFilenameLuisAfterDumpfile);
        if (!Utility.isEmptyString(outputModelFilename)) {
            const outputModelFilenameAfterDumpfile: string = Utility.dumpFile(
                outputModelFilename,
                learner.serializeToJsonString(undefined, 4));
            outputFilenames.push(outputModelFilenameAfterDumpfile);
        }
        if (!Utility.isEmptyString(outputFeaturizerFilename)) {
            const outputFeaturizerFilenameAfterDumpfile: string = Utility.dumpFile(
                outputFeaturizerFilename,
                newDataWithSubwordFeaturizer.getFeaturizer().serializeToJsonString(undefined, 4));
            outputFilenames.push(outputFeaturizerFilenameAfterDumpfile);
        }
        // -------------------------------------------------------------------
        const dateTimeEndInString: string = (new Date()).toISOString();
        // -------------------------------------------------------------------
        Utility.debuggingLog(
            `dateTimeBeginInString=${dateTimeBeginInString}`);
        Utility.debuggingLog(
            `dateTimeEndInString=${dateTimeEndInString}`);
        // -------------------------------------------------------------------
        return outputFilenames;
    }