public static async mainAutoActiveLearnerWithLuContent()

in packages/dispatcher/src/model/supervised/classifier/auto_active_learning/AppAutoActiveLearner.ts [293:491]


    public static async mainAutoActiveLearnerWithLuContent(
        luContent: string,
        doBootstrapResampling: boolean =
            AppAutoActiveLearner.defaultDoBootstrapResampling,
        brsDistribution: TMapStringKeyGenericValue<number> =
            DictionaryMapUtility.newTMapStringKeyGenericValue<number>(),
        doAutoActiveLearning: boolean =
            AutoActiveLearner.defaultDoAutoActiveLearning,
        aalLimitInitialNumberOfInstancesPerCategory: number =
            AutoActiveLearner.defaultAalLimitInitialNumberOfInstancesPerCategory,
        aalNumberOfInstancesPerIteration: number =
            AutoActiveLearner.defaultAalNumberOfInstancesPerIteration,
        aalInstanceSelectionThreshold: number =
            AutoActiveLearner.defaultAalInstanceSelectionThreshold,
        learnerParameterEpochs: number =
            AppSoftmaxRegressionSparse.defaultEpochs,
        learnerParameterMiniBatchSize: number =
            AppSoftmaxRegressionSparse.defaultMiniBatchSize,
        learnerParameterL1Regularization: number =
            AppSoftmaxRegressionSparse.defaultL1Regularization,
        learnerParameterL2Regularization: number =
            AppSoftmaxRegressionSparse.defaultL2Regularization,
        learnerParameterLossEarlyStopRatio: number =
            AppSoftmaxRegressionSparse.defaultLossEarlyStopRatio,
        learnerParameterLearningRate: number =
            AppSoftmaxRegressionSparse.defaultLearningRate,
        learnerParameterToCalculateOverallLossAfterEpoch: boolean =
            true,
        limitingSampleSize: number =
            DefaultLimitingSampleSize): Promise<{
            "newLuDataWithSubwordFeaturizer": LuDataWithSubwordFeaturizer,
            "learner": SoftmaxRegressionSparse,
            "seedingInstanceIndexArray": number[],
            "seedingInstanceIndexArrayInitial": number[],
            }> {
        // -------------------------------------------------------------------
        let luDataWithSubwordFeaturizer: LuDataWithSubwordFeaturizer =
            await LuDataWithSubwordFeaturizer.createLuDataWithSubwordFeaturizer(
                luContent,
                new NgramSubwordFeaturizer(),
                true);
        // -------------------------------------------------------------------
        if (doBootstrapResampling) {
            const bootstrapSamplerKeyMap: BootstrapSamplerKeyMapDistribution<number> =
                new BootstrapSamplerKeyMapDistribution<number>(
                    brsDistribution,
                    luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
            // ---- NOTE-FOR-REFERENCE ---- const bootstrapSamplerKeyMap: BootstrapSamplerKeyMap<number> =
            // ---- NOTE-FOR-REFERENCE ----     new BootstrapSamplerKeyMap(
            // ---- NOTE-FOR-REFERENCE ----         dataWithSubwordFeaturizer.getIntentInstanceIndexMapArray());
            Utility.debuggingLog(`luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray()=` +
                `${Utility.mapToJsonSerialization(luDataWithSubwordFeaturizer.getIntentInstanceIndexMapArray())}`);
            Utility.debuggingLog(`bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()=` +
                `${bootstrapSamplerKeyMap.computeSamplingNumberInstancesPerLabel()}`);
            // ---- NOTE-FOR-DEBUGGING ---- const samplingIndexArrayGenerator =
            // ---- NOTE-FOR-DEBUGGING ----     bootstrapSamplerKeyMap.sampleInstances();
            // ---- NOTE-FOR-DEBUGGING ---- for (const element of samplingIndexArrayGenerator) {
            // ---- NOTE-FOR-DEBUGGING ----     Utility.debuggingLog(`element of samplingIndexArrayGenerator=` +
            // ---- NOTE-FOR-DEBUGGING ----         `${element}`);
            // ---- NOTE-FOR-DEBUGGING ---- }
            const samplingIndexArray: number[] =
                [...bootstrapSamplerKeyMap.sampleInstances()];
            Utility.debuggingLog(`samplingIndexArray.length=` +
                `${samplingIndexArray.length}`);
            const luDataWithSubwordFeaturizerBootstrapSampled: DataWithSubwordFeaturizer =
                await luDataWithSubwordFeaturizer.createDataFromSamplingExistingDataUtterances(
                    luDataWithSubwordFeaturizer,
                    -1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- labelColumnIndex,
                    -1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- textColumnIndex,
                    -1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- weightColumnIndex,
                    -1, // ---- NOTE-NO-NEED-FOR-LuDataWithSubwordFeaturizer ---- linesToSkip,
                    samplingIndexArray,
                    false) as DataWithSubwordFeaturizer;
            luDataWithSubwordFeaturizer = luDataWithSubwordFeaturizerBootstrapSampled as LuDataWithSubwordFeaturizer;
        }
        // -------------------------------------------------------------------
        const results =
            luDataWithSubwordFeaturizer.collectSmallUtteranceIndexSetCoveringAllIntentEntityLabels();
        const smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
            results.smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
        const smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
            results.smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels;
        const smallUtteranceIndexSetCoveringAllIntentEntityLabels: Set<number> =
            results.smallUtteranceIndexSetCoveringAllIntentEntityLabels;
        const remainingUtteranceIndexSet: Set<number> =
            results.remainingUtteranceIndexSet;
        Utility.debuggingLog(`smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
            `${Utility.stringMapSetToJson(smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
        Utility.debuggingLog(`smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels=` +
            `${Utility.stringMapSetToJson(smallUtteranceIndexEntityTypeMapCoveringAllIntentEntityLabels)}`);
        Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels=` +
            `${Utility.setToJsonSerialization(smallUtteranceIndexSetCoveringAllIntentEntityLabels)}`);
        Utility.debuggingLog(`remainingUtteranceIndexSet=` +
            `${Utility.setToJsonSerialization(remainingUtteranceIndexSet)}`);
        Utility.debuggingLog(`smallUtteranceIndexSetCoveringAllIntentEntityLabels.size=` +
            `${smallUtteranceIndexSetCoveringAllIntentEntityLabels.size}`);
        Utility.debuggingLog(`remainingUtteranceIndexSet.size=` +
            `${remainingUtteranceIndexSet.size}`);
        // -------------------------------------------------------------------
        if (!doAutoActiveLearning) {
            aalLimitInitialNumberOfInstancesPerCategory = -1;
        }
        const resultsInitialSampling: {
            "seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels": Map<string, Set<number>>,
            "candidateUtteranceIndexSetSampled": Set<number>,
            "candidateUtteranceIndexSetRemaining": Set<number>,
            } = luDataWithSubwordFeaturizer.collectUtteranceIndexSetSeedingIntentTrainingSet(
                smallUtteranceIndexIntentMapCoveringAllIntentEntityLabels,
                remainingUtteranceIndexSet,
                aalLimitInitialNumberOfInstancesPerCategory);
        const seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: Map<string, Set<number>> =
            resultsInitialSampling.seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels;
        const candidateUtteranceIndexSetSampled: Set<number> =
            resultsInitialSampling.candidateUtteranceIndexSetSampled;
        const candidateUtteranceIndexSetRemaining: Set<number> =
            resultsInitialSampling.candidateUtteranceIndexSetRemaining;
        Utility.debuggingLog(`seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
            `${Utility.stringMapSetToJson(seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels)}`);
        Utility.debuggingLog(`candidateUtteranceIndexSetSampled=` +
            `${Utility.setToJsonSerialization(candidateUtteranceIndexSetSampled)}`);
        Utility.debuggingLog(`candidateUtteranceIndexSetRemaining=` +
            `${Utility.setToJsonSerialization(candidateUtteranceIndexSetRemaining)}`);
        Utility.debuggingLog(`candidateUtteranceIndexSetSampled.size=` +
            `${candidateUtteranceIndexSetSampled.size}`);
        Utility.debuggingLog(`candidateUtteranceIndexSetRemaining.size=` +
            `${candidateUtteranceIndexSetRemaining.size}`);
        const countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels: number =
            [...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
                (accumulation: number, entry: [string, Set<number>]) =>
                accumulation + entry[1].size, 0);
        Utility.debuggingLog(`countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels=` +
            `${countSeedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels}`);
        // -------------------------------------------------------------------
        const seedingUtteranceIndexArray: number[] =
            [...seedingUtteranceIndexIntentMapCoveringAllIntentEntityLabels].reduce(
                (accumulation: number[], entry: [string, Set<number>]) =>
                accumulation.concat(Array.from(entry[1])), []);
        Utility.debuggingLog(`seedingUtteranceIndexArray.length=` +
            `${seedingUtteranceIndexArray.length}`);
        // -------------------------------------------------------------------
        const seedingInstanceIndexArray: number[] =
            Utility.cloneArray(seedingUtteranceIndexArray);
        const intentLabelIndexArray: number[] =
            luDataWithSubwordFeaturizer.getIntentLabelIndexArray();
        const utteranceFeatureIndexArrays: number[][] =
            luDataWithSubwordFeaturizer.getUtteranceFeatureIndexArrays();
        const autoActiveLearner: AutoActiveLearner =
            new AutoActiveLearner(
                doAutoActiveLearning,
                aalLimitInitialNumberOfInstancesPerCategory,
                aalNumberOfInstancesPerIteration,
                aalInstanceSelectionThreshold,
                learnerParameterEpochs,
                learnerParameterMiniBatchSize,
                learnerParameterL1Regularization,
                learnerParameterL2Regularization,
                learnerParameterLossEarlyStopRatio,
                learnerParameterLearningRate,
                learnerParameterToCalculateOverallLossAfterEpoch);
        const learned: {
            "seedingInstanceIndexArray": number[],
            "learner": SoftmaxRegressionSparse,
            } = autoActiveLearner.learn(
                luDataWithSubwordFeaturizer.getFeaturizerLabels(),
                luDataWithSubwordFeaturizer.getFeaturizerLabelMap(),
                luDataWithSubwordFeaturizer.getFeaturizer().getNumberLabels(),
                luDataWithSubwordFeaturizer.getFeaturizer().getNumberFeatures(),
                intentLabelIndexArray,
                utteranceFeatureIndexArrays,
                seedingInstanceIndexArray,
                Array.from(candidateUtteranceIndexSetRemaining));
        let aalSampledInstanceIndexArray: number[] =
            learned.seedingInstanceIndexArray;
        const learner: SoftmaxRegressionSparse =
            learned.learner;
        // -------------------------------------------------------------------
        const numberInstancesPreSelected: number =
            seedingUtteranceIndexArray.length;
        if (limitingSampleSize > numberInstancesPreSelected) {
            limitingSampleSize -= numberInstancesPreSelected;
            const reservoirArraySampler: ReservoirArraySampler<number> = new ReservoirArraySampler(
                aalSampledInstanceIndexArray,
                numberInstancesPreSelected);
            aalSampledInstanceIndexArray =
                [...reservoirArraySampler.sampleInstances(limitingSampleSize)];
        }
        // -------------------------------------------------------------------
        const newLuDataWithSubwordFeaturizer: LuDataWithSubwordFeaturizer =
            await LuDataWithSubwordFeaturizer.createLuDataWithSubwordFeaturizerFromFilteringExistingLuDataUtterances(
                luDataWithSubwordFeaturizer,
                new Set<number>(aalSampledInstanceIndexArray),
                false);
        return {
            newLuDataWithSubwordFeaturizer,
            learner,
            seedingInstanceIndexArray: aalSampledInstanceIndexArray,
            seedingInstanceIndexArrayInitial: seedingUtteranceIndexArray };
        // -------------------------------------------------------------------
    }