def build_stacked_datasets()

in mozetl/taar/taar_ensemble.py [0:0]


def build_stacked_datasets(dataset, folds):
    # For each of k_folds, we apply the stacking
    # function to the training fold.
    # Where k_folds = 3, this will yield a list consisting
    # of 3 RDDs.   Each RDD is defined by the output of the
    # `stacking` function.

    def stacked_row_closure():
        rec_map = load_recommenders()

        recommender_list = [
            rec_map[COLLABORATIVE].recommend,  # Collaborative
            rec_map[SIMILARITY].recommend,  # Similarity
            rec_map[LOCALE].recommend,  # Locale
        ]

        def inner(client_row):
            return to_stacked_row(recommender_list, client_row)

        return inner

    wrapped_to_stacked_row = stacked_row_closure()

    print("Number of folds: {}".format(len(folds)))

    stacked_datasets = []
    for fold in folds:
        train_set = [f for f in folds if f != fold]
        stacking_result = [
            df.rdd.map(wrapped_to_stacked_row).filter(lambda x: x is not None)
            for df in train_set
        ]
        stacked_datasets.append(stacking_result)
    return stacked_datasets