def __preprocess_hps()

in datawig/_hpo.py [0:0]


    def __preprocess_hps(self,
                         train_df: pd.DataFrame,
                         simple_imputer,
                         num_evals) -> pd.DataFrame:
        """
        Generates list of all possible combinations of hyperparameter from the nested hp dictionary.
        Requires the data to check whether the relevant columns are present and have the appropriate type.

        :param train_df: training data as dataframe
        :param simple_imputer: Parent instance of SimpleImputer
        :param num_evals is the maximum number of hpo configurations to consider.

        :return: Data frame where each row is a hyperparameter configuration and each column is a parameter.
                    Column names have the form colum:parameter, e.g. title:max_tokens or global:learning rate.
        """

        default_hps = dict()
        # Define default hyperparameter choices for each column type (string, categorical, numeric)
        default_hps['global'] = {}
        default_hps['global']['learning_rate'] = [4e-3]
        default_hps['global']['weight_decay'] = [0]
        default_hps['global']['num_epochs'] = [100]
        default_hps['global']['patience'] = [5]
        default_hps['global']['batch_size'] = [16]
        default_hps['global']['final_fc_hidden_units'] = [[]]
        default_hps['string'] = {}
        default_hps['string']['ngram_range'] = {}
        default_hps['string']['max_tokens'] = []  # [2 ** exp for exp in [12, 15, 18]]
        default_hps['string']['tokens'] = []  # [['chars'], ['words']]
        default_hps['string']['ngram_range']['words'] = [(1, 3)]
        default_hps['string']['ngram_range']['chars'] = [(1, 5)]

        default_hps['categorical'] = {}
        default_hps['categorical']['max_tokens'] = [2 ** 12]
        default_hps['categorical']['embed_dim'] = [10]

        default_hps['numeric'] = {}
        default_hps['numeric']['normalize'] = [True]
        default_hps['numeric']['numeric_latent_dim'] = [10]
        default_hps['numeric']['numeric_hidden_layers'] = [1]

        # create empty dict if global hps not passed
        if 'global' not in self.hps.keys():
            self.hps['global'] = {}

        # merge data type default parameters with the ones in self.hps
        # giving precedence over the parameters specified in self.hps
        for data_type in ['string', 'categorical', 'numeric']:
            for parameter_key, values in default_hps[data_type].items():
                if parameter_key in self.hps[data_type]:
                    default_hps[data_type][parameter_key] = self.hps[data_type][parameter_key]

        # add type to column dictionaries if it was not specified, does not support categorical types
        for column_name in simple_imputer.input_columns:
            if column_name not in self.hps.keys():
                self.hps[column_name] = {}
            if 'type' not in self.hps[column_name].keys():
                if is_numeric_dtype(train_df[column_name]):
                    self.hps[column_name]['type'] = ['numeric']
                else:
                    self.hps[column_name]['type'] = ['string']

            # merge column hyper parameters with feature type specific defaults
            for parameter_key, values in default_hps[self.hps[column_name]['type'][0]].items():
                if parameter_key not in self.hps[column_name]:
                    self.hps[column_name][parameter_key] = values

        # all of the data type specific parameters have been copied to the column encoder parameters
        del self.hps['string']
        del self.hps['numeric']
        del self.hps['categorical']

        # merge global parameters with defaults
        for parameter_key, values in default_hps['global'].items():
            if parameter_key not in self.hps['global']:
                self.hps['global'][parameter_key] = values

        flat_dict = flatten_dict(self.hps)

        values = [value for key, value in flat_dict.items()]
        keys = [key for key in flat_dict.keys()]
        hp_df = pd.DataFrame(
            random_cartesian_product(values, num=num_evals),
            columns=keys
        )

        return hp_df