def __fit

def __fit_hp()

in datawig/_hpo.py [0:0]
95 lines of code
24 McCabe index (conditional complexity)

    def __fit_hp(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 hp: pd.Series,
                 simple_imputer,
                 name: str,
                 user_defined_scores: list = None) -> pd.core.series.Series:
        """

        Method initialises the model, performs fitting and returns the desired metrics.


        :param train_df: training data as dataframe
        :param test_df: test data as dataframe; if not provided, a ratio of test_split of the
                          training data are used as test data
        :param hp: pd.Series with hyperparameter configuration
        :param simple_imputer: SimpleImputer instance from which to inherit column names etc.
        :param name to identify the current setting of hps.
        :param user_defined_scores: list with entries (Callable, str), where callable is a function
                          accepting arguments (true, predicted, confidence). True is an array with the true labels,
                          predicted with the predicted labels and confidence is an array with the confidence score for
                          each prediction.
                          Default metrics are:
                          f1_weighted, f1_micro, f1_macro, f1_weighted_train
                          recall_weighted, recall_weighted_train, precision_weighted, precision_weighted_train,
                          coverage_at_90, coverage_at_90_train, empirical_precision_at_90,
                          ece_pre_calibration (ece: expected calibration error), ece_post_calibration, time [min].
                          A user defined function could look as follows:

                          def my_function(true, predicted, confidence):
                               return (true[confidence > .75] == predicted[confidence > .75]).mean()

                          uds = (my_function, 'empirical_precision_above_75')

        :return: Series with hpo parameters and results.

        """

        from . import Imputer  # needs to be imported here to avoid circular dependency

        if not name:
            name = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        data_encoders = []
        data_featurizers = []

        # define column encoders and featurisers for each input column
        for input_column in simple_imputer.input_columns:

            # extract parameters for the current input column, take everything after the first colon
            col_parms = {':'.join(key.split(':')[1:]): val for key, val in hp.items() if key.startswith(input_column)}

            # define all input columns
            if col_parms['type'] == 'string':
                # iterate over multiple embeddings (chars + strings for the same column)
                for token in col_parms['tokens']:
                    encoder = TfIdfEncoder if simple_imputer.is_explainable else BowEncoder
                    # call kw. args. with: **{key: item for key, item in col_parms.items() if not key == 'type'})]
                    data_encoders += [encoder(input_columns=[input_column],
                                              output_column=input_column + '_' + token,
                                              tokens=token,
                                              ngram_range=col_parms['ngram_range:' + token],
                                              max_tokens=col_parms['max_tokens'])]
                    data_featurizers += [BowFeaturizer(field_name=input_column + '_' + token,
                                                       max_tokens=col_parms['max_tokens'])]

            elif col_parms['type'] == 'categorical':
                data_encoders += [CategoricalEncoder(input_columns=[input_column],
                                                     output_column=input_column + '_' + col_parms['type'],
                                                     max_tokens=col_parms['max_tokens'])]
                data_featurizers += [EmbeddingFeaturizer(field_name=input_column + '_' + col_parms['type'],
                                                         max_tokens=col_parms['max_tokens'],
                                                         embed_dim=col_parms['embed_dim'])]

            elif col_parms['type'] == 'numeric':
                data_encoders += [NumericalEncoder(input_columns=[input_column],
                                                   output_column=input_column + '_' + col_parms['type'],
                                                   normalize=col_parms['normalize'])]
                data_featurizers += [NumericalFeaturizer(field_name=input_column + '_' + col_parms['type'],
                                                         numeric_latent_dim=col_parms['numeric_latent_dim'],
                                                         numeric_hidden_layers=col_parms['numeric_hidden_layers'])]
            else:
                logger.warning('Found unknown column type. Canidates are string, categorical, numeric.')

        # Define separate encoder and featurizer for each column
        # Define output column. Associated parameters are not tuned.
        if is_numeric_dtype(train_df[simple_imputer.output_column]):
            label_column = [NumericalEncoder(simple_imputer.output_column)]
            logger.debug("Assuming numeric output column: {}".format(simple_imputer.output_column))
        else:
            label_column = [CategoricalEncoder(simple_imputer.output_column)]
            logger.debug("Assuming categorical output column: {}".format(simple_imputer.output_column))

        global_parms = {key.split(':')[1]: val for key, val in hp.iteritems() if key.startswith('global:')}

        hp_time = time.time()

        hp_imputer = Imputer(data_encoders=data_encoders,
                             data_featurizers=data_featurizers,
                             label_encoders=label_column,
                             output_path=self.output_path + name)

        hp_imputer.fit(train_df=train_df,
                       test_df=test_df,
                       ctx=get_context(),
                       learning_rate=global_parms['learning_rate'],
                       num_epochs=global_parms['num_epochs'],
                       patience=global_parms['patience'],
                       test_split=.1,
                       weight_decay=global_parms['weight_decay'],
                       batch_size=global_parms['batch_size'],
                       final_fc_hidden_units=global_parms['final_fc_hidden_units'],
                       calibrate=True)

        # add suitable metrics to hp series
        imputed = hp_imputer.predict(test_df)
        true = imputed[simple_imputer.output_column]
        predicted = imputed[simple_imputer.output_column + '_imputed']

        imputed_train = hp_imputer.predict(train_df.sample(min(train_df.shape[0], int(1e4))))
        true_train = imputed_train[simple_imputer.output_column]
        predicted_train = imputed_train[simple_imputer.output_column + '_imputed']

        if is_numeric_dtype(train_df[simple_imputer.output_column]):
            hp['mse'] = mean_squared_error(true, predicted)
            hp['mse_train'] = mean_squared_error(true_train, predicted_train)
            confidence = float('nan')
        else:
            confidence = imputed[simple_imputer.output_column + '_imputed_proba']
            confidence_train = imputed_train[simple_imputer.output_column + '_imputed_proba']
            hp['f1_micro'] = f1_score(true, predicted, average='micro')
            hp['f1_macro'] = f1_score(true, predicted, average='macro')
            hp['f1_weighted'] = f1_score(true, predicted, average='weighted')
            hp['f1_weighted_train'] = f1_score(true_train, predicted_train, average='weighted')
            hp['precision_weighted'] = f1_score(true, predicted, average='weighted')
            hp['precision_weighted_train'] = f1_score(true_train, predicted_train, average='weighted')
            hp['recall_weighted'] = recall_score(true, predicted, average='weighted')
            hp['recall_weighted_train'] = recall_score(true_train, predicted_train, average='weighted')
            hp['coverage_at_90'] = (confidence > .9).mean()
            hp['coverage_at_90_train'] = (confidence_train > .9).mean()
            hp['empirical_precision_at_90'] = (predicted[confidence > .9] == true[confidence > .9]).mean()
            hp['ece_pre_calibration'] = hp_imputer.calibration_info['ece_post']
            hp['ece_post_calibration'] = hp_imputer.calibration_info['ece_post']
            hp['time [min]'] = (time.time() - hp_time)/60

        if user_defined_scores:
            for uds in user_defined_scores:
                hp[uds[1]] = uds[0](true=true, predicted=predicted, confidence=confidence)

        hp_imputer.save()

        return hp