def fit_hpo()

in datawig/simple_imputer.py [0:0]
65 lines of code
19 McCabe index (conditional complexity)

    def fit_hpo(self,
                train_df: pd.DataFrame,
                test_df: pd.DataFrame = None,
                hps: dict = None,
                num_evals: int = 10,
                max_running_hours: float = 96.0,
                hpo_run_name: str = None,
                user_defined_scores: list = None,
                num_epochs: int = None,
                patience: int = None,
                test_split: float = .2,
                weight_decay: List[float] = None,
                batch_size: int = 16,
                num_hash_bucket_candidates: List[float] = [2 ** exp for exp in [12, 15, 18]],
                tokens_candidates: List[str] = ['words', 'chars'],
                numeric_latent_dim_candidates: List[int] = None,
                numeric_hidden_layers_candidates: List[int] = None,
                final_fc_hidden_units: List[List[int]] = None,
                learning_rate_candidates: List[float] = None,
                normalize_numeric: bool = True,
                hpo_max_train_samples: int = None,
                ctx: mx.context = get_context()) -> Any:

        """
        Fits an imputer model with hyperparameter optimization. The parameter ranges are searched randomly.

        Grids are specified using the *_candidates arguments (old)
        or with more flexibility via the dictionary hps.

        :param train_df: training data as dataframe
        :param test_df: test data as dataframe; if not provided, a ratio of test_split of the
            training data are used as test data
        :param hps: nested dictionary where hps[global][parameter_name] is list of parameters. Similarly,
            hps[column_name][parameter_name] is a list of parameter values for each input column.
            Further, hps[column_name]['type'] is in ['numeric', 'categorical', 'string'] and is
            inferred if not provided.
        :param num_evals: number of evaluations for random search
        :param max_running_hours: Time before the hpo run is terminated in hours.
        :param hpo_run_name: string to identify the current hpo run.
        :param user_defined_scores: list with entries (Callable, str), where callable is a function
            accepting **kwargs true, predicted, confidence. Allows custom scoring functions.

        Below are parameters of the old implementation, kept to ascertain backwards compatibility.
        :param num_epochs: maximal number of training epochs (default 10)
        :param patience: used for early stopping; after [patience] epochs with no improvement,
            training is stopped. (default 3)
        :param test_split: if no test_df is provided this is the ratio of test data to be held
            separate for determining model convergence
        :param weight_decay: regularizer (default 0)
        :param batch_size (default 16)
        :param num_hash_bucket_candidates: candidates for gridsearch hyperparameter
            optimization (default [2**10, 2**13, 2**15, 2**18, 2**20])
        :param tokens_candidates: candidates for tokenization (default ['words', 'chars'])
        :param numeric_latent_dim_candidates: candidates for latent dimensionality of
            numerical features (default [10, 50, 100])
        :param numeric_hidden_layers_candidates: candidates for number of hidden layers of
        :param final_fc_hidden_units: list of lists w/ dimensions for FC layers after the
            final concatenation (NOTE: for HPO, this expects a list of lists)
        :param learning_rate_candidates: learning rate for stochastic gradient descent (default 4e-4)
            numerical features (default [0, 1, 2])
        :param learning_rate_candidates: candidates for learning rate (default [1e-1, 1e-2, 1e-3])
        :param normalize_numeric: boolean indicating whether or not to normalize numeric values
        :param hpo_max_train_samples: training set size for hyperparameter optimization.
            use is deprecated.
        :param ctx: List of mxnet contexts (if no gpu's available, defaults to [mx.cpu()])
            User can also pass in a list gpus to be used, ex. [mx.gpu(0), mx.gpu(2), mx.gpu(4)]
            This parameter is deprecated.

        :return: pd.DataFrame with with hyper-parameter configurations and results
        """

        # generate dictionary with default hyperparameter settings. Overwrite these defaults
        # with configurations that were passed via this functions API wherever applicable.
        default_hps = dict()
        default_hps['global'] = dict()
        if learning_rate_candidates:
            default_hps['global']['learning_rate'] = learning_rate_candidates
        if weight_decay:
            default_hps['global']['weight_decay'] = weight_decay
        if num_epochs:
            default_hps['global']['num_epochs'] = [num_epochs]

        if patience:
            default_hps['global']['patience'] = [patience]

        if batch_size:
            default_hps['global']['batch_size'] = [batch_size]
        if final_fc_hidden_units:
            default_hps['global']['final_fc_hidden_units'] = final_fc_hidden_units

        default_hps['string'] = {}
        if num_hash_bucket_candidates:
            default_hps['string']['max_tokens'] = num_hash_bucket_candidates

        if tokens_candidates:
            default_hps['string']['tokens'] = [[c] for c in tokens_candidates]

        default_hps['categorical'] = {}
        if num_hash_bucket_candidates:
            default_hps['categorical']['max_tokens'] = num_hash_bucket_candidates

        default_hps['numeric'] = {}
        if normalize_numeric:
            default_hps['numeric']['normalize'] = [normalize_numeric]
        if numeric_latent_dim_candidates:
            default_hps['numeric']['numeric_latent_dim'] = numeric_latent_dim_candidates

        if numeric_hidden_layers_candidates:
            default_hps['numeric']['numeric_hidden_layers'] = numeric_hidden_layers_candidates

        if hps is None:
            hps = {}

        # give parameters in `hps` precedence over default parameters
        parameters_in_both = set(default_hps.keys()).intersection(set(hps.keys()))
        for param in parameters_in_both:
            del default_hps[param]
        hps = merge_dicts(hps, default_hps)

        if user_defined_scores is None:
            user_defined_scores = []

        if test_df is None:
            train_df, test_df = random_split(train_df, [1-test_split, test_split])

        self.check_data_types(train_df)  # infer data types, saved self.string_columns, self.numeric_columns
        self.hpo.tune(train_df, test_df, hps, num_evals, max_running_hours, user_defined_scores, hpo_run_name, self)
        self.save()

        return self