in datawig/simple_imputer.py [0:0]
def fit_hpo(self,
train_df: pd.DataFrame,
test_df: pd.DataFrame = None,
hps: dict = None,
num_evals: int = 10,
max_running_hours: float = 96.0,
hpo_run_name: str = None,
user_defined_scores: list = None,
num_epochs: int = None,
patience: int = None,
test_split: float = .2,
weight_decay: List[float] = None,
batch_size: int = 16,
num_hash_bucket_candidates: List[float] = [2 ** exp for exp in [12, 15, 18]],
tokens_candidates: List[str] = ['words', 'chars'],
numeric_latent_dim_candidates: List[int] = None,
numeric_hidden_layers_candidates: List[int] = None,
final_fc_hidden_units: List[List[int]] = None,
learning_rate_candidates: List[float] = None,
normalize_numeric: bool = True,
hpo_max_train_samples: int = None,
ctx: mx.context = get_context()) -> Any:
"""
Fits an imputer model with hyperparameter optimization. The parameter ranges are searched randomly.
Grids are specified using the *_candidates arguments (old)
or with more flexibility via the dictionary hps.
:param train_df: training data as dataframe
:param test_df: test data as dataframe; if not provided, a ratio of test_split of the
training data are used as test data
:param hps: nested dictionary where hps[global][parameter_name] is list of parameters. Similarly,
hps[column_name][parameter_name] is a list of parameter values for each input column.
Further, hps[column_name]['type'] is in ['numeric', 'categorical', 'string'] and is
inferred if not provided.
:param num_evals: number of evaluations for random search
:param max_running_hours: Time before the hpo run is terminated in hours.
:param hpo_run_name: string to identify the current hpo run.
:param user_defined_scores: list with entries (Callable, str), where callable is a function
accepting **kwargs true, predicted, confidence. Allows custom scoring functions.
Below are parameters of the old implementation, kept to ascertain backwards compatibility.
:param num_epochs: maximal number of training epochs (default 10)
:param patience: used for early stopping; after [patience] epochs with no improvement,
training is stopped. (default 3)
:param test_split: if no test_df is provided this is the ratio of test data to be held
separate for determining model convergence
:param weight_decay: regularizer (default 0)
:param batch_size (default 16)
:param num_hash_bucket_candidates: candidates for gridsearch hyperparameter
optimization (default [2**10, 2**13, 2**15, 2**18, 2**20])
:param tokens_candidates: candidates for tokenization (default ['words', 'chars'])
:param numeric_latent_dim_candidates: candidates for latent dimensionality of
numerical features (default [10, 50, 100])
:param numeric_hidden_layers_candidates: candidates for number of hidden layers of
:param final_fc_hidden_units: list of lists w/ dimensions for FC layers after the
final concatenation (NOTE: for HPO, this expects a list of lists)
:param learning_rate_candidates: learning rate for stochastic gradient descent (default 4e-4)
numerical features (default [0, 1, 2])
:param learning_rate_candidates: candidates for learning rate (default [1e-1, 1e-2, 1e-3])
:param normalize_numeric: boolean indicating whether or not to normalize numeric values
:param hpo_max_train_samples: training set size for hyperparameter optimization.
use is deprecated.
:param ctx: List of mxnet contexts (if no gpu's available, defaults to [mx.cpu()])
User can also pass in a list gpus to be used, ex. [mx.gpu(0), mx.gpu(2), mx.gpu(4)]
This parameter is deprecated.
:return: pd.DataFrame with with hyper-parameter configurations and results
"""
# generate dictionary with default hyperparameter settings. Overwrite these defaults
# with configurations that were passed via this functions API wherever applicable.
default_hps = dict()
default_hps['global'] = dict()
if learning_rate_candidates:
default_hps['global']['learning_rate'] = learning_rate_candidates
if weight_decay:
default_hps['global']['weight_decay'] = weight_decay
if num_epochs:
default_hps['global']['num_epochs'] = [num_epochs]
if patience:
default_hps['global']['patience'] = [patience]
if batch_size:
default_hps['global']['batch_size'] = [batch_size]
if final_fc_hidden_units:
default_hps['global']['final_fc_hidden_units'] = final_fc_hidden_units
default_hps['string'] = {}
if num_hash_bucket_candidates:
default_hps['string']['max_tokens'] = num_hash_bucket_candidates
if tokens_candidates:
default_hps['string']['tokens'] = [[c] for c in tokens_candidates]
default_hps['categorical'] = {}
if num_hash_bucket_candidates:
default_hps['categorical']['max_tokens'] = num_hash_bucket_candidates
default_hps['numeric'] = {}
if normalize_numeric:
default_hps['numeric']['normalize'] = [normalize_numeric]
if numeric_latent_dim_candidates:
default_hps['numeric']['numeric_latent_dim'] = numeric_latent_dim_candidates
if numeric_hidden_layers_candidates:
default_hps['numeric']['numeric_hidden_layers'] = numeric_hidden_layers_candidates
if hps is None:
hps = {}
# give parameters in `hps` precedence over default parameters
parameters_in_both = set(default_hps.keys()).intersection(set(hps.keys()))
for param in parameters_in_both:
del default_hps[param]
hps = merge_dicts(hps, default_hps)
if user_defined_scores is None:
user_defined_scores = []
if test_df is None:
train_df, test_df = random_split(train_df, [1-test_split, test_split])
self.check_data_types(train_df) # infer data types, saved self.string_columns, self.numeric_columns
self.hpo.tune(train_df, test_df, hps, num_evals, max_running_hours, user_defined_scores, hpo_run_name, self)
self.save()
return self