in datawig/_hpo.py [0:0]
def __preprocess_hps(self,
train_df: pd.DataFrame,
simple_imputer,
num_evals) -> pd.DataFrame:
"""
Generates list of all possible combinations of hyperparameter from the nested hp dictionary.
Requires the data to check whether the relevant columns are present and have the appropriate type.
:param train_df: training data as dataframe
:param simple_imputer: Parent instance of SimpleImputer
:param num_evals is the maximum number of hpo configurations to consider.
:return: Data frame where each row is a hyperparameter configuration and each column is a parameter.
Column names have the form colum:parameter, e.g. title:max_tokens or global:learning rate.
"""
default_hps = dict()
# Define default hyperparameter choices for each column type (string, categorical, numeric)
default_hps['global'] = {}
default_hps['global']['learning_rate'] = [4e-3]
default_hps['global']['weight_decay'] = [0]
default_hps['global']['num_epochs'] = [100]
default_hps['global']['patience'] = [5]
default_hps['global']['batch_size'] = [16]
default_hps['global']['final_fc_hidden_units'] = [[]]
default_hps['string'] = {}
default_hps['string']['ngram_range'] = {}
default_hps['string']['max_tokens'] = [] # [2 ** exp for exp in [12, 15, 18]]
default_hps['string']['tokens'] = [] # [['chars'], ['words']]
default_hps['string']['ngram_range']['words'] = [(1, 3)]
default_hps['string']['ngram_range']['chars'] = [(1, 5)]
default_hps['categorical'] = {}
default_hps['categorical']['max_tokens'] = [2 ** 12]
default_hps['categorical']['embed_dim'] = [10]
default_hps['numeric'] = {}
default_hps['numeric']['normalize'] = [True]
default_hps['numeric']['numeric_latent_dim'] = [10]
default_hps['numeric']['numeric_hidden_layers'] = [1]
# create empty dict if global hps not passed
if 'global' not in self.hps.keys():
self.hps['global'] = {}
# merge data type default parameters with the ones in self.hps
# giving precedence over the parameters specified in self.hps
for data_type in ['string', 'categorical', 'numeric']:
for parameter_key, values in default_hps[data_type].items():
if parameter_key in self.hps[data_type]:
default_hps[data_type][parameter_key] = self.hps[data_type][parameter_key]
# add type to column dictionaries if it was not specified, does not support categorical types
for column_name in simple_imputer.input_columns:
if column_name not in self.hps.keys():
self.hps[column_name] = {}
if 'type' not in self.hps[column_name].keys():
if is_numeric_dtype(train_df[column_name]):
self.hps[column_name]['type'] = ['numeric']
else:
self.hps[column_name]['type'] = ['string']
# merge column hyper parameters with feature type specific defaults
for parameter_key, values in default_hps[self.hps[column_name]['type'][0]].items():
if parameter_key not in self.hps[column_name]:
self.hps[column_name][parameter_key] = values
# all of the data type specific parameters have been copied to the column encoder parameters
del self.hps['string']
del self.hps['numeric']
del self.hps['categorical']
# merge global parameters with defaults
for parameter_key, values in default_hps['global'].items():
if parameter_key not in self.hps['global']:
self.hps['global'][parameter_key] = values
flat_dict = flatten_dict(self.hps)
values = [value for key, value in flat_dict.items()]
keys = [key for key in flat_dict.keys()]
hp_df = pd.DataFrame(
random_cartesian_product(values, num=num_evals),
columns=keys
)
return hp_df