in datawig/simple_imputer.py [0:0]
def fit(self,
train_df: pd.DataFrame,
test_df: pd.DataFrame = None,
ctx: mx.context = get_context(),
learning_rate: float = 4e-3,
num_epochs: int = 100,
patience: int = 5,
test_split: float = .1,
weight_decay: float = 0.,
batch_size: int = 16,
final_fc_hidden_units: List[int] = None,
calibrate: bool = True,
class_weights: dict = None,
instance_weights: list = None) -> Any:
"""
Trains and stores imputer model
:param train_df: training data as dataframe
:param test_df: test data as dataframe; if not provided, a ratio of test_split of the
training data are used as test data
:param ctx: List of mxnet contexts (if no gpu's available, defaults to [mx.cpu()])
User can also pass in a list gpus to be used, ex. [mx.gpu(0), mx.gpu(2), mx.gpu(4)]
:param learning_rate: learning rate for stochastic gradient descent (default 4e-4)
:param num_epochs: maximal number of training epochs (default 10)
:param patience: used for early stopping; after [patience] epochs with no improvement,
training is stopped. (default 3)
:param test_split: if no test_df is provided this is the ratio of test data to be held
separate for determining model convergence
:param weight_decay: regularizer (default 0)
:param batch_size (default 16)
:param final_fc_hidden_units: list dimensions for FC layers after the final concatenation
:param calibrate: Control automatic model calibration
:param class_weights: Dictionary with labels as keys and weights as values.
Weighs each instance's contribution to the likelihood based on the corresponding class.
:param instance_weights: List of weights for each instance in train_df.
"""
# add weights to training data if provided
train_df = self.__add_weights_to_df(train_df, class_weights, instance_weights, in_place=False)
self.check_data_types(train_df)
data_encoders = []
data_columns = []
if len(self.string_columns) > 0:
string_feature_column = "ngram_features-" + rand_string(10)
if self.is_explainable:
data_encoders += [TfIdfEncoder(input_columns=self.string_columns,
output_column=string_feature_column,
max_tokens=self.num_hash_buckets,
tokens=self.tokens)]
else:
data_encoders += [BowEncoder(input_columns=self.string_columns,
output_column=string_feature_column,
max_tokens=self.num_hash_buckets,
tokens=self.tokens)]
data_columns += [
BowFeaturizer(field_name=string_feature_column, max_tokens=self.num_hash_buckets)]
if len(self.numeric_columns) > 0:
numerical_feature_column = "numerical_features-" + rand_string(10)
data_encoders += [NumericalEncoder(input_columns=self.numeric_columns,
output_column=numerical_feature_column)]
data_columns += [
NumericalFeaturizer(field_name=numerical_feature_column, numeric_latent_dim=self.numeric_latent_dim,
numeric_hidden_layers=self.numeric_hidden_layers)]
if is_numeric_dtype(train_df[self.output_column]):
label_column = [NumericalEncoder(self.output_column, normalize=True)]
logger.debug("Assuming numeric output column: {}".format(self.output_column))
else:
label_column = [CategoricalEncoder(self.output_column, max_tokens=self.num_labels)]
logger.debug("Assuming categorical output column: {}".format(self.output_column))
# to make consecutive calls to .fit() continue where the previous call finished
if self.imputer is None:
self.imputer = Imputer(data_encoders=data_encoders,
data_featurizers=data_columns,
label_encoders=label_column,
output_path=self.output_path)
self.output_path = self.imputer.output_path
self.imputer = self.imputer.fit(train_df, test_df, ctx, learning_rate, num_epochs, patience,
test_split,
weight_decay, batch_size,
final_fc_hidden_units=final_fc_hidden_units,
calibrate=calibrate)
self.save()
return self