in tabular/src/autogluon/tabular/learner/default_learner.py [0:0]
def general_data_processing(self, X: DataFrame, X_val: DataFrame, X_unlabeled: DataFrame, holdout_frac: float, num_bag_folds: int):
""" General data processing steps used for all models. """
X = copy.deepcopy(X)
# TODO: We should probably uncomment the below lines, NaN label should be treated as just another value in multiclass classification -> We will have to remove missing, compute problem type, and add back missing if multiclass
# if self.problem_type == MULTICLASS:
# X[self.label] = X[self.label].fillna('')
# Remove all examples with missing labels from this dataset:
missinglabel_inds = [index for index, x in X[self.label].isna().iteritems() if x]
if len(missinglabel_inds) > 0:
logger.warning(f"Warning: Ignoring {len(missinglabel_inds)} (out of {len(X)}) training examples for which the label value in column '{self.label}' is missing")
X = X.drop(missinglabel_inds, axis=0)
if self.problem_type is None:
self.problem_type = self.infer_problem_type(X[self.label])
if self.quantile_levels is not None:
if self.problem_type == REGRESSION:
self.problem_type = QUANTILE
else:
raise ValueError("autogluon infers this to be classification problem for which quantile_levels "
"cannot be specified. If it is truly a quantile regression problem, "
"please specify:problem_type='quantile'")
if X_val is not None and self.label in X_val.columns:
holdout_frac = 1
if (self.eval_metric is not None) and (self.eval_metric.name in ['log_loss', 'pac_score']) and (self.problem_type == MULTICLASS):
if num_bag_folds > 0:
self.threshold = 2
if self.groups is None:
X = augment_rare_classes(X, self.label, threshold=2)
else:
self.threshold = 1
self.threshold, holdout_frac, num_bag_folds = self.adjust_threshold_if_necessary(X[self.label], threshold=self.threshold, holdout_frac=holdout_frac, num_bag_folds=num_bag_folds)
# Gets labels prior to removal of infrequent classes
y_uncleaned = X[self.label].copy()
self.cleaner = Cleaner.construct(problem_type=self.problem_type, label=self.label, threshold=self.threshold)
X = self.cleaner.fit_transform(X) # TODO: Consider merging cleaner into label_cleaner
X, y = self.extract_label(X)
self.label_cleaner = LabelCleaner.construct(problem_type=self.problem_type, y=y, y_uncleaned=y_uncleaned, positive_class=self._positive_class)
y = self.label_cleaner.transform(y)
X = self.set_predefined_weights(X, y)
X, w = extract_column(X, self.sample_weight)
X, groups = extract_column(X, self.groups)
if self.label_cleaner.num_classes is not None and self.problem_type != BINARY:
logger.log(20, f'Train Data Class Count: {self.label_cleaner.num_classes}')
if X_val is not None and self.label in X_val.columns:
X_val = self.cleaner.transform(X_val)
if len(X_val) == 0:
logger.warning('All X_val data contained low frequency classes, ignoring X_val and generating from subset of X')
X_val = None
y_val = None
w_val = None
else:
X_val, y_val = self.extract_label(X_val)
y_val = self.label_cleaner.transform(y_val)
X_val = self.set_predefined_weights(X_val, y_val)
X_val, w_val = extract_column(X_val, self.sample_weight)
else:
y_val = None
w_val = None
# TODO: Move this up to top of data before removing data, this way our feature generator is better
logger.log(20, f'Using Feature Generators to preprocess the data ...')
if X_val is not None:
# Do this if working with SKLearn models, otherwise categorical features may perform very badly on the test set
logger.log(15, 'Performing general data preprocessing with merged train & validation data, so validation performance may not accurately reflect performance on new test data')
X_super = pd.concat([X, X_val, X_unlabeled], ignore_index=True)
if self.feature_generator.is_fit():
logger.log(20, f'{self.feature_generator.__class__.__name__} is already fit, so the training data will be processed via .transform() instead of .fit_transform().')
X_super = self.feature_generator.transform(X_super)
self.feature_generator.print_feature_metadata_info()
else:
if X_unlabeled is None:
y_super = pd.concat([y, y_val], ignore_index=True)
else:
y_unlabeled = pd.Series(np.nan, index=X_unlabeled.index)
y_super = pd.concat([y, y_val, y_unlabeled], ignore_index=True)
X_super = self.fit_transform_features(X_super, y_super, problem_type=self.label_cleaner.problem_type_transform, eval_metric=self.eval_metric)
X = X_super.head(len(X)).set_index(X.index)
X_val = X_super.head(len(X)+len(X_val)).tail(len(X_val)).set_index(X_val.index)
if X_unlabeled is not None:
X_unlabeled = X_super.tail(len(X_unlabeled)).set_index(X_unlabeled.index)
del X_super
else:
X_super = pd.concat([X, X_unlabeled], ignore_index=True)
if self.feature_generator.is_fit():
logger.log(20, f'{self.feature_generator.__class__.__name__} is already fit, so the training data will be processed via .transform() instead of .fit_transform().')
X_super = self.feature_generator.transform(X_super)
self.feature_generator.print_feature_metadata_info()
else:
if X_unlabeled is None:
y_super = y.reset_index(drop=True)
else:
y_unlabeled = pd.Series(np.nan, index=X_unlabeled.index)
y_super = pd.concat([y, y_unlabeled], ignore_index=True)
X_super = self.fit_transform_features(X_super, y_super, problem_type=self.label_cleaner.problem_type_transform, eval_metric=self.eval_metric)
X = X_super.head(len(X)).set_index(X.index)
if X_unlabeled is not None:
X_unlabeled = X_super.tail(len(X_unlabeled)).set_index(X_unlabeled.index)
del X_super
X, X_val = self.bundle_weights(X, w, X_val, w_val) # TODO: consider not bundling sample-weights inside X, X_val
return X, y, X_val, y_val, X_unlabeled, holdout_frac, num_bag_folds, groups