in src/autotrain/trainers/tabular/__main__.py [0:0]
def optimize(trial, model_name, xtrain, xvalid, ytrain, yvalid, eval_metric, task, preprocessor):
"""
Optimize the model based on the given trial and parameters.
Parameters:
trial (dict or optuna.trial.Trial): The trial object or dictionary containing hyperparameters.
model_name (str): The name of the model to be used (e.g., "xgboost").
xtrain (pd.DataFrame or np.ndarray): Training features.
xvalid (pd.DataFrame or np.ndarray): Validation features.
ytrain (pd.Series or np.ndarray): Training labels.
yvalid (pd.Series or np.ndarray): Validation labels.
eval_metric (str): The evaluation metric to be used for optimization.
task (str): The type of task (e.g., "binary_classification", "multi_class_classification", "single_column_regression").
preprocessor (object): The preprocessor object to be applied to the data.
Returns:
float or tuple: If trial is a dictionary, returns a tuple containing the models, preprocessor, and metric dictionary.
Otherwise, returns the loss value based on the evaluation metric.
"""
if isinstance(trial, dict):
params = trial
else:
params = utils.get_params(trial, model_name, task)
labels = None
if task == "multi_class_classification":
labels = np.unique(ytrain)
metrics = utils.TabularMetrics(sub_task=task, labels=labels)
if task in ("binary_classification", "multi_class_classification", "single_column_regression"):
ytrain = ytrain.ravel()
yvalid = yvalid.ravel()
if preprocessor is not None:
try:
xtrain = preprocessor.fit_transform(xtrain)
xvalid = preprocessor.transform(xvalid)
except ValueError:
logger.info("Preprocessing failed, using nan_to_num")
train_cols = xtrain.columns.tolist()
valid_cols = xvalid.columns.tolist()
xtrain = np.nan_to_num(xtrain)
xvalid = np.nan_to_num(xvalid)
# convert back to dataframe
xtrain = pd.DataFrame(xtrain, columns=train_cols)
xvalid = pd.DataFrame(xvalid, columns=valid_cols)
xtrain = preprocessor.fit_transform(xtrain)
xvalid = preprocessor.transform(xvalid)
if model_name == "xgboost":
params["eval_metric"] = eval_metric
_model = utils.TabularModel(model_name, preprocessor=None, sub_task=task, params=params)
model = _model.pipeline
models = []
if task in ("multi_label_classification", "multi_column_regression"):
# also multi_column_regression
ypred = []
models = [model] * ytrain.shape[1]
for idx, _m in enumerate(models):
if model_name == "xgboost":
_m.fit(
xtrain,
ytrain[:, idx],
model__eval_set=[(xvalid, yvalid[:, idx])],
model__verbose=False,
)
else:
_m.fit(xtrain, ytrain[:, idx])
if task == "multi_column_regression":
ypred_temp = _m.predict(xvalid)
else:
if _model.use_predict_proba:
ypred_temp = _m.predict_proba(xvalid)[:, 1]
else:
ypred_temp = _m.predict(xvalid)
ypred.append(ypred_temp)
ypred = np.column_stack(ypred)
else:
models = [model]
if model_name == "xgboost":
model.fit(
xtrain,
ytrain,
model__eval_set=[(xvalid, yvalid)],
model__verbose=False,
)
else:
models[0].fit(xtrain, ytrain)
if _model.use_predict_proba:
ypred = models[0].predict_proba(xvalid)
else:
ypred = models[0].predict(xvalid)
if task == "multi_class_classification":
if ypred.reshape(xvalid.shape[0], -1).shape[1] != len(labels):
ypred_ohe = np.zeros((xvalid.shape[0], len(labels)))
ypred_ohe[np.arange(xvalid.shape[0]), ypred] = 1
ypred = ypred_ohe
if task == "binary_classification":
if ypred.reshape(xvalid.shape[0], -1).shape[1] != 2:
ypred = np.column_stack([1 - ypred, ypred])
# calculate metric
metric_dict = metrics.calculate(yvalid, ypred)
# change eval_metric key to loss
if eval_metric in metric_dict:
metric_dict["loss"] = metric_dict[eval_metric]
logger.info(f"Metrics: {metric_dict}")
if isinstance(trial, dict):
return models, preprocessor, metric_dict
return metric_dict["loss"]