def optimize()

in src/autotrain/trainers/tabular/__main__.py [0:0]
83 lines of code
20 McCabe index (conditional complexity)

def optimize(trial, model_name, xtrain, xvalid, ytrain, yvalid, eval_metric, task, preprocessor):
    """
    Optimize the model based on the given trial and parameters.

    Parameters:
    trial (dict or optuna.trial.Trial): The trial object or dictionary containing hyperparameters.
    model_name (str): The name of the model to be used (e.g., "xgboost").
    xtrain (pd.DataFrame or np.ndarray): Training features.
    xvalid (pd.DataFrame or np.ndarray): Validation features.
    ytrain (pd.Series or np.ndarray): Training labels.
    yvalid (pd.Series or np.ndarray): Validation labels.
    eval_metric (str): The evaluation metric to be used for optimization.
    task (str): The type of task (e.g., "binary_classification", "multi_class_classification", "single_column_regression").
    preprocessor (object): The preprocessor object to be applied to the data.

    Returns:
    float or tuple: If trial is a dictionary, returns a tuple containing the models, preprocessor, and metric dictionary.
                    Otherwise, returns the loss value based on the evaluation metric.
    """
    if isinstance(trial, dict):
        params = trial
    else:
        params = utils.get_params(trial, model_name, task)
    labels = None
    if task == "multi_class_classification":
        labels = np.unique(ytrain)
    metrics = utils.TabularMetrics(sub_task=task, labels=labels)

    if task in ("binary_classification", "multi_class_classification", "single_column_regression"):
        ytrain = ytrain.ravel()
        yvalid = yvalid.ravel()

    if preprocessor is not None:
        try:
            xtrain = preprocessor.fit_transform(xtrain)
            xvalid = preprocessor.transform(xvalid)
        except ValueError:
            logger.info("Preprocessing failed, using nan_to_num")
            train_cols = xtrain.columns.tolist()
            valid_cols = xvalid.columns.tolist()
            xtrain = np.nan_to_num(xtrain)
            xvalid = np.nan_to_num(xvalid)
            # convert back to dataframe
            xtrain = pd.DataFrame(xtrain, columns=train_cols)
            xvalid = pd.DataFrame(xvalid, columns=valid_cols)
            xtrain = preprocessor.fit_transform(xtrain)
            xvalid = preprocessor.transform(xvalid)

    if model_name == "xgboost":
        params["eval_metric"] = eval_metric

    _model = utils.TabularModel(model_name, preprocessor=None, sub_task=task, params=params)
    model = _model.pipeline
    models = []
    if task in ("multi_label_classification", "multi_column_regression"):
        # also multi_column_regression
        ypred = []
        models = [model] * ytrain.shape[1]
        for idx, _m in enumerate(models):
            if model_name == "xgboost":
                _m.fit(
                    xtrain,
                    ytrain[:, idx],
                    model__eval_set=[(xvalid, yvalid[:, idx])],
                    model__verbose=False,
                )
            else:
                _m.fit(xtrain, ytrain[:, idx])
            if task == "multi_column_regression":
                ypred_temp = _m.predict(xvalid)
            else:
                if _model.use_predict_proba:
                    ypred_temp = _m.predict_proba(xvalid)[:, 1]
                else:
                    ypred_temp = _m.predict(xvalid)
            ypred.append(ypred_temp)
        ypred = np.column_stack(ypred)

    else:
        models = [model]
        if model_name == "xgboost":
            model.fit(
                xtrain,
                ytrain,
                model__eval_set=[(xvalid, yvalid)],
                model__verbose=False,
            )
        else:
            models[0].fit(xtrain, ytrain)

        if _model.use_predict_proba:
            ypred = models[0].predict_proba(xvalid)
        else:
            ypred = models[0].predict(xvalid)

        if task == "multi_class_classification":
            if ypred.reshape(xvalid.shape[0], -1).shape[1] != len(labels):
                ypred_ohe = np.zeros((xvalid.shape[0], len(labels)))
                ypred_ohe[np.arange(xvalid.shape[0]), ypred] = 1
                ypred = ypred_ohe

        if task == "binary_classification":
            if ypred.reshape(xvalid.shape[0], -1).shape[1] != 2:
                ypred = np.column_stack([1 - ypred, ypred])

    # calculate metric
    metric_dict = metrics.calculate(yvalid, ypred)

    # change eval_metric key to loss
    if eval_metric in metric_dict:
        metric_dict["loss"] = metric_dict[eval_metric]

    logger.info(f"Metrics: {metric_dict}")
    if isinstance(trial, dict):
        return models, preprocessor, metric_dict
    return metric_dict["loss"]