def prepare_data()

in assets/training/model_evaluation/src/utils.py [0:0]


def prepare_data(data, task, all_cols, label_column_name=None,
                 _has_multiple_output=False, extra_y_test_cols=None, batch_size=None,
                 file_ext=None):
    """Prepare data.

    Args:
        data (_type_): _description_
        task (_type_): _description_
        all_cols
        label_column_name (_type_, optional): _description_. Defaults to None.
        _has_multiple_output (bool, optional): _description_. Defaults to False.
        extra_y_test_cols (_type_, optional): _description_. Defaults to None.
        batch_size
        file_ext

    Raises:
        ModelEvaluationException: _description_
        DataLoaderException: _description_

    Returns:
        _type_: _description_
    """
    data = _clean_and_validate_dataset(data, all_cols, batch_size)

    X_test, y_test = data, None
    if len(X_test) == 0:
        return X_test, y_test

    if extra_y_test_cols is not None and label_column_name is not None:
        # IF extra_y_test_cols is not None, label_column_name should also be not None;
        # extra_y_test_cols is accepted only for text-gen
        X_test, y_test = data.drop(extra_y_test_cols + [label_column_name], axis=1), \
                         data[extra_y_test_cols + [label_column_name]]
    elif label_column_name is not None:
        X_test, y_test = data.drop(label_column_name, axis=1), data[label_column_name]
    elif extra_y_test_cols is not None:
        X_test, y_test = data.drop(extra_y_test_cols, axis=1), data[extra_y_test_cols]
    if task == constants.TASK.REGRESSION:
        if y_test is not None:
            try:
                y_test = y_test.astype(np.float64)
            except Exception as e:
                exception = get_azureml_exception(DataLoaderException, BadRegressionData, e,
                                                  error=repr(e), y_test_dtype=y_test.dtype)
                log_traceback(exception, logger)
                raise exception
    if task == constants.TASK.NER:
        if len(X_test.columns) > 1 and "tokens" not in X_test.columns:
            message = "Too many feature columns in dataset. Only 1 feature column should be passed for NER."
            exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
            log_traceback(exception, logger)
            raise exception
        if len(X_test.columns) > 1:
            X_test = X_test["tokens"]
        if len(X_test.columns) == 1:
            if isinstance(X_test[X_test.columns[0]].iloc[0], list):
                X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x))
            if isinstance(X_test[X_test.columns[0]].iloc[0], np.ndarray):
                X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x.tolist()))
        if isinstance(X_test, pd.Series):
            X_test = X_test.to_frame()
    if _has_multiple_output and y_test is not None and not isinstance(y_test.iloc[0], str):
        if isinstance(y_test.iloc[0], np.ndarray):
            y_test = y_test.apply(lambda x: x.tolist())
        y_test = y_test.astype(str)

    if task == constants.TASK.QnA and y_test is not None:
        if isinstance(y_test.iloc[0], dict):
            # Extracting only the first one for now
            # TODO: Fix this post PrP
            key = "text"
            try:
                y_test = y_test.apply(lambda x: x[key][0] if len(x[key]) > 0 else "")
            except KeyError as e:
                message = f"Ground Truths dict for Question-answering should contain key [{key}]. " + \
                          f"Found: {str(list(y_test.iloc[0].keys()))}."
                exception = get_azureml_exception(DataLoaderException, BadInputData, e, error=message)
                log_traceback(exception, logger, message)
                raise exception
        elif isinstance(y_test.iloc[0], list) or isinstance(y_test.iloc[0], np.ndarray):
            y_test = y_test.apply(lambda x: x[0])
        if not isinstance(y_test.iloc[0], str):
            message = "Ground Truths for Question-answering should be a string or an array. " \
                      "Found: " + type(y_test.iloc[0])
            exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
            log_traceback(exception, logger, message)
            raise exception
    if task == constants.TASK.FILL_MASK and y_test is not None:
        if isinstance(y_test.iloc[0], np.ndarray) or isinstance(y_test.iloc[0], list):
            y_test = y_test.apply(lambda x: tuple(x))
        if not isinstance(y_test.iloc[0], str) and not isinstance(y_test.iloc[0], tuple):
            message = "Ground Truths for Fill-Mask should be a string or an array found " + type(y_test.iloc[0])
            exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
            log_traceback(exception, logger, message)
            raise exception

    if task == constants.TASK.CHAT_COMPLETION:
        if file_ext == SupportedFileExtensions.CSV:
            try:
                X_test = X_test.applymap(json.loads)
            except Exception as e:
                message = "Incorrectly formatted JSON in CSV file."
                exception = get_azureml_exception(DataLoaderException, BadInputData, e, error=message)
                log_traceback(exception, logger, message)
                raise exception
        if y_test is None:
            col_name = all_cols[0]
            ground_truth = []
            for xt in X_test[col_name].tolist():
                if isinstance(xt, list) and isinstance(xt[0], dict):
                    if xt[-1].get("role", "assistant") == "user":
                        ground_truth.append(xt[-1]["content"])
            if len(ground_truth) == X_test.shape[0]:
                y_test = pd.Series(ground_truth)
                X_test[col_name] = X_test[col_name].apply(lambda x: x[:-1])

    if y_test is not None:
        y_test = y_test.values

    return X_test, y_test