def prepare_data()

in assets/training/model_evaluation/src_distributed/data_utils.py [0:0]


def prepare_data(data, task, label_column_name=None, _has_multiple_output=False, extra_y_test_cols=None):
    """Prepare data.

    Args:
        data (_type_): _description_
        task (_type_): _description_
        label_column_name (_type_, optional): _description_. Defaults to None.
        _has_multiple_output (bool, optional): _description_. Defaults to False.
        extra_y_test_cols (_type_, optional): _description_. Defaults to None.

    Raises:
        ModelEvaluationException: _description_
        DataLoaderException: _description_

    Returns:
        _type_: _description_
    """
    X_test, y_test = data, None
    if extra_y_test_cols is not None and label_column_name is not None:
        # IF extra_y_test_cols is not None, label_column_name should also be not None;
        # extra_y_test_cols is accepted only for text-gen
        X_test, y_test = data.drop(extra_y_test_cols + [label_column_name], axis=1), \
                         data[extra_y_test_cols + [label_column_name]]
    elif label_column_name is not None:
        X_test, y_test = data.drop(label_column_name, axis=1), data[label_column_name]
    elif extra_y_test_cols is not None:
        X_test, y_test = data.drop(extra_y_test_cols, axis=1), data[extra_y_test_cols]
    
    if task == constants.SupportedTask.NER:
        if len(X_test.columns) > 1 and "tokens" not in X_test.columns:
            message = "Too many feature columns in dataset. Only 1 feature column should be passed for NER."
            raise ValueError(message)
        if len(X_test.columns) > 1:
            X_test = X_test["tokens"]
        if len(X_test.columns) == 1:
            if isinstance(X_test[X_test.columns[0]].iloc[0], list):
                X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x))
            if isinstance(X_test[X_test.columns[0]].iloc[0], np.ndarray):
                X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x.tolist()))
        if isinstance(X_test, pd.Series):
            X_test = X_test.to_frame()
    if _has_multiple_output and y_test is not None and not isinstance(y_test.iloc[0], str):
        if isinstance(y_test.iloc[0], np.ndarray):
            y_test = y_test.apply(lambda x: x.tolist())
        y_test = y_test.astype(str)

    if task == constants.SupportedTask.QnA and y_test is not None:
        if isinstance(y_test.iloc[0], dict):
            # Extracting only the first one for now
            # TODO: Fix this post PrP
            y_test = y_test.apply(lambda x: x["text"][0] if len(x["text"]) > 0 else "")
        elif isinstance(y_test.iloc[0], list) or isinstance(y_test.iloc[0], np.ndarray):
            y_test = y_test.apply(lambda x: x[0])
        if not isinstance(y_test.iloc[0], str):
            message = "Ground Truths for Question-answering should be a string or an array. " \
                      "Found: " + type(y_test.iloc[0])
            raise ValueError(message)
    if task == constants.SupportedTask.FILL_MASK and y_test is not None:
        if isinstance(y_test.iloc[0], np.ndarray) or isinstance(y_test.iloc[0], list):
            y_test = y_test.apply(lambda x: tuple(x))
        if not isinstance(y_test.iloc[0], str) and not isinstance(y_test.iloc[0], tuple):
            message = "Ground Truths for Fill-Mask should be a string or an array found " + type(y_test.iloc[0])
            raise ValueError(message)

    if y_test is not None:
        y_test = y_test.values

    return X_test, y_test