in assets/training/model_evaluation/src_distributed/data_utils.py [0:0]
def prepare_data(data, task, label_column_name=None, _has_multiple_output=False, extra_y_test_cols=None):
"""Prepare data.
Args:
data (_type_): _description_
task (_type_): _description_
label_column_name (_type_, optional): _description_. Defaults to None.
_has_multiple_output (bool, optional): _description_. Defaults to False.
extra_y_test_cols (_type_, optional): _description_. Defaults to None.
Raises:
ModelEvaluationException: _description_
DataLoaderException: _description_
Returns:
_type_: _description_
"""
X_test, y_test = data, None
if extra_y_test_cols is not None and label_column_name is not None:
# IF extra_y_test_cols is not None, label_column_name should also be not None;
# extra_y_test_cols is accepted only for text-gen
X_test, y_test = data.drop(extra_y_test_cols + [label_column_name], axis=1), \
data[extra_y_test_cols + [label_column_name]]
elif label_column_name is not None:
X_test, y_test = data.drop(label_column_name, axis=1), data[label_column_name]
elif extra_y_test_cols is not None:
X_test, y_test = data.drop(extra_y_test_cols, axis=1), data[extra_y_test_cols]
if task == constants.SupportedTask.NER:
if len(X_test.columns) > 1 and "tokens" not in X_test.columns:
message = "Too many feature columns in dataset. Only 1 feature column should be passed for NER."
raise ValueError(message)
if len(X_test.columns) > 1:
X_test = X_test["tokens"]
if len(X_test.columns) == 1:
if isinstance(X_test[X_test.columns[0]].iloc[0], list):
X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x))
if isinstance(X_test[X_test.columns[0]].iloc[0], np.ndarray):
X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x.tolist()))
if isinstance(X_test, pd.Series):
X_test = X_test.to_frame()
if _has_multiple_output and y_test is not None and not isinstance(y_test.iloc[0], str):
if isinstance(y_test.iloc[0], np.ndarray):
y_test = y_test.apply(lambda x: x.tolist())
y_test = y_test.astype(str)
if task == constants.SupportedTask.QnA and y_test is not None:
if isinstance(y_test.iloc[0], dict):
# Extracting only the first one for now
# TODO: Fix this post PrP
y_test = y_test.apply(lambda x: x["text"][0] if len(x["text"]) > 0 else "")
elif isinstance(y_test.iloc[0], list) or isinstance(y_test.iloc[0], np.ndarray):
y_test = y_test.apply(lambda x: x[0])
if not isinstance(y_test.iloc[0], str):
message = "Ground Truths for Question-answering should be a string or an array. " \
"Found: " + type(y_test.iloc[0])
raise ValueError(message)
if task == constants.SupportedTask.FILL_MASK and y_test is not None:
if isinstance(y_test.iloc[0], np.ndarray) or isinstance(y_test.iloc[0], list):
y_test = y_test.apply(lambda x: tuple(x))
if not isinstance(y_test.iloc[0], str) and not isinstance(y_test.iloc[0], tuple):
message = "Ground Truths for Fill-Mask should be a string or an array found " + type(y_test.iloc[0])
raise ValueError(message)
if y_test is not None:
y_test = y_test.values
return X_test, y_test