in assets/training/model_evaluation/src/utils.py [0:0]
def prepare_data(data, task, all_cols, label_column_name=None,
_has_multiple_output=False, extra_y_test_cols=None, batch_size=None,
file_ext=None):
"""Prepare data.
Args:
data (_type_): _description_
task (_type_): _description_
all_cols
label_column_name (_type_, optional): _description_. Defaults to None.
_has_multiple_output (bool, optional): _description_. Defaults to False.
extra_y_test_cols (_type_, optional): _description_. Defaults to None.
batch_size
file_ext
Raises:
ModelEvaluationException: _description_
DataLoaderException: _description_
Returns:
_type_: _description_
"""
data = _clean_and_validate_dataset(data, all_cols, batch_size)
X_test, y_test = data, None
if len(X_test) == 0:
return X_test, y_test
if extra_y_test_cols is not None and label_column_name is not None:
# IF extra_y_test_cols is not None, label_column_name should also be not None;
# extra_y_test_cols is accepted only for text-gen
X_test, y_test = data.drop(extra_y_test_cols + [label_column_name], axis=1), \
data[extra_y_test_cols + [label_column_name]]
elif label_column_name is not None:
X_test, y_test = data.drop(label_column_name, axis=1), data[label_column_name]
elif extra_y_test_cols is not None:
X_test, y_test = data.drop(extra_y_test_cols, axis=1), data[extra_y_test_cols]
if task == constants.TASK.REGRESSION:
if y_test is not None:
try:
y_test = y_test.astype(np.float64)
except Exception as e:
exception = get_azureml_exception(DataLoaderException, BadRegressionData, e,
error=repr(e), y_test_dtype=y_test.dtype)
log_traceback(exception, logger)
raise exception
if task == constants.TASK.NER:
if len(X_test.columns) > 1 and "tokens" not in X_test.columns:
message = "Too many feature columns in dataset. Only 1 feature column should be passed for NER."
exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
log_traceback(exception, logger)
raise exception
if len(X_test.columns) > 1:
X_test = X_test["tokens"]
if len(X_test.columns) == 1:
if isinstance(X_test[X_test.columns[0]].iloc[0], list):
X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x))
if isinstance(X_test[X_test.columns[0]].iloc[0], np.ndarray):
X_test[X_test.columns[0]] = X_test[X_test.columns[0]].apply(lambda x: " ".join(x.tolist()))
if isinstance(X_test, pd.Series):
X_test = X_test.to_frame()
if _has_multiple_output and y_test is not None and not isinstance(y_test.iloc[0], str):
if isinstance(y_test.iloc[0], np.ndarray):
y_test = y_test.apply(lambda x: x.tolist())
y_test = y_test.astype(str)
if task == constants.TASK.QnA and y_test is not None:
if isinstance(y_test.iloc[0], dict):
# Extracting only the first one for now
# TODO: Fix this post PrP
key = "text"
try:
y_test = y_test.apply(lambda x: x[key][0] if len(x[key]) > 0 else "")
except KeyError as e:
message = f"Ground Truths dict for Question-answering should contain key [{key}]. " + \
f"Found: {str(list(y_test.iloc[0].keys()))}."
exception = get_azureml_exception(DataLoaderException, BadInputData, e, error=message)
log_traceback(exception, logger, message)
raise exception
elif isinstance(y_test.iloc[0], list) or isinstance(y_test.iloc[0], np.ndarray):
y_test = y_test.apply(lambda x: x[0])
if not isinstance(y_test.iloc[0], str):
message = "Ground Truths for Question-answering should be a string or an array. " \
"Found: " + type(y_test.iloc[0])
exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
log_traceback(exception, logger, message)
raise exception
if task == constants.TASK.FILL_MASK and y_test is not None:
if isinstance(y_test.iloc[0], np.ndarray) or isinstance(y_test.iloc[0], list):
y_test = y_test.apply(lambda x: tuple(x))
if not isinstance(y_test.iloc[0], str) and not isinstance(y_test.iloc[0], tuple):
message = "Ground Truths for Fill-Mask should be a string or an array found " + type(y_test.iloc[0])
exception = get_azureml_exception(DataLoaderException, BadInputData, None, error=message)
log_traceback(exception, logger, message)
raise exception
if task == constants.TASK.CHAT_COMPLETION:
if file_ext == SupportedFileExtensions.CSV:
try:
X_test = X_test.applymap(json.loads)
except Exception as e:
message = "Incorrectly formatted JSON in CSV file."
exception = get_azureml_exception(DataLoaderException, BadInputData, e, error=message)
log_traceback(exception, logger, message)
raise exception
if y_test is None:
col_name = all_cols[0]
ground_truth = []
for xt in X_test[col_name].tolist():
if isinstance(xt, list) and isinstance(xt[0], dict):
if xt[-1].get("role", "assistant") == "user":
ground_truth.append(xt[-1]["content"])
if len(ground_truth) == X_test.shape[0]:
y_test = pd.Series(ground_truth)
X_test[col_name] = X_test[col_name].apply(lambda x: x[:-1])
if y_test is not None:
y_test = y_test.values
return X_test, y_test