in src/autotrain/project.py [0:0]
def tabular_munge_data(params, local):
if isinstance(params.target_columns, str):
col_map_label = [params.target_columns]
else:
col_map_label = params.target_columns
task = params.task
if task == "classification" and len(col_map_label) > 1:
task = "tabular_multi_label_classification"
elif task == "classification" and len(col_map_label) == 1:
task = "tabular_multi_class_classification"
elif task == "regression" and len(col_map_label) > 1:
task = "tabular_multi_column_regression"
elif task == "regression" and len(col_map_label) == 1:
task = "tabular_single_column_regression"
else:
raise Exception("Please select a valid task.")
exts = ["csv", "jsonl"]
ext_to_use = None
for ext in exts:
path = f"{params.data_path}/{params.train_split}.{ext}"
if os.path.exists(path):
ext_to_use = ext
break
train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}"
if params.valid_split is not None:
valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}"
else:
valid_data_path = None
if os.path.exists(train_data_path):
dset = AutoTrainDataset(
train_data=[train_data_path],
task=task,
token=params.token,
project_name=params.project_name,
username=params.username,
column_mapping={"id": params.id_column, "label": col_map_label},
valid_data=[valid_data_path] if valid_data_path is not None else None,
percent_valid=None, # TODO: add to UI
local=local,
ext=ext_to_use,
)
params.data_path = dset.prepare()
params.valid_split = "validation"
params.id_column = "autotrain_id"
if len(col_map_label) == 1:
params.target_columns = ["autotrain_label"]
else:
params.target_columns = [f"autotrain_label_{i}" for i in range(len(col_map_label))]
return params