in services/worker/src/worker/job_runners/dataset/compatible_libraries.py [0:0]
def get_compatible_libraries_for_csv(dataset: str, hf_token: Optional[str], login_required: bool) -> CompatibleLibrary:
library: DatasetLibrary
builder_configs = get_builder_configs_with_simplified_data_files(dataset, module_name="csv", hf_token=hf_token)
for config in builder_configs:
if any(len(data_files) != 1 for data_files in config.data_files.values()):
raise DatasetWithTooComplexDataFilesPatternsError(
f"Failed to simplify csv data files pattern: {config.data_files}"
)
loading_codes: list[LoadingCode] = [
{
"config_name": config.name,
"arguments": {"splits": {str(split): data_files[0] for split, data_files in config.data_files.items()}},
"code": "",
}
for config in builder_configs
]
is_single_file = all(
"*" not in data_file and "[" not in data_file
for loading_code in loading_codes
for data_file in loading_code["arguments"]["splits"].values()
)
comment = LOGIN_COMMENT if login_required else ""
if is_single_file:
library = "pandas"
function = "pd.read_csv"
for loading_code in loading_codes:
first_file = next(iter(loading_code["arguments"]["splits"].values()))
if ".tsv" in first_file:
args = ', sep="\\t"'
else:
args = ""
if len(loading_code["arguments"]["splits"]) == 1:
data_file = next(iter(loading_code["arguments"]["splits"].values()))
loading_code["code"] = PANDAS_CODE.format(
function=function, dataset=dataset, data_file=data_file, args=args, comment=comment
)
else:
loading_code["code"] = PANDAS_CODE_SPLITS.format(
function=function,
dataset=dataset,
splits=loading_code["arguments"]["splits"],
first_split=next(iter(loading_code["arguments"]["splits"])),
args=args,
comment=comment,
)
else:
library = "dask"
function = "dd.read_csv"
for loading_code in loading_codes:
if len(loading_code["arguments"]["splits"]) == 1:
pattern = next(iter(loading_code["arguments"]["splits"].values()))
loading_code["code"] = DASK_CODE.format(
function=function, dataset=dataset, pattern=pattern, comment=comment
)
else:
loading_code["code"] = DASK_CODE_SPLITS.format(
function=function,
dataset=dataset,
splits=loading_code["arguments"]["splits"],
first_split=next(iter(loading_code["arguments"]["splits"])),
comment=comment,
)
return {"language": "python", "library": library, "function": function, "loading_codes": loading_codes}