in src/sal/utils/data.py [0:0]
def save_dataset(dataset, config):
if config.push_to_hub:
# Since concurrent pushes can get rejected by the Hub, we make several attempts to push the dataset with try/except
for _ in range(20):
try:
# Create branch from the repo's initial commit.
# This is needed to avoid branching from a commit on main that already has data
if repo_exists(config.hub_dataset_id, repo_type="dataset"):
initial_commit = list_repo_commits(
config.hub_dataset_id, repo_type="dataset"
)[-1]
create_branch(
repo_id=config.hub_dataset_id,
branch=config.revision,
revision=initial_commit.commit_id,
exist_ok=True,
repo_type="dataset",
)
url = dataset.push_to_hub(
config.hub_dataset_id,
revision=config.revision,
split="train",
private=config.hub_dataset_private,
commit_message=f"Add {config.revision}",
)
break
except Exception as e:
logger.error(f"Error pushing dataset to the Hub: {e}")
time.sleep(5)
logger.info(f"Pushed dataset to {url}")
else:
if config.output_dir is None:
config.output_dir = f"data/{config.model_path}"
Path(config.output_dir).mkdir(parents=True, exist_ok=True)
dataset.to_json(
f"{config.output_dir}/{config.approach}_completions.jsonl", lines=True
)
logger.info(
f"Saved completions to {config.output_dir}/{config.approach}_completions.jsonl"
)