in api/run_eval.py [0:0]
def fetch_audio_urls(dataset_path, dataset, split, batch_size=100, max_retries=20):
API_URL = "https://datasets-server.huggingface.co/rows"
size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_path}&config={dataset}&split={split}"
size_response = requests.get(size_url).json()
total_rows = size_response["size"]["config"]["num_rows"]
audio_urls = []
for offset in tqdm(range(0, total_rows, batch_size), desc="Fetching audio URLs"):
params = {
"dataset": dataset_path,
"config": dataset,
"split": split,
"offset": offset,
"length": min(batch_size, total_rows - offset),
}
retries = 0
while retries <= max_retries:
try:
headers = {}
if os.environ.get("HF_TOKEN") is not None:
headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
else:
print("HF_TOKEN not set, might experience rate-limiting.")
response = requests.get(API_URL, params=params)
response.raise_for_status()
data = response.json()
yield from data["rows"]
break
except (requests.exceptions.RequestException, ValueError) as e:
retries += 1
print(
f"Error fetching data: {e}, retrying ({retries}/{max_retries})..."
)
time.sleep(10)
if retries >= max_retries:
raise Exception("Max retries exceeded while fetching data.")