in utils/find_corpus.py [0:0]
def get_huggingface_parallel(source: str, target: str):
"""
Returns parallel datasets ordered by size. Datasets with few downloads are ignored
as they are probably low quality and not trustworthy.
"""
from huggingface_hub import DatasetFilter, HfApi
api = HfApi()
datasets = list(
api.list_datasets(
filter=DatasetFilter(
#
language=[source, target],
)
)
)
datasets.sort(key=lambda dataset: -dataset.downloads)
datasets.sort(key=lambda dataset: -HF_DATASET_SIZES.get(get_size(dataset.tags), 0))
print("")
print(
"┌────────────────────────────────────────────────────────────────────────────────────────────────────┐"
)
print(
f"│ huggingface parallel data https://huggingface.co/datasets?language=language:{source},language:{target}"
)
print(
"└────────────────────────────────────────────────────────────────────────────────────────────────────┘"
)
print_table(
[
["ID", "Size", "Downloads"],
*[
[
#
f"https://huggingface.co/datasets/{dataset.id}",
get_size(dataset.tags),
dataset.downloads,
]
for dataset in datasets
if is_useful_dataset(dataset)
],
]
)