def get_huggingface_parallel()

in utils/find_corpus.py [0:0]


def get_huggingface_parallel(source: str, target: str):
    """
    Returns parallel datasets ordered by size. Datasets with few downloads are ignored
    as they are probably low quality and not trustworthy.
    """
    from huggingface_hub import DatasetFilter, HfApi

    api = HfApi()

    datasets = list(
        api.list_datasets(
            filter=DatasetFilter(
                #
                language=[source, target],
            )
        )
    )
    datasets.sort(key=lambda dataset: -dataset.downloads)
    datasets.sort(key=lambda dataset: -HF_DATASET_SIZES.get(get_size(dataset.tags), 0))

    print("")
    print(
        "┌────────────────────────────────────────────────────────────────────────────────────────────────────┐"
    )
    print(
        f"│ huggingface parallel data https://huggingface.co/datasets?language=language:{source},language:{target}"
    )
    print(
        "└────────────────────────────────────────────────────────────────────────────────────────────────────┘"
    )
    print_table(
        [
            ["ID", "Size", "Downloads"],
            *[
                [
                    #
                    f"https://huggingface.co/datasets/{dataset.id}",
                    get_size(dataset.tags),
                    dataset.downloads,
                ]
                for dataset in datasets
                if is_useful_dataset(dataset)
            ],
        ]
    )