def make_corpus()

in cc_net/tools/make_dmoz_corpus.py [0:0]


def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None:
    """
    Loads a tags file and create a training dataset using the given webpages.

    Arguments:
        - file: CC shard file
        - tags_file: dmoz tagging file, (like the one produced by `dl`)
        - output: ""
    """
    url2tags = load_tags(tags_file)
    with jsonql.open_write(output) as o:
        for document in jsonql.read_jsons(file):
            if not document:
                continue
            url = document["url"]
            domain = document["source_domain"]

            if url in url2tags:
                tags = url2tags[url]
            elif domain in url2tags:
                tags = url2tags[domain]
            else:
                continue

            if len(tags) == 0:
                continue

            fasttext_tags = ["__label__" + tag for tag in tags]
            content = document["tokenized"].replace("\n", " ").lower()
            if len(content) > 200:
                print(" ".join(fasttext_tags), content, file=o)  # type: ignore