in cc_net/tools/make_dmoz_corpus.py [0:0]
def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None:
"""
Loads a tags file and create a training dataset using the given webpages.
Arguments:
- file: CC shard file
- tags_file: dmoz tagging file, (like the one produced by `dl`)
- output: ""
"""
url2tags = load_tags(tags_file)
with jsonql.open_write(output) as o:
for document in jsonql.read_jsons(file):
if not document:
continue
url = document["url"]
domain = document["source_domain"]
if url in url2tags:
tags = url2tags[url]
elif domain in url2tags:
tags = url2tags[domain]
else:
continue
if len(tags) == 0:
continue
fasttext_tags = ["__label__" + tag for tag in tags]
content = document["tokenized"].replace("\n", " ").lower()
if len(content) > 200:
print(" ".join(fasttext_tags), content, file=o) # type: ignore