in dpr_scale/utils/ccnews_stats.py [0:0]
def process_json_line(line_str: str):
line = json.loads(line_str)
if (
line["language"] != "en"
or line["text"] is None
or line["title"] is None
or line["url"] is None
):
return None
sentences = split_text_into_sentences(line["text"], language="en")
words = word_tokenize(line["text"])
return line["url"], len(sentences), len(words)