in voxpopuli/get_lm_data.py [0:0]
def load_from_tsv_gz(in_file: Path) -> List[str]:
output = []
with gzip.open(in_file, "rt") as f:
reader = csv.DictReader(
f,
delimiter="|",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
for e in reader:
e = dict(e)
if e["split"] != "train":
continue
text = e["normed_text"]
text = text.translate(REMOVE_TRANSLATOR)
output.append(text)
return output