recipes/self_training/pseudo_labeling/dataset_utils.py (43 lines of code) (raw):
from __future__ import absolute_import, division, print_function, unicode_literals
class Transcript(object):
def __init__(self, sid, path, duration, transcript=""):
self.sid = sid
self.path = path
self.duration = duration
self.transcript = transcript
def __hash__(self):
return hash(self.sid)
def __eq__(self, other):
return self.sid == other.sid
def create_transcript_dict_from_listfile(listpath):
res = {}
with open(listpath, "r") as f:
while True:
line = f.readline()
if not line:
break
els = line.split()
sid = els[0]
res[sid] = Transcript(sid, els[1], els[2], " ".join(els[3:]))
return res
listfileformat = """{sid} {path} {duration} {transcript}\n"""
def write_transcript_list_to_file(transcript_dict, outfilename):
with open(outfilename, "w") as f:
for sid in transcript_dict.keys():
transcript = transcript_dict[sid]
f.write(
listfileformat.format(
sid=transcript.sid,
path=transcript.path,
duration=transcript.duration,
transcript=transcript.transcript.rstrip(),
)
)
def zip_datasets(first, second):
"""
Zips together two list dataset representations. Preserves the sample ID for the
"""
output = first
for sample_key in second.keys():
sample = second[sample_key]
if sample.sid in output.keys():
raise Exception("Attempted to write duplicate sample ID: " + sample.sid)
output[sample.sid] = sample
return output