recipes/self_training/pseudo_labeling/dataset_utils.py (43 lines of code) (raw):

from __future__ import absolute_import, division, print_function, unicode_literals class Transcript(object): def __init__(self, sid, path, duration, transcript=""): self.sid = sid self.path = path self.duration = duration self.transcript = transcript def __hash__(self): return hash(self.sid) def __eq__(self, other): return self.sid == other.sid def create_transcript_dict_from_listfile(listpath): res = {} with open(listpath, "r") as f: while True: line = f.readline() if not line: break els = line.split() sid = els[0] res[sid] = Transcript(sid, els[1], els[2], " ".join(els[3:])) return res listfileformat = """{sid} {path} {duration} {transcript}\n""" def write_transcript_list_to_file(transcript_dict, outfilename): with open(outfilename, "w") as f: for sid in transcript_dict.keys(): transcript = transcript_dict[sid] f.write( listfileformat.format( sid=transcript.sid, path=transcript.path, duration=transcript.duration, transcript=transcript.transcript.rstrip(), ) ) def zip_datasets(first, second): """ Zips together two list dataset representations. Preserves the sample ID for the """ output = first for sample_key in second.keys(): sample = second[sample_key] if sample.sid in output.keys(): raise Exception("Attempted to write duplicate sample ID: " + sample.sid) output[sample.sid] = sample return output