in benchmarks/rnnt/ootb/train/common/data/dataset.py [0:0]
def _load_json_manifest(self, fpath):
j = json.load(open(fpath, "r", encoding="utf-8"))
for i, s in enumerate(j):
if i % 1000 == 0:
print(f'{i:>10}/{len(j):<10}', end='\r')
s_max_duration = s['original_duration']
s['duration'] = s.pop('original_duration')
if not (self.min_duration <= s_max_duration <= self.max_duration):
self.duration_filtered += s['duration']
continue
# Prune and normalize according to transcript
tr = (s.get('transcript', None) or
self.load_transcript(s['text_filepath']))
if not isinstance(tr, str):
print(f'WARNING: Skipped sample (transcript not a str): {tr}.')
self.duration_filtered += s['duration']
continue
if self.normalize_transcripts:
tr = normalize_string(tr, self.tokenizer.charset, self.punctuation_map)
s["transcript"] = self.tokenizer.tokenize(tr)
files = s.pop('files')
if self.ignore_offline_speed_perturbation:
files = [f for f in files if f['speed'] == 1.0]
s['audio_duration'] = [f['duration'] for f in files]
s['audio_filepath'] = [str(Path(self.data_dir, f['fname']))
for f in files]
self.samples.append(s)
self.duration += s['duration']
if self.max_utts > 0 and len(self.samples) >= self.max_utts:
print(f'Reached max_utts={self.max_utts}. Finished parsing {fpath}.')
break