in avhubert/preparation/lrs3_prepare.py [0:0]
def trim_audio(csv_fn, raw_dir, output_dir, ffmpeg, rank, nshard):
df = read_csv(csv_fn)
raw2fid = OrderedDict()
for fid, start, end in zip(df['id'], df['start'], df['end']):
if '_' in fid:
raw_fid = '_'.join(fid.split('_')[:-1])
else:
raw_fid = fid
if raw_fid in raw2fid:
raw2fid[raw_fid].append([fid, start, end])
else:
raw2fid[raw_fid] = [[fid, start, end]]
i_raw = -1
num_per_shard = math.ceil(len(raw2fid.keys())/nshard)
start_id, end_id = num_per_shard*rank, num_per_shard*(rank+1)
fid_info_shard = list(raw2fid.items())[start_id: end_id]
print(f"Total audios in current shard: {len(fid_info_shard)}/{len(raw2fid.keys())}")
for raw_fid, fid_info in tqdm(fid_info_shard):
i_raw += 1
tmp_dir = tempfile.mkdtemp()
wav_path = os.path.join(tmp_dir, 'tmp.wav')
cmd = ffmpeg + " -i " + os.path.join(raw_dir, raw_fid+'.mp4') + " -f wav -vn -y " + wav_path + ' -loglevel quiet'
subprocess.call(cmd, shell=True)
raw_audio = AudioSegment.from_wav(wav_path)
for fid, start_sec, end_sec in fid_info:
start_sec, end_sec = float(start_sec), float(end_sec)
if end_sec == -1:
end_sec = 24*3600
t1, t2 = int(start_sec*1000), int(end_sec*1000)
new_audio = raw_audio[t1: t2]
output_path = os.path.join(output_dir, fid+'.wav')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
new_audio.export(output_path, format="wav")
shutil.rmtree(tmp_dir)
return