in datasets/spd_datasets.py [0:0]
def get_ami_files(path_to_ami, setup="only_words", hm_type="ihm"):
"""_summary_
Returns:
_type_: _description_
"""
assert setup in ["only_words", "mini"]
assert hm_type in ["ihm", "sdm"]
rttm_files = {
"train": glob.glob(path_to_ami + "/AMI-diarization-setup/{}/rttms/{}/*.rttm".format(setup, "train")),
"validation": glob.glob(path_to_ami + "/AMI-diarization-setup/{}/rttms/{}/*.rttm".format(setup, "dev")),
"test": glob.glob(path_to_ami + "/AMI-diarization-setup/{}/rttms/{}/*.rttm".format(setup, "test")),
}
audio_files = {
"train": [],
"validation": [],
"test": [],
}
for subset in rttm_files:
rttm_list = copy.deepcopy(rttm_files[subset])
for rttm in rttm_list:
meeting = rttm.split("/")[-1].split(".")[0]
if hm_type == "ihm":
path = path_to_ami + "/AMI-diarization-setup/pyannote/amicorpus/{}/audio/{}.Mix-Headset.wav".format(
meeting, meeting
)
if os.path.exists(path):
audio_files[subset].append(path)
else:
rttm_files[subset].remove(rttm)
if hm_type == "sdm":
path = path_to_ami + "/AMI-diarization-setup/pyannote/amicorpus/{}/audio/{}.Array1-01.wav".format(
meeting, meeting
)
if os.path.exists(path):
audio_files[subset].append(path)
else:
rttm_files[subset].remove(rttm)
return audio_files, rttm_files