in torchaudio/datasets/cmudict.py [0:0]
def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
_alt_re = re.compile(r"\([0-9]+\)")
cmudict: List[Tuple[str, List[str]]] = list()
for line in lines:
if not line or line.startswith(";;;"): # ignore comments
continue
word, phones = line.strip().split(" ")
if word in _PUNCTUATIONS:
if exclude_punctuations:
continue
# !EXCLAMATION-POINT -> !
# --DASH -> --
# ...ELLIPSIS -> ...
if word.startswith("..."):
word = "..."
elif word.startswith("--"):
word = "--"
else:
word = word[0]
# if a word have multiple pronunciations, there will be (number) appended to it
# for example, DATAPOINTS and DATAPOINTS(1),
# the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
word = re.sub(_alt_re, "", word)
phones = phones.split(" ")
cmudict.append((word, phones))
return cmudict