in recipes/joint_training_vox_populi/prepare_data/make_lexicon.py [0:0]
def load_words_from_lst(path_lst: Path, n_best: int, min_occ: int, is_raw_text: bool):
"""
Load words from an input file, which can be in w2l list format or
a file with lines of sentences.
paht_lst: input file
n_best: top n frequent words to keep
min_occ: minimum number of occurrences of each word
is_raw_text: the input file only contains lines of text (True);
the input file is in w2l list format, including utterance ids and audio path (False)
"""
with path_lst.open("r") as file_lst:
data = [x.strip() for x in file_lst.readlines()]
log.info("Building the lexicon")
out = {}
# id_ path duration normalized_text
for line in data:
if is_raw_text:
words = line.split()
else:
words = line.split()[3:]
for word in words:
if word not in out:
out[word] = 0
out[word] += 1
tmp = list(out.items())
tmp = [(k, v) for k, v in tmp if v >= min_occ]
tmp.sort(reverse=True, key=lambda x: x[1])
return {x for x, v in tmp[:n_best]}