def load_words_from_lst()

in recipes/joint_training_vox_populi/prepare_data/make_lexicon.py [0:0]


def load_words_from_lst(path_lst: Path, n_best: int, min_occ: int, is_raw_text: bool):
    """
    Load words from an input file, which can be in w2l list format or
    a file with lines of sentences.

    paht_lst: input file
    n_best: top n frequent words to keep
    min_occ: minimum number of occurrences of each word
    is_raw_text: the input file only contains lines of text (True);
                the input file is in w2l list format, including utterance ids and audio path (False)
    """

    with path_lst.open("r") as file_lst:
        data = [x.strip() for x in file_lst.readlines()]

    log.info("Building the lexicon")

    out = {}
    # id_ path duration normalized_text
    for line in data:
        if is_raw_text:
            words = line.split()
        else:
            words = line.split()[3:]
        for word in words:
            if word not in out:
                out[word] = 0

            out[word] += 1

    tmp = list(out.items())
    tmp = [(k, v) for k, v in tmp if v >= min_occ]
    tmp.sort(reverse=True, key=lambda x: x[1])
    return {x for x, v in tmp[:n_best]}