def count()

in recipes/sota/2019/lm_analysis/filter_segmentations.py [0:0]


def count(MIN_SIL_LENGTH, align_file):
    lines = []
    with open(align_file) as fin:
        lines = fin.readlines()

    res = {}

    res["word_counter"] = [0] * 100  # number of word in each small chunk
    res["chunk_counter"] = [0] * 100  # number of small chunk per audio
    stat = defaultdict(list)
    good_samples = []

    for line in lines:
        sp = line.split("\t")
        # filename = sp[0]

        alignments = sp[1].strip().split("\\n")

        # Parse the alignments
        chunk_starts = [0]
        chunk_ends = []
        words = []

        cur_words = []
        cur_end = 0
        for i, alignment in enumerate(alignments):
            sp = alignment.split()
            begin = float(sp[2])
            length = float(sp[3])
            word = sp[4]

            cur_end = begin + length

            if i == 0:
                continue

            if word == "$":
                if length > MIN_SIL_LENGTH:
                    chunk_ends.append(cur_end)
                    chunk_starts.append(cur_end)
                    words.append(" ".join(cur_words))
                    cur_words = []
                continue

            cur_words.append(word)

        if len(cur_words) > 0:
            chunk_ends.append(cur_end)
            words.append(" ".join(cur_words))
        else:
            chunk_starts.pop()

        # res
        good = True
        n_chunk = len(words)
        # filter if n_segments == 1
        if n_chunk < 2:
            good = False
        res["chunk_counter"][n_chunk] += 1
        for word_chunk in words:
            n_word = len(word_chunk.split())
            res["word_counter"][n_word] += 1
            stat[n_chunk].append(n_word)
            # filter if number of words in a segment > 6
            if n_word > 6:
                good = False
        if good:
            good_samples.append(line)

    print(len(good_samples))
    return res, stat, good_samples