def read_counts_from_arpa()

in scripts/load_arpa.py [0:0]


def read_counts_from_arpa(arpa_file):
    fid = open(arpa_file, "r")
    # read header
    while fid.readline().strip() != "\\data\\":
        continue
    line = fid.readline()
    assert "ngram 1" in line
    num_words = int(line.strip().split("=")[1])
    lm_order = 1
    while True:
        line = fid.readline().strip()
        if len(line) == 0:
            break
        lm_order += 1
        assert f"ngram {lm_order}" in line

    counts = []
    vocab = {}
    # read higher order ngrams
    for cur_order in range(1, lm_order + 1):
        counts.append({})
        while f"\\{cur_order}-grams" not in fid.readline():
            continue
        idx = 0
        while True:
            line = fid.readline().strip().split()
            if len(line) == 0 or "\\end\\" == line[0]:
                break
            if cur_order == 1:
                vocab[line[1]] = idx
            gram = line[1 : cur_order + 1]
            key = tuple([vocab[g] for g in gram])
            prob = float(line[0])
            if len(line) > cur_order + 1:
                bckoff = float(line[cur_order + 1])
            else:
                bckoff = 0.0 if cur_order < lm_order else None
            counts[cur_order - 1][key] = (prob, bckoff)
            idx += 1
    assert len(vocab) == num_words
    return counts, vocab