in scripts/load_arpa.py [0:0]
def read_counts_from_arpa(arpa_file):
fid = open(arpa_file, "r")
# read header
while fid.readline().strip() != "\\data\\":
continue
line = fid.readline()
assert "ngram 1" in line
num_words = int(line.strip().split("=")[1])
lm_order = 1
while True:
line = fid.readline().strip()
if len(line) == 0:
break
lm_order += 1
assert f"ngram {lm_order}" in line
counts = []
vocab = {}
# read higher order ngrams
for cur_order in range(1, lm_order + 1):
counts.append({})
while f"\\{cur_order}-grams" not in fid.readline():
continue
idx = 0
while True:
line = fid.readline().strip().split()
if len(line) == 0 or "\\end\\" == line[0]:
break
if cur_order == 1:
vocab[line[1]] = idx
gram = line[1 : cur_order + 1]
key = tuple([vocab[g] for g in gram])
prob = float(line[0])
if len(line) > cur_order + 1:
bckoff = float(line[cur_order + 1])
else:
bckoff = 0.0 if cur_order < lm_order else None
counts[cur_order - 1][key] = (prob, bckoff)
idx += 1
assert len(vocab) == num_words
return counts, vocab