in scripts/build_transitions.py [0:0]
def add_blank_grams(pruned_ngrams, num_tokens, blank):
all_grams = [gram for grams in pruned_ngrams for gram in grams]
maxorder = len(pruned_ngrams)
blank_grams = {}
if blank == "forced":
pruned_ngrams = [pruned_ngrams[0] if i == 0 else [] for i in range(maxorder)]
pruned_ngrams[0].append(tuple([num_tokens]))
blank_grams[tuple([num_tokens])] = True
for gram in all_grams:
# Iterate over all possibilities by using a vector of 0s, 1s to
# denote whether a blank is being used at each position
if blank == "optional":
# given a gram ab.. of order n, we have have n+1 positions
# avaiable whether to use blank or not.
onehot_vectors = itertools.product([0, 1], repeat=len(gram) + 1)
elif blank == "forced":
# must include a blank token in between
onehot_vectors = [[1] * (len(gram) + 1)]
else:
raise ValueError(
"Invalid value specificed for blank. Must be in |optional|forced|none|"
)
for j in onehot_vectors:
new_array = []
for idx, oz in enumerate(j[:-1]):
if oz == 1 and gram[idx] != START_IDX:
new_array.append(num_tokens)
new_array.append(gram[idx])
if j[-1] == 1 and gram[-1] != END_IDX:
new_array.append(num_tokens)
for n in range(maxorder):
for e in range(n, len(new_array)):
cur_gram = tuple(new_array[e - n : e + 1])
if num_tokens in cur_gram and cur_gram not in blank_grams:
pruned_ngrams[n].append(cur_gram)
blank_grams[cur_gram] = True
return pruned_ngrams