in ocr/utils/denoiser_utils.py [0:0]
def generate_sequences(self, inputs, begin_states, sentence):
samples, scores, valid_lengths = self.sampler(inputs, begin_states)
samples = samples[0].asnumpy()
scores = scores[0].asnumpy()
valid_lengths = valid_lengths[0].asnumpy()
max_score = -10e20
# Heuristic #1
#If the sentence is correct, let's not try to change it
sentence_tokenized = [i.replace(""", '"').replace("'","'").replace("&", "&") for i in self.tokenizer(sentence)]
sentence_correct = True
for token in sentence_tokenized:
if (token not in self.vocab or self.vocab[token] > 400000) and token.lower() not in ["don't", "doesn't", "can't", "won't", "ain't", "couldn't", "i'd", "you'd", "he's", "she's", "it's", "i've", "you've", "she'd"]:
sentence_correct = False
break
if sentence_correct:
return sentence
# Heuristic #2
# We want sentence that have the most in-vocabulary words
# and we penalize sentences that have out of vocabulary words
# that do not start with a capital letter
for i, sample in enumerate(samples):
tokens = decode_char(sample[:valid_lengths[i]])
tokens = [i.replace(""", '"').replace("'","'").replace("&", "&") for i in self.tokenizer(tokens)]
score = 0
for t in tokens:
# Boosting names
if (t in self.vocab and self.vocab[t] < 450000) or (len(t) > 0 and t.istitle()):
score += 0
else:
score -= 1
score -= 0
if score == max_score:
max_score = score
best_tokens.append(tokens)
elif score > max_score:
max_score = score
best_tokens = [tokens]
# Heurisitic #3
# Smallest edit distance
# We then take the sentence with the lowest edit distance
# From the predicted original sentence
best_dist = 1000
output_tokens = best_tokens[0]
best_tokens_ = []
for tokens in best_tokens:
dist = leven.levenshtein(sentence, ''.join(self.detokenizer(tokens)))
if dist < best_dist:
best_dist = dist
best_tokens_ =[tokens]
elif dist == best_dist:
best_tokens_.append(tokens)
# Heuristic #4
# We take the sentence with the smallest number of tokens
# to avoid split up composed words
min_len = 10e20
for tokens in best_tokens_:
if len(tokens) < min_len:
min_len = len(tokens)
best_tokens__ = [tokens]
elif len(tokens) == min_len:
best_tokens__.append(tokens)
# Heuristic #5
# Lowest ppl
# If we still have ties we take the sentence with the lowest
# Perplexity score according to the language model
best_ppl = 10e20
for tokens in best_tokens__:
if len(tokens) > 1:
inputs = self.vocab[tokens]
hidden = self.language_model.begin_state(batch_size=1, func=mx.nd.zeros, ctx=self.ctx_nlp)
output, _ = self.language_model(mx.nd.array(inputs).expand_dims(axis=1).as_in_context(self.ctx_nlp), hidden)
output = output.softmax()
l = 0
for i in range(1, len(inputs)):
l += -output[i-1][0][inputs[i]].log()
ppl = (l/len(inputs)).exp()
if ppl < best_ppl:
output_tokens = tokens
best_ppl = ppl
output = ''.join(self.detokenizer(output_tokens))
# Heuristic #6
# Sometimes there are artefact at the end of the corrected sentence
# We cut the end of the sentence
if len(output) > len(sentence) + 10:
output = output[:len(sentence)+2]
return output