in scripts/distant/generate.py [0:0]
def find_answer(paragraph, q_tokens, answer, opts):
"""Return the best matching answer offsets from a paragraph.
The paragraph is skipped if:
* It is too long or short.
* It doesn't contain the answer at all.
* It doesn't contain named entities found in the question.
* The answer context match score is too low.
- This is the unigram + bigram overlap within +/- window_sz.
"""
# Length check
if len(paragraph) > opts['char_max'] or len(paragraph) < opts['char_min']:
return
# Answer check
if opts['regex']:
# Add group around the whole answer
answer = '(%s)' % answer[0]
ans_regex = re.compile(answer, flags=re.IGNORECASE + re.UNICODE)
answers = ans_regex.findall(paragraph)
answers = {a[0] if isinstance(a, tuple) else a for a in answers}
answers = {a.strip() for a in answers if len(a.strip()) > 0}
else:
answers = {a for a in answer if a in paragraph}
if len(answers) == 0:
return
# Entity check. Default tokenizer + NLTK to minimize falling through cracks
q_tokens, q_nltk_ner = q_tokens
for ne in q_tokens.entity_groups():
if ne[0] not in paragraph:
return
for ne in q_nltk_ner:
if ne not in paragraph:
return
# Search...
p_tokens = tokenize_text(paragraph)
p_words = p_tokens.words(uncased=True)
q_grams = Counter(q_tokens.ngrams(
n=2, uncased=True, filter_fn=utils.filter_ngram
))
best_score = 0
best_ex = None
for ans in answers:
try:
a_words = tokenize_text(ans).words(uncased=True)
except RuntimeError:
logger.warn('Failed to tokenize answer: %s' % ans)
continue
for idx in range(len(p_words)):
if p_words[idx:idx + len(a_words)] == a_words:
# Overlap check
w_s = max(idx - opts['window_sz'], 0)
w_e = min(idx + opts['window_sz'] + len(a_words), len(p_words))
w_tokens = p_tokens.slice(w_s, w_e)
w_grams = Counter(w_tokens.ngrams(
n=2, uncased=True, filter_fn=utils.filter_ngram
))
score = sum((w_grams & q_grams).values())
if score > best_score:
# Success! Set new score + formatted example
best_score = score
best_ex = {
'id': uuid.uuid4().hex,
'question': q_tokens.words(),
'document': p_tokens.words(),
'offsets': p_tokens.offsets(),
'answers': [(idx, idx + len(a_words) - 1)],
'qlemma': q_tokens.lemmas(),
'lemma': p_tokens.lemmas(),
'pos': p_tokens.pos(),
'ner': p_tokens.entities(),
}
if best_score >= opts['match_threshold']:
return best_score, best_ex