def find_answer()

in scripts/distant/generate.py [0:0]
57 lines of code
21 McCabe index (conditional complexity)

def find_answer(paragraph, q_tokens, answer, opts):
    """Return the best matching answer offsets from a paragraph.

    The paragraph is skipped if:
    * It is too long or short.
    * It doesn't contain the answer at all.
    * It doesn't contain named entities found in the question.
    * The answer context match score is too low.
      - This is the unigram + bigram overlap within +/- window_sz.
    """
    # Length check
    if len(paragraph) > opts['char_max'] or len(paragraph) < opts['char_min']:
        return

    # Answer check
    if opts['regex']:
        # Add group around the whole answer
        answer = '(%s)' % answer[0]
        ans_regex = re.compile(answer, flags=re.IGNORECASE + re.UNICODE)
        answers = ans_regex.findall(paragraph)
        answers = {a[0] if isinstance(a, tuple) else a for a in answers}
        answers = {a.strip() for a in answers if len(a.strip()) > 0}
    else:
        answers = {a for a in answer if a in paragraph}
    if len(answers) == 0:
        return

    # Entity check. Default tokenizer + NLTK to minimize falling through cracks
    q_tokens, q_nltk_ner = q_tokens
    for ne in q_tokens.entity_groups():
        if ne[0] not in paragraph:
            return
    for ne in q_nltk_ner:
        if ne not in paragraph:
            return

    # Search...
    p_tokens = tokenize_text(paragraph)
    p_words = p_tokens.words(uncased=True)
    q_grams = Counter(q_tokens.ngrams(
        n=2, uncased=True, filter_fn=utils.filter_ngram
    ))

    best_score = 0
    best_ex = None
    for ans in answers:
        try:
            a_words = tokenize_text(ans).words(uncased=True)
        except RuntimeError:
            logger.warn('Failed to tokenize answer: %s' % ans)
            continue
        for idx in range(len(p_words)):
            if p_words[idx:idx + len(a_words)] == a_words:
                # Overlap check
                w_s = max(idx - opts['window_sz'], 0)
                w_e = min(idx + opts['window_sz'] + len(a_words), len(p_words))
                w_tokens = p_tokens.slice(w_s, w_e)
                w_grams = Counter(w_tokens.ngrams(
                    n=2, uncased=True, filter_fn=utils.filter_ngram
                ))
                score = sum((w_grams & q_grams).values())
                if score > best_score:
                    # Success! Set new score + formatted example
                    best_score = score
                    best_ex = {
                        'id': uuid.uuid4().hex,
                        'question': q_tokens.words(),
                        'document': p_tokens.words(),
                        'offsets': p_tokens.offsets(),
                        'answers': [(idx, idx + len(a_words) - 1)],
                        'qlemma': q_tokens.lemmas(),
                        'lemma': p_tokens.lemmas(),
                        'pos': p_tokens.pos(),
                        'ner': p_tokens.entities(),
                    }
    if best_score >= opts['match_threshold']:
        return best_score, best_ex