def _get_entity2qpa_list()

in distant_supervision/entity_to_queries_mapper.py [0:0]


    def _get_entity2qpa_list(self, article, ner_broadcast):
        rng = utils.RandomNumberGenerator()

        # good_sents is contiguous
        good_sents = self._get_valid_context_sentences(article, rng)

        # only use the first N sentences, instead of using article.text
        article_raw = ' '.join([sent.text for sent in good_sents])

        article_phrases = self._get_all_phrases_from_sentence_list(good_sents)

        candidate_phrase_pairs = set()
        for sent in good_sents:
            candidate_phrase_pairs.update(sent.get_phrases(self.phrase_mode))

        candidate_phrase_pairs = list(candidate_phrase_pairs & ner_broadcast.value)  # only keep ones that are in NER list

        rng.vanilla.shuffle(candidate_phrase_pairs)
        candidate_phrase_pairs = candidate_phrase_pairs[:NUM_ENTITIES_PER_ARTICLE_TO_CONSIDER]

        result_lst = []
        for phrase_str, phrase_category in candidate_phrase_pairs:
            # filtered sentences where the "answer" string is in there.
            # Also, keep only ones that have less than X number of words (others are likely an error).
            filtered_sents = [
                s for s in good_sents
                if (phrase_str, phrase_category) in s.get_phrases(self.phrase_mode) and len(s.text.split()) <= NUM_WORDS_IN_QUERY_SENTENCE_ULIM]

            if not filtered_sents:
                continue

            phrase = PhraseObj(phrase_str, phrase_category)

            rng.vanilla.shuffle(filtered_sents)
            filtered_sents = filtered_sents[:NUM_OF_SENTENCES_ULIM]  # only randomly take this many sentences

            qpa = QueriesPerArticleObj(
                article_id=article.id,
                article_title=article.title,
                article_raw=article_raw,  # do not use article.text here. We only use first/some N sentences
                article_phrases=article_phrases,
                filtered_sents=filtered_sents,  # filtered sentences where the "answer" string is in there
                phrase=phrase)
            result_lst.append(((phrase.phrase_str, phrase.phrase_category), [qpa]))

            if len(result_lst) >= NUM_ENTITIES_PER_ARTICLE_TO_KEEP:
                break
        return result_lst