in distant_supervision/entity_to_queries_mapper.py [0:0]
def _get_entity2qpa_list(self, article, ner_broadcast):
rng = utils.RandomNumberGenerator()
# good_sents is contiguous
good_sents = self._get_valid_context_sentences(article, rng)
# only use the first N sentences, instead of using article.text
article_raw = ' '.join([sent.text for sent in good_sents])
article_phrases = self._get_all_phrases_from_sentence_list(good_sents)
candidate_phrase_pairs = set()
for sent in good_sents:
candidate_phrase_pairs.update(sent.get_phrases(self.phrase_mode))
candidate_phrase_pairs = list(candidate_phrase_pairs & ner_broadcast.value) # only keep ones that are in NER list
rng.vanilla.shuffle(candidate_phrase_pairs)
candidate_phrase_pairs = candidate_phrase_pairs[:NUM_ENTITIES_PER_ARTICLE_TO_CONSIDER]
result_lst = []
for phrase_str, phrase_category in candidate_phrase_pairs:
# filtered sentences where the "answer" string is in there.
# Also, keep only ones that have less than X number of words (others are likely an error).
filtered_sents = [
s for s in good_sents
if (phrase_str, phrase_category) in s.get_phrases(self.phrase_mode) and len(s.text.split()) <= NUM_WORDS_IN_QUERY_SENTENCE_ULIM]
if not filtered_sents:
continue
phrase = PhraseObj(phrase_str, phrase_category)
rng.vanilla.shuffle(filtered_sents)
filtered_sents = filtered_sents[:NUM_OF_SENTENCES_ULIM] # only randomly take this many sentences
qpa = QueriesPerArticleObj(
article_id=article.id,
article_title=article.title,
article_raw=article_raw, # do not use article.text here. We only use first/some N sentences
article_phrases=article_phrases,
filtered_sents=filtered_sents, # filtered sentences where the "answer" string is in there
phrase=phrase)
result_lst.append(((phrase.phrase_str, phrase.phrase_category), [qpa]))
if len(result_lst) >= NUM_ENTITIES_PER_ARTICLE_TO_KEEP:
break
return result_lst