in distant_supervision/synthetic_data_creator.py [0:0]
def _obtain_retrieved_sentences_for_single_article(self, *, qpa, es, rng, backfill_article):
"""
Return a list. Currently, it should return a list with zero or one element
:param backfill_article: for debugging purpose. The index of backfill
"""
filtered_sentences = qpa.filtered_sents
rng.vanilla.shuffle(filtered_sentences)
context_phrase_set = set(qpa.article_phrases)
for backfill_sent, sent in enumerate(filtered_sentences):
# Consider adding title to es_query
es_query = sent.text
es_query_phrases = set(sent.get_phrases(self.phrase_mode))
request_body = {
"query": {
"bool": {
"should": {
"match": {"body_with_title": es_query},
},
"must": {
"match": {"body_with_title": qpa.phrase.phrase_str},
},
"must_not": {
"match": {"article_id": str(qpa.article_id)}
}
}
}
}
try:
# maybe convert to using msearch (multi-search)
results = es.search(
index=self.es_conf.index_name,
doc_type=self.es_conf.doc_type,
size=NUM_OF_HITS_FROM_ES, # top-k
request_timeout=60,
body=request_body)
except RequestError as ex:
print('ES RequestError found: {}'.format(str(ex)))
continue
for hit_idx, hit in enumerate(results['hits']['hits']):
retrieved_str = hit['_source']['body']
hit_article_id = int(hit['_source']['article_id'])
# TODO add create real hit_phrases
hit_phrases = self._get_hit_phrases(hit)
if hit_article_id == qpa.article_id:
# if the hit is from the same article as the query, skip
# we already check this in ES, but just to be safe
continue
if len(hit_phrases) < 2:
# if number of entities in hit (question) is less than 2, it's likely that it's unanswerable
# Should have at least 2 entities
continue
if (qpa.phrase.phrase_str, qpa.phrase.phrase_category) not in hit_phrases:
continue
if len(hit_phrases & es_query_phrases) < self.nb_aux_qs_matches + 1:
# needs to +1 to nb_aux_qs_matches because there is already a match for phrase_str
continue
if len(hit_phrases & context_phrase_set) < self.nb_aux_awc_matches + self.nb_aux_qs_matches + 1:
# needs to add self.nb_aux_qs_matches + 1 for both the phrase_str and aux_qs matches
continue
if len(retrieved_str.split()) > QUESTION_NUM_WORDS_ULIM:
# used naive splitting based on spaces
continue
nb_entity_occurrences = len(self.text_preprocessor.findall_substr(qpa.phrase.phrase_str, retrieved_str))
if nb_entity_occurrences != 1:
# We already check existence in ES query, but we want only occurrence of *once* here.
# This is to simplify conversion to question-style
continue
if self.text_preprocessor.is_similar(es_query, retrieved_str, 0.95, discard_stopwords=False):
# recall that retrieved_str (from ES hit) is actually a sentence
# If the two sentence are too similar, then it's likely a plagiarized sentence
continue
if not self.text_preprocessor.is_similar(retrieved_str, es_query, 0.30, discard_stopwords=True):
# if there is almost no overlap, skip
continue
# the "hit" is used to generate the question
dataset_sample = self._construct_dataset_sample(
rng=rng,
qpa=qpa,
hit=hit,
hit_phrases=list(hit_phrases),
es_query=es_query,
es_rank=hit_idx,
backfill_article=backfill_article,
backfill_sent=backfill_sent)
return [dataset_sample]