def _obtain_retrieved_sentences_for_single_article()

in distant_supervision/synthetic_data_creator.py [0:0]


    def _obtain_retrieved_sentences_for_single_article(self, *, qpa, es, rng, backfill_article):
        """
        Return a list. Currently, it should return a list with zero or one element

        :param backfill_article: for debugging purpose. The index of backfill
        """

        filtered_sentences = qpa.filtered_sents
        rng.vanilla.shuffle(filtered_sentences)

        context_phrase_set = set(qpa.article_phrases)

        for backfill_sent, sent in enumerate(filtered_sentences):
            # Consider adding title to es_query
            es_query = sent.text

            es_query_phrases = set(sent.get_phrases(self.phrase_mode))

            request_body = {
                "query": {
                    "bool": {
                        "should": {
                            "match": {"body_with_title": es_query},
                        },
                        "must": {
                            "match": {"body_with_title": qpa.phrase.phrase_str},
                        },
                        "must_not": {
                            "match": {"article_id": str(qpa.article_id)}
                        }
                    }
                }
            }

            try:
                # maybe convert to using msearch (multi-search)
                results = es.search(
                    index=self.es_conf.index_name,
                    doc_type=self.es_conf.doc_type,
                    size=NUM_OF_HITS_FROM_ES,  # top-k
                    request_timeout=60,
                    body=request_body)
            except RequestError as ex:
                print('ES RequestError found: {}'.format(str(ex)))
                continue

            for hit_idx, hit in enumerate(results['hits']['hits']):
                retrieved_str = hit['_source']['body']
                hit_article_id = int(hit['_source']['article_id'])

                # TODO add create real hit_phrases
                hit_phrases = self._get_hit_phrases(hit)

                if hit_article_id == qpa.article_id:
                    # if the hit is from the same article as the query, skip
                    # we already check this in ES, but just to be safe
                    continue

                if len(hit_phrases) < 2:
                    # if number of entities in hit (question) is less than 2, it's likely that it's unanswerable
                    # Should have at least 2 entities
                    continue

                if (qpa.phrase.phrase_str, qpa.phrase.phrase_category) not in hit_phrases:
                    continue

                if len(hit_phrases & es_query_phrases) < self.nb_aux_qs_matches + 1:
                    # needs to +1 to nb_aux_qs_matches because there is already a match for phrase_str
                    continue

                if len(hit_phrases & context_phrase_set) < self.nb_aux_awc_matches + self.nb_aux_qs_matches + 1:
                    # needs to add self.nb_aux_qs_matches + 1 for both the phrase_str and aux_qs matches
                    continue

                if len(retrieved_str.split()) > QUESTION_NUM_WORDS_ULIM:
                    # used naive splitting based on spaces
                    continue

                nb_entity_occurrences = len(self.text_preprocessor.findall_substr(qpa.phrase.phrase_str, retrieved_str))
                if nb_entity_occurrences != 1:
                    # We already check existence in ES query, but we want only occurrence of *once* here.
                    # This is to simplify conversion to question-style
                    continue

                if self.text_preprocessor.is_similar(es_query, retrieved_str, 0.95, discard_stopwords=False):
                    # recall that retrieved_str (from ES hit) is actually a sentence
                    # If the two sentence are too similar, then it's likely a plagiarized sentence
                    continue

                if not self.text_preprocessor.is_similar(retrieved_str, es_query, 0.30, discard_stopwords=True):
                    # if there is almost no overlap, skip
                    continue

                # the "hit" is used to generate the question
                dataset_sample = self._construct_dataset_sample(
                    rng=rng,
                    qpa=qpa,
                    hit=hit,
                    hit_phrases=list(hit_phrases),
                    es_query=es_query,
                    es_rank=hit_idx,
                    backfill_article=backfill_article,
                    backfill_sent=backfill_sent)

                return [dataset_sample]