def _get_valid_context_sentences()

in distant_supervision/entity_to_queries_mapper.py [0:0]


    def _get_valid_context_sentences(self, article, rng):
        """
        0  10
        1  5
        2  6
        3  7

        Let's say CONTEXT_NUM_WORDS_ULIM=10. Then The largest_inclusive_idx should be index of 2.
        We then randint(0,2)
        """
        word_count_arr = []
        for sent in article.sents:
            word_count_arr.append(len(sent.text.split()))  # using split() is only approximate because of punctuation

        assert len(word_count_arr) == len(article.sents)

        largest_inclusive_idx = self._largest_index_exceeding_ulim_context(word_count_arr)
        rnd_idx = rng.vanilla.randint(0, largest_inclusive_idx)

        good_sents = []
        accum_nb_words = 0
        for i in range(rnd_idx, len(article.sents)):
            good_sents.append(article.sents[i])
            accum_nb_words += word_count_arr[i]
            if accum_nb_words >= CONTEXT_NUM_WORDS_ULIM:
                break

        return good_sents