in distant_supervision/entity_to_queries_mapper.py [0:0]
def _get_valid_context_sentences(self, article, rng):
"""
0 10
1 5
2 6
3 7
Let's say CONTEXT_NUM_WORDS_ULIM=10. Then The largest_inclusive_idx should be index of 2.
We then randint(0,2)
"""
word_count_arr = []
for sent in article.sents:
word_count_arr.append(len(sent.text.split())) # using split() is only approximate because of punctuation
assert len(word_count_arr) == len(article.sents)
largest_inclusive_idx = self._largest_index_exceeding_ulim_context(word_count_arr)
rnd_idx = rng.vanilla.randint(0, largest_inclusive_idx)
good_sents = []
accum_nb_words = 0
for i in range(rnd_idx, len(article.sents)):
good_sents.append(article.sents[i])
accum_nb_words += word_count_arr[i]
if accum_nb_words >= CONTEXT_NUM_WORDS_ULIM:
break
return good_sents