in filtering/deduplication/dedup_oscar.py [0:0]
def get_segmentation(text, passage_tokens, overlap_tokens):
whitespace_idx = [-1] + list(find_whitespace(text))
unique_tokens = passage_tokens - overlap_tokens
passages = []
for i in range(0, len(whitespace_idx), unique_tokens):
if i + passage_tokens >= len(whitespace_idx):
passages.append((whitespace_idx[i] + 1, len(text)))
break
passages.append((whitespace_idx[i] + 1, whitespace_idx[i + passage_tokens] + 1))
return passages