in muss/mining/nn_search.py [0:0]
def get_sentences_from_ids(sentence_ids, sentences_paths):
def get_sentences_from_ids_single_file(sentence_ids, sentences_path):
sentences = read_lines(sentences_path)
try:
return [sentences[sentence_id] for sentence_id in sentence_ids]
except IndexError:
print(
f'len(sentences)={len(sentences)}, max(sentence_ids)={max(sentence_ids)}, sentences_path={sentences_path}'
)
raise
sorted_idx = np.argsort(sentence_ids)
sentence_ids = sentence_ids[sorted_idx]
ids_per_file = defaultdict(list)
n_sentences_list = [cached_count_lines(sentences_path) for sentences_path in sentences_paths]
offsets = np.cumsum([0] + n_sentences_list[:-1])
next_offsets = np.cumsum(n_sentences_list)
sentence_ids = np.sort(sentence_ids)
for offset, next_offset, sentences_path in zip(offsets, next_offsets, sentences_paths):
selected_sentence_ids = sentence_ids[(offset <= sentence_ids) & (sentence_ids < next_offset)]
if len(selected_sentence_ids) > 0:
selected_sentence_ids -= offset
ids_per_file[sentences_path].extend(selected_sentence_ids.tolist())
# The sentences should be returned in the correct order because python dicts are insertion ordered
sentences_list = Parallel(n_jobs=10)(
delayed(get_sentences_from_ids_single_file)(sentence_ids, sentences_path)
for sentences_path, sentence_ids in tqdm(ids_per_file.items(), desc='Load sentences')
)
sentences = [sentence for sentences in sentences_list for sentence in sentences]
# Put sentences back in order
return [sentences[idx] for idx in np.argsort(sorted_idx)]