def get_sentences_from_ids()

in muss/mining/nn_search.py [0:0]


def get_sentences_from_ids(sentence_ids, sentences_paths):
    def get_sentences_from_ids_single_file(sentence_ids, sentences_path):
        sentences = read_lines(sentences_path)
        try:
            return [sentences[sentence_id] for sentence_id in sentence_ids]
        except IndexError:
            print(
                f'len(sentences)={len(sentences)}, max(sentence_ids)={max(sentence_ids)}, sentences_path={sentences_path}'
            )
            raise

    sorted_idx = np.argsort(sentence_ids)
    sentence_ids = sentence_ids[sorted_idx]
    ids_per_file = defaultdict(list)
    n_sentences_list = [cached_count_lines(sentences_path) for sentences_path in sentences_paths]
    offsets = np.cumsum([0] + n_sentences_list[:-1])
    next_offsets = np.cumsum(n_sentences_list)
    sentence_ids = np.sort(sentence_ids)
    for offset, next_offset, sentences_path in zip(offsets, next_offsets, sentences_paths):
        selected_sentence_ids = sentence_ids[(offset <= sentence_ids) & (sentence_ids < next_offset)]
        if len(selected_sentence_ids) > 0:
            selected_sentence_ids -= offset
            ids_per_file[sentences_path].extend(selected_sentence_ids.tolist())
    # The sentences should be returned in the correct order because python dicts are insertion ordered
    sentences_list = Parallel(n_jobs=10)(
        delayed(get_sentences_from_ids_single_file)(sentence_ids, sentences_path)
        for sentences_path, sentence_ids in tqdm(ids_per_file.items(), desc='Load sentences')
    )
    sentences = [sentence for sentences in sentences_list for sentence in sentences]
    # Put sentences back in order
    return [sentences[idx] for idx in np.argsort(sorted_idx)]