def select_pars()

in data_creation/select_sentences_tfidf.py [0:0]


def select_pars(qa_dct, docs_list, word_freqs, n_sents=100, n_context=3):
    question    = qa_dct['title'][0]
    split_docs  = [sentence_split(doc['text'][0], max_len=64) for doc in docs_list]
    q_ti_dct    = tf_idf_vec(question,
                             word_freqs['title'][0],
                             word_freqs['title'][1])
    split_docs_pre  = [(i, j, sen, tf_idf_vec(sen,
                                              word_freqs['doc'][0],
                                              word_freqs['doc'][1]))
                       for i, doc in enumerate(split_docs) for j, sen in enumerate(doc)]
    split_docs_sc   = [(i, j, tf_idf_dist(q_ti_dct, dct))
                       for k, (i, j, sen, dct) in enumerate(split_docs_pre) if len(sen.split()) >= 4 and sen not in [x[2] for x in split_docs_pre[:k]]]
    split_docs_sort = sorted(split_docs_sc, key=lambda x:x[-1], reverse=True)[:n_sents]
    select_ids      = sorted([(i, j) for i, j, _ in split_docs_sort])
    par_ids     = []
    this_par    = []
    last_seen   = (-1, -1)
    for i, j in select_ids:
        if i > last_seen[0]:
            par_ids     += [this_par]
            this_par    = []
            for k in range(-n_context, n_context+1):
                if j+k >= 0 and j+k < len(split_docs[i]):
                    this_par    += [(i, j+k)]
                    last_seen   = (i, j+k)
        else:
            if j - n_context > last_seen[1] + 1:
                par_ids     += [this_par]
                this_par    = []
            for k in range(-n_context, n_context+1):
                if j+k > last_seen[1] and j+k >= 0 and j+k < len(split_docs[i]):
                    this_par    += [(i, j+k)]
                    last_seen   = (i, j+k)
    par_ids     = par_ids[1:] + [this_par]
    extract_doc = ' <P> '.join([''] + [' '.join([split_docs[i][j] for i, j in par]) for par in par_ids]).strip()
    return extract_doc