in data_creation/select_sentences_tfidf.py [0:0]
def select_pars(qa_dct, docs_list, word_freqs, n_sents=100, n_context=3):
question = qa_dct['title'][0]
split_docs = [sentence_split(doc['text'][0], max_len=64) for doc in docs_list]
q_ti_dct = tf_idf_vec(question,
word_freqs['title'][0],
word_freqs['title'][1])
split_docs_pre = [(i, j, sen, tf_idf_vec(sen,
word_freqs['doc'][0],
word_freqs['doc'][1]))
for i, doc in enumerate(split_docs) for j, sen in enumerate(doc)]
split_docs_sc = [(i, j, tf_idf_dist(q_ti_dct, dct))
for k, (i, j, sen, dct) in enumerate(split_docs_pre) if len(sen.split()) >= 4 and sen not in [x[2] for x in split_docs_pre[:k]]]
split_docs_sort = sorted(split_docs_sc, key=lambda x:x[-1], reverse=True)[:n_sents]
select_ids = sorted([(i, j) for i, j, _ in split_docs_sort])
par_ids = []
this_par = []
last_seen = (-1, -1)
for i, j in select_ids:
if i > last_seen[0]:
par_ids += [this_par]
this_par = []
for k in range(-n_context, n_context+1):
if j+k >= 0 and j+k < len(split_docs[i]):
this_par += [(i, j+k)]
last_seen = (i, j+k)
else:
if j - n_context > last_seen[1] + 1:
par_ids += [this_par]
this_par = []
for k in range(-n_context, n_context+1):
if j+k > last_seen[1] and j+k >= 0 and j+k < len(split_docs[i]):
this_par += [(i, j+k)]
last_seen = (i, j+k)
par_ids = par_ids[1:] + [this_par]
extract_doc = ' <P> '.join([''] + [' '.join([split_docs[i][j] for i, j in par]) for par in par_ids]).strip()
return extract_doc