in mdr/qa/qa_dataset.py [0:0]
def prepare(item, tokenizer, special_toks=["[SEP]", "[unused1]", "[unused2]"]):
"""
tokenize the passages chains, add sentence start markers for SP sentence identification
"""
def _process_p(para):
"""
handle each para
"""
title, sents = para["title"].strip(), para["sents"]
# return "[unused1] " + title + " [unused1] " + text # mark title
# return title + " " + text
pre_sents = []
for idx, sent in enumerate(sents):
pre_sents.append("[unused1] " + sent.strip())
return title + " " + " ".join(pre_sents)
# return " ".join(pre_sents)
# mark passage boundary
contexts = []
for para in item["passages"]:
contexts.append(_process_p(para))
context = " [SEP] ".join(contexts)
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
context = "yes no [SEP] " + context
for c in context:
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
sent_starts = []
orig_to_tok_index = []
tok_to_orig_index = []
all_doc_tokens = []
for (i, token) in enumerate(doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
if token in special_toks:
if token == "[unused1]":
sent_starts.append(len(all_doc_tokens))
sub_tokens = [token]
else:
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
item["context_processed"] = {
"doc_tokens": doc_tokens,
"char_to_word_offset": char_to_word_offset,
"orig_to_tok_index": orig_to_tok_index,
"tok_to_orig_index": tok_to_orig_index,
"all_doc_tokens": all_doc_tokens,
"context": context,
"sent_starts": sent_starts
}
return item