def prepare()

in mdr/qa/qa_dataset.py [0:0]


def prepare(item, tokenizer, special_toks=["[SEP]", "[unused1]", "[unused2]"]):
    """
    tokenize the passages chains, add sentence start markers for SP sentence identification
    """
    def _process_p(para):
        """
        handle each para
        """
        title, sents = para["title"].strip(), para["sents"]
        # return "[unused1] " + title + " [unused1] " + text # mark title
        # return title + " " + text
        pre_sents = []
        for idx, sent in enumerate(sents):
            pre_sents.append("[unused1] " + sent.strip())
        return title + " " + " ".join(pre_sents)
        # return " ".join(pre_sents)
    # mark passage boundary
    contexts = []
    for para in item["passages"]:
        contexts.append(_process_p(para))
    context = " [SEP] ".join(contexts)

    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True

    context = "yes no [SEP] " + context

    for c in context:
        if _is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    sent_starts = []
    orig_to_tok_index = []
    tok_to_orig_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))

        if token in special_toks:
            if token == "[unused1]":
                sent_starts.append(len(all_doc_tokens))

            sub_tokens = [token]
        else:
            sub_tokens = tokenizer.tokenize(token)

        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    item["context_processed"] = {
        "doc_tokens": doc_tokens,
        "char_to_word_offset": char_to_word_offset,
        "orig_to_tok_index": orig_to_tok_index,
        "tok_to_orig_index": tok_to_orig_index,
        "all_doc_tokens": all_doc_tokens,
        "context": context,
        "sent_starts": sent_starts
    }

    return item