def prepare_for_finetuning()

in data.py [0:0]
31 lines of code
10 McCabe index (conditional complexity)

    def prepare_for_finetuning(self):
        """
        Prepare data for BERT based finetuning
        https://github.com/huggingface/pytorch-transformers/tree/master/examples/lm_finetuning
        Format:
            "The scripts in this folder expect a single file as input, consisting of
            untokenized text, with one sentence per line, and one blank line between
            documents. The reason for the sentence splitting is that part of BERT's
            training involves a next sentence objective in which the model must predict
            whether two sequences of text are contiguous text from the same document
            or not, and to avoid making the task too easy, the split point between
            the sequences is always at the end of a sentence. The linebreaks
            in the file are therefore necessary to mark the points where the
            text can be split."
        :return:
        """
        self.logbook.write_message_logs("Preparing data for finetuning")
        indices = self.train_indices
        if args.mode == "test":
            indices = self.test_indices
        df = self.dialogs["true_response"]
        with open(
            "fine_tune_{}_{}.txt".format(self.args.data_name, self.args.mode), "w"
        ) as fp:
            uniq_dial_ids = list(df["dialog_id"].unique())
            for dialog_id in uniq_dial_ids:
                context = (
                    df[df.dialog_id == dialog_id]
                    .sort_values(by=["context_id"], ascending=False)["context"]
                    .values[0]
                )
                context = context.split("\n")
                response = (
                    df[df.dialog_id == dialog_id]
                    .sort_values(by=["context_id"], ascending=False)["true_response"]
                    .values[0]
                )
                dialog = context + [response]
                for utt in dialog:
                    utt = utt.replace("[CLS] ", "")
                    utt = utt.replace(" [SEP]", "")
                    sents = sent_tokenize(utt)
                    for sent in sents:
                        fp.write(sent + "\n")
                # blank line for end of doc
                fp.write("\n")
        self.logbook.write_message_logs("Done")