in data.py [0:0]
def prepare_for_finetuning(self):
"""
Prepare data for BERT based finetuning
https://github.com/huggingface/pytorch-transformers/tree/master/examples/lm_finetuning
Format:
"The scripts in this folder expect a single file as input, consisting of
untokenized text, with one sentence per line, and one blank line between
documents. The reason for the sentence splitting is that part of BERT's
training involves a next sentence objective in which the model must predict
whether two sequences of text are contiguous text from the same document
or not, and to avoid making the task too easy, the split point between
the sequences is always at the end of a sentence. The linebreaks
in the file are therefore necessary to mark the points where the
text can be split."
:return:
"""
self.logbook.write_message_logs("Preparing data for finetuning")
indices = self.train_indices
if args.mode == "test":
indices = self.test_indices
df = self.dialogs["true_response"]
with open(
"fine_tune_{}_{}.txt".format(self.args.data_name, self.args.mode), "w"
) as fp:
uniq_dial_ids = list(df["dialog_id"].unique())
for dialog_id in uniq_dial_ids:
context = (
df[df.dialog_id == dialog_id]
.sort_values(by=["context_id"], ascending=False)["context"]
.values[0]
)
context = context.split("\n")
response = (
df[df.dialog_id == dialog_id]
.sort_values(by=["context_id"], ascending=False)["true_response"]
.values[0]
)
dialog = context + [response]
for utt in dialog:
utt = utt.replace("[CLS] ", "")
utt = utt.replace(" [SEP]", "")
sents = sent_tokenize(utt)
for sent in sents:
fp.write(sent + "\n")
# blank line for end of doc
fp.write("\n")
self.logbook.write_message_logs("Done")