in curiosity/reader.py [0:0]
def text_to_instance(self, row: List):
# (1) Prepare facts
# Set max length of each fact text: 300 characters
fact = row[0][:300]
# If it doesn't have any fact, put a default symbol
if fact == "":
fact = "@@NOFACT@@"
# Tokenize facts
tokenized_fact = (
[Token(START_SYMBOL)]
+ self._tokenizer.tokenize(fact)[:150]
+ [Token(END_SYMBOL)]
)
# (2) Prepare the paraphrased message
message = row[1]
# Tokenize
tokenized_message = (
[Token(START_SYMBOL)]
+ self._tokenizer.tokenize(message)[:150]
+ [Token(END_SYMBOL)]
)
# (3) Prepare dialog acts
dialog_acts = ["@@NODA@@"]
# (4) Prepare sender information
sender = "teacher"
return Instance(
{
"source_tokens": TextField(tokenized_fact, self._token_indexers),
"target_tokens": TextField(tokenized_message, self._token_indexers),
"dialog_acts": MultiLabelField(
dialog_acts, label_namespace="dialog_acts"
),
"sender": LabelField(sender, label_namespace="sender"),
"metadata": MetadataField(
{
"dialog_id": -1,
"n_message": -1,
}
),
}
)