in curiosity/reader.py [0:0]
def text_to_instance(self, msg: Dict, d: Dict):
# (1) Prepare facts
# Set max length of each fact text: 300 characters
fact_texts = [
self._fact_lookup[f["fid"]].text[:300] for f in msg["facts"] if f["used"]
]
# Aggregate facts
aggregated_fact_text = " ".join(fact_texts)
# If it doesn't have any fact, put a default symbol
if aggregated_fact_text == "":
aggregated_fact_text = "@@NOFACT@@"
# Wrap each sentence with start and end symbols
aggregated_fact_text = "{start_symbol} {text} {end_symbol}".format(
start_symbol=START_SYMBOL, text=aggregated_fact_text, end_symbol=END_SYMBOL
)
# Tokenize facts
tokenized_fact = self._tokenizer.tokenize(aggregated_fact_text)[:150]
# (2) Prepare messages
message = msg["message"] if msg["message"] != "" else "@@NOMESSAGE@@"
# Wrap each sentence with start and end symbols
message = "{start_symbol} {text} {end_symbol}".format(
start_symbol=START_SYMBOL, text=message, end_symbol=END_SYMBOL
)
# Tokenize
tokenized_message = self._tokenizer.tokenize(message)[:150]
# (3) Prepare dialog acts
dialog_acts = (
msg["dialog_acts"] if msg["dialog_acts"] is not None else ["@@NODA@@"]
)
# (4) Prepare sender information
sender = "user" if msg["sender"] == USER else "teacher"
return Instance(
{
"source_tokens": TextField(tokenized_fact, self._token_indexers),
"target_tokens": TextField(tokenized_message, self._token_indexers),
"dialog_acts": MultiLabelField(
dialog_acts, label_namespace=DIALOG_ACT_LABELS
),
"sender": LabelField(sender, label_namespace="sender"),
"metadata": MetadataField(
{
"dialog_id": d["dialog_id"],
"n_message": len(d["messages"]),
}
),
}
)