in metaicl/data.py [0:0]
def tensorize(self, _train_data, _test_data, options=None,
add_newlines=True):
if options is not None:
assert np.all([dp["output"] in options for dp in _train_data])
for i, dp in enumerate(_test_data):
assert "options" not in dp
assert type(dp)==str
_test_data[i] = {"input": dp, "options": options}
train_data, test_data = [], []
if self.use_demonstrations:
for dp in _train_data:
assert type(dp)==dict, ("Each example should be a dictionary", dp)
assert "input" in dp and "output" in dp, ("Training example should contain input and output", dp)
train_data.append(dp.copy())
for dp in _test_data:
assert type(dp)==dict, ("Each example should be a dictionary", dp)
assert "input" in dp and "options" in dp and type(dp["options"])==list, \
("Test example should contain input and options in a list format", dp)
if "output" not in dp:
dp["output"] = dp["options"][0] # randomly choose one (we don't need it anyways)
test_data.append(dp.copy())
# each datapoint: passage, question, options, output
bos_token_id = self.tokenizer.bos_token_id
eos_token_id = self.tokenizer.eos_token_id
input_ids, attention_mask, token_type_ids = [], [], []
metadata = []
if self.use_demonstrations:
assert len(train_data)==self.k
demonstrations = []
for i, dp in enumerate(train_data):
input_, output_ = self._prepro_each_datapoint(
dp, is_first=i==0, for_demonstrations=True,
add_newlines=add_newlines)
demonstrations += input_ + output_
for dp_idx, dp in enumerate(test_data):
inputs, outputs, answer = self._prepro_each_datapoint(
dp, is_first=not self.use_demonstrations, add_newlines=add_newlines)
indices = [[i] for i in range(len(input_ids), len(input_ids)+len(inputs))]
metadata.append({"indices": indices, "answer": answer, "options": dp["options"]})
for inputs_, outputs_ in zip(inputs, outputs):
if self.use_demonstrations:
inputs_ = demonstrations + inputs_
encoded = prepro_sentence_pair_single(
inputs_, outputs_, self.max_length, bos_token_id, eos_token_id,
allow_truncation=self.use_demonstrations)
input_ids.append(encoded[0])
attention_mask.append(encoded[1])
token_type_ids.append(encoded[2])
self.tensorized_inputs = dict(input_ids=torch.LongTensor(input_ids),
attention_mask=torch.LongTensor(attention_mask),
token_type_ids=torch.LongTensor(token_type_ids))
self.metadata = metadata