in code/train_deploy.py [0:0]
def _get_train_data_loader(batch_size, training_dir, is_distributed):
logger.info("Get train data loader")
dataset = pd.read_csv(os.path.join(training_dir, "train.csv"))
sentences = dataset.sentence.values
labels = dataset.label.values
input_ids = []
for sent in sentences:
encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
input_ids.append(encoded_sent)
# pad shorter sentences
input_ids_padded = []
for i in input_ids:
while len(i) < MAX_LEN:
i.append(0)
input_ids_padded.append(i)
input_ids = input_ids_padded
# mask; 0: added, 1: otherwise
attention_masks = []
# For each sentence...
for sent in input_ids:
att_mask = [int(token_id > 0) for token_id in sent]
attention_masks.append(att_mask)
# convert to PyTorch data types.
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
if is_distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
else:
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
return train_dataloader