in lmgvp/datasets.py [0:0]
def _preprocess(self):
"""Preprocess sequences to input_ids and attention_mask
Args:
Return:
None
"""
print("Preprocessing seqeuence data...")
self.sequences = [prep_seq(seq) for seq in self.sequences]
encodings = self.tokenizer(
self.sequences, return_tensors="pt", padding=True
)
self.encodings = {
key: val
for key, val in encodings.items()
if key in ("input_ids", "attention_mask")
}