in jamba1.5-retriever/scripts/train.py [0:0]
def preprocess_function(examples, tokenizer, max_lenght):
# Used to tokenize and format datasets
print("insde the dataset preprocess_function")
# Tokenize sentence1 and sentence2 separately
sentence1_inputs = tokenizer(examples['sentence1'], truncation=True, padding=True, max_length=max_lenght)
sentence2_inputs = tokenizer(examples['sentence2'], truncation=True, padding=True, max_length=max_lenght)
# Binary labels based on similarity score
labels = [1 if score >= 2.5 else -1 for score in examples['similarity_score']]
print("before retuning the dictionary for both sentences")
# Return a dictionary with input_ids and attention_mask for both sentences, plus labels
return {
'input_ids1': sentence1_inputs['input_ids'],
'attention_mask1': sentence1_inputs['attention_mask'],
'input_ids2': sentence2_inputs['input_ids'],
'attention_mask2': sentence2_inputs['attention_mask'],
'labels': labels
}