def preprocess_function()

in jamba1.5-retriever/scripts/train.py [0:0]


def preprocess_function(examples, tokenizer, max_lenght):
    # Used to tokenize and format datasets
    print("insde the dataset preprocess_function")

    # Tokenize sentence1 and sentence2 separately
    sentence1_inputs = tokenizer(examples['sentence1'], truncation=True, padding=True, max_length=max_lenght)
    sentence2_inputs = tokenizer(examples['sentence2'], truncation=True, padding=True, max_length=max_lenght)

    # Binary labels based on similarity score
    labels = [1 if score >= 2.5 else -1 for score in examples['similarity_score']]

    print("before retuning the dictionary for both sentences")

    # Return a dictionary with input_ids and attention_mask for both sentences, plus labels
    return {
        'input_ids1': sentence1_inputs['input_ids'],
        'attention_mask1': sentence1_inputs['attention_mask'],
        'input_ids2': sentence2_inputs['input_ids'],
        'attention_mask2': sentence2_inputs['attention_mask'],
        'labels': labels
    }