jamba1.5-retriever/scripts/train.py (224 lines of code) (raw):
import argparse
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
def preprocess_function(examples, tokenizer, max_lenght):
# Used to tokenize and format datasets
print("insde the dataset preprocess_function")
# Tokenize sentence1 and sentence2 separately
sentence1_inputs = tokenizer(examples['sentence1'], truncation=True, padding=True, max_length=max_lenght)
sentence2_inputs = tokenizer(examples['sentence2'], truncation=True, padding=True, max_length=max_lenght)
# Binary labels based on similarity score
labels = [1 if score >= 2.5 else -1 for score in examples['similarity_score']]
print("before retuning the dictionary for both sentences")
# Return a dictionary with input_ids and attention_mask for both sentences, plus labels
return {
'input_ids1': sentence1_inputs['input_ids'],
'attention_mask1': sentence1_inputs['attention_mask'],
'input_ids2': sentence2_inputs['input_ids'],
'attention_mask2': sentence2_inputs['attention_mask'],
'labels': labels
}
class CustomDataCollatorWithPadding(DataCollatorWithPadding):
# Custom data collator that handles both input_ids and labels
def __call__(self, features):
# Separate input_ids and attention_mask for sentence1 and sentence2
sentence1_features = [{'input_ids': f['input_ids1'], 'attention_mask': f['attention_mask1']} for f in features]
sentence2_features = [{'input_ids': f['input_ids2'], 'attention_mask': f['attention_mask2']} for f in features]
# Call the parent method to handle padding of sentence1 and sentence2
batch_sentence1 = super().__call__(sentence1_features)
batch_sentence2 = super().__call__(sentence2_features)
# Combine sentence1 and sentence2 into a single batch dictionary
batch = {
'input_ids1': batch_sentence1['input_ids'],
'attention_mask1': batch_sentence1['attention_mask'],
'input_ids2': batch_sentence2['input_ids'],
'attention_mask2': batch_sentence2['attention_mask']
}
batch['labels'] = torch.stack([f['labels'] for f in features])
return batch
class CustomTrainer(Trainer):
# Custom loss function based on cosine similarity
def compute_loss(self, model, inputs, return_outputs=False):
print("Inside Compute_loss")
print("inputs: ", inputs)
print("return_ouputs: ", return_outputs)
# Extract labels
labels = inputs["labels"]
# Extract input_ids and attention_mask for sentence1 and sentence2
sentence1_input_ids, sentence2_input_ids = inputs['input_ids1'], inputs['input_ids2']
sentence1_attention_mask, sentence2_attention_mask = inputs['attention_mask1'], inputs['attention_mask2']
# Pass sentence1 and sentence2 through the model to get embeddings
outputs1 = model(input_ids=sentence1_input_ids, attention_mask=sentence1_attention_mask)
print("Outputs1: ", outputs1)
outputs2 = model(input_ids=sentence2_input_ids, attention_mask=sentence2_attention_mask)
print("Outputs2: ", outputs2)
# Pool embeddings
print("Before mean_pooling call for Embeddings1")
embeddings1 = mean_pooling(outputs1.last_hidden_state, sentence1_attention_mask)
print("embeddings1: ", embeddings1)
print("Before mean_pooling call for Embeddings2")
embeddings2 = mean_pooling(outputs2.last_hidden_state, sentence2_attention_mask)
print("embeddings2: ", embeddings2)
print("Before constrastive_loss call")
# Calculate contrastive loss
loss = contrastive_loss(embeddings1, embeddings2, labels)
print("After constrastive_loss call")
print("Return for compute_loss: ", (loss, (embeddings1, embeddings2)))
if return_outputs:
print("Return loss and embeddings")
return (loss, (embeddings1, embeddings2))
else:
print("Return only loss")
return loss
# Custom Predict Step used during Evaluation steps. Needed to pool embeddings and calculate loss based on contrastive objective
def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
print("inside prediction_step")
print(f"inputs: {inputs}, prediction_loss_only: {prediction_loss_only}, ignore-keys: {ignore_keys}")
# Extract labels
labels = inputs["labels"]
print("will call compute_loss to return loss and embeddings")
# Compute loss and embeddings for
with torch.no_grad():
loss, (embeddings1, embeddings2) = self.compute_loss(model, inputs, return_outputs=True)
print("after compute_loss")
print(f"loss , embeddings1, embeddings2: {loss},{embeddings1},{embeddings2}")
# Return loss, embeddings, and labels for evaluation
if prediction_loss_only:
print("retuning loss only inside prediction_step")
return loss
else:
print("retuning loss embeddings and labels inside prediction_step")
return (loss, (embeddings1, embeddings2), labels)
def mean_pooling(token_embeddings, attention_mask):
# Mean pooling function to get sentence-level embeddings
print("inside mean_pooling with token_embeddings")
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def contrastive_loss(embeddings1, embeddings2, labels, margin=0.5):
# Compute cosine similarity between embeddings1 (sentence 1) and embeddings2 (sentence 2)
print("Inside Contrastive_loss")
print("Labels:", labels)
print(f"embeddings1: {embeddings1}")
print(f"embeddings2: {embeddings2}")
cosine_sim = F.cosine_similarity(embeddings1, embeddings2)
# Positive pairs (similar): Want similarity to be close to 1
positive_loss = (1 - cosine_sim) * (labels == 1).float()
# Negative pairs (dissimilar): Want similarity to be low, at least less than the margin
negative_loss = (cosine_sim - margin).clamp(min=0) * (labels == -1).float()
# Total loss is the sum of positive and negative losses
loss = positive_loss + negative_loss
# Return the mean loss over the batch
print(f"loss.mean: {loss.mean}")
print("done with Contrastive_loss")
return loss.mean()
def compute_metrics(eval_pred, compute_result=False):
print("Inside compute_metric")
print(f"eval_pred: {eval_pred}")
print(f"compute_result: {compute_result}")
(embeddings1, embeddings2), labels = eval_pred
# Move tensors to CPU if they are on GPU
embeddings1 = embeddings1.cpu() if embeddings1.is_cuda else embeddings1
embeddings2 = embeddings2.cpu() if embeddings2.is_cuda else embeddings2
# Ensure labels are on CPU and converted to numpy arrays as expected by sklearn.
labels = labels.cpu().detach().numpy() if labels.is_cuda else labels.detach().numpy()
print("Calculating Cosine Similarity")
# Calculate cosine similarity between pairs
cosine_sim = F.cosine_similarity(embeddings1, embeddings2).detach().cpu().numpy()
print("After Cosine Similarity")
print(f"cosine_sim: {cosine_sim}")
print("Calculating Prediction")
# Convert cosine similarity to binary predictions (1 for similar, -1 for dissimilar)
predictions = [1 if sim >= 0.5 else -1 for sim in cosine_sim]
print("After Prediction")
print(f"predictions: {predictions}")
# Calculate accuracy, precision, recall, F1 and support score
print("Calculating Accuracy")
accuracy = accuracy_score(labels, predictions)
print("After Accuracy")
print(f"predictions: {accuracy}")
print("Calculating Precision, Recall, F1, Support")
precision, recall, f1, support = precision_recall_fscore_support(labels, predictions, average='binary')
print("After Precision, Recall, F1, Support")
print(f"precision: {precision}, recall: {recall}, f1: {f1}")
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'support': support
}
print(f"Calculated Metrics : {metrics}")
return metrics
def main():
# Command-line arguments for hyperparameters
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=32)
parser.add_argument("--learning_rate", type=float, default=2e-5)
parser.add_argument("--model_name", type=str, default="ai21labs/AI21-Jamba-1.5-Mini")
parser.add_argument("--output_dir", type=str, default="/opt/ml/model")
parser.add_argument("--log_dir", type=str, default="/opt/ml/output")
parser.add_argument("--cache_dir_ds", type=str, default="/opt/ml/dataset_cache")
parser.add_argument("--cache_dir_model", type=str, default="/opt/ml/model_cache")
parser.add_argument("--huggingface_token", type=str, default="<myToken>")
parser.add_argument("--dataset_name", type=str, default="stsb_multi_mt")
args = parser.parse_args()
print("Processing Datasets and building Training Configurations" )
print("load Tokenizer")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir_model, token=args.huggingface_token)
print("After tokenizer / Before model load")
print("load Model")
model = AutoModel.from_pretrained(args.model_name,cache_dir=args.cache_dir_model, token=args.huggingface_token)
print("After model load / Before dataset load")
# Load and preprocess dataset
train_ds, test_ds, dev_ds = load_dataset(args.dataset_name, 'en', split=['train[:20%]','test[:20%]','dev[:20%]'], cache_dir=args.cache_dir_ds)
train_ds_size = train_ds.num_rows
test_ds_size = test_ds.num_rows
dev_ds_size = dev_ds.num_rows
print(f"After dataset load. # of rows: train {train_ds_size}, test {test_ds_size}, dev {dev_ds_size}")
print("Tokenizing and formatting Datasets")
tokenized_train_ds = train_ds.map(lambda examples: preprocess_function(examples, tokenizer, max_lenght=400), batched=True)
tokenized_test_ds = test_ds.map(lambda examples: preprocess_function(examples, tokenizer, max_lenght=400), batched=True)
tokenized_dev_ds = dev_ds.map(lambda examples: preprocess_function(examples, tokenizer, max_lenght=400), batched=True)
print("Here are the first row for each dataset split after processing: Train, Test and Dev")
print(tokenized_train_ds[0])
print(tokenized_test_ds[0])
print(tokenized_dev_ds[0])
# Define Hugging Face Datasets compatible format
tokenized_train_ds.set_format(type='torch', columns=['input_ids1', 'attention_mask1', 'input_ids2', 'attention_mask2', 'labels'])
tokenized_test_ds.set_format(type='torch', columns=['input_ids1', 'attention_mask1', 'input_ids2', 'attention_mask2', 'labels'])
tokenized_dev_ds.set_format(type='torch', columns=['input_ids1', 'attention_mask1', 'input_ids2', 'attention_mask2', 'labels'])
print("First rows for each dataset tensors: ")
print(tokenized_train_ds[0])
print(tokenized_test_ds[0])
print(tokenized_dev_ds[0])
# Initialize the custom data collator
data_collator = CustomDataCollatorWithPadding(tokenizer=tokenizer)
# Define Step related metrics to drive training loop
steps_per_epoch = train_ds_size // args.train_batch_size
num_saves_per_epoch = 2
total_steps = steps_per_epoch * args.epochs
warmup_steps = int(0.1 * total_steps)
# Define the training arguments
training_args = TrainingArguments(
# Output and Checkpointing
output_dir=args.output_dir,
save_strategy="steps",
save_steps=steps_per_epoch // num_saves_per_epoch,
save_total_limit=2,
load_best_model_at_end=True,
# Training Control
do_train=True,
do_eval=True,
do_predict=False,
per_device_train_batch_size=args.train_batch_size,
per_device_eval_batch_size=args.eval_batch_size,
num_train_epochs=args.epochs,
max_steps=-1,
fp16=True,
gradient_checkpointing=False,
gradient_accumulation_steps=1,
# Logging and Reporting
logging_dir=args.log_dir,
logging_steps=1,
logging_first_step=True,
report_to="tensorboard",
# Evaluation Control
evaluation_strategy="steps",
eval_steps=steps_per_epoch // num_saves_per_epoch,
eval_accumulation_steps=None,
batch_eval_metrics=True,
# Optimization
learning_rate=args.learning_rate,
warmup_steps=warmup_steps,
lr_scheduler_type="linear",
weight_decay=0.01,
# Model Evaluation
metric_for_best_model='eval_loss',
greater_is_better=False,
# Other
remove_unused_columns=False,
label_smoothing_factor=0.0
)
# Initialize the Trainer based on Custom Trainer Class. Needed for Compute_loss and Prediction_Step overrides
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train_ds,
eval_dataset=tokenized_dev_ds,
compute_metrics=compute_metrics,
data_collator=data_collator
)
print("Starting Training")
# Train the model
trainer.train()
print("Training is done")
print("Start Final Evaluation")
trainer.evaluate(eval_dataset=tokenized_test_ds)
print("Final Evaluation done")
# Save the model and tokenizers
print("Saving the Model and Tokenizer")
tokenizer.save_pretrained(args.output_dir)
trainer.save_model(args.output_dir)
print("Model is ready for deployment")
if __name__ == "__main__":
main()