In [None]:
# Import required modules, initialize SageMaker Session and define key parameters for Training and Deployment
# Replace huggingface_token with your token below

import sagemaker
import matplotlib.pyplot as plt
import numpy as np

from sagemaker.huggingface import HuggingFace
from datasets import load_dataset
from sagemaker.huggingface.model import HuggingFaceModel
from sklearn.metrics.pairwise import cosine_similarity

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

# Get the default bucket created by SageMaker
default_s3_bucket = sess.default_bucket()

print(f"Default S3 bucket: {default_s3_bucket}")

model_name = "ai21labs/Jamba-tiny-dev"
cache_dir_ds = "/opt/ml/dataset_cache"
cache_dir_model = "/opt/ml/model_cache"
output_dir = "/opt/ml/model"
log_dir = "/opt/ml/output"
dataset_name = "stsb_multi_mt"

# Jamba1.5 models are gated models in Huggingface
# You need to generate a HuggingFace User Access Token
# and enable model access via your HuggingFace Account
# https://huggingface.co/docs/hub/en/models-gated

huggingface_token = "<Replace_with_your_token>"
assert huggingface_token != "<Replace_with_your_token>", "Replace with your HuggingFace Token to gain access to gated Jamba1.5 models"

In [None]:
# Define Hugging Face training estimator
huggingface_estimator = HuggingFace(
    entry_point='train.py',  # train script provided under /scripts folder
    source_dir='./scripts',  # Path to the script directory
    instance_type='ml.p3.8xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.36.0',  # This will get updated via requirements.txt. Transformer support for JAMBA came after 4.39. DLC Training still up to 4.36
    pytorch_version='2.1.0',
    py_version='py310',
    hyperparameters={
        'epochs': 1,
        'train_batch_size': 64,
        'eval_batch_size': 128,
        'learning_rate': 2e-5,
        'model_name': model_name,
        'output_dir': output_dir,
        'log_dir': log_dir,
        'cache_dir_ds': cache_dir_ds,
        'cache_dir_model': cache_dir_model,
        'huggingface_token': huggingface_token,
        'dataset_name': dataset_name
    },
    enable_sagemaker_metrics=True,
    container_log_level=10,
    volume_size=1000,
    dependencies=['./scripts/requirements.txt']  # needed to install requirements for JAMBA not included in DLC
)

# Start the fine-tuning job
huggingface_estimator.fit()

# Save fine-tuned model data location in S3
model_data = huggingface_estimator.model_data

In [4]:
# Define HuggingFace Model based on the fine tuned model data
huggingface_model = HuggingFaceModel(
   model_data=model_data,
   source_dir='./scripts',
   entry_point='inference.py',  # Inference script provided under /scripts folder
   role=role,
   transformers_version="4.37.0",
   pytorch_version="2.1.0",
   py_version='py310',
   dependencies=['./scripts/requirements.txt'],
   env={                                           # Define environment variables
        'HF_TASK': 'feature-extraction',           # Set task to feature-extraction
        'SAGEMAKER_PRELOAD_MODELS': 'true',        # Preload model on container startup
        'SAGEMAKER_MODEL_SERVER_TIMEOUT': '180',   # Set a high timeout if needed
        'SAGEMAKER_CONTAINER_LOG_LEVEL': 10        # Set log level to DEBUG (most verbose)
    }
)

In [None]:
# Define Predictor and start Deployment
predictor = huggingface_model.deploy(
    initial_instance_count=1, 
    instance_type='ml.p3.2xlarge'
)

In [6]:
# Create Sample Test and Receive Sentence Embeddings from fine tuned Model

Sentence0 = {
  "inputs": "ja esta muito tarde"
}

Sentence1 = {
  "inputs": "I love you"
}

Sentence2 = {
  "inputs": "can you do my homework tomorrow"
}

Sentence3 = {
  "inputs": "I'm in love with you"
}

SentenceEmbeddings = []
SentenceEmbeddings.append(predictor.predict(data=Sentence0))
SentenceEmbeddings.append(predictor.predict(data=Sentence1))
SentenceEmbeddings.append(predictor.predict(data=Sentence2))
SentenceEmbeddings.append(predictor.predict(data=Sentence3))

In [None]:
# Function to calculate the distance between embeddings
def compute_cosine_similarity(embeddings1, embeddings2):
    # Calculate cosine similarity between two sets of embeddings
    return cosine_similarity(embeddings1, embeddings2)


# Prepare a function to plot the 2x2 matrix with color coding
def plot_similarity_matrix(similarities, labels, threshold=0.5):
    fig, ax = plt.subplots()

    # Create a 2x2 matrix for the results
    matrix = np.zeros((2, 2))

    # Fill the matrix based on similarity values and labels
    for i in range(2):
        for j in range(2):
            is_similar = labels[i * 2 + j]  # Expected similarity (1 if similar, 0 if dissimilar)
            is_good = (similarities[i * 2 + j] >= threshold and is_similar == 1) or (similarities[i * 2 + j] < threshold and is_similar == 0)
            matrix[i, j] = similarities[i * 2 + j]

            # Set color: Green for good results, Red for bad results
            color = 'green' if is_good else 'red'
            ax.text(j, i, f'{matrix[i, j]:.2f}', ha='center', va='center', color='white', fontsize=12, bbox=dict(facecolor=color, alpha=0.7))

    ax.imshow(matrix, cmap='RdYlGn', vmin=0, vmax=1)
    ax.set_xticks(np.arange(2))
    ax.set_yticks(np.arange(2))
    ax.set_xticklabels(['Sentence 1', 'Sentence 3'])
    ax.set_yticklabels(['Sentence 0', 'Sentence 2'])

    plt.title('Cosine Similarity Matrix (Green=Good, Red=Bad)')
    plt.show()


# Expected similarity labels for the test pairs (1 for similar, 0 for dissimilar)
expected_labels = [1, 0, 0, 0]  # 1st pair is similar, 2nd pair is dissimilar 

# Initialize similarities
similarities = []

# Calculate cosine similarity between embeddings
similarities.append(cosine_similarity(SentenceEmbeddings[0].get('embeddings'), SentenceEmbeddings[1].get('embeddings')))
similarities.append(cosine_similarity(SentenceEmbeddings[2].get('embeddings'), SentenceEmbeddings[3].get('embeddings')))
similarities.append(cosine_similarity(SentenceEmbeddings[0].get('embeddings'), SentenceEmbeddings[2].get('embeddings')))
similarities.append(cosine_similarity(SentenceEmbeddings[1].get('embeddings'), SentenceEmbeddings[3].get('embeddings')))

# Plot the similarity matrix
plot_similarity_matrix(similarities, expected_labels)
print('Similarity Scores: ', similarities)

In [None]:
# Delete the model and endpoint when done
predictor.delete_model()
predictor.delete_endpoint()