In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Install Vertex AI SDK and other required packages

In [87]:
%pip install --upgrade --user --quiet google-cloud-aiplatform

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [88]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [2]:
PROJECT_ID = "dlai-test"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Lesson 5: Developing Use Cases with Videos

In this lesson, you'll go through Gemini's Multimodality capabilities, by passing Videos and Texts as input.

- Import the [Vertex AI](https://cloud.google.com/vertex-ai?hl=en) SDK.

In [3]:
import vertexai

In [4]:
vertexai.init(project = PROJECT_ID,
              location = LOCATION)

**Note:** In the latest version, `from vertexai.preview.generative_models` has been changed to `from vertexai.generative_models`.

`from vertexai.preview.generative_models` can still be used.

In [5]:
from vertexai.generative_models import GenerativeModel

- Load the `gemini-pro-vision` model.
- When specifying `gemini-pro-vision`, the [gemini-1.0-pro-vision](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-pro-vision) model is used.

In [6]:
multimodal_model = GenerativeModel("gemini-1.5-flash-001")

## Digital Marketer

In [7]:
file_path_1 = "github-repo/img/gemini/multimodality_usecases_overview/vertex-ai-langchain.mp4"
video_uri_1 = f"gs://{file_path_1}"
video_url_1 = f"https://storage.googleapis.com/{file_path_1}"

In [8]:
import IPython

In [9]:
IPython.display.Video(video_url_1, width=450)

In [10]:
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Part,
)

In [12]:
video_1 = Part.from_uri(video_uri_1, mime_type="video/mp4")

- Structure your prompt(s).
- Be specific with what you want the model to do for you.
- You can even specify the output format of the response from the model.
- In this case, you are asking for the response to be in JSON format.

In [13]:
role = """
You are a great digital marketer working on a new video.
"""

In [14]:
tasks = """
You will add the video to your website and to do this you
need to complete some tasks. Please make sure your answer
is structured.

Tasks:
- What is the title of the video?
- Write a summary of what is in the video.
- Generate metadata for the video in JSON that includes:\
Title, short description, language, and company.
"""

# tasks = """
# You will add the video to your website and to do this you
# need to complete some tasks. Please make sure your answer
# is structured.

# Tasks:
# - What is the title of the video?
# - Write a summary of what is in the video.
# - Generate metadata for the video that includes:\
# Title, short description, language, and company.
# """

- You can choose the number of variables you want for your prompt.
- More variables means you have more flexibility in making specific changes to your prompts while keeping everyhting else the same.

In [None]:
# format_json = "Please output the metadata in JSON"

In [15]:
contents_1 = [video_1, role, tasks]

# contents_1 = [video_1, role, tasks, format_json]

- Feel free to change the `temperature`

In [16]:
generation_config_1 = GenerationConfig(
    temperature=0.1,
)

In [24]:
responses = multimodal_model.generate_content(
    contents_1,
    generation_config=generation_config_1,
    stream=False
)

**Note**: If you set `stream=True`, you'll print your responses as:
```Python
for response in responses:
    print(response.text, end="")
```

**Note**: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.

In [25]:
print(responses.text, end="")

Here are the tasks you requested:

- **Title of the video:** Build AI-powered apps on Vertex AI with LangChain
- **Summary of the video:** This video is about how to use Vertex AI and LangChain to build AI-powered applications. The video starts by explaining the challenges of using large language models (LLMs) and how LangChain can help to overcome these challenges. The video then goes on to show how to use LangChain to build a basic application that summarizes large documents. Finally, the video discusses some of the use cases for Vertex AI and LangChain.
- **Metadata for the video in JSON:**
```json
{
  "Title": "Build AI-powered apps on Vertex AI with LangChain",
  "short description": "Learn how to use Vertex AI and LangChain to build AI-powered applications. This video covers the challenges of using LLMs, how LangChain can help, and how to build a basic application that summarizes large documents.",
  "language": "English",
  "company": "Google Cloud"
}
``` 


# Explaining the Educational Concepts

In [32]:
file_path_2 = "github-repo/img/gemini/multimodality_usecases_overview/descending-into-ml.mp4"
video_uri_2 = f"gs://{file_path_2}"
video_url_2 = f"https://storage.googleapis.com/{file_path_2}"

IPython.display.Video(video_url_2, width=450)

In [33]:
video_2 = Part.from_uri(video_uri_2, mime_type="video/mp4")

- You can even ask the model to answer based on answers of previous questions.
- And to generate programming code based on previous answers.

In [34]:
prompt = """
Please have a look at the video and answer the following
questions.

Questions:
- Question 1: Which concept is explained in the video?
- Question 2: Based on your answer to Question 1,
can you explain the basic math of this concept?
- Question 3: Can you provide a simple scikit code example
explaining the concept?
"""

In [35]:
contents_2 = [video_2, prompt]

In [36]:
responses = multimodal_model.generate_content(
    contents_2,
    stream=False
)

**Note**: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.

In [37]:
print(responses.text)

Of course! I can help with that. 

Here are the answers to your questions based on the video: 

**Question 1: Which concept is explained in the video?**
The concept explained in the video is **Linear Regression** with a focus on how to minimize loss in linear regression models. 

**Question 2: Based on your answer to Question 1, can you explain the basic math of this concept?**

Linear regression is a statistical method that aims to establish a linear relationship between a dependent variable (y) and one or more independent variables (x).  The goal is to find the best-fitting line through the data points that minimizes the difference between the predicted and actual values.  

The fundamental equation for a linear regression model is:

* **y = wX + b**

Where:

* **y** is the predicted value of the dependent variable.
* **w** is the weight vector (slope of the line) which determines the relationship between the independent variable (X) and the dependent variable (y).
* **X** is the ind

- You can copy/paste and run your generated code in the cell below.

**Note:** LLM's are known to generate code which is incomplete or has bugs

In [None]:
### you can copy/paste your generated code here:



- Below cell includes the code which was generated in the lecture video

In [None]:
# # Import the necessary libraries
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.linear_model import LinearRegression

# # Create some data
# X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = np.dot(X, np.array([1, 2])) + 3

# # Fit the linear regression model
# model = LinearRegression()
# model.fit(X, y)

# # Make predictions
# y_pred = model.predict(X)

# # Plot the data and the fitted line
# plt.scatter(X[:, 1], y)
# plt.plot(X[:, 1], y_pred, color='red')
# plt.show()

## Extracting Information

In [38]:
file_path_4 = "github-repo/img/gemini/multimodality_usecases_overview/google-search.mp4"
video_uri_4 = f"gs://{file_path_4}"
video_url_4 = f"https://storage.googleapis.com/{file_path_4}"

IPython.display.Video(video_url_4, width=450)

In [39]:
video_4 = Part.from_uri(video_uri_4, mime_type="video/mp4")

**Note:** In the lecture video, everything was put in a single prompt (`prompt_4`):

```Python
prompt_4 = """
Answer the following questions using the video only.
Present the results in a table with a row for each question
and its answer.
Make sure the table is in markdown format.

Questions:
- What is the most searched sport?
- Who is the most searched scientist?

"""

contents_4 = [video_4, prompt_4]
```
But as also mentioned in the lecture, you can break it into seperate variables (`questions` and `format_html`), as done in the notebook below. Feel free to pause the video and compare your notebook with the video to see the differences.

- Here, you have your questions.

In [40]:
questions = """
Answer the following questions using the video only.

Questions:
- What is the most searched sport?
- Who is the most searched scientist?
"""

# questions = """
# Answer the following questions using the video only.
# If the answer is not found in the video,
# say "Not found in video".

# Questions:
# - What is the most searched sport?
# - Who is the most searched scientist?
# """

- Here, you specify the output format.
- In this case, it is table format.

In [41]:
format_html = """
Format:
Present the results in a table with a row for each question
and its answer.
Make sure the table is in markdown format.
"""

In [42]:
contents_4 = [video_4, questions, format_html]

- Set the `temperature`. For now, it is `temperature=0.9`

In [43]:
generation_config_1 = GenerationConfig(
    temperature=0.9,
)

In [44]:
responses = multimodal_model.generate_content(contents_4,
                   generation_config=generation_config_1,
                                              stream=True
)

**Note**: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.

In [45]:
for response in responses:
    print(response.text, end="")

| Question | Answer |
|---|---|
| What is the most searched sport? | Soccer |
| Who is the most searched scientist? | Albert Einstein |

```
You can copy/paste your generation in this Markdown cell (double click here)
```
(Paste here)

## Finding a Needle in a Haystack

In [None]:
from utils import gemini_vision

- Load the [gemini-1.5-pro-001](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-pro-preview-0409) model.

In [None]:
multimodal_model = GenerativeModel("gemini-1.5-pro-001")

- Just like with images, you can send more than 1 video to the model.
- The following videos are from the **[LLMOps](https://learn.deeplearning.ai/courses/llmops/lesson/1/introduction)** short course, which you can enroll in on **[DeepLearning.AI's Short Courses Platform](https://learn.deeplearning.ai)**.

In [46]:
video_1 = Part.from_uri("gs://github-repo/img/gemini/multimodality_usecases_overview/sc-gc-c3-LLMOps_L1_v3.mp4",  mime_type="video/mp4")
video_2 = Part.from_uri("gs://github-repo/img/gemini/multimodality_usecases_overview/sc-gc-c3-LLMOps_L2_v4.mp4",  mime_type="video/mp4")
video_3 = Part.from_uri("gs://github-repo/img/gemini/multimodality_usecases_overview/sc-gc-c3-LLMOps_L3_v4.mp4",  mime_type="video/mp4")

In [None]:
from IPython.display import IFrame

- This displays only one of the three videos.
- To view others, feel free to change the `file_path`

In [None]:
file_path = "tuning-demo-erwinh/video/mlops-dlai-videos/sc-gc-c3-LLMOps_L2_v4.mp4"
video_url = f"https://storage.googleapis.com/{file_path}"

In [None]:
IFrame(video_url, width=560, height=315)  # Adjust width and height as needed

In [None]:
role = """
You are specialized in analyzing videos and finding \
a needle in a haystack.
"""

In [None]:
instruction = """
Here are three videos. Each is a lesson from the \
LLMOps course from Deep Learning AI.
Your answers are only based on the videos.
"""

- You are asking the model (question 2) to find something very specific from across these 3 videos.

In [None]:
questions = """
Answer the following questions:
1. Create a summary of each video and what is discussed in \
the video.\
Limit the summary to a max of 100 words.
2. In which of the three videos does the instructor run \
and explains this Python code: bq_client.query(). \
Where do you see this code in the video?
"""

In [None]:
contents_5 = [
    role,
    instruction,
    video_1,
    video_2,
    video_3,
    questions
]

# contents_5 = [
#     instruction,
#     video_1,
#     video_2,
#     video_3,
#     questions,
#     role,
# ]

<span style="color:red; font-weight:bold;">IMPORTANT ⚠️ : PROMPTING THIS NEEDLE IN A HAYSTACK EXAMPLE COSTS ABOUT $4 PER EXECUTION</span>

```Python
responses = multimodal_model.generate_content(
    contents_5,
    stream=True
)
```

**Note**: LLM's do not always produce the same results, especially because they are frequently updated. So the output you see in the video might be different than what you may get.

```Python
### this will take some time to run

for response in responses:
    print(response.text, end="")
```