In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini: An Overview of Multimodal Use Cases

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fintro_multimodal_use_cases.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://goo.gle/3DUssjz">
      <img width="32px" src="https://cdn.qwiklabs.com/assets/gcp_cloud-e3a77215f0b8bfa9b3f611c0d2208c7e8708ed31.svg" alt="Google Cloud logo"><br> Open in  Cloud Skills Boost
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            


| Authors |
| --- |
| [Katie Nguyen](https://github.com/katiemn) |
| [Saeed Aghabozorgi](https://github.com/saeedaghabozorgi) |

## Overview

**YouTube Video: Multimodal AI in action**

<a href="https://www.youtube.com/watch?v=pEmCgIGpIoo&list=PLIivdWyY5sqJio2yeg1dlfILOUO2FoFRx" target="_blank">
  <img src="https://img.youtube.com/vi/pEmCgIGpIoo/maxresdefault.jpg" alt="Multimodal AI in action" width="500">
</a>

In this notebook, you will explore a variety of different use cases enabled by multimodality with Gemini.

Gemini is a family of generative AI models developed by [Google DeepMind](https://deepmind.google/) that is designed for multimodal use cases. [Gemini 2.0](https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2) is the latest model version.

### Gemini 2.0 Flash

This smaller Gemini model is optimized for high-frequency tasks to prioritize the model's response time. This model has superior speed and efficiency with a context window of up to 1 million tokens for all modalities.

For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) documentation.

### Objectives

This notebook demonstrates a variety of multimodal use cases with Gemini.

In this tutorial, you will learn how to use Gemini with the Gen AI SDK for Python to:

  - Process and generate text
  - Parse and summarize PDF documents
  - Reason across multiple images
  - Generating a video description
  - Combining video data with external knowledge
  - Understand Audio
  - Analyze a code base
  - Combine modalities
  - Recommendation based on user preferences for e-commerce
  - Understanding charts and diagrams
  - Comparing images for similarities, anomalies, or differences

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Google Gen AI SDK for Python

In [None]:
%pip install --upgrade --quiet google-genai gitingest

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).


In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
import os

from google import genai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Import libraries


In [None]:
from IPython.display import Audio, Image, Markdown, Video, display
from gitingest import ingest
from google.genai.types import CreateCachedContentConfig, GenerateContentConfig, Part
import nest_asyncio

nest_asyncio.apply()

### Load Gemini 2.0 Flash model

Learn more about all [Gemini models on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-models).

In [None]:
MODEL_ID = "gemini-2.0-flash-001"  # @param {type: "string"}

## Individual Modalities

### Textual understanding

Gemini can parse textual questions and retain that context across following prompts.

In [None]:
question = "What is the average weather in Mountain View, CA in the middle of May?"
prompt = """
Considering the weather, please provide some outfit suggestions.

Give examples for the daytime and the evening.
"""

contents = [question, prompt]
response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

### Document Summarization

You can use Gemini to process PDF documents, and analyze content, retain information, and provide answers to queries regarding the documents.

The PDF document example used here is the Gemini 2.0 paper (https://arxiv.org/pdf/2403.05530.pdf).

![image.png](https://storage.googleapis.com/cloud-samples-data/generative-ai/image/gemini1.5-paper-2403.05530.png)

In [None]:
pdf_file_uri = "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
pdf_file = Part.from_uri(file_uri=pdf_file_uri, mime_type="application/pdf")

prompt = "How many tokens can the model process?"

contents = [pdf_file, prompt]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

In [None]:
prompt = """
  You are a professional document summarization specialist.
  Please summarize the given document.
"""

contents = [pdf_file, prompt]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

### Image understanding across multiple images

One of the capabilities of Gemini is being able to reason across multiple images.

This is an example of using Gemini to reason which glasses would be more suitable for an oval face shape.

In [None]:
image_glasses1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses1.jpg"
image_glasses2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses2.jpg"

display(Image(image_glasses1_url, width=150))
display(Image(image_glasses2_url, width=150))

prompt = """
I have an oval face. Given my face shape, which glasses would be more suitable?

Explain how you reached this decision.
Provide your recommendation based on my face shape, and please give an explanation for each.
"""

contents = [
    prompt,
    Part.from_uri(file_uri=image_glasses1_url, mime_type="image/jpeg"),
    Part.from_uri(file_uri=image_glasses2_url, mime_type="image/jpeg"),
]
response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

### Generating a video description

Gemini can also extract tags throughout a video:

In [None]:
video_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4"
display(Video(video_url, width=350))

prompt = """
What is shown in this video?
Where should I go to see it?
What are the top 5 places in the world that look like this?
Provide the 10 best tags for this video?
"""

video = Part.from_uri(
    file_uri=video_url,
    mime_type="video/mp4",
)
contents = [prompt, video]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

> You can confirm that the location is indeed Antalya, Turkey by visiting the Wikipedia page: https://en.wikipedia.org/wiki/Antalya

You can also use Gemini to retrieve extra information beyond the video contents.

In [None]:
video_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/ottawatrain3.mp4"
display(Video(video_url, width=350))

prompt = """
Which train line is this?
Where does it go?
What are the stations/stops?
Which river is being crossed?
"""

video = Part.from_uri(
    file_uri=video_url,
    mime_type="video/mp4",
)
contents = [prompt, video]

response = client.models.generate_content(
    model=MODEL_ID, contents=contents, config=GenerateContentConfig(temperature=0)
)
display(Markdown(response.text))

> You can confirm that this is indeed the Confederation Line on Wikipedia here: https://en.wikipedia.org/wiki/Confederation_Line

### Audio understanding

Gemini can directly process audio for long-context understanding.

In [None]:
audio_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/audio/pixel.mp3"
)
display(Audio(audio_url))

#### Summarization

In [None]:
prompt = """
  Please provide a short summary and title for the audio.
  Provide chapter titles, be concise and short, no need to provide chapter summaries.
  Provide each of the chapter titles in a numbered list.
  Do not make up any information that is not part of the audio and do not be verbose.
"""

audio_file = Part.from_uri(file_uri=audio_url, mime_type="audio/mpeg")
contents = [audio_file, prompt]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

#### Transcription

In [None]:
prompt = """
    Transcribe this interview, in the format of timecode, speaker, caption.
    Use speaker A, speaker B, etc. to identify the speakers.
    Provide each piece of information on a separate bullet point.
"""

audio_file = Part.from_uri(file_uri=audio_url, mime_type="audio/mpeg")
contents = [audio_file, prompt]

response = client.models.generate_content(
    model=MODEL_ID,
    contents=contents,
    config=GenerateContentConfig(max_output_tokens=8192),
)
display(Markdown(response.text))

### Reason across a codebase

You will use the [Online Boutique repository](https://github.com/GoogleCloudPlatform/microservices-demo) as an example in this notebook. Online Boutique is a cloud-first microservices demo application. The application is a web-based e-commerce app where users can browse items, add them to the cart, and purchase them. This application consists of 11 microservices across multiple languages.

In [None]:
# The GitHub repository URL
repo_url = "https://github.com/GoogleCloudPlatform/microservices-demo"  # @param {type:"string"}

#### Create an index and extract the contents of a codebase

Clone the repo and create an index and extract content of code/text files.

In [None]:
exclude_patterns = {
    "*.png",
    "*.jpg",
    "*.jpeg",
    "*.gif",
    "*.svg",
    "*.ico",
    "*.webp",
    "*.jar",
    ".git/",
    "*.gitkeep",
}
_, code_index, code_text = ingest(repo_url, exclude_patterns=exclude_patterns)

#### Create a content cache for the codebase

The codebase prompt is going to be quite large with all of the included data.
Gemini supports [Context caching](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview), which lets you to store frequently used input tokens in a dedicated cache and reference them for subsequent requests, eliminating the need to repeatedly pass the same set of tokens to a model.

**Note**: Context caching is only available for stable models with fixed versions (for example, `gemini-2.0-flash-001`). You must include the version postfix (for example, the `-001`).

In [None]:
prompt = f"""
Context:
- The entire codebase is provided below.
- Here is an index of all of the files in the codebase:
    \n\n{code_index}\n\n.
- Then each of the files is concatenated together. You will find all of the code you need:
    \n\n{code_text}\n\n
"""

cached_content = client.caches.create(
    model="gemini-2.0-flash-001",
    config=CreateCachedContentConfig(
        contents=prompt,
        ttl="3600s",
    ),
)

#### Create a developer getting started guide

In [None]:
question = """
  Provide a getting started guide to onboard new developers to the codebase.
"""

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    contents=question,
    config=GenerateContentConfig(
        cached_content=cached_content.name,
    ),
)
display(Markdown(response.text))

#### Finding bugs in the code

In [None]:
question = """
    Find the top 3 most severe issues in the codebase.
"""

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    contents=question,
    config=GenerateContentConfig(
        cached_content=cached_content.name,
    ),
)
display(Markdown(response.text))

#### Summarizing the codebase

In [None]:
question = """
  Give me a summary of this codebase, and tell me the top 3 things that I can learn from it.
"""

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    contents=question,
    config=GenerateContentConfig(
        cached_content=cached_content.name,
    ),
)
display(Markdown(response.text))

## Combining multiple modalities

### Video and audio understanding

Try out Gemini's native multimodal and long-context capabilities on video interleaving with audio inputs.

In [None]:
video_url = (
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/video/pixel8.mp4"
)
display(Video(video_url, width=350))

In [None]:
prompt = """
  Provide a detailed description of the video.
  The description should also contain any important dialogue from the video and key features of the phone.
"""

video = Part.from_uri(
    file_uri=video_url,
    mime_type="video/mp4",
)
contents = [prompt, video]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

### All modalities (images, video, audio, text) at once

Gemini is natively multimodal and supports interleaving of data from different modalities. It can support a mix of audio, visual, text, and code inputs in the same input sequence.

In [None]:
video_url = "gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4"
display(Video(video_url.replace("gs://", "https://storage.googleapis.com/"), width=350))

In [None]:
image_url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/a-man-and-a-dog.png"
display(Image(image_url, width=350))

In [None]:
prompt = """
  Look through each frame in the video carefully and answer the questions.
  Only base your answers strictly on what information is available in the video attached.
  Do not make up any information that is not part of the video and do not be too
  verbose, be straightforward.

  Questions:
  - When is the moment in the image happening in the video? Provide a timestamp.
  - What is the context of the moment and what does the narrator say about it?
"""

contents = [
    prompt,
    Part.from_uri(file_uri=video_url, mime_type="video/mp4"),
    Part.from_uri(file_uri=image_url, mime_type="image/png"),
]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

## Use Case: retail / e-commerce

Suppose a customer shows you their living room and wants to find appropriate furniture and choose between four wall art options for the room.

How can you use Gemini to help the customer choose the best option?

### Generating open recommendations

Using the same image, you can ask the model to recommend a piece of furniture that would make sense in the space.

Note that the model can choose any furniture in this case, and can do so only from its built-in knowledge.

In [None]:
room_image_url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/living-room.png"
display(Image(room_image_url, width=350))

room_image = Part.from_uri(file_uri=room_image_url, "image/png")

prompt = "Describe this room"
contents = [prompt, room_image]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

In [None]:
prompt1 = "Recommend a new piece of furniture for this room"
prompt2 = "Explain the reason in detail"
contents = [prompt1, room_image, prompt2]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

### Generating recommendations based on provided images

Instead of keeping the recommendation open, you can also provide a list of items for the model to choose from. Here, you will load a few art images that the Gemini model can recommend. This is particularly useful for retail companies who want to provide product recommendations to users based on their current setup.

In [None]:
art_image_urls = [
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/room-art-1.png",
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/room-art-2.png",
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/room-art-3.png",
    "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/room-art-4.png",
]

md_content = f"""
|Customer photo |
|:-----:|
| <img src="{room_image_url}" width="50%"> |

|Art 1| Art 2 | Art 3 | Art 4 |
|:-----:|:----:|:-----:|:----:|
| <img src="{art_image_urls[0]}" width="60%">|<img src="{art_image_urls[1]}" width="100%">|<img src="{art_image_urls[2]}" width="60%">|<img src="{art_image_urls[3]}" width="60%">|
"""

display(Markdown(md_content))

# Load wall art images as Part objects
art_images = [
    Part.from_uri(file_uri=url, mime_type="image/png") for url in art_image_urls
]

# To recommend an item from a selection, you will need to label the item number within the prompt.
# That way you are providing the model with a way to reference each image as you pose a question.
# Labeling images within your prompt also helps reduce hallucinations and produce better results.
prompt = """
  You are an interior designer.
  For each piece of wall art, explain whether it would be appropriate for the style of the room.
  Rank each piece according to how well it would be compatible in the room.
"""

contents = [
    "Consider the following art pieces:",
    "art 1:",
    art_images[0],
    "art 2:",
    art_images[1],
    "art 3:",
    art_images[2],
    "art 4:",
    art_images[3],
    "room:",
    room_image,
    prompt,
]

response = client.models.generate_content(model=MODEL_ID, contents=contents)
display(Markdown(response.text))

## Use Case: Entity relationships in technical diagrams

Gemini has multimodal capabilities that enable it to understand diagrams and take actionable steps, such as optimization or code generation. This example demonstrates how Gemini can decipher an entity relationship (ER) diagram, understand the relationships between tables, identify requirements for optimization in a specific environment like BigQuery, and even generate corresponding code.

In [None]:
image_er_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/er.png"
display(Image(image_er_url, width=350))

prompt = "Document the entities and relationships in this ER diagram."

contents = [prompt, Part.from_uri(file_uri=image_er_url, mime_type="image/png")]

# Use a more deterministic configuration with a low temperature
config = GenerateContentConfig(
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    candidate_count=1,
    max_output_tokens=8192,
)

response = client.models.generate_content(
    model=MODEL_ID,
    contents=contents,
    config=config,
)
display(Markdown(response.text))

## Use Case: Similarity/Differences

Gemini can compare images and identify similarities or differences between objects.

The following example shows two scenes from [Marienplatz in Munich, Germany](https://en.wikipedia.org/wiki/Marienplatz) that are slightly different.

In [None]:
image_landmark1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark1.jpg"
image_landmark2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark2.jpg"

md_content = f"""
| Image 1 | Image 2 |
|:-----:|:----:|
| <img src="{image_landmark1_url}" width="350"> | <img src="{image_landmark2_url}" width="350"> |
"""

display(Markdown(md_content))

prompt1 = """
Consider the following two images:
Image 1:
"""
prompt2 = """
Image 2:
"""
prompt3 = """
1. What is shown in Image 1? Where is it?
2. What is similar between the two images?
3. What is difference between Image 1 and Image 2 in terms of the contents or people shown?
"""

contents = [
    prompt1,
    Part.from_uri(file_uri=image_landmark1_url, mime_type="image/jpeg"),
    prompt2,
    Part.from_uri(file_uri=image_landmark2_url, mime_type="image/jpeg"),
    prompt3,
]

config = GenerateContentConfig(
    temperature=0.0,
    top_p=0.8,
    top_k=40,
    candidate_count=1,
    max_output_tokens=2048,
)

response = client.models.generate_content(
    model=MODEL_ID,
    contents=contents,
    config=config,
)
display(Markdown(response.text))