In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# YouTube Video Analysis with Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fvideo-analysis%2Fyoutube_video_analysis.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/youtube_video_analysis.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
| Author(s) | [Alok Pattani](https://github.com/alokpattani/) |

## Overview

In this notebook, you'll explore how to do direct analysis of publicly available [YouTube](https://www.youtube.com/) videos with Gemini.

You will complete the following tasks:

- Summarizing a single YouTube video using Gemini 2.0 Flash
- Extracting a specific set of structured outputs from a longer YouTube video using Gemini 2.0 Pro and controlled generation
- Creating insights from analyzing multiple YouTube videos together using asynchronous generation with Gemini

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-genai itables

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [3]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [26]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

from google import genai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## Set up libraries, options, and models

### Import libraries

In [3]:
import json

from IPython.display import HTML, Markdown, display
from google.genai.types import GenerateContentConfig, Part
from itables import show
import itables.options as itable_opts
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_random_exponential

### Configure some notebook options

In [4]:
# Configure some options related to interactive tables
itable_opts.maxBytes = 1e9
itable_opts.maxColumns = 50

itable_opts.order = []
itable_opts.column_filters = "header"

### Create a helper function

In [9]:
def display_youtube_video(url: str) -> None:
    youtube_video_embed_url = url.replace("/watch?v=", "/embed/")

    # Create HTML code to directly embed video
    youtube_video_embed_html_code = f"""
    <iframe width="560" height="315" src="{youtube_video_embed_url}"
    title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; 
    clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen>
    </iframe>
    """

    # Display embedded YouTube video
    display(HTML(youtube_video_embed_html_code))

### Load models

In [5]:
# Set Gemini Flash and Pro models to be used in this notebook
GEMINI_FLASH_MODEL_ID = "gemini-2.0-flash-001"
GEMINI_PRO_MODEL_ID = "gemini-2.0-flash"

## Summarize a YouTube video

Provide a link to a public YouTube video that you'd like to summarize. Ensure that the video is less than an hour long (if using Gemini 2.0, as is shown below; can try up to a 2-hour video with Gemini 2.0) to make sure it fits in the context window.

The default content to be summarized is [this 6.5-minute video showing how Major League Baseball (MLB) analyzes data using Google Cloud](https://www.youtube.com/watch?v=O_W_VGUeHVI).

In [None]:
# Provide link to a public YouTube video to summarize
YOUTUBE_VIDEO_URL = (
    "https://www.youtube.com/watch?v=O_W_VGUeHVI"  # @param {type:"string"}
)

display_youtube_video(YOUTUBE_VIDEO_URL)

In [None]:
# Call Gemini API with prompt to summarize video
video_summary_prompt = "Give a detailed summary of this video."

video_summary_response = client.models.generate_content(
    model=GEMINI_FLASH_MODEL_ID,
    contents=[
        Part.from_uri(
            file_uri=YOUTUBE_VIDEO_URL,
            mime_type="video/webm",
        ),
        video_summary_prompt,
    ],
)

# Display results
display(Markdown(video_summary_response.text))

## Extract structured output from a YouTube video

Next, we'll show how to extract structured outputs using [controlled generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output), in this case from a video that covers multiple topics.

We're going to see how Gemini's industry-leading 2 million token context window can help analyze [the full opening keynote](https://www.youtube.com/watch?v=V6DJYGn2SFk) from our Next conference back in April - all 1 hour and 41 minutes of it!

In [None]:
# Link to full Cloud Next '24 Opening Keynote video
# cloud_next_keynote_video_url = "https://www.youtube.com/watch?v=V6DJYGn2SFk"

# Uncomment line below to replace with 14-min keynote summary video instead (faster)
cloud_next_keynote_video_url = "https://www.youtube.com/watch?v=M-CzbTUVykg"

display_youtube_video(cloud_next_keynote_video_url)

Below is a prompt to extract the biggest product announcements that were made during this keynote. We use the response schema to show that we want valid JSON output in a particular form, including a constraint specifying that the "product status" field should be either GA, Preview, or Coming Soon.

The following cell may take several minutes to run, as Gemini 2.0 Pro is analyzing all 101 minutes of the video and audio to produce comprehensive results.

In [None]:
# Set up pieces (prompt, response schema, config) and run video extraction

video_extraction_prompt = (
    "Provide a summary of the biggest product announcements "
    "that were made in this Google Cloud Next keynote video including:\n"
    "  - name\n"
    '  - product status: "GA" (Generally Available), "Preview", or "Coming Soon"\n'
    "  - key quote from the presenter about the product, 20 words or fewer per product\n\n"
    "Make sure to look through and listen to the whole video, start to finish, to find "
    "the top product announcements. Only reference information in the video itself in "
    "your response."
)

video_extraction_response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "name": {"type": "STRING"},
            "product_status": {
                "type": "STRING",
                "enum": ["GA", "Preview", "Coming Soon"],
            },
            "quote_from_presenter": {"type": "STRING"},
        },
    },
}

video_extraction_json_generation_config = GenerateContentConfig(
    temperature=0.0,
    max_output_tokens=8192,
    response_mime_type="application/json",
    response_schema=video_extraction_response_schema,
)

video_extraction_response = client.models.generate_content(
    model=GEMINI_PRO_MODEL_ID,
    contents=[
        video_extraction_prompt,
        Part.from_uri(
            file_uri=cloud_next_keynote_video_url,
            mime_type="video/webm",
        ),
    ],
    config=video_extraction_json_generation_config,
)

print(video_extraction_response.text)

In [None]:
# Convert structured output from response to data frame for display and/or further analysis
video_extraction_response_df = pd.DataFrame(video_extraction_response.parsed)

show(video_extraction_response_df)

## Creating insights from analyzing multiple YouTube videos together

### Google "Year in Search" videos
Now, consider expanding the problem to a more common enterprise use case: extracting information from _multiple_ YouTube videos at once.

This time, we'll use [Google's "Year in Search" videos](https://about.google/intl/ALL_us/stories/year-in-search/), which summarize the questions, people, and moments that captured the world's attention in each year. As of fall 2024, there are 14 of these videos, each 2-4 minutes in length, from [2010](https://www.youtube.com/watch?v=F0QXB5pw2qE) through [2023](https://www.youtube.com/watch?v=3KtWfp0UopM).

We start by reading in a CSV file that has links to all the videos.

In [None]:
# Read in table of Year in Search video links from public CSV file
GOOGLE_YEAR_IN_SEARCH_VIDEO_LINKS_CSV_GCS_URI = (
    "gs://github-repo/video/google_year_in_search_video_links.csv"
)

year_in_search_yt_links = pd.read_csv(GOOGLE_YEAR_IN_SEARCH_VIDEO_LINKS_CSV_GCS_URI)

year_in_search_yt_links

### Set up for analyzing multiple video files

Let's say we are a sports agency who wants to see which athletes or teams appear most often in these videos as a measure of cultural relevance. Instead of watching and manually counting, we can use Gemini's multimodal capabilities and world knowledge to extract each appearance of an athlete or team into a structured output that we can use for further analysis.

The system instructions, prompt, and response schema that will apply to all 14 videos are each created in the cell below.

In [18]:
# Set up pieces (prompt, response schema, config) for Google Year in Search videos
multiple_video_extraction_system_instruction_text = (
    "You are a video analyst that "
    "carefully looks through all frames of provided videos, extracting out the "
    "pieces necessary to respond to user prompts."
)

multiple_video_extraction_prompt = (
    "Which sports athletes or teams are mentioned or "
    "shown in this video? Please look through each frame carefully, and respond "
    "with a complete list that includes the athlete or team's name (1 row per "
    "athlete or team), whether they are an athlete or team, the sport they play, "
    "and the timestamp into the video at which they appear (in mm:ss format, "
    "do not give extra precision) for each one."
)

multiple_video_extraction_response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "name": {"type": "STRING"},
            "athlete_or_team": {"type": "STRING", "enum": ["athlete", "team"]},
            "sport": {"type": "STRING"},
            "video_timestamp": {"type": "STRING"},
        },
    },
}

multiple_video_extraction_json_generation_config = GenerateContentConfig(
    temperature=0.0,
    max_output_tokens=8192,
    response_mime_type="application/json",
    response_schema=multiple_video_extraction_response_schema,
)

Next, we'll set up to run each of these prompt/video pairs through the Gemini API _asynchronously_. This allows us to send all the requests to Gemini at once, then wait for all the answers to come back - a more efficient process than sending them synchronously (one-by-one). See more details in [this Google Cloud Community Medium blog post](https://medium.com/google-cloud/how-to-prompt-gemini-asynchronously-using-python-on-google-cloud-986ca45d9f1b).


In [27]:
# Function for asynchronous generation


@retry(wait=wait_random_exponential(multiplier=1, max=120), stop=stop_after_attempt(2))
async def async_generate(prompt, yt_link):
    try:
        response = await client.aio.models.generate_content(
            model=GEMINI_PRO_MODEL_ID,
            contents=[prompt, Part.from_uri(file_uri=yt_link, mime_type="video/webm")],
            config=multiple_video_extraction_json_generation_config,
        )

        return response.to_json_dict()
    except Exception as e:
        print("Something failed, retrying")
        print(e)
        with retry.stop_after_attempt(2) as retry_state:
            if retry_state.attempt > 2:
                return None
        raise  # Re-raise the exception for tenacity to handle

### Run asynchronous Gemini calls to do video extraction

In [None]:
# Perform asynchronous calls across all videos, gather responses
import asyncio

start_time = asyncio.get_event_loop().time()

get_responses = [
    async_generate(multiple_video_extraction_prompt, yt_link)
    for yt_link in year_in_search_yt_links["yt_link"]
]

multiple_video_extraction_responses = await asyncio.gather(*get_responses)

end_time = asyncio.get_event_loop().time()

elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")

### Extract and analyze video results across years

Once we have the results from Gemini, we can process them and get table of every athlete or team appearance across all 14 "Year in Search" videos.

In [None]:
# Add structured outputs by year back to original table, show full extraction results
year_in_search_responses = year_in_search_yt_links.copy()

year_in_search_responses["gemini_response"] = [
    json.dumps(response) for response in multiple_video_extraction_responses
]


def extract_result_df_from_gemini_response(year, gemini_response):
    extract_response_text = json.loads(gemini_response)["candidates"][0]["content"][
        "parts"
    ][0]["text"]

    extract_result_df = pd.DataFrame(json.loads(extract_response_text))

    extract_result_df["year"] = year

    return extract_result_df


year_in_search_responses["extract_result_df"] = year_in_search_responses.apply(
    lambda row: extract_result_df_from_gemini_response(
        row["year"], row["gemini_response"]
    ),
    axis=1,
)

all_year_in_search_extractions = pd.concat(
    year_in_search_responses["extract_result_df"].tolist(), ignore_index=True
)[["year", "name", "athlete_or_team", "sport", "video_timestamp"]]

show(all_year_in_search_extractions)

Finally, we can count the number of years in which each athlete or team appeared in these videos, and return results for those who appeared more than once.

In [None]:
# Analyze results to show athletes/teams showing up most often in Year in Search videos
multiple_year_in_search_app = (
    all_year_in_search_extractions.assign(
        # Convert 'name' to uppercase to handle e.g. "LeBron" vs "Lebron"
        name=all_year_in_search_extractions["name"].str.upper(),
        # Convert 'athlete_or_team' to lowercase for consistency
        athlete_or_team=all_year_in_search_extractions["athlete_or_team"].str.lower(),
    )
    .groupby(["name", "athlete_or_team"])
    .apply(
        lambda x: pd.Series(
            {
                # Aggregate 'sport' across type and name (handling different cases)
                "sport": ", ".join(sorted(x["sport"].str.lower().unique())),
                # Count # of diff years in which each athlete/team appears in video
                "num_years": x["year"].nunique(),
            }
        )
    )
    .reset_index()
    .
    # Filter to only those appearing multiple times
    query("num_years >= 2")
    .sort_values(["num_years", "name"], ascending=[False, True])
    .reset_index(drop=True)
)

# Display results
display(Markdown("<b>Athletes/Teams Appearing in Multiple Year in Search Videos<b>"))
display(multiple_year_in_search_app)