In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Generate descriptions from videos

<table align="left">

<a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/content_generation/description_from_video.ipynb">
<img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
View on GitHub
</a>
</td>
<td>
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/content_generation/description_from_video.ipynb">
<img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
</a>
</td>
</td>
<td style="text-align: center\">
<a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks%2Fgenerative_ai%2Fcontent_generation%2Fdescription_from_video.ipynb">
<img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"> <br> Open in Colab Enterprise
</a>
</td>
</table>

## Overview

This notebook shows how to generate descriptions of videos in a GCS bucket.  
It uses the [Youtube UGC dataset](https://media.withyoutube.com/) and uses the [Gemini](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini) to generate video descriptions for each video.

#### **Steps**
Using Spark,
1) It reads the table [Youtube UGC dataset](https://media.withyoutube.com/) from gs://dataproc-metastore-public-binaries/youtube_ucg/
2) It calls Vertex AI Gemini API vision pro to generate description from videos.

### Setup

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs**
  - Service Usage Consumer

#### Imports

In [None]:
import time

from pyspark.sql.functions import regexp_replace, concat
from pyspark.sql.functions import udf, col, lit

import google.auth
import google.auth.transport.requests
import requests

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 20)

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip install --upgrade google-cloud-aiplatform -q
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

#### Authentication

In [None]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
  .appName("Video descriptions generation") \
  .enableHiveSupport() \
  .getOrCreate()

#### Read dataset

In [None]:
BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/youtube_ucg/"
binaries_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH)

In [None]:
# Let's select the paths of the first 5 youtube videos
paths_df = binaries_df.select("path").limit(5)

#### Define UDF and call Gemini API to generate video descriptions

In [None]:
response_schema = {
    "type": "object",
    "properties": {
        "where": {"type": "string"},
        "how_many_people": {"type": "integer"},
        "task": {"type": "string"},
        "proposition": {"type": "string"},
        "description": {"type": "string"}
    },
    "required": ["where","how_many_people","task","proposition","description"],
}

system_instructions = [
    """Format the 5 items as attributes of a JSON object: where, how_many_people, task, proposition and description.""",  
    """The response should be a single valid formatted JSON object only."""
]

prompt = f"""
    Create a short description for this video with the following questions:
     1) Where is the video recorded? 
     2) How many people are shown in the video? 
     3) What the people are doing in the video? 
     4) Whats the proposition for the video, i.e what it is about?
     5) A sumary description from the itens 1,2,3 and 4
    """

In [None]:
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part, Image, Content, HarmCategory, HarmBlockThreshold

def predict(uri, prompt, system_instructions=system_instructions, response_schema=response_schema, content_type="video/mp4", temperature=1, model_name="gemini-1.5-pro"):

    model = GenerativeModel(model_name=model_name, system_instruction=system_instructions)
    
    prompt_content = Content(
        role="user",
        parts=[
            Part.from_uri(uri, content_type),
            Part.from_text(prompt)
        ]
    )

    response = model.generate_content(
        prompt_content,
        generation_config = GenerationConfig(
            temperature=temperature, response_mime_type="application/json", response_schema=response_schema
        ),
        safety_settings={
                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_ONLY_HIGH
        }
    )
    
    return response.text

In [None]:
def generate_descriptions(gcs_uri):

    descriptions = predict(gcs_uri, prompt)
    return descriptions
    
generate_descriptions_udf = udf(generate_descriptions)

In [None]:
df_descriptions = paths_df.sort(paths_df.path.asc()).withColumn("data", generate_descriptions_udf(paths_df.path))

In [None]:
df_descriptions.cache()

In [41]:
df_descriptions.toPandas()

                                                                                

Unnamed: 0,path,data
0,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,"{""description"": ""The video was recorded on a mobile device. It shows how to navigate to a folder and lists the videos inside it. There are no people visible in the video. "", ""how_many_people"": 0, ""proposition"": ""The video shows how to navigate to a folder on a mobile device."", ""task"": ""The user navigates to a folder in his mobile and shows the videos it contains."", ""where"": ""On a mobile device.""}"
1,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-1dba.mp4,"{""description"": ""The video was recorded inside a room by one person. This person shows how to make a bracelet with colored rubber bands using their fingers."", ""how_many_people"": 1, ""proposition"": ""Video tutorial showing how to make a bracelet with colored rubber bands using their fingers."", ""task"": ""Make a bracelet with colored rubber bands"", ""where"": ""Room""}"
2,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,"{""description"": ""The video shows a person writing something on a piece of paper with a pen. The background suggests the video was recorded indoors."", ""how_many_people"": 1, ""proposition"": ""Someone writing on a piece of paper"", ""task"": ""Writing"", ""where"": ""Indoors""}"
3,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-5da7.mp4,"{""description"": ""A person is using a hot glue gun to create a design on a silicone mat. They are working on a wooden table. The video demonstrates how to use a hot glue gun to make crafts."", ""how_many_people"": 1, ""proposition"": ""This video demonstrates how to use a hot glue gun to make crafts."", ""task"": ""The person is using a hot glue gun to create a design on a silicone mat."", ""where"": ""The video is recorded indoors on a wooden table.""}"
4,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,"{""description"": ""The video was recorded at home by one person showing a manga style drawing of a female character and comparing it to a comic book. The video is about drawing manga characters."", ""how_many_people"": 1, ""proposition"": ""How to Draw Manga Characters"", ""task"": ""Drawing"", ""where"": ""Home""}"


#### Extract feature from generated text

In [42]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
    [
        StructField('where', StringType(), True),
        StructField('how_many_people', StringType(), True),
        StructField('proposition', StringType(), True),
        StructField('description', StringType(), True),
        StructField('task', StringType(), True)
    ]
)
df_final = df_descriptions.withColumn("exploded_data", from_json(regexp_replace(regexp_replace(col("data"),"json", ""),"```",""), schema))\
    .select(col('path'),col('exploded_data.*'))

In [43]:
df_final.toPandas()

Unnamed: 0,path,where,how_many_people,proposition,description,task
0,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,On a mobile device.,0,The video shows how to navigate to a folder on a mobile device.,The video was recorded on a mobile device. It shows how to navigate to a folder and lists the videos inside it. There are no people visible in the video.,The user navigates to a folder in his mobile and shows the videos it contains.
1,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-1dba.mp4,Room,1,Video tutorial showing how to make a bracelet with colored rubber bands using their fingers.,The video was recorded inside a room by one person. This person shows how to make a bracelet with colored rubber bands using their fingers.,Make a bracelet with colored rubber bands
2,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,Indoors,1,Someone writing on a piece of paper,The video shows a person writing something on a piece of paper with a pen. The background suggests the video was recorded indoors.,Writing
3,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-5da7.mp4,The video is recorded indoors on a wooden table.,1,This video demonstrates how to use a hot glue gun to make crafts.,A person is using a hot glue gun to create a design on a silicone mat. They are working on a wooden table. The video demonstrates how to use a hot glue gun to make crafts.,The person is using a hot glue gun to create a design on a silicone mat.
4,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,Home,1,How to Draw Manga Characters,The video was recorded at home by one person showing a manga style drawing of a female character and comparing it to a comic book. The video is about drawing manga characters.,Drawing
