movie_search/final_ver/backend/scene_search.py (79 lines of code) (raw):
# backend/scene_search.py
import json
from typing import List
import vertexai
import vertexai.preview.generative_models as generative_models
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig
from google.cloud import storage
from utils import PROJECT_ID, generate_download_signed_url_v4, metadata_url_to_movie_blob_name, running_on_cloudrun
from search_document import search_documents_by_query
from prompt_content_search import PROMPT_CONTENT_SEARCH
# --- グローバル変数 ---
vertexai.init(project=PROJECT_ID, location='us-central1')
model_pro = GenerativeModel('gemini-1.5-pro')
model_flash = GenerativeModel('gemini-1.5-flash')
def generate_text(prompt: str, model: GenerativeModel = model_pro, temperature: float = 0.4, top_p: float = 0.4) -> dict:
"""Gemini でテキストを生成する
Args:
prompt: 入力プロンプト
model: 利用する Gemini モデル
temperature: 生成テキストのランダム性
top_p: 生成テキストの多様性
Returns:
生成された JSON オブジェクト
"""
response_schema = {
"type": "array",
"items": {
"type": "object",
"properties": {
"Timestamp": {
"type": "string",
},
"Description": {
"type": "string",
},
},
"required": ["Timestamp", "Description"],
},
}
responses = model.generate_content(
prompt,
generation_config=GenerationConfig(
max_output_tokens=8192,
temperature=temperature,
top_p=top_p,
response_mime_type="application/json",
response_schema=response_schema
),
safety_settings={
generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
},
# stream=True は不要になりました
)
# 最初のレスポンスのみを取得
result = responses.text
# JSON 文字列をパースして返す
return json.loads(result)
def search_scene(query: str, top_n: int = 1, model: GenerativeModel = model_flash) -> List[dict]:
"""シーン検索を実行する
Args:
query: 検索クエリ
top_n: 検索対象とする動画の数
model: 利用する Gemini モデル
Returns:
検索結果のリスト
"""
response = search_documents_by_query(query, show_summary=False)
storage_client = storage.Client()
results = []
for doc_id in range(min(top_n, len(response.results))):
# Discovery Engine の検索結果から、動画メタデータの URI とタイトルを取得
meta_uri = response.results[doc_id].document.derived_struct_data['link']
title = response.results[doc_id].document.derived_struct_data['title']
print(f'meta_uri: {meta_uri}')
# URI からバケット名と blob 名を取得
bucket_name = meta_uri.split("//")[1].split("/", 1)[0]
metadata_blob_name = meta_uri.split("//")[1].split("/", 1)[1]
movie_blob_name = metadata_url_to_movie_blob_name(meta_uri)
# Cloud Storage からメタデータを取得
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(metadata_blob_name)
# download_to_filename を使わずに、blob から直接テキストデータを読み込む
metatext = blob.download_as_text()
prompt = PROMPT_CONTENT_SEARCH.format(query=query, metatext=metatext)
temperature = 0.4
result = None
while temperature < 1.0:
try:
print(f'movie_blob_name: {movie_blob_name}')
signed_url = generate_download_signed_url_v4(bucket_name, movie_blob_name)
# generate_text から直接結果リストを取得
result = generate_text(prompt, model=model, temperature=temperature)
# 結果に signed_url と title を追加
for r in result:
r['signed_url'] = signed_url
r['title'] = title
results.extend(result)
break
except Exception as e:
print(e)
temperature += 0.05
if temperature < 1.0:
print('\n=====')
return results
# テスト用
# result = search_scene('AI')
# print(result)