movie_search_metadata/Notebooks/preprocess_movie_files.ipynb (236 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "6fa92035-c0f4-4b7f-9502-0640e3bb4515",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## 事前準備"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b87c385e-17db-4224-bddc-e65734bf7a82",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!sudo apt update\n",
"!sudo apt -y install ffmpeg"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "dc8ae445-351c-4bbf-b081-0ba597fe7631",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"_ = app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"id": "31288a23-851f-4984-af94-71a55fe327ae",
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## 無音動画 & 静止画像作成"
]
},
{
"cell_type": "markdown",
"id": "9daced0a-95a4-4654-a307-4dbbde06964c",
"metadata": {},
"source": [
"**[注意]** `gs://[PROJECT_ID]-handson/mp4_original/` 以下に、オリジナルの動画ファイル(mp4)を事前にアップロードしておきます。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "de61229e-a95c-4746-870f-4a8c908c7545",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"[PROJECT_ID] = !gcloud config get-value project\n",
"BUCKET = f'gs://{PROJECT_ID}-handson'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c6b68655-508c-4c3e-9e39-5e77a7580b28",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import glob\n",
"from PIL import Image\n",
"from tqdm.notebook import tqdm\n",
"\n",
"def process_file(target):\n",
" gsutil_opt = '-o GSUtil:parallel_composite_upload_threshold=150M'\n",
" ffmpeg_opt = '-loglevel error -v error -stats'\n",
" filename = target.split('/')[-1]\n",
" basename = filename.rstrip('.mp4') \n",
" filename2 = f'{BUCKET}/mp4/s_' + filename\n",
" filename3 = f'{BUCKET}/mp4/n_' + filename\n",
" image_dir = f'{BUCKET}/image/'\n",
" \n",
" !rm -f tmpfile1.mp4 tmpfile2.mp4 tmpfile3.mp4\n",
" !gsutil {gsutil_opt} cp \"{target}\" tmpfile1.mp4\n",
" \n",
" # サイズ圧縮動画\n",
" !ffmpeg {ffmpeg_opt} -i tmpfile1.mp4 -crf 31 tmpfile2.mp4\n",
" !gsutil {gsutil_opt} cp tmpfile2.mp4 \"{filename2}\"\n",
"\n",
" # 無音動画\n",
" !ffmpeg {ffmpeg_opt} -i tmpfile2.mp4 -vcodec copy -an tmpfile3.mp4\n",
" !gsutil {gsutil_opt} cp tmpfile3.mp4 \"{filename3}\" \n",
" \n",
" # 静止画像\n",
" !mkdir -p \"{basename}\"\n",
" !ffmpeg {ffmpeg_opt} -i tmpfile2.mp4 -r 1 \"{basename}/capture%04d.png\"\n",
" \n",
" # png を jpg に変換\n",
" width, _ = Image.open(f'{basename}/capture0001.png').size\n",
" width = min(width, 1280)\n",
" file_list = glob.glob(f'{basename}/capture*.png')\n",
" for img_path in tqdm(file_list):\n",
" img_path2 = img_path.rstrip('.png') + '.jpg'\n",
" !mv -f \"{img_path}\" tmp.png\n",
" !ffmpeg {ffmpeg_opt} -i tmp.png -vf scale=\"{width}:-1\" -q 2 \"{img_path2}\" 1>/dev/null 2>&1\n",
" !gsutil -m {gsutil_opt} cp -r \"{basename}\" \"{image_dir}/\"\n",
" !rm -f tmpfile1.mp4 tmpfile2.mp4 tmpfile3.mp4 tmp.png"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9922bc9b-4fcc-4c03-8750-e8264a237780",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['gs://your-project-handson/mp4_original/Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4',\n",
" 'gs://your-project-handson/mp4_original/千葉県印西市にデータセンターを開設.mp4',\n",
" 'gs://your-project-handson/mp4_original/大規模言語モデルを支える分散学習インフラ Pathways.mp4']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"targets = !gsutil ls {BUCKET}/mp4_original/*.mp4\n",
"targets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ec46d72-8cb2-4971-9442-c0cf2339e46d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"for target in targets:\n",
" process_file(target)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1a002a89-5db0-46ce-935f-c4f08c24bd61",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gs://your-project-handson/mp4/n_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4\n",
"gs://your-project-handson/mp4/n_千葉県印西市にデータセンターを開設.mp4\n",
"gs://your-project-handson/mp4/n_大規模言語モデルを支える分散学習インフラ Pathways.mp4\n",
"gs://your-project-handson/mp4/s_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4\n",
"gs://your-project-handson/mp4/s_千葉県印西市にデータセンターを開設.mp4\n",
"gs://your-project-handson/mp4/s_大規模言語モデルを支える分散学習インフラ Pathways.mp4\n"
]
}
],
"source": [
"!gsutil ls {BUCKET}/mp4/"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "937c41ef-aa0b-4047-bd29-6f25ef742b82",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gs://your-project-handson/image/Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -/\n",
"gs://your-project-handson/image/千葉県印西市にデータセンターを開設/\n",
"gs://your-project-handson/image/大規模言語モデルを支える分散学習インフラ Pathways/\n"
]
}
],
"source": [
"!gsutil ls {BUCKET}/image/"
]
}
],
"metadata": {
"environment": {
"kernel": "conda-base-py",
"name": "workbench-notebooks.m119",
"type": "gcloud",
"uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m119"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel) (Local)",
"language": "python",
"name": "conda-base-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}