movie_search_metadata/Notebooks/preprocess_movie_files.ipynb (236 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "6fa92035-c0f4-4b7f-9502-0640e3bb4515", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## 事前準備" ] }, { "cell_type": "code", "execution_count": null, "id": "b87c385e-17db-4224-bddc-e65734bf7a82", "metadata": { "tags": [] }, "outputs": [], "source": [ "!sudo apt update\n", "!sudo apt -y install ffmpeg" ] }, { "cell_type": "code", "execution_count": 3, "id": "dc8ae445-351c-4bbf-b081-0ba597fe7631", "metadata": { "tags": [] }, "outputs": [], "source": [ "import IPython\n", "app = IPython.Application.instance()\n", "_ = app.kernel.do_shutdown(True)" ] }, { "cell_type": "markdown", "id": "31288a23-851f-4984-af94-71a55fe327ae", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## 無音動画 & 静止画像作成" ] }, { "cell_type": "markdown", "id": "9daced0a-95a4-4654-a307-4dbbde06964c", "metadata": {}, "source": [ "**[注意]** `gs://[PROJECT_ID]-handson/mp4_original/` 以下に、オリジナルの動画ファイル(mp4)を事前にアップロードしておきます。" ] }, { "cell_type": "code", "execution_count": 1, "id": "de61229e-a95c-4746-870f-4a8c908c7545", "metadata": { "tags": [] }, "outputs": [], "source": [ "[PROJECT_ID] = !gcloud config get-value project\n", "BUCKET = f'gs://{PROJECT_ID}-handson'" ] }, { "cell_type": "code", "execution_count": 2, "id": "c6b68655-508c-4c3e-9e39-5e77a7580b28", "metadata": { "tags": [] }, "outputs": [], "source": [ "import glob\n", "from PIL import Image\n", "from tqdm.notebook import tqdm\n", "\n", "def process_file(target):\n", " gsutil_opt = '-o GSUtil:parallel_composite_upload_threshold=150M'\n", " ffmpeg_opt = '-loglevel error -v error -stats'\n", " filename = target.split('/')[-1]\n", " basename = filename.rstrip('.mp4') \n", " filename2 = f'{BUCKET}/mp4/s_' + filename\n", " filename3 = f'{BUCKET}/mp4/n_' + filename\n", " image_dir = f'{BUCKET}/image/'\n", " \n", " !rm -f tmpfile1.mp4 tmpfile2.mp4 tmpfile3.mp4\n", " !gsutil {gsutil_opt} cp \"{target}\" tmpfile1.mp4\n", " \n", " # サイズ圧縮動画\n", " !ffmpeg {ffmpeg_opt} -i tmpfile1.mp4 -crf 31 tmpfile2.mp4\n", " !gsutil {gsutil_opt} cp tmpfile2.mp4 \"{filename2}\"\n", "\n", " # 無音動画\n", " !ffmpeg {ffmpeg_opt} -i tmpfile2.mp4 -vcodec copy -an tmpfile3.mp4\n", " !gsutil {gsutil_opt} cp tmpfile3.mp4 \"{filename3}\" \n", " \n", " # 静止画像\n", " !mkdir -p \"{basename}\"\n", " !ffmpeg {ffmpeg_opt} -i tmpfile2.mp4 -r 1 \"{basename}/capture%04d.png\"\n", " \n", " # png を jpg に変換\n", " width, _ = Image.open(f'{basename}/capture0001.png').size\n", " width = min(width, 1280)\n", " file_list = glob.glob(f'{basename}/capture*.png')\n", " for img_path in tqdm(file_list):\n", " img_path2 = img_path.rstrip('.png') + '.jpg'\n", " !mv -f \"{img_path}\" tmp.png\n", " !ffmpeg {ffmpeg_opt} -i tmp.png -vf scale=\"{width}:-1\" -q 2 \"{img_path2}\" 1>/dev/null 2>&1\n", " !gsutil -m {gsutil_opt} cp -r \"{basename}\" \"{image_dir}/\"\n", " !rm -f tmpfile1.mp4 tmpfile2.mp4 tmpfile3.mp4 tmp.png" ] }, { "cell_type": "code", "execution_count": 3, "id": "9922bc9b-4fcc-4c03-8750-e8264a237780", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['gs://your-project-handson/mp4_original/Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4',\n", " 'gs://your-project-handson/mp4_original/千葉県印西市にデータセンターを開設.mp4',\n", " 'gs://your-project-handson/mp4_original/大規模言語モデルを支える分散学習インフラ Pathways.mp4']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "targets = !gsutil ls {BUCKET}/mp4_original/*.mp4\n", "targets" ] }, { "cell_type": "code", "execution_count": null, "id": "7ec46d72-8cb2-4971-9442-c0cf2339e46d", "metadata": { "tags": [] }, "outputs": [], "source": [ "for target in targets:\n", " process_file(target)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1a002a89-5db0-46ce-935f-c4f08c24bd61", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gs://your-project-handson/mp4/n_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4\n", "gs://your-project-handson/mp4/n_千葉県印西市にデータセンターを開設.mp4\n", "gs://your-project-handson/mp4/n_大規模言語モデルを支える分散学習インフラ Pathways.mp4\n", "gs://your-project-handson/mp4/s_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -.mp4\n", "gs://your-project-handson/mp4/s_千葉県印西市にデータセンターを開設.mp4\n", "gs://your-project-handson/mp4/s_大規模言語モデルを支える分散学習インフラ Pathways.mp4\n" ] } ], "source": [ "!gsutil ls {BUCKET}/mp4/" ] }, { "cell_type": "code", "execution_count": 6, "id": "937c41ef-aa0b-4047-bd29-6f25ef742b82", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gs://your-project-handson/image/Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入! -/\n", "gs://your-project-handson/image/千葉県印西市にデータセンターを開設/\n", "gs://your-project-handson/image/大規模言語モデルを支える分散学習インフラ Pathways/\n" ] } ], "source": [ "!gsutil ls {BUCKET}/image/" ] } ], "metadata": { "environment": { "kernel": "conda-base-py", "name": "workbench-notebooks.m119", "type": "gcloud", "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m119" }, "kernelspec": { "display_name": "Python 3 (ipykernel) (Local)", "language": "python", "name": "conda-base-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }