seed/merge_training_dataset_json.ipynb (114 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Merge from multiple jsonl files for OpenAI fine-tuning\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "import os, shutil, random\n", "from util.preprocess import convert_html_to_md\n", "import glob\n", "import pandas as pd\n", "load_dotenv()\n", "\n", "dataset_dir = \"dataset\"\n", "all_files = glob.glob(os.path.join(dataset_dir, \"*-oai.jsonl\"))\n", "#df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert to a base64 image input format that can be recognized by multimodal models such as GPT-4o.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import glob\n", "\n", "output_dir = \"final_dataset\"\n", "os.makedirs(output_dir, exist_ok=True)\n", "result = []\n", "\n", "for f in all_files:\n", " with open(f, 'r', encoding='utf-8-sig') as infile:\n", " for line in infile.readlines():\n", " try:\n", " result.append(json.loads(line)) # read each line of the file\n", " except ValueError:\n", " print(f)\n", "\n", "# This would output jsonl\n", "with open(f'{output_dir}/training-data-v1.jsonl','w', encoding='utf-8-sig') as outfile:\n", " for entry in result:\n", " outfile.write(json.dumps(entry, ensure_ascii=False) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !rm -rf {output_dir}" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 4 }