videocategorization/create_prompts.py (54 lines of code) (raw):

import os import json import pandas as pd from concurrent.futures import ThreadPoolExecutor # # Given a pandas dataframe with a list of videos, this script will generate custom prompts for your videos and by default store # them in a subfolder 'prompts' # ### CONFIG ### df_path = 'current_videos.pkl' ### # prompt_template = """ # Given those categories: {leaves} # Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else! # Title: {title} # Description: {description} # Categories: {categories} # Tags: {tags} # Channel: {channel} # Closed Caption: {closed_caption} # """ prompt_template = """ Given those categories: {leaves} Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else! Title: {title} Description: {description} Channel: {channel} Closed Caption: {closed_caption} """ def get_leaves(taxonomy): leaves = [] for key, value in taxonomy.items(): if isinstance(value, dict) and value: # If it's a non-empty dictionary leaves.extend(get_leaves(value)) else: # If it's an empty dictionary, consider it as a leaf if not value: # Check if the value is an empty dictionary leaves.append(key) return leaves def generate_prompt(row, text, leaves): return prompt_template.format( leaves=json.dumps(leaves, indent=2), title=row['title'], # description=row['description'], # categories=row['categories'], tags=row['tags'], channel=row['channel'], closed_caption=row['text'][:5000] # Trim closed captions ) def save_prompts_to_file(prompts, output_file): """Save prompts to the output JSON file, overwriting it.""" with open(output_file, 'w', encoding='utf-8') as file: json.dump(prompts, file, indent=4, ensure_ascii=False) def process_row(row, leaves): video_id = row['video_id'] # Generate the prompt prompt = generate_prompt(row, leaves) return {"video_id": video_id, "prompt": prompt} def generate_prompts_and_save(df_path, output_dir='prompts', max_workers=None, chunksize=1000): # Ensure the output directory exists os.makedirs(output_dir, exist_ok=True) # Load the taxonomy content with open('content_taxonomy.json', 'r') as file: taxonomy_content = json.load(file) leaves = get_leaves(taxonomy_content) # Load the entire DataFrame first (ensure this fits in memory) df = pd.read_pickle(df_path) # Process in chunks chunk_index = 0 for start in range(0, len(df), chunksize): chunk = df.iloc[start:start + chunksize] prompts = [] # Use ThreadPoolExecutor for file I/O-bound operations with ThreadPoolExecutor(max_workers=max_workers) as executor: results = executor.map( process_row, (row for _, row in chunk.iterrows()), [leaves] * len(chunk) ) # Collect results and filter out None results results = [result for result in results if result is not None] # Save results to file in chunks if results: chunk_file = os.path.join(output_dir, f'prompts_{chunk_index}.json') save_prompts_to_file(results, chunk_file) print(f"Saved chunk {chunk_index} to {chunk_file}") chunk_index += 1 print(f"Completed processing.") # Specify the number of workers, for example, 8 generate_prompts_and_save(df_path, max_workers=8)