def generate_prompts_and_save()

in videocategorization/create_prompts.py [0:0]


def generate_prompts_and_save(df_path, output_dir='prompts', max_workers=None, chunksize=1000):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load the taxonomy content
    with open('content_taxonomy.json', 'r') as file:
        taxonomy_content = json.load(file)

    leaves = get_leaves(taxonomy_content)

    # Load the entire DataFrame first (ensure this fits in memory)
    df = pd.read_pickle(df_path)
    
    # Process in chunks
    chunk_index = 0
    for start in range(0, len(df), chunksize):
        chunk = df.iloc[start:start + chunksize]
        prompts = []

        # Use ThreadPoolExecutor for file I/O-bound operations
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = executor.map(
                process_row,
                (row for _, row in chunk.iterrows()),
                [leaves] * len(chunk)
            )

        # Collect results and filter out None results
        results = [result for result in results if result is not None]

        # Save results to file in chunks
        if results:
            chunk_file = os.path.join(output_dir, f'prompts_{chunk_index}.json')
            save_prompts_to_file(results, chunk_file)
            print(f"Saved chunk {chunk_index} to {chunk_file}")
            chunk_index += 1

    print(f"Completed processing.")