in videocategorization/create_prompts.py [0:0]
def generate_prompts_and_save(df_path, output_dir='prompts', max_workers=None, chunksize=1000):
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
# Load the taxonomy content
with open('content_taxonomy.json', 'r') as file:
taxonomy_content = json.load(file)
leaves = get_leaves(taxonomy_content)
# Load the entire DataFrame first (ensure this fits in memory)
df = pd.read_pickle(df_path)
# Process in chunks
chunk_index = 0
for start in range(0, len(df), chunksize):
chunk = df.iloc[start:start + chunksize]
prompts = []
# Use ThreadPoolExecutor for file I/O-bound operations
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = executor.map(
process_row,
(row for _, row in chunk.iterrows()),
[leaves] * len(chunk)
)
# Collect results and filter out None results
results = [result for result in results if result is not None]
# Save results to file in chunks
if results:
chunk_file = os.path.join(output_dir, f'prompts_{chunk_index}.json')
save_prompts_to_file(results, chunk_file)
print(f"Saved chunk {chunk_index} to {chunk_file}")
chunk_index += 1
print(f"Completed processing.")