analysis/plot.py (55 lines of code) (raw):

import matplotlib.pyplot as plt import pandas as pd import seaborn as sns analysis_df = pd.read_json('prompt_analysis_results.json', orient='records', lines=True) # Plot configurations sns.set(style="whitegrid") plt.figure(figsize=(16, 12)) # Plot: Number of Q/A pairs per Prompt ID plt.subplot(3, 2, 1) sns.barplot(x='Prompt ID', y='Number of Q/A pairs', data=analysis_df, palette='viridis') plt.title('Number of Q/A pairs per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Number of Q/A pairs') for i, row in analysis_df.iterrows(): plt.text(i, row['Number of Q/A pairs'], f"{row['Number of Q/A pairs']/1e6:.2f}e6", ha='center', va='bottom') # Plot: Average answer length per Prompt ID plt.subplot(3, 2, 2) sns.barplot(x='Prompt ID', y='Average answer length', data=analysis_df, palette='viridis') plt.title('Average Answer Length per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Average Answer Length') for i, row in analysis_df.iterrows(): plt.text(i, row['Average answer length'], f"{row['Average answer length']:.2f}", ha='center', va='bottom') # Plot: Diversity within documents per Prompt ID plt.subplot(3, 2, 3) sns.barplot(x='Prompt ID', y='Diversity within documents', data=analysis_df, palette='viridis') plt.title('Diversity within Documents per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Diversity within Documents') for i, row in analysis_df.iterrows(): plt.text(i, row['Diversity within documents'], f"{row['Diversity within documents']:.2f}", ha='center', va='bottom') # Plot: Total empty questions per Prompt ID plt.subplot(3, 2, 4) sns.barplot(x='Prompt ID', y='Total empty questions', data=analysis_df, palette='viridis') plt.title('Total Empty Questions per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Total Empty Questions') for i, row in analysis_df.iterrows(): plt.text(i, row['Total empty questions'], f"{row['Total empty questions']}", ha='center', va='bottom') # Plot: Average Q/A pairs per page per Prompt ID plt.subplot(3, 2, 5) sns.barplot(x='Prompt ID', y='Average Q/A pairs per page', data=analysis_df, palette='viridis') plt.title('Average Q/A pairs per Page per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Average Q/A pairs per Page') for i, row in analysis_df.iterrows(): plt.text(i, row['Average Q/A pairs per page'], f"{row['Average Q/A pairs per page']:.2f}", ha='center', va='bottom') # Plot: Number of unique questions per Prompt ID plt.subplot(3, 2, 6) sns.barplot(x='Prompt ID', y='Number of unique questions', data=analysis_df, palette='viridis') plt.title('Number of unique questions per Prompt ID') plt.xlabel('Prompt ID') plt.ylabel('Number of unique questions') for i, row in analysis_df.iterrows(): plt.text(i, row['Number of unique questions'], f"{row['Number of unique questions']/1e6:.2f}e6", ha='center', va='bottom') plt.tight_layout() plt.savefig('prompt_analysis_plots_enhanced.png') plt.show() # Summary Report report = f""" Prompt Analysis Report ======================= Number of Q/A pairs per Prompt ID: {analysis_df[['Prompt ID', 'Number of Q/A pairs']]} Average answer length per Prompt ID: {analysis_df[['Prompt ID', 'Average answer length']]} Unique questions per Prompt ID: {analysis_df[['Prompt ID', 'Number of unique questions']]} Total pages per Prompt ID: {analysis_df[['Prompt ID', 'Total pages']]} Average Q/A pairs per page per Prompt ID: {analysis_df[['Prompt ID', 'Average Q/A pairs per page']]} Average answer length per page per Prompt ID: {analysis_df[['Prompt ID', 'Average answer length per page']]} Diversity within documents per Prompt ID: {analysis_df[['Prompt ID', 'Diversity within documents']]} Total empty questions per Prompt ID: {analysis_df[['Prompt ID', 'Total empty questions']]} """ with open('prompt_analysis_report.txt', 'w') as f: f.write(report) print("Report and plots generated successfully.")