in parlai/crowdsourcing/tasks/model_chat/analysis/compile_results.py [0:0]
def compile_results(self) -> pd.DataFrame:
read_folders = []
date_strings = []
for folder in self.results_folders:
# Load paths
date_strings = sorted(
[
obj
for obj in os.listdir(folder)
if os.path.isdir(os.path.join(folder, obj))
and re.fullmatch(r'\d\d\d\d_\d\d_\d\d', obj)
]
)
if self.start_date != '':
date_strings = [
str_ for str_ in date_strings if str_ >= self.start_date
]
folders = [os.path.join(folder, str_) for str_ in date_strings]
read_folders.extend(folders)
print(f'Date folders: ' + ', '.join(date_strings))
now = datetime.now()
worker_results_file = os.path.join(
self.output_folder, f'worker_results_{now.strftime("%Y%m%d_%H%M%S")}.csv'
)
# Read in each file
num_incomplete_convos = 0
num_complete_convos = 0
complete_convos_per_model = {}
bad_conversations = []
stat_counts = {}
worker_stats = {}
worker_conversation_counts = {}
total_utterances = 0
conversation_idx = 0
conversation_dfs = []
for read_folder in read_folders:
read_folder_name = os.path.split(read_folder)[-1]
for file_name in sorted(os.listdir(read_folder)):
if file_name in self.hit_block_list:
continue
if 'incomplete' in file_name:
num_incomplete_convos += 1
continue
else:
num_complete_convos += 1
# Read in file
with open(os.path.join(read_folder, file_name), 'rb') as f:
data = json.load(f)
# Only include the first max_convos_per_worker conversations from a
# worker to avoid biasing
worker_id = data['workers'][0]
assignment_id = data['assignment_ids'][0]
if worker_id in worker_conversation_counts:
conversations_so_far = worker_conversation_counts[worker_id]
else:
conversations_so_far = 0
worker_conversation_counts[worker_id] = conversations_so_far + 1
if (
self.max_convos_per_worker != -1
and conversations_so_far >= self.max_convos_per_worker
):
print(
f'Had {conversations_so_far} conversation(s) already from this worker {worker_id}. Skipping {assignment_id}.'
)
continue
# Check if need to block the turker
word_counts = [
len(d['text'].split(' '))
for d in data['dialog']
if d['agent_idx'] == 0
]
utterances = [d['text'] for d in data['dialog'] if d['agent_idx'] == 0]
if np.average(word_counts) < self.min_word_count:
bad_conversations.append(data)
print(
f'Bad complete conversation, words from human: {utterances}. Skipping.'
)
continue
if self.use_problem_buckets:
if not all(
bucket in data['dialog'][1]['problem_data']
for bucket in self.problem_buckets
):
raise ValueError('Bucket(s) are missing from the problem data!')
model_nickname = data['task_description']['model_nickname']
if model_nickname not in stat_counts:
stat_counts[model_nickname] = {}
if model_nickname in complete_convos_per_model:
complete_convos_per_model[model_nickname] += 1
else:
complete_convos_per_model[model_nickname] = 1
# Extract non-message info
info_dict = {
'read_folder_name': read_folder_name,
'file_name': file_name,
'worker': worker_id,
'model_nickname': model_nickname,
'bad_workers': ','.join(data['bad_workers']),
'hit_id': data['hit_ids'][0],
'assignment_id': assignment_id,
'is_incomplete': 'incomplete' in file_name,
'context_dataset': data['context_dataset'],
'additional_context': data['additional_context'],
}
# Check that the conversation consists of pairs of comments between
# agents 0 and 1, with 0 speaking first
assert all(
[
utterance_data['agent_idx'] == utterance_idx % 2
for utterance_idx, utterance_data in enumerate(data['dialog'])
]
)
# Determine whether the HIT contains unacceptable messages.
# (We do this for every HIT, even if acceptability violation info
# was already saved, because the violation criteria may have
# changed since the HIT was collected.)
messages_0 = [utt for utt in data['dialog'] if utt['agent_idx'] == 0]
messages_1 = [utt for utt in data['dialog'] if utt['agent_idx'] == 1]
assert len(messages_0) + len(messages_1) == len(data['dialog'])
# Check the human utterances for safety
utterances_0 = [m['text'] for m in messages_0]
info_dict[
'acceptability_violations_0'
] = self.acceptability_checker.check_messages(
messages=utterances_0,
is_worker_0=True,
violation_types=self.acceptability_checker.ALL_VIOLATION_TYPES,
)
# Compile personas and previous utterances
df = pd.DataFrame(
[],
columns=[
'folder',
'worker_id',
'hit_id',
'model_nickname',
'conversation_idx',
'turn_idx',
'agent_idx',
'text',
]
+ self.problem_buckets,
)
text_parts = []
if data['personas'] is not None and len(data['personas']) > 0:
text_parts += [
'your persona: ' + data['personas'][1][0],
'your persona: ' + data['personas'][1][1],
]
if (
data['additional_context'] is not None
and len(data['additional_context']) > 0
):
text_parts.append(data['additional_context'])
df = df.append(
{
'folder': info_dict['read_folder_name'],
'worker_id': info_dict['worker'],
'hit_id': info_dict['hit_id'],
'model_nickname': model_nickname,
'conversation_idx': conversation_idx,
'turn_idx': -1,
'agent_idx': 1,
'text': '\n'.join(text_parts),
**{bucket: '' for bucket in self.problem_buckets},
},
ignore_index=True,
)
total_utterances += len(
[d for d in data["dialog"] if d["agent_idx"] == 1]
)
if len(data['dialog']) > 20:
print(
f'Got long dialogue of {len(data["dialog"])} utterances, hit id: {info_dict["hit_id"]}, model_nickname: {model_nickname}.'
)
if self.use_problem_buckets:
dialog_has_problems = False
for utterance_idx, utt in enumerate(data['dialog']):
d = {
'folder': info_dict['read_folder_name'],
'worker_id': info_dict['worker'],
'hit_id': info_dict['hit_id'],
'model_nickname': model_nickname,
'conversation_idx': conversation_idx,
'turn_idx': utterance_idx,
'agent_idx': utt['agent_idx'],
'text': utt['text'],
**{bucket: '' for bucket in self.problem_buckets},
}
if utt['agent_idx'] == 1:
d['final_rating'] = utt.get('final_rating')
if self.use_problem_buckets:
if 'problem_data' not in utt:
for bucket in self.problem_buckets:
d[bucket] = 'MALFORMED'
print(
f'Warning got MALFORMED utterance problem data inside complete convo: {utt}. Skipping.'
)
continue
else:
for bucket in self.regular_buckets + ['none_all_good']:
d[bucket] = utt['problem_data'][bucket]
for k in self.regular_buckets + ['none_all_good']:
if k not in stat_counts[model_nickname]:
stat_counts[model_nickname][k] = 0
stat_counts[model_nickname][k] += d[k]
if k != 'none_all_good' and d[k]:
dialog_has_problems = True
if 'total' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['total'] = 0
if d['agent_idx'] == 1:
stat_counts[model_nickname]['total'] += 1
if d['final_rating'] is not None:
# Only one the last utterance (agent idx == 1)
if 'count_ratings' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['count_ratings'] = 0
stat_counts[model_nickname]['count_ratings'] += 1
if 'ratings' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['ratings'] = []
stat_counts[model_nickname]['ratings'].append(
int(d['final_rating'])
)
else:
# Counting some aspects of the human's utterances
if 'human_utterance_count' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['human_utterance_count'] = 0
stat_counts[model_nickname]['human_utterance_count'] += 1
if 'human_word_count' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['human_word_count'] = 0
stat_counts[model_nickname]['human_word_count'] += len(
d['text'].strip().split(' ')
)
if 'human_question_count' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['human_question_count'] = 0
stat_counts[model_nickname]['human_question_count'] += d[
'text'
].count('?')
d = self._add_additional_per_turn_stats(d=d, utt=utt)
df = df.append(d, ignore_index=True)
if info_dict['worker'] not in worker_stats:
worker_stats[info_dict['worker']] = {'conversations': 0}
if self.use_problem_buckets:
worker_stats[info_dict['worker']]['problems_found'] = 0
worker_stats[info_dict['worker']]['conversations'] += 1
if self.use_problem_buckets:
# Count the number of problems the worker got
is_problem = ~df['none_all_good'].replace('', True)
# Only want to count bot utterances but human ones, while included,
# won't be False
count = is_problem.sum()
worker_stats[info_dict['worker']]['problems_found'] += count
# Logic for calculating percent of conversations that are clean
if 'count_convos' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['count_convos'] = 0
stat_counts[model_nickname]['count_convos'] += 1
if self.use_problem_buckets and not dialog_has_problems:
if 'convo_clean' not in stat_counts[model_nickname]:
stat_counts[model_nickname]['convo_clean'] = 0
stat_counts[model_nickname]['convo_clean'] += 1
# Adding the full conversation to the list of conversations
conversation_dfs.append(df)
conversation_idx += 1
for m, conversation_count in complete_convos_per_model.items():
print(f'Got {conversation_count} complete conversation(s) for model: {m}')
print(f'{num_complete_convos:d} complete conversation(s) collected.')
print(f'{len(bad_conversations):d} bad conversation(s).')
num_approved_convos = num_complete_convos - len(bad_conversations)
print(f'{num_approved_convos:d} approved conversation(s).')
print(f'({num_incomplete_convos:d} incomplete conversation(s) collected.)')
for model_nickname, model_stats_dict in stat_counts.items():
print(f'---{model_nickname}---')
for p, v in model_stats_dict.items():
if p == 'count_ratings':
continue
if p == 'ratings':
print(
f'Average Engaging-ness Rating: {np.average(model_stats_dict["ratings"])} ({model_stats_dict["count_ratings"]} ratings)'
)
continue
if p == 'human_word_count' or p == 'human_question_count':
print(
f'{p}: {v} ({v/model_stats_dict["human_utterance_count"]:.3})'
)
elif p == 'human_utterance_count':
print(f'{p}: {v}')
elif p == 'count_convos':
print(f'{p}: {v}')
elif self.use_problem_buckets and p == 'convo_clean':
print(f'{p}: {v} ({v/model_stats_dict["count_convos"]:.2%})')
else:
print(f'{p}: {v} ({v/model_stats_dict["total"]:.2%})')
print('Printing worker IDs not already in block list to add...')
for b in bad_conversations:
worker_id = b['workers'][0]
if worker_id not in self.worker_block_list:
print(f"""'{worker_id}',""")
print('Done printing bad workers.')
print('Worker stats:')
worker_columns = ['worker_id', 'conversations']
if self.use_problem_buckets:
worker_columns += ['problems_found', 'avg_problems_per_convo']
worker_df = pd.DataFrame([], columns=worker_columns)
for worker_id, data in worker_stats.items():
print(worker_id)
stat = {'worker_id': worker_id, 'conversations': data['conversations']}
if self.use_problem_buckets:
avg_problems_per_convo = data['problems_found'] / data['conversations']
stat.update(
{
'problems_found': data['problems_found'],
'avg_problems_per_convo': avg_problems_per_convo,
}
)
worker_df = worker_df.append(stat, ignore_index=True)
if self.use_problem_buckets:
worker_df = worker_df.sort_values('avg_problems_per_convo', ascending=0)
worker_df.to_csv(worker_results_file, index=False)
print(worker_df)
print(f'Wrote worker statistical results to: {worker_results_file}')
# Save full results
all_conversations_df = pd.DataFrame()
for df in conversation_dfs:
all_conversations_df = all_conversations_df.append(df)
print(f'\nWorker conversation counts: {worker_conversation_counts}')
return all_conversations_df