def compile_results()

in parlai/crowdsourcing/projects/multisession_chat/human_eval/compile_results.py [0:0]


    def compile_results(self) -> pd.DataFrame:
        # TODO modularize the shared components to dedup the code
        read_folders = []
        date_strings = []
        import ipdb

        ipdb.set_trace()
        for folder in self.results_folders:
            # Load paths
            # TODO load this data in using DataBrowser
            date_strings = sorted(
                [
                    obj
                    for obj in os.listdir(folder)
                    if os.path.isdir(os.path.join(folder, obj))
                    and re.fullmatch(r'\d\d\d\d_\d\d_\d\d', obj)
                ]
            )
            if self.start_date != '':
                date_strings = [
                    str_ for str_ in date_strings if str_ >= self.start_date
                ]
            folders = [os.path.join(folder, str_) for str_ in date_strings]
            read_folders.extend(folders)
        print(f'Date folders: ' + ', '.join(date_strings))

        # Read in each file
        num_incomplete_convos = 0
        num_complete_convos = 0
        complete_convos_per_model = {}
        bad_conversations = []
        worker_stats = {}
        worker_conversation_counts = {}

        conversation_idx = 0
        conversation_dfs = []
        stat_counts = {}
        for read_folder in read_folders:
            read_folder_name = os.path.split(read_folder)[-1]
            for file_name in sorted(os.listdir(read_folder)):
                if file_name in self.hit_block_list or 'sandbox' in file_name:
                    continue

                if 'incomplete' in file_name:
                    num_incomplete_convos += 1
                    continue
                else:
                    num_complete_convos += 1

                # Read in file
                with open(os.path.join(read_folder, file_name), 'rb') as f:
                    data = json.load(f)

                # Only include the first max_convos_per_worker conversations from a
                # worker to avoid biasing
                worker_id = data['workers'][0]
                worker_id = worker_id.split('-')[-1]
                assignment_id = data['assignment_ids'][0]
                if worker_id in worker_conversation_counts:
                    conversations_so_far = worker_conversation_counts[worker_id]
                else:
                    conversations_so_far = 0
                worker_conversation_counts[worker_id] = conversations_so_far + 1
                if (
                    self.max_convos_per_worker != -1
                    and conversations_so_far >= self.max_convos_per_worker
                ):
                    print(
                        f'Had {conversations_so_far} conversation(s) already from this worker {worker_id}. Skipping {assignment_id}.'
                    )
                    continue

                # Check if need to block the turker
                word_counts = [
                    len(d['text'].split(' '))
                    for d in data['dialog']
                    if d['agent_idx'] == 0
                ]
                utterances = [d['text'] for d in data['dialog'] if d['agent_idx'] == 0]
                if np.average(word_counts) < self.min_word_count:
                    bad_conversations.append(data)
                    print(
                        f'Bad complete conversation, words from human: {utterances}. Skipping.'
                    )
                    continue

                if not all(
                    bucket in data['dialog'][0]['problem_data']
                    for bucket in self.problem_buckets
                ):
                    raise ValueError('Bucket(s) are missing from the problem data!')

                model_nickname = data['model_name']
                assert self.model_nickname == model_nickname
                initial_data_id = data['context_info']['observation_for_bot'][
                    'initial_data_id'
                ]
                if model_nickname not in stat_counts:
                    stat_counts[model_nickname] = {}
                if model_nickname in complete_convos_per_model:
                    complete_convos_per_model[model_nickname].append(initial_data_id)
                else:
                    complete_convos_per_model[model_nickname] = [initial_data_id]

                # Extract non-message info
                info_dict = {
                    'read_folder_name': read_folder_name,
                    'file_name': file_name,
                    'worker': worker_id,
                    'model_nickname': model_nickname,
                    'bad_workers': ','.join(data['bad_workers']),
                    'hit_id': data['hit_ids'][0],
                    'assignment_id': assignment_id,
                    'is_incomplete': 'incomplete' in file_name,
                    'context_info': data['context_info'],
                    'bot_persona_strings': data['bot_persona_strings'],
                    'human_persona_strings': data['human_persona_strings'],
                    'initial_task_data': data['initial_task_data'],
                    'initial_data_id': initial_data_id,
                }

                # Check that the conversation consists of pairs of comments between
                # agents 0 and 1, with 1(bot) speaking first
                assert all(
                    [
                        utterance_data['agent_idx'] == (utterance_idx + 1) % 2
                        for utterance_idx, utterance_data in enumerate(data['dialog'])
                    ]
                )

                # Determine whether the HIT contains unacceptable messages.
                # (We do this for every HIT, even if acceptability violation info
                # was already saved, because the violation criteria may have
                # changed since the HIT was collected.)
                messages_0 = [utt for utt in data['dialog'] if utt['agent_idx'] == 0]
                messages_1 = [utt for utt in data['dialog'] if utt['agent_idx'] == 1]
                assert len(messages_0) + len(messages_1) == len(data['dialog'])

                # Check the human utterances for safety
                utterances_0 = [m['text'] for m in messages_0]
                info_dict[
                    'acceptability_violations_0'
                ] = self.acceptability_checker.check_messages(
                    messages=utterances_0,
                    is_worker_0=True,
                    violation_types=self.acceptability_checker.ALL_VIOLATION_TYPES,
                )

                # Compile personas and previous utterances
                df = pd.DataFrame(
                    [],
                    columns=[
                        'folder',
                        'file_name' 'worker_id',
                        'hit_id',
                        'is_incomplete',
                        'context_info',
                        'initial_data_id',
                        'acceptability_violations_0',
                        'model_nickname',
                        'conversation_idx',
                        'turn_idx',
                        'agent_idx',
                        'text',
                    ]
                    + self.problem_buckets,
                )
                df = df.append(
                    {
                        'folder': info_dict['read_folder_name'],
                        'file_name': info_dict['file_name'],
                        'worker_id': info_dict['worker'],
                        'hit_id': info_dict['hit_id'],
                        'is_incomplete': info_dict['is_incomplete'],
                        'context_info': info_dict['context_info'],
                        'initial_data_id': info_dict['initial_task_data'],
                        'acceptability_violations_0': info_dict[
                            'acceptability_violations_0'
                        ],
                        'model_nickname': model_nickname,
                        'conversation_idx': conversation_idx,
                        'turn_idx': -1,
                        'agent_idx': 0,
                        'text': info_dict['context_info']['observation_for_bot'][
                            'text'
                        ],
                        **{bucket: '' for bucket in self.problem_buckets},
                    },
                    ignore_index=True,
                )

                for utterance_idx, utt in enumerate(data['dialog']):

                    d = {
                        'folder': info_dict['read_folder_name'],
                        'file_name': info_dict['file_name'],
                        'worker_id': info_dict['worker'],
                        'hit_id': info_dict['hit_id'],
                        'is_incomplete': info_dict['is_incomplete'],
                        'context_info': info_dict['context_info'],
                        'initial_data_id': info_dict['initial_task_data'],
                        'acceptability_violations_0': info_dict[
                            'acceptability_violations_0'
                        ],
                        'model_nickname': model_nickname,
                        'conversation_idx': conversation_idx,
                        'turn_idx': utterance_idx,
                        'agent_idx': utt['agent_idx'],
                        'text': utt['text'],
                        **{bucket: '' for bucket in self.problem_buckets},
                    }

                    if utt['agent_idx'] == 1:
                        if 'problem_data' not in utt:
                            for bucket in self.problem_buckets:
                                d[bucket] = 'MALFORMED'
                            print(
                                f'Warning got MALFORMED utterance problem data inside complete convo: {utt}. Skipping.'
                            )
                            continue
                        else:
                            for bucket in self.regular_buckets:
                                d[bucket] = utt['problem_data'][bucket]
                            d['final_rating'] = (
                                utt['final_rating'] if 'final_rating' in utt else None
                            )
                        for k in self.regular_buckets:
                            if k not in stat_counts[model_nickname]:
                                stat_counts[model_nickname][k] = 0
                            stat_counts[model_nickname][k] += d[k]

                        if 'total' not in stat_counts[model_nickname]:
                            stat_counts[model_nickname]['total'] = 0
                        if d['agent_idx'] == 1:
                            stat_counts[model_nickname]['total'] += 1
                        if d['final_rating'] is not None:
                            # Only one the last utterance (agent idx == 1)
                            if 'count_ratings' not in stat_counts[model_nickname]:
                                stat_counts[model_nickname]['count_ratings'] = 0
                            stat_counts[model_nickname]['count_ratings'] += 1
                            if 'ratings' not in stat_counts[model_nickname]:
                                stat_counts[model_nickname]['ratings'] = []
                            if 'pairwise_ratings' not in stat_counts[model_nickname]:
                                stat_counts[model_nickname]['pairwise_ratings'] = {}
                            stat_counts[model_nickname]['ratings'].append(
                                int(d['final_rating'])
                            )
                            stat_counts[model_nickname]['pairwise_ratings'][
                                info_dict['initial_data_id']
                            ] = int(d['final_rating'])

                        if 'bot_word_count' not in stat_counts[model_nickname]:
                            stat_counts[model_nickname]['bot_word_count'] = 0
                        stat_counts[model_nickname]['bot_word_count'] += len(
                            d['text'].strip().split(' ')
                        )
                    else:

                        # Counting some aspects of the human's utterances
                        if 'human_utterance_count' not in stat_counts[model_nickname]:
                            stat_counts[model_nickname]['human_utterance_count'] = 0
                        stat_counts[model_nickname]['human_utterance_count'] += 1

                        if 'human_word_count' not in stat_counts[model_nickname]:
                            stat_counts[model_nickname]['human_word_count'] = 0
                        stat_counts[model_nickname]['human_word_count'] += len(
                            d['text'].strip().split(' ')
                        )

                        if 'human_question_count' not in stat_counts[model_nickname]:
                            stat_counts[model_nickname]['human_question_count'] = 0
                        stat_counts[model_nickname]['human_question_count'] += d[
                            'text'
                        ].count('?')

                # Only want to count bot utterances but human ones, while included,
                # won't be False
                if info_dict['worker'] not in worker_stats:
                    worker_stats[info_dict['worker']] = {'conversations': 0}
                worker_stats[info_dict['worker']]['conversations'] += 1

                # Logic for calculating percent of conversations that are clean
                if 'count_convos' not in stat_counts[model_nickname]:
                    stat_counts[model_nickname]['count_convos'] = 0
                stat_counts[model_nickname]['count_convos'] += 1

                # Adding the full conversation to the list of conversations
                conversation_dfs.append(df)
                conversation_idx += 1

        for m, conversations_completed in complete_convos_per_model.items():
            print(
                f'Got {len(conversations_completed)} complete conversations for model: {m}'
            )
            print(f"{m} completed: {conversations_completed}")

        print(f'{num_complete_convos:d} complete conversation(s) collected.')
        print(f'{len(bad_conversations):d} bad conversation(s).')
        num_approved_convos = num_complete_convos - len(bad_conversations)
        print(f'{num_approved_convos:d} approved conversation(s).')
        print(f'({num_incomplete_convos:d} incomplete conversation(s) collected.)')
        for model_nickname, model_stats_dict in stat_counts.items():
            print(f'---{model_nickname}---')
            for p, v in model_stats_dict.items():
                if p == 'count_ratings' or p == 'pairwise_ratings':
                    continue
                if p == 'ratings':
                    print(
                        f'Average Engaging-ness Rating: {np.average(model_stats_dict["ratings"])} ({model_stats_dict["count_ratings"]} ratings)'
                    )
                    continue
                if p == 'human_word_count' or p == 'human_question_count':
                    print(
                        f'{p}: {v} ({v/model_stats_dict["human_utterance_count"]:.3})'
                    )
                elif p == 'bot_word_count':
                    print(f'{p}: {v} ({v/model_stats_dict["total"]:.3})')
                elif p == 'human_utterance_count':
                    print(f'{p}: {v}')
                elif p == 'count_convos':
                    print(f'{p}: {v}')
                else:
                    print(f'{p}: {v} ({v/model_stats_dict["total"]:.2%})')

        print('Printing worker IDs not already in block list to add...')
        for b in bad_conversations:
            worker_id = b['workers'][0]
            if worker_id not in self.worker_block_list:
                print(f"""'{worker_id}',""")
        print('Done printing bad workers.')

        worker_df = pd.DataFrame([], columns=['worker_id', 'conversations'])

        for worker_id, data in worker_stats.items():
            stat = {'worker_id': worker_id, 'conversations': data['conversations']}
            worker_df = worker_df.append(stat, ignore_index=True)

        with open(self.completed_run_stats_path, 'r') as f:
            completed_run_stats = json.load(f)
        assert completed_run_stats['bot_model_name'] == self.model_nickname
        completed_run_stats['context_done_statistics'][
            self.model_nickname
        ] = complete_convos_per_model[self.model_nickname]
        completed_run_stats['context_done_counts'] = len(
            complete_convos_per_model[self.model_nickname]
        )
        with open(self.completed_run_stats_path, 'w') as fw:
            json.dump(completed_run_stats, fw)
        print(f'Wrote override opt to: {self.completed_run_stats_path}')

        rating_path = os.path.join(self.output_folder, f'pairwise_ratings.json')
        with open(rating_path, 'w') as fw:
            json.dump(stat_counts[self.model_nickname]['pairwise_ratings'], fw)
        print(f'Wrote pairwise ratings to: {rating_path}')

        # Save full results
        all_conversations_df = pd.DataFrame()
        for df in conversation_dfs:
            all_conversations_df = all_conversations_df.append(df)

        return all_conversations_df