def compute_training_data_for_tests()

in src/tab_title_tuning_data.py [0:0]


    def compute_training_data_for_tests(self, ai_dataset, test_ids, max_docs):
        topic_gen = OpenAITopicGenerator(support_keywords=True)

        hint_db = pd.read_csv("./data/topic_model_fine_tune/topic_fine_tuning_data__01_05__grouped_with_hints.csv")
        topic_gen.prepare_hint_data(hint_db)

        results = []
        for ai_test_id in test_ids:
            one_test = ai_dataset[ai_dataset.test_set_id == ai_test_id].reset_index(drop=True)
            for task_id in one_test["task_id"].unique().tolist():
                cluster_data = self.get_meta_info_for_task(one_test, task_id, max_docs)
                if cluster_data is None:
                    print("Skipping invalid / missing metadata for item")
                    continue
                print(cluster_data)

                topic = topic_gen.get_topic(cluster_data)
                print(f"AI topic is: {topic}")

                cluster_data["keywords"] = list(filter(lambda a: a != "2023", cluster_data["keywords"]))

                input_for_fine_tuning_keywords = ",".join(cluster_data["keywords"][:3])
                input_for_fine_tuning_titles = "\n".join(cluster_data["documents"][:3])
                input_for_fine_tuning_description = "\n".join(cluster_data["descriptions"][:3])

                results.append({
                    "input_titles": input_for_fine_tuning_titles,
                    "input_keywords": input_for_fine_tuning_keywords,
                    "input_description": input_for_fine_tuning_description,
                    "output": topic,
                })
        return results