in src/tab_title_tuning_data.py [0:0]
def compute_training_data_for_tests(self, ai_dataset, test_ids, max_docs):
topic_gen = OpenAITopicGenerator(support_keywords=True)
hint_db = pd.read_csv("./data/topic_model_fine_tune/topic_fine_tuning_data__01_05__grouped_with_hints.csv")
topic_gen.prepare_hint_data(hint_db)
results = []
for ai_test_id in test_ids:
one_test = ai_dataset[ai_dataset.test_set_id == ai_test_id].reset_index(drop=True)
for task_id in one_test["task_id"].unique().tolist():
cluster_data = self.get_meta_info_for_task(one_test, task_id, max_docs)
if cluster_data is None:
print("Skipping invalid / missing metadata for item")
continue
print(cluster_data)
topic = topic_gen.get_topic(cluster_data)
print(f"AI topic is: {topic}")
cluster_data["keywords"] = list(filter(lambda a: a != "2023", cluster_data["keywords"]))
input_for_fine_tuning_keywords = ",".join(cluster_data["keywords"][:3])
input_for_fine_tuning_titles = "\n".join(cluster_data["documents"][:3])
input_for_fine_tuning_description = "\n".join(cluster_data["descriptions"][:3])
results.append({
"input_titles": input_for_fine_tuning_titles,
"input_keywords": input_for_fine_tuning_keywords,
"input_description": input_for_fine_tuning_description,
"output": topic,
})
return results