in src/util/topic_utils.py [0:0]
def create_topic_training_dataset(df: pd.DataFrame, user_label_key: str, topic_generator, predicted_id_topics):
user_topics = df[user_label_key].unique().tolist()
key_doc_finder_user = KeyDocumentFinder(df, user_label_key, "title")
key_doc_finder_user.compute_all()
topic_trainers = []
for user_topic in user_topics:
if isinstance(user_topic, float):
user_topic = int(user_topic)
header_title = predicted_id_topics.get(str(user_topic), user_topic)
pred_topic, picked_documents, keywords = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
user_topic, num_keywords=1)
legacy_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
user_topic, num_keywords=LEGACY_DOC_SELECTION)
zero_keyword_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
user_topic, num_keywords=0)
two_keyword_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
user_topic, num_keywords=2)
topic_trainers.append({"label": header_title, "three_titles": "\n".join(picked_documents), "keywords": ",".join(keywords),
"ai_pred_topic": pred_topic, "ai_pred_topic_2kw": two_keyword_topic, "ai_pred_topic_0kw": zero_keyword_topic,
"ai_pred_topic_legacy": legacy_topic})
return pd.DataFrame(topic_trainers)