def create_topic_training_dataset()

in src/jobs/util/topic_utils.py [0:0]


def create_topic_training_dataset(df: pd.DataFrame, user_label_key: str, topic_generator, predicted_id_topics):
    user_topics = df[user_label_key].unique().tolist()
    key_doc_finder_user = KeyDocumentFinder(df, user_label_key, "title")
    key_doc_finder_user.compute_all()
    topic_trainers = []
    for user_topic in user_topics:
        if isinstance(user_topic, float):
            user_topic = int(user_topic)
        header_title = predicted_id_topics.get(str(user_topic), user_topic)
        pred_topic, picked_documents, keywords = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
                                                                            user_topic, num_keywords=1)
        legacy_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
                                                                            user_topic, num_keywords=LEGACY_DOC_SELECTION)
        zero_keyword_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
                                                                            user_topic, num_keywords=0)
        two_keyword_topic, _x, _x = compute_topic_using_digest(topic_generator, key_doc_finder_user, df,
                                                                            user_topic, num_keywords=2)

        topic_trainers.append({"label": header_title, "three_titles": "\n".join(picked_documents), "keywords": ",".join(keywords),
                   "ai_pred_topic": pred_topic, "ai_pred_topic_2kw": two_keyword_topic, "ai_pred_topic_0kw": zero_keyword_topic,
                               "ai_pred_topic_legacy": legacy_topic})

    return pd.DataFrame(topic_trainers)