def generate_topics()

in src/kg_builder.py [0:0]


def generate_topics(df, generate_topic):
    """
    Generates topics for each entry from title + description.
    Uses gliner zsl with TOPIC_LABELS
    - usage: for the first time set the generate_topic as True to generate.
             for subsequent runs set the generate_topic as False to get speedup
    """
    texts = (df['title'].fillna('').str.lower() + " " + df['description'].fillna('').str.lower()).str.strip().values.tolist()
    if generate_topic:
        topics = []
        for text in tqdm(texts):
            entities = gliner_model.predict_entities(text, TOPIC_LABELS, threshold=0.3)
            themes = list({entity["label"] for entity in entities})
            topics.append(themes)
        df['topics'] = topics
        df.to_parquet(f"{DATA_PATH}/input_data_df.parquet", index=False)
    else:
        df_bkp = pd.read_parquet(f"{DATA_PATH}/input_data_df.parquet")
        topics_lkp = df_bkp.set_index('url_hash')['topics'].to_dict()
        df['topics'] = df['url_hash'].map(topics_lkp)
    return df