in src/kg_builder.py [0:0]
def generate_topics(df, generate_topic):
"""
Generates topics for each entry from title + description.
Uses gliner zsl with TOPIC_LABELS
- usage: for the first time set the generate_topic as True to generate.
for subsequent runs set the generate_topic as False to get speedup
"""
texts = (df['title'].fillna('').str.lower() + " " + df['description'].fillna('').str.lower()).str.strip().values.tolist()
if generate_topic:
topics = []
for text in tqdm(texts):
entities = gliner_model.predict_entities(text, TOPIC_LABELS, threshold=0.3)
themes = list({entity["label"] for entity in entities})
topics.append(themes)
df['topics'] = topics
df.to_parquet(f"{DATA_PATH}/input_data_df.parquet", index=False)
else:
df_bkp = pd.read_parquet(f"{DATA_PATH}/input_data_df.parquet")
topics_lkp = df_bkp.set_index('url_hash')['topics'].to_dict()
df['topics'] = df['url_hash'].map(topics_lkp)
return df