in src/kg_builder.py [0:0]
def extract_tags_batch(df):
"""
extract tags using spacy pipeline on combined title + description columns
based on analysis - useful tags - ADJ, PROPN, NOUN
"""
texts = (df['title'].fillna('') + " " + df['description'].fillna('')).str.strip()
docs = nlp.pipe(texts, disable=["ner"])
useful_tags = ["ADJ", "PROPN", "NOUN"]
# Extract tags for each document
tags_list = []
for doc in docs:
tags = set()
for token in doc:
if token.pos_ in useful_tags and not token.is_stop:
tags.add(token.text.strip().lower())
tags_list.append(list(tags))
return tags_list