def extract_tags_batch()

in src/kg_builder.py [0:0]


def extract_tags_batch(df):
    """
    extract tags using spacy pipeline on combined title + description columns
    based on analysis - useful tags - ADJ, PROPN, NOUN
    """
    texts = (df['title'].fillna('') + " " + df['description'].fillna('')).str.strip()
    docs = nlp.pipe(texts, disable=["ner"])
    useful_tags = ["ADJ", "PROPN", "NOUN"]
    # Extract tags for each document
    tags_list = []
    for doc in docs:
        tags = set()
        for token in doc:
            if token.pos_ in useful_tags and not token.is_stop:
                tags.add(token.text.strip().lower())
        tags_list.append(list(tags))
    return tags_list