in modules/python/src/datapreprocessing/datacleaner.py [0:0]
def prep_product_desc(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepares the product description by performing NLP preprocessing using spaCy.
Args:
df (pd.DataFrame): The input DataFrame containing the 'description' column.
Returns:
pd.DataFrame: The DataFrame with the preprocessed 'description' column.
"""
spacy.cli.download("en_core_web_sm")
model = spacy.load("en_core_web_sm")
def parse_nlp_description(description: str) -> str:
if not pd.isna(description):
try:
doc = model(description.lower())
lemmas = []
for token in doc:
if (
token.lemma_ not in lemmas
and not token.is_stop
and token.is_alpha
):
lemmas.append(token.lemma_)
return " ".join(lemmas)
except:
self.logger.error("Unable to load spacy model")
df["description"] = df["description"].apply(parse_nlp_description)
return df