in src/jobs/util/grouping_pipeline.py [0:0]
def generate_pipeline(config, model_provider: ModelProvider):
history_scale = config["history_scale"]
domain_scale = config["domain_scale"]
title_embedding_scale = config["title_embedding_scale"]
tf_idf_scale = config["tf_idf_scale"]
pipeline_domain = Pipeline(
[
("selector", ItemSelector(column=["domain"])),
("domain_features", OneHotEncoder(handle_unknown="ignore")),
('scaler', EmbeddingScaler(scale_factor=domain_scale))
]
)
pipeline_history = Pipeline(
[
("selector", ItemSelector(column=["browse_group"])),
("domain_features", OneHotEncoder(handle_unknown="ignore")),
('scaler', EmbeddingScaler(scale_factor=history_scale))
]
)
# pipeline_domain_category = Pipeline(
# [
# ("selector", ItemSelector(column=["domain_category_info"])),
# ("domain_cat_features", MultiLabelBinarizerWrapper()),
# ('scaler', EmbeddingScaler(scale_factor=domain_category_scale))
# ]
# )
title_embedding_transformer = get_title_embedding_transformer(config["embedding_model"], model_provider=model_provider)
pipeline_title_embeddings = Pipeline([("title_embedding_features", title_embedding_transformer),
('scaler', EmbeddingScaler(scale_factor=title_embedding_scale))])
stemmer = PorterStemmer()
def stem_preprocess(text):
tokens = word_tokenize(text)
return ' '.join([stemmer.stem(token) for token in tokens])
stop_words = list(text.ENGLISH_STOP_WORDS)
# stop_words.extend(CUSTOM_STOP_WORDS)
pipeline_tfidf = Pipeline(
[
("selector", ItemSelector(column=EMBEDDING_TEXT_COLUMN)),
(
"tfidf_title",
TfidfVectorizer(
# preprocessor=stemming_tokenizer,
ngram_range=(1, 2),
stop_words= list(stop_words) + ["google", "search", "sheets", "docs"],
max_df=0.95,
min_df=3,
max_features=1000,
)
),
('scaler', EmbeddingScaler(scale_factor=tf_idf_scale))
]
)
combined_features = FeatureUnion([
("pipeline_title_embeddings", pipeline_title_embeddings),
("pipeline_tfidf", pipeline_tfidf),
("pipeline_domain", pipeline_domain),
("pipeline_history", pipeline_history)
])
final_pipeline = Pipeline(
[
("features", combined_features),
('normalizer', Normalizer()),
]
)
return final_pipeline