def generate_pipeline()

in src/jobs/util/grouping_pipeline.py [0:0]


def generate_pipeline(config, model_provider: ModelProvider):
    history_scale = config["history_scale"]
    domain_scale = config["domain_scale"]
    title_embedding_scale = config["title_embedding_scale"]
    tf_idf_scale = config["tf_idf_scale"]

    pipeline_domain = Pipeline(
        [
            ("selector", ItemSelector(column=["domain"])),
            ("domain_features", OneHotEncoder(handle_unknown="ignore")),
            ('scaler', EmbeddingScaler(scale_factor=domain_scale))
        ]
    )

    pipeline_history = Pipeline(
        [
            ("selector", ItemSelector(column=["browse_group"])),
            ("domain_features", OneHotEncoder(handle_unknown="ignore")),
            ('scaler', EmbeddingScaler(scale_factor=history_scale))
        ]
    )

    #    pipeline_domain_category = Pipeline(
    #        [
    #            ("selector", ItemSelector(column=["domain_category_info"])),
    #            ("domain_cat_features", MultiLabelBinarizerWrapper()),
    #            ('scaler', EmbeddingScaler(scale_factor=domain_category_scale))
    #        ]
    #    )
    title_embedding_transformer = get_title_embedding_transformer(config["embedding_model"], model_provider=model_provider)
    pipeline_title_embeddings = Pipeline([("title_embedding_features", title_embedding_transformer),
                                          ('scaler', EmbeddingScaler(scale_factor=title_embedding_scale))])

    stemmer = PorterStemmer()

    def stem_preprocess(text):
        tokens = word_tokenize(text)
        return ' '.join([stemmer.stem(token) for token in tokens])

    stop_words = list(text.ENGLISH_STOP_WORDS)
    # stop_words.extend(CUSTOM_STOP_WORDS)
    pipeline_tfidf = Pipeline(
        [
            ("selector", ItemSelector(column=EMBEDDING_TEXT_COLUMN)),
            (
                "tfidf_title",
                TfidfVectorizer(
#                    preprocessor=stemming_tokenizer,
                    ngram_range=(1, 2),
                    stop_words= list(stop_words) + ["google", "search", "sheets", "docs"],
                    max_df=0.95,
                    min_df=3,
                    max_features=1000,
                )
            ),
            ('scaler', EmbeddingScaler(scale_factor=tf_idf_scale))
        ]
    )
    combined_features = FeatureUnion([
        ("pipeline_title_embeddings", pipeline_title_embeddings),
        ("pipeline_tfidf", pipeline_tfidf),
        ("pipeline_domain", pipeline_domain),
        ("pipeline_history", pipeline_history)
    ])
    final_pipeline = Pipeline(
        [
            ("features", combined_features),
            ('normalizer', Normalizer()),
        ]
    )
    return final_pipeline