def _fit_vectorizer()

in src/sagemaker_sklearn_extension/feature_extraction/text.py [0:0]
38 lines of code
8 McCabe index (conditional complexity)

    def _fit_vectorizer(self, col_idx, X):
        max_features = self.max_features

        # Override max_features for the current column in order to enforce the vocabulary size.
        if self.max_features and self.vocabulary_sizes:
            max_features = min(self.max_features, self.vocabulary_sizes[col_idx])
        elif self.vocabulary_sizes:
            max_features = self.vocabulary_sizes[col_idx]

        try:
            vectorizer = TfidfVectorizer(
                strip_accents=self.strip_accents,
                lowercase=self.lowercase,
                preprocessor=self.preprocessor,
                tokenizer=self.tokenizer,
                stop_words=self.stop_words,
                token_pattern=self.token_pattern,
                ngram_range=self.ngram_range,
                analyzer=self.analyzer,
                max_df=self.max_df,
                min_df=self.min_df,
                max_features=max_features,
                vocabulary=self.vocabulary,
                dtype=self.dtype,
                norm=self.norm,
                use_idf=self.use_idf,
                smooth_idf=self.smooth_idf,
                sublinear_tf=self.sublinear_tf,
            )
            vectorizer.fit(X[:, col_idx])
        except ValueError as err:
            zero_vocab_errors = [
                "After pruning, no terms remain. Try a lower min_df or a higher max_df.",
                "max_df corresponds to < documents than min_df",
                "empty vocabulary; perhaps the documents only contain stop words",
            ]
            if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size:
                vectorizer = None
            else:
                raise
        return vectorizer