in src/sagemaker_sklearn_extension/feature_extraction/text.py [0:0]
def _fit_vectorizer(self, col_idx, X):
max_features = self.max_features
# Override max_features for the current column in order to enforce the vocabulary size.
if self.max_features and self.vocabulary_sizes:
max_features = min(self.max_features, self.vocabulary_sizes[col_idx])
elif self.vocabulary_sizes:
max_features = self.vocabulary_sizes[col_idx]
try:
vectorizer = TfidfVectorizer(
strip_accents=self.strip_accents,
lowercase=self.lowercase,
preprocessor=self.preprocessor,
tokenizer=self.tokenizer,
stop_words=self.stop_words,
token_pattern=self.token_pattern,
ngram_range=self.ngram_range,
analyzer=self.analyzer,
max_df=self.max_df,
min_df=self.min_df,
max_features=max_features,
vocabulary=self.vocabulary,
dtype=self.dtype,
norm=self.norm,
use_idf=self.use_idf,
smooth_idf=self.smooth_idf,
sublinear_tf=self.sublinear_tf,
)
vectorizer.fit(X[:, col_idx])
except ValueError as err:
zero_vocab_errors = [
"After pruning, no terms remain. Try a lower min_df or a higher max_df.",
"max_df corresponds to < documents than min_df",
"empty vocabulary; perhaps the documents only contain stop words",
]
if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size:
vectorizer = None
else:
raise
return vectorizer