flink-ml-python/pyflink/ml/feature/stopwordsremover.py (75 lines of code) (raw):
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import typing
from typing import Tuple
from pyflink.java_gateway import get_gateway
from pyflink.ml.param import Param, StringArrayParam, BooleanParam, StringParam
from pyflink.ml.wrapper import JavaWithParams
from pyflink.ml.feature.common import JavaFeatureTransformer
from pyflink.ml.common.param import HasInputCols, HasOutputCols
def _load_default_stop_words(language: str) -> Tuple[str, ...]:
return tuple(*[get_gateway().jvm.org.apache.flink.ml.feature.
stopwordsremover.StopWordsRemover.loadDefaultStopWords(language)])
def _get_default_or_us() -> str:
return get_gateway().jvm.org.apache.flink.ml.feature. \
stopwordsremover.StopWordsRemover.getDefaultOrUS()
def _get_available_locales() -> set:
return {*get_gateway().jvm.org.apache.flink.ml.feature.
stopwordsremover.StopWordsRemover.getAvailableLocales()}
class _StopWordsRemoverParams(
JavaWithParams,
HasInputCols,
HasOutputCols
):
"""
Params for :class:`StopWordsRemover`.
"""
STOP_WORDS: Param[Tuple[str, ...]] = StringArrayParam(
"stop_words",
"The words to be filtered out.",
_load_default_stop_words('english'))
CASE_SENSITIVE: Param[bool] = BooleanParam(
"case_sensitive",
"Whether to do a case-sensitive comparison over the stop words.",
False
)
LOCALE: Param[str] = StringParam(
"locale",
"Locale of the input for case insensitive matching. Ignored when caseSensitive is true.",
_get_default_or_us())
def __init__(self, java_params):
super(_StopWordsRemoverParams, self).__init__(java_params)
def set_stop_words(self, *value: str):
return typing.cast(_StopWordsRemoverParams, self.set(self.STOP_WORDS, value))
def set_case_sensitive(self, value: bool):
return typing.cast(_StopWordsRemoverParams, self.set(self.CASE_SENSITIVE, value))
def set_locale(self, value: str):
return typing.cast(_StopWordsRemoverParams, self.set(self.LOCALE, value))
def get_stop_words(self) -> Tuple[str, ...]:
return self.get(self.STOP_WORDS)
def get_case_sensitive(self) -> bool:
return self.get(self.CASE_SENSITIVE)
def get_locale(self) -> str:
return self.get(self.LOCALE)
@property
def stop_words(self):
return self.get_stop_words()
@property
def case_sensitive(self):
return self.get_case_sensitive()
@property
def locale(self):
return self.get_locale()
class StopWordsRemover(JavaFeatureTransformer, _StopWordsRemoverParams):
"""
A feature transformer that filters out stop words from input.
Note: null values from input array are preserved unless adding null to stopWords explicitly.
See Also: http://en.wikipedia.org/wiki/Stop_words
"""
def __init__(self, java_model=None):
super(StopWordsRemover, self).__init__(java_model)
@classmethod
def _java_transformer_package_name(cls) -> str:
return "stopwordsremover"
@classmethod
def _java_transformer_class_name(cls) -> str:
return "StopWordsRemover"
@classmethod
def load_default_stop_words(cls, language: str):
"""
Loads the default stop words for the given language.
Supported languages: danish, dutch, english, finnish, french, german, hungarian, italian,
norwegian, portuguese, russian, spanish, swedish, turkish
See Also: http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/
"""
return _load_default_stop_words(language)
@classmethod
def get_default_or_us(cls):
"""
Returns system default locale, or "en_US" if the default locale is not available. The locale
is returned as a String.
"""
return _get_default_or_us()
@classmethod
def get_available_locales(cls):
"""
Returns a set of all installed locales. It must contain at least a Locale
instance equal to "en_US". The locales are returned as Strings.
"""
return _get_available_locales()