in core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/text/TextFeaturizer.scala [31:199]
final def getUseTokenizer: Boolean = $(useTokenizer)
/** Indicates whether the regex splits on gaps (true) or matches tokens (false)
*
* @group param
*/
val tokenizerGaps = new BooleanParam(
this,
"tokenizerGaps",
"Indicates whether regex splits on gaps (true) or matches tokens (false)."
)
/** @group getParam */
final def getTokenizerGaps: Boolean = $(tokenizerGaps)
/** Minumum token length; must be 0 or greater.
*
* @group param
*/
val minTokenLength = new IntParam(this, "minTokenLength", "Minimum token length, >= 0.")
/** @group getParam */
final def getMinTokenLength: Int = $(minTokenLength)
/** Regex pattern used to match delimiters if gaps (true) or tokens (false)
*
* @group param
*/
val tokenizerPattern = new Param[String](
this,
"tokenizerPattern",
"Regex pattern used to match delimiters if gaps is true or tokens if gaps is false.")
/** @group getParam */
final def getTokenizerPattern: String = $(tokenizerPattern)
/** Indicates whether to convert all characters to lowercase before tokenizing.
*
* @group param
*/
val toLowercase = new BooleanParam(
this,
"toLowercase",
"Indicates whether to convert all characters to lowercase before tokenizing.")
/** @group getParam */
final def getToLowercase: Boolean = $(toLowercase)
/** Indicates whether to remove stop words from tokenized data.
*
* @group param
*/
val useStopWordsRemover = new BooleanParam(this,
"useStopWordsRemover",
"Whether to remove stop words from tokenized data")
/** @group getParam */
final def getUseStopWordsRemover: Boolean = $(useStopWordsRemover)
/** Indicates whether a case sensitive comparison is performed on stop words.
*
* @group param
*/
val caseSensitiveStopWords = new BooleanParam(
this,
"caseSensitiveStopWords",
" Whether to do a case sensitive comparison over the stop words")
/** @group getParam */
final def getCaseSensitiveStopWords: Boolean = $(caseSensitiveStopWords)
/** Specify the language to use for stop word removal. The Use the custom setting when using the
* stopWords input
*
* @group param
*/
val defaultStopWordLanguage = new Param[String](this,
"defaultStopWordLanguage",
"Which language to use for the stop word remover," +
" set this to custom to use the stopWords input")
/** @group getParam */
final def getDefaultStopWordLanguage: String = $(defaultStopWordLanguage)
/** The words to be filtered out. This is a comma separated list of words, encoded as a single string.
* For example, "a, the, and"
*/
val stopWords = new Param[String](this, "stopWords", "The words to be filtered out.")
/** @group getParam */
final def getStopWords: String = $(stopWords)
/** Enumerate N grams when set
*
* @group param
*/
val useNGram = new BooleanParam(this, "useNGram", "Whether to enumerate N grams")
/** @group getParam */
final def getUseNGram: Boolean = $(useNGram)
/** The size of the Ngrams
*
* @group param
*/
val nGramLength = new IntParam(this, "nGramLength", "The size of the Ngrams")
/** @group getParam */
final def getNGramLength: Int = $(nGramLength)
/** All nonnegative word counts are set to 1 when set to true
*
* @group param
*/
val binary = new BooleanParam(
this,
"binary",
"If true, all nonegative word counts are set to 1")
/** @group getParam */
final def getBinary: Boolean = $(binary)
/** Set the number of features to hash each document to
*
* @group param
*/
val numFeatures = new IntParam(
this,
"numFeatures",
"Set the number of features to hash each document to")
/** @group getParam */
final def getNumFeatures: Int = $(numFeatures)
/** Scale the Term Frequencies by IDF when set to true
*
* @group param
*/
val useIDF = new BooleanParam(
this,
"useIDF",
"Whether to scale the Term Frequencies by IDF")
/** @group getParam */
final def getUseIDF: Boolean = $(useIDF)
/** Minimum number of documents in which a term should appear.
*
* @group param
*/
val minDocFreq = new IntParam(
this,
"minDocFreq",
"The minimum number of documents in which a term should appear.")
/** @group getParam */
final def getMinDocFreq: Int = $(minDocFreq)
}
object TextFeaturizer extends DefaultParamsReadable[TextFeaturizer]
/** Featurize text.
*
* @param uid The id of the module
*/
class TextFeaturizer(override val uid: String)
extends Estimator[PipelineModel]
with TextFeaturizerParams with HasInputCol with HasOutputCol with BasicLogging {