in vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitFeaturizer.scala [29:88]
def this() = this(Identifiable.randomUID("VowpalWabbitFeaturizer"))
setDefault(inputCols -> Array())
setDefault(outputCol -> "features")
val seed = new IntParam(this, "seed", "Hash seed")
setDefault(seed -> 0)
def getSeed: Int = $(seed)
def setSeed(value: Int): this.type = set(seed, value)
val stringSplitInputCols = new StringArrayParam(this, "stringSplitInputCols",
"Input cols that should be split at word boundaries")
setDefault(stringSplitInputCols -> Array())
def getStringSplitInputCols: Array[String] = $(stringSplitInputCols)
def setStringSplitInputCols(value: Array[String]): this.type = set(stringSplitInputCols, value)
val preserveOrderNumBits = new IntParam(this, "preserveOrderNumBits",
"Number of bits used to preserve the feature order. This will reduce the hash size. " +
"Needs to be large enough to fit count the maximum number of words",
(value: Int) => value >= 0 && value < 29)
setDefault(preserveOrderNumBits -> 0)
def getPreserveOrderNumBits: Int = $(preserveOrderNumBits)
def setPreserveOrderNumBits(value: Int): this.type = set(preserveOrderNumBits, value)
val prefixStringsWithColumnName = new BooleanParam(this, "prefixStringsWithColumnName",
"Prefix string features with column name")
setDefault(prefixStringsWithColumnName -> true)
def getPrefixStringsWithColumnName: Boolean = $(prefixStringsWithColumnName)
def setPrefixStringsWithColumnName(value: Boolean): this.type = set(prefixStringsWithColumnName, value)
private def getAllInputCols = getInputCols ++ getStringSplitInputCols
private def getFeaturizer(name: String,
dataType: DataType,
nullable: Boolean,
idx: Int,
namespaceHash: Int): Featurizer = {
val prefixName = if (getPrefixStringsWithColumnName) name else ""
dataType match {
case DoubleType => getNumericFeaturizer[Double](prefixName, nullable, idx, namespaceHash, 0)
case FloatType => getNumericFeaturizer[Float](prefixName, nullable, idx, namespaceHash, 0)
case IntegerType => getNumericFeaturizer[Int](prefixName, nullable, idx, namespaceHash, 0)
case LongType => getNumericFeaturizer[Long](prefixName, nullable, idx, namespaceHash, 0)
case ShortType => getNumericFeaturizer[Short](prefixName, nullable, idx, namespaceHash, 0)
case ByteType => getNumericFeaturizer[Byte](prefixName, nullable, idx, namespaceHash, 0)
case BooleanType => new BooleanFeaturizer(idx, prefixName, namespaceHash, getMask)
case StringType => getStringFeaturizer(name, prefixName, idx, namespaceHash)
case ArrayType(t: StringType, _) => getArrayFeaturizer("", ArrayType(t), nullable, idx, namespaceHash)
// Arrays of strings never use a prefix and use the column name namespace hash
case arr: ArrayType => getArrayFeaturizer(name, arr, nullable, idx)
case struct: StructType => getStructFeaturizer(struct, name, nullable, idx)
case m: MapType => getMapFeaturizer(prefixName, m, idx, namespaceHash)
case m: Any => getOtherFeaturizer(m, prefixName, idx)
}
}