in core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/ValueIndexer.scala [111:197]
def this() = this(Identifiable.randomUID("ValueIndexerModel"))
/** Levels in categorical array
*
* @group param
*/
val levels = new UntypedArrayParam(this, "levels", "Levels in categorical array")
val emptyLevels: Array[Any] = Array()
/** @group getParam */
def getLevels: Array[Any] = if (isDefined(levels)) $(levels) else emptyLevels
/** @group setParam */
def setLevels(value: Array[_]): this.type = set(levels, value.asInstanceOf[Array[Any]])
/** The datatype of the levels as a jason string
*
* @group param
*/
val dataType = new Param[String](this, "dataType", "The datatype of the levels as a Json string")
setDefault(dataType -> "string")
/** @group getParam */
def getDataTypeStr: String = if ($(dataType) == "string") DataTypes.StringType.json else $(dataType)
/** @group setParam */
def setDataTypeStr(value: String): this.type = set(dataType, value)
/** @group getParam */
def getDataType: DataType = if ($(dataType) == "string") DataTypes.StringType else DataType.fromJson($(dataType))
/** @group setParam */
def setDataType(value: DataType): this.type = set(dataType, value.json)
setDefault(inputCol -> "input", outputCol -> (uid + "_output"))
override def copy(extra: ParamMap): ValueIndexerModel =
new ValueIndexerModel(uid)
.setLevels(getLevels)
.setDataType(getDataType)
.setInputCol(getInputCol)
.setOutputCol(getOutputCol)
/** Transform the input column to categorical */
override def transform(dataset: Dataset[_]): DataFrame = {
logTransform[DataFrame]({
val nonNullLevels = getLevels.filter(_ != null)
val castLevels = nonNullLevels.map { l =>
(getDataType, l) match {
case (_: IntegerType, v: scala.math.BigInt) => v.toInt
case (_: IntegerType, v: scala.math.BigDecimal) => v.toInt
case (_: IntegerType, v) => v.asInstanceOf[Int]
case (_: LongType, v: scala.math.BigDecimal) => v.toLong
case (_: LongType, v) => v.asInstanceOf[Long]
case (_: DoubleType, v: scala.math.BigDecimal) => v.toDouble
case (_: DoubleType, v) => v.asInstanceOf[Double]
case (_: StringType, v: String) => v
case (_: StringType, v) => v.asInstanceOf[String]
case (_: BooleanType, v: Boolean) => v
case (_: BooleanType, v) => v.asInstanceOf[Boolean]
case _ => throw new UnsupportedOperationException(s"Unsupported type ${l.getClass} for type ${getDataType} ")
}
}
val hasNullLevel = getLevels.length != nonNullLevels.length
val map = new CategoricalMap(castLevels, false, hasNullLevel)
val unknownIndex =
if (!map.hasNullLevel) {
map.numLevels
} else {
map.numLevels + 1
}
val getIndex = udf((level: Any) => {
// Treat nulls and NaNs specially
if (level == null || (level.isInstanceOf[Double] && level.asInstanceOf[Double].isNaN)) {
map.numLevels
} else {
map.getIndexOption(level).getOrElse(unknownIndex)
}
})
// Add the MML style and sparkML style metadata for categoricals
val metadata = map.toMetadata(map.toMetadata(dataset.schema(getInputCol).metadata, true), false)
val inputColIndex = getIndex(dataset(getInputCol))
dataset.withColumn(getOutputCol, inputColIndex.as(getOutputCol, metadata))
})
}