def this()

in core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/ValueIndexer.scala [111:197]


  def this() = this(Identifiable.randomUID("ValueIndexerModel"))

  /** Levels in categorical array
    *
    * @group param
    */
  val levels = new UntypedArrayParam(this, "levels", "Levels in categorical array")
  val emptyLevels: Array[Any] = Array()

  /** @group getParam */
  def getLevels: Array[Any] = if (isDefined(levels)) $(levels) else emptyLevels

  /** @group setParam */
  def setLevels(value: Array[_]): this.type = set(levels, value.asInstanceOf[Array[Any]])

  /** The datatype of the levels as a jason string
    *
    * @group param
    */
  val dataType = new Param[String](this, "dataType", "The datatype of the levels as a Json string")
  setDefault(dataType -> "string")

  /** @group getParam */
  def getDataTypeStr: String = if ($(dataType) == "string") DataTypes.StringType.json else $(dataType)

  /** @group setParam */
  def setDataTypeStr(value: String): this.type = set(dataType, value)

  /** @group getParam */
  def getDataType: DataType = if ($(dataType) == "string") DataTypes.StringType else DataType.fromJson($(dataType))

  /** @group setParam */
  def setDataType(value: DataType): this.type = set(dataType, value.json)

  setDefault(inputCol -> "input", outputCol -> (uid + "_output"))

  override def copy(extra: ParamMap): ValueIndexerModel =
    new ValueIndexerModel(uid)
      .setLevels(getLevels)
      .setDataType(getDataType)
      .setInputCol(getInputCol)
      .setOutputCol(getOutputCol)

  /** Transform the input column to categorical */
  override def transform(dataset: Dataset[_]): DataFrame = {
    logTransform[DataFrame]({
      val nonNullLevels = getLevels.filter(_ != null)

      val castLevels = nonNullLevels.map { l =>
        (getDataType, l) match {
          case (_: IntegerType, v: scala.math.BigInt) => v.toInt
          case (_: IntegerType, v: scala.math.BigDecimal) => v.toInt
          case (_: IntegerType, v) => v.asInstanceOf[Int]
          case (_: LongType, v: scala.math.BigDecimal) => v.toLong
          case (_: LongType, v) => v.asInstanceOf[Long]
          case (_: DoubleType, v: scala.math.BigDecimal) => v.toDouble
          case (_: DoubleType, v) => v.asInstanceOf[Double]
          case (_: StringType, v: String) => v
          case (_: StringType, v) => v.asInstanceOf[String]
          case (_: BooleanType, v: Boolean) => v
          case (_: BooleanType, v) => v.asInstanceOf[Boolean]
          case _ => throw new UnsupportedOperationException(s"Unsupported type ${l.getClass} for type ${getDataType} ")
        }
      }
      val hasNullLevel = getLevels.length != nonNullLevels.length
      val map = new CategoricalMap(castLevels, false, hasNullLevel)
      val unknownIndex =
        if (!map.hasNullLevel) {
          map.numLevels
        } else {
          map.numLevels + 1
        }
      val getIndex = udf((level: Any) => {
        // Treat nulls and NaNs specially
        if (level == null || (level.isInstanceOf[Double] && level.asInstanceOf[Double].isNaN)) {
          map.numLevels
        } else {
          map.getIndexOption(level).getOrElse(unknownIndex)
        }
      })
      // Add the MML style and sparkML style metadata for categoricals
      val metadata = map.toMetadata(map.toMetadata(dataset.schema(getInputCol).metadata, true), false)
      val inputColIndex = getIndex(dataset(getInputCol))
      dataset.withColumn(getOutputCol, inputColIndex.as(getOutputCol, metadata))
    })

  }