in spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala [122:235]
def generateColumn(
r: Random,
dataType: DataType,
numRows: Int,
options: DataGenOptions): Seq[Any] = {
dataType match {
case ArrayType(elementType, _) =>
val values = generateColumn(r, elementType, numRows, options)
val list = ListBuffer[Any]()
for (i <- 0 until numRows) {
if (i % 10 == 0 && options.allowNull) {
list += null
} else {
list += Range(0, r.nextInt(5)).map(j => values((i + j) % values.length)).toArray
}
}
list.toSeq
case StructType(fields) =>
val values = fields.map(f => generateColumn(r, f.dataType, numRows, options))
Range(0, numRows).map(i => Row(values.indices.map(j => values(j)(i)): _*))
case MapType(keyType, valueType, _) =>
val mapOptions = options.copy(allowNull = false)
val k = generateColumn(r, keyType, numRows, mapOptions)
val v = generateColumn(r, valueType, numRows, mapOptions)
k.zip(v).map(x => Map(x._1 -> x._2))
case DataTypes.BooleanType =>
generateColumn(r, DataTypes.LongType, numRows, options)
.map(_.asInstanceOf[Long].toShort)
.map(s => s % 2 == 0)
case DataTypes.ByteType =>
generateColumn(r, DataTypes.LongType, numRows, options)
.map(_.asInstanceOf[Long].toByte)
case DataTypes.ShortType =>
generateColumn(r, DataTypes.LongType, numRows, options)
.map(_.asInstanceOf[Long].toShort)
case DataTypes.IntegerType =>
generateColumn(r, DataTypes.LongType, numRows, options)
.map(_.asInstanceOf[Long].toInt)
case DataTypes.LongType =>
Range(0, numRows).map(_ => {
r.nextInt(50) match {
case 0 if options.allowNull => null
case 1 => 0L
case 2 => Byte.MinValue.toLong
case 3 => Byte.MaxValue.toLong
case 4 => Short.MinValue.toLong
case 5 => Short.MaxValue.toLong
case 6 => Int.MinValue.toLong
case 7 => Int.MaxValue.toLong
case 8 => Long.MinValue
case 9 => Long.MaxValue
case _ => r.nextLong()
}
})
case DataTypes.FloatType =>
Range(0, numRows).map(_ => {
r.nextInt(20) match {
case 0 if options.allowNull => null
case 1 => Float.NegativeInfinity
case 2 => Float.PositiveInfinity
case 3 => Float.MinValue
case 4 => Float.MaxValue
case 5 => 0.0f
case 6 if options.generateNegativeZero => -0.0f
case _ => r.nextFloat()
}
})
case DataTypes.DoubleType =>
Range(0, numRows).map(_ => {
r.nextInt(20) match {
case 0 if options.allowNull => null
case 1 => Double.NegativeInfinity
case 2 => Double.PositiveInfinity
case 3 => Double.MinValue
case 4 => Double.MaxValue
case 5 => 0.0
case 6 if options.generateNegativeZero => -0.0
case _ => r.nextDouble()
}
})
case dt: DecimalType =>
Range(0, numRows).map(_ =>
new BigDecimal(r.nextDouble()).setScale(dt.scale, RoundingMode.HALF_UP))
case DataTypes.StringType =>
Range(0, numRows).map(_ => {
r.nextInt(10) match {
case 0 if options.allowNull => null
case 1 => r.nextInt().toByte.toString
case 2 => r.nextLong().toString
case 3 => r.nextDouble().toString
case 4 => RandomStringUtils.randomAlphabetic(8)
case _ => r.nextString(8)
}
})
case DataTypes.BinaryType =>
generateColumn(r, DataTypes.StringType, numRows, options)
.map {
case x: String =>
x.getBytes(Charset.defaultCharset())
case _ =>
null
}
case DataTypes.DateType =>
Range(0, numRows).map(_ => new java.sql.Date(options.baseDate + r.nextInt()))
case DataTypes.TimestampType =>
Range(0, numRows).map(_ => new Timestamp(options.baseDate + r.nextInt()))
case DataTypes.TimestampNTZType =>
Range(0, numRows).map(_ =>
LocalDateTime.ofInstant(
Instant.ofEpochMilli(options.baseDate + r.nextInt()),
ZoneId.systemDefault()))
case _ => throw new IllegalStateException(s"Cannot generate data for $dataType yet")
}
}