in data/rand-many-types/generate.py [0:0]
def generate_random_data(data_type, num_rows, random_generator):
rng = random_generator
if pa.types.is_int8(data_type):
return pa.array(rng.integers(-128, 127, num_rows, dtype=np.int8))
elif pa.types.is_int16(data_type):
return pa.array(rng.integers(-32768, 32767, num_rows, dtype=np.int16))
elif pa.types.is_int32(data_type):
return pa.array(
rng.integers(-2147483648, 2147483647, num_rows, dtype=np.int32)
)
elif pa.types.is_int64(data_type):
return pa.array(
rng.integers(
-9223372036854775808,
9223372036854775807,
num_rows,
dtype=np.int64,
)
)
elif pa.types.is_uint8(data_type):
return pa.array(rng.integers(0, 255, num_rows, dtype=np.uint8))
elif pa.types.is_uint16(data_type):
return pa.array(rng.integers(0, 65535, num_rows, dtype=np.uint16))
elif pa.types.is_uint32(data_type):
return pa.array(rng.integers(0, 4294967295, num_rows, dtype=np.uint32))
elif pa.types.is_uint64(data_type):
return pa.array(
rng.integers(0, 18446744073709551615, num_rows, dtype=np.uint64)
)
elif pa.types.is_float32(data_type):
return pa.array(rng.random(num_rows, np.float32))
elif pa.types.is_float64(data_type):
return pa.array(rng.random(num_rows, np.float64))
elif pa.types.is_string(data_type):
charset = list(
string.ascii_lowercase + string.ascii_uppercase + string.digits
)
return pa.array(
["".join(rng.choice(charset, 8)) for _ in range(num_rows)]
)
elif pa.types.is_binary(data_type):
return pa.array([rng.bytes(8) for _ in range(num_rows)])
elif pa.types.is_boolean(data_type):
return pa.array(rng.choice([True, False], num_rows))
elif pa.types.is_date32(data_type):
base_date = datetime(1970, 1, 1)
return pa.array(
[
(base_date + timedelta(days=int(rng.integers(0, 10000)))).date()
for _ in range(num_rows)
],
type=pa.date32(),
)
elif pa.types.is_date64(data_type):
base_date = datetime(1970, 1, 1)
return pa.array(
[
(
base_date
+ timedelta(
milliseconds=int(
rng.integers(0, 10000 * 24 * 60 * 60 * 1000)
)
)
).date()
for _ in range(num_rows)
],
type=pa.date64(),
)
elif pa.types.is_timestamp(data_type):
base_time = datetime(2016, 1, 1, 0, 0, 0, 0)
return pa.array(
[
base_time + timedelta(seconds=int(rng.integers(0, 10000)))
for _ in range(num_rows)
],
type=pa.timestamp("ns"),
)
elif pa.types.is_decimal(data_type):
return pa.array(
[
Decimal(
f"{rng.integers(10**7, 10**8-1)}.{rng.integers(0, 10**2-1)}"
)
for _ in range(num_rows)
],
type=pa.decimal128(10, 2),
)
elif pa.types.is_list(data_type):
return pa.array(
[[rng.integers(0, 100) for _ in range(3)] for _ in range(num_rows)],
type=pa.list_(pa.int32()),
)
elif pa.types.is_struct(data_type):
struct_type = pa.struct(
[("field1", pa.int32()), ("field2", pa.float64())]
)
return pa.array(
[
{"field1": rng.integers(0, 100), "field2": rng.random()}
for _ in range(num_rows)
],
type=struct_type,
)
elif pa.types.is_dictionary(data_type):
return pa.array(
[f"key_{i}" for i in range(num_rows)],
type=pa.dictionary(pa.int32(), pa.string()),
)
else:
return pa.nulls(num_rows, type=data_type)