in de/synthetic.py [0:0]
def generate_data(self, dtype, num_samples):
if dtype in ("int", int):
return np.random.randint(0, 1_000_000, size=num_samples).tolist()
elif dtype in ("float", float):
return np.random.uniform(0, 1_000_000, size=num_samples).round(3).tolist()
elif dtype in ("str", str):
num_chars = np.random.randint(10, 200, size=num_samples)
return [self.fake.text(max_nb_chars=n_chars) for n_chars in num_chars]
elif dtype in ("largestr",):
num_chars = np.random.randint(100, 1000, size=num_samples)
return [self.fake.text(max_nb_chars=n_chars) for n_chars in num_chars]
elif dtype == ("bool", bool):
return np.random.choice([True, False], size=num_samples).tolist()
elif isinstance(dtype, dict):
columns = [
self.generate_data(field_type, num_samples)
for field_type in dtype.values()
]
return [dict(zip(dtype.keys(), row)) for row in zip(*columns)]
elif isinstance(dtype, list) and dtype:
lengths = np.random.randint(0, 5, size=num_samples)
values = self.generate_data(dtype[0], lengths.sum())
return [
values[i : i + length]
for i, length in zip(np.cumsum([0] + lengths), lengths)
]
else:
raise ValueError("Unsupported data type: {}".format(dtype))