benchmarks/benchmark_array_xd.py (108 lines of code) (raw):
import json
import os
import tempfile
import datasets
from datasets.arrow_writer import ArrowWriter
from datasets.features import Array2D
from utils import generate_examples, get_duration
SHAPE_TEST_1 = (30, 487)
SHAPE_TEST_2 = (36, 1024)
SPEED_TEST_SHAPE = (100, 100)
SPEED_TEST_N_EXAMPLES = 100
DEFAULT_FEATURES = datasets.Features(
{"text": Array2D(SHAPE_TEST_1, dtype="float32"), "image": Array2D(SHAPE_TEST_2, dtype="float32")}
)
RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
@get_duration
def write(my_features, dummy_data, tmp_dir):
with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) as writer:
for key, record in dummy_data:
example = my_features.encode_example(record)
writer.write(example)
num_examples, num_bytes = writer.finalize()
@get_duration
def read_unformated(feats, tmp_dir):
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
for _ in dataset:
pass
@get_duration
def read_formatted_as_numpy(feats, tmp_dir):
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
dataset.set_format("numpy")
for _ in dataset:
pass
@get_duration
def read_batch_unformated(feats, tmp_dir):
batch_size = 10
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
for i in range(0, len(dataset), batch_size):
_ = dataset[i : i + batch_size]
@get_duration
def read_batch_formatted_as_numpy(feats, tmp_dir):
batch_size = 10
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
dataset.set_format("numpy")
for i in range(0, len(dataset), batch_size):
_ = dataset[i : i + batch_size]
@get_duration
def read_col_unformated(feats, tmp_dir):
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
for col in feats:
_ = dataset[col]
@get_duration
def read_col_formatted_as_numpy(feats, tmp_dir):
dataset = datasets.Dataset.from_file(
filename=os.path.join(tmp_dir, "beta.arrow"), info=datasets.DatasetInfo(features=feats)
)
dataset.set_format("numpy")
for col in feats:
_ = dataset[col]
def benchmark_array_xd():
times = {}
read_functions = (
read_unformated,
read_formatted_as_numpy,
read_batch_unformated,
read_batch_formatted_as_numpy,
read_col_unformated,
read_col_formatted_as_numpy,
)
with tempfile.TemporaryDirectory() as tmp_dir:
feats = datasets.Features({"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")})
data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES)
times["write_array2d"] = write(feats, data, tmp_dir)
for read_func in read_functions:
times[read_func.__name__ + " after write_array2d"] = read_func(feats, tmp_dir)
with tempfile.TemporaryDirectory() as tmp_dir:
# don't use fixed length for fair comparison
# feats = datasets.Features(
# {"image": datasets.Sequence(datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0])}
# )
feats = datasets.Features({"image": datasets.Sequence(datasets.Sequence(datasets.Value("float32")))})
data = generate_examples(
features=feats, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"image": SPEED_TEST_SHAPE}
)
times["write_nested_sequence"] = write(feats, data, tmp_dir)
for read_func in read_functions:
times[read_func.__name__ + " after write_nested_sequence"] = read_func(feats, tmp_dir)
with tempfile.TemporaryDirectory() as tmp_dir:
# don't use fixed length for fair comparison
# feats = datasets.Features(
# {"image": datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])}
# )
feats = datasets.Features({"image": datasets.Sequence(datasets.Value("float32"))})
data = generate_examples(
features=feats,
num_examples=SPEED_TEST_N_EXAMPLES,
seq_shapes={"image": [SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]]},
)
times["write_flattened_sequence"] = write(feats, data, tmp_dir)
for read_func in read_functions:
times[read_func.__name__ + " after write_flattened_sequence"] = read_func(feats, tmp_dir)
with open(RESULTS_FILE_PATH, "wb") as f:
f.write(json.dumps(times).encode("utf-8"))
if __name__ == "__main__": # useful to run the profiler
benchmark_array_xd()