benchmarks/benchmark_map_filter.py (42 lines of code) (raw):
import json
import os
import tempfile
import transformers
import datasets
from utils import generate_example_dataset, get_duration
SPEED_TEST_N_EXAMPLES = 500_000
RESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)
RESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, "results", RESULTS_FILENAME.replace(".py", ".json"))
@get_duration
def map(dataset: datasets.Dataset, **kwargs):
_ = dataset.map(**kwargs)
@get_duration
def filter(dataset: datasets.Dataset, **kwargs):
_ = dataset.filter(**kwargs)
def benchmark_map_filter():
times = {"num examples": SPEED_TEST_N_EXAMPLES}
with tempfile.TemporaryDirectory() as tmp_dir:
features = datasets.Features({"text": datasets.Value("string"), "numbers": datasets.Value("float32")})
dataset = generate_example_dataset(
os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
)
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)
def tokenize(examples):
return tokenizer(examples["text"])
times["map identity"] = map(dataset)
times["map identity batched"] = map(dataset, batched=True)
times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True)
with dataset.formatted_as(type="numpy"):
times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True)
with dataset.formatted_as(type="pandas"):
times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True)
with dataset.formatted_as(type="torch", columns="numbers"):
times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True)
with dataset.formatted_as(type="tensorflow", columns="numbers"):
times["map no-op batched tensorflow"] = map(dataset, function=lambda x: None, batched=True)
times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True)
times["filter"] = filter(dataset)
# Activate later when tokenizer support batched inputs
# with dataset.formatted_as(type='numpy'):
# times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)
with open(RESULTS_FILE_PATH, "wb") as f:
f.write(json.dumps(times).encode("utf-8"))
if __name__ == "__main__": # useful to run the profiler
benchmark_map_filter()