in benchmark/muse_perf.py [0:0]
def sd_benchmark(batch_size, timesteps, use_xformers):
model = "runwayml/stable-diffusion-v1-5"
device = "cuda"
dtype = torch.float16
tokenizer = CLIPTokenizer.from_pretrained(model, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(model, subfolder="text_encoder")
text_encoder.to(device=device, dtype=dtype)
vae = AutoencoderKL.from_pretrained(model, subfolder="vae")
vae = vae.to(device=device, dtype=dtype)
unet = UNet2DConditionModel.from_pretrained(model, subfolder="unet")
unet = unet.to(device=device, dtype=dtype)
pipe = StableDiffusionPipeline.from_pretrained(
model,
vae=vae,
unet=unet,
text_encoder=text_encoder,
tokenizer=tokenizer,
safety_checker=None,
)
if use_xformers:
pipe.enable_xformers_memory_efficient_attention()
def benchmark_fn():
pipe(
prompt,
num_images_per_prompt=batch_size,
num_inference_steps=timesteps,
)
pipe(prompt, num_images_per_prompt=batch_size, num_inference_steps=2)
def fn():
return Timer(
stmt="benchmark_fn()",
globals={"benchmark_fn": benchmark_fn},
num_threads=num_threads,
label=f"batch_size: {batch_size}, dtype: {dtype}, timesteps {timesteps}, use_xformers: {use_xformers}",
description=model,
).blocked_autorange(min_run_time=1)
return measure_max_memory_allocated(fn)