def sd_benchmark()

in benchmark/muse_perf.py [0:0]


def sd_benchmark(batch_size, timesteps, use_xformers):
    model = "runwayml/stable-diffusion-v1-5"
    device = "cuda"
    dtype = torch.float16

    tokenizer = CLIPTokenizer.from_pretrained(model, subfolder="tokenizer")
    text_encoder = CLIPTextModel.from_pretrained(model, subfolder="text_encoder")
    text_encoder.to(device=device, dtype=dtype)

    vae = AutoencoderKL.from_pretrained(model, subfolder="vae")
    vae = vae.to(device=device, dtype=dtype)

    unet = UNet2DConditionModel.from_pretrained(model, subfolder="unet")
    unet = unet.to(device=device, dtype=dtype)

    pipe = StableDiffusionPipeline.from_pretrained(
        model,
        vae=vae,
        unet=unet,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        safety_checker=None,
    )

    if use_xformers:
        pipe.enable_xformers_memory_efficient_attention()

    def benchmark_fn():
        pipe(
            prompt,
            num_images_per_prompt=batch_size,
            num_inference_steps=timesteps,
        )

    pipe(prompt, num_images_per_prompt=batch_size, num_inference_steps=2)

    def fn():
        return Timer(
            stmt="benchmark_fn()",
            globals={"benchmark_fn": benchmark_fn},
            num_threads=num_threads,
            label=f"batch_size: {batch_size}, dtype: {dtype}, timesteps {timesteps}, use_xformers: {use_xformers}",
            description=model,
        ).blocked_autorange(min_run_time=1)

    return measure_max_memory_allocated(fn)