optimum/habana/diffusers/pipelines/flux/pipeline_flux.py [447:515]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if self.transformer.config.guidance_embeds:
            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
            guidance = guidance.expand(latents.shape[0])
        else:
            guidance = None

        logger.info(
            f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
            f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
        )
        if num_batches < 3:
            logger.warning("The first two iterations are slower so it is recommended to feed more batches.")

        throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
        use_warmup_inference_steps = (
            num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
        )

        ht.hpu.synchronize()
        t0 = time.time()
        t1 = t0

        hb_profiler = HabanaProfile(
            warmup=profiling_warmup_steps,
            active=profiling_steps,
            record_shapes=False,
        )
        hb_profiler.start()

        # 5.1. Split Input data to batches (HPU-specific step)
        (
            latents_batches,
            text_embeddings_batches,
            pooled_prompt_embeddings_batches,
            guidance_batches,
            num_dummy_samples,
        ) = self._split_inputs_into_batches(batch_size, latents, prompt_embeds, pooled_prompt_embeds, guidance)

        outputs = {
            "images": [],
        }

        # 6. Denoising loop
        for j in range(num_batches):
            # The throughput is calculated from the 4th iteration
            # because compilation occurs in the first 2-3 iterations
            if j == throughput_warmup_steps:
                ht.hpu.synchronize()
                t1 = time.time()

            latents_batch = latents_batches[0]
            latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
            text_embeddings_batch = text_embeddings_batches[0]
            text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
            pooled_prompt_embeddings_batch = pooled_prompt_embeddings_batches[0]
            pooled_prompt_embeddings_batches = torch.roll(pooled_prompt_embeddings_batches, shifts=-1, dims=0)
            guidance_batch = None if guidance_batches is None else guidance_batches[0]
            guidance_batches = None if guidance_batches is None else torch.roll(guidance_batches, shifts=-1, dims=0)

            if hasattr(self.scheduler, "_init_step_index"):
                # Reset scheduler step index for next batch
                self.scheduler.timesteps = timesteps
                self.scheduler._init_step_index(timesteps[0])

            # Mixed quantization
            quant_mixed_step = len(timesteps)
            if quant_mode == "quantize-mixed":
                # 10% of steps use higher precision in mixed quant mode
                quant_mixed_step = quant_mixed_step - (quant_mixed_step // 10)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py [479:547]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if self.transformer.config.guidance_embeds:
            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
            guidance = guidance.expand(latents.shape[0])
        else:
            guidance = None

        logger.info(
            f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
            f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
        )
        if num_batches < 3:
            logger.warning("The first two iterations are slower so it is recommended to feed more batches.")

        throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
        use_warmup_inference_steps = (
            num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
        )

        ht.hpu.synchronize()
        t0 = time.time()
        t1 = t0

        hb_profiler = HabanaProfile(
            warmup=profiling_warmup_steps,
            active=profiling_steps,
            record_shapes=False,
        )
        hb_profiler.start()

        # 5.1. Split Input data to batches (HPU-specific step)
        (
            latents_batches,
            text_embeddings_batches,
            pooled_prompt_embeddings_batches,
            guidance_batches,
            num_dummy_samples,
        ) = self._split_inputs_into_batches(batch_size, latents, prompt_embeds, pooled_prompt_embeds, guidance)

        outputs = {
            "images": [],
        }

        # 6. Denoising loop
        for j in range(num_batches):
            # The throughput is calculated from the 4th iteration
            # because compilation occurs in the first 2-3 iterations
            if j == throughput_warmup_steps:
                ht.hpu.synchronize()
                t1 = time.time()

            latents_batch = latents_batches[0]
            latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
            text_embeddings_batch = text_embeddings_batches[0]
            text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
            pooled_prompt_embeddings_batch = pooled_prompt_embeddings_batches[0]
            pooled_prompt_embeddings_batches = torch.roll(pooled_prompt_embeddings_batches, shifts=-1, dims=0)
            guidance_batch = None if guidance_batches is None else guidance_batches[0]
            guidance_batches = None if guidance_batches is None else torch.roll(guidance_batches, shifts=-1, dims=0)

            if hasattr(self.scheduler, "_init_step_index"):
                # Reset scheduler step index for next batch
                self.scheduler.timesteps = timesteps
                self.scheduler._init_step_index(timesteps[0])

            # Mixed quantization
            quant_mixed_step = len(timesteps)
            if quant_mode == "quantize-mixed":
                # 10% of steps use higher precision in mixed quant mode
                quant_mixed_step = quant_mixed_step - (quant_mixed_step // 10)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -