def _attach_figures_to_chunks()

in chunking/chunkers/multimodal_chunker.py [0:0]


    def _attach_figures_to_chunks(self, document, chunks):
        """
        Associates figures from the document with their corresponding text chunks.
        by scanning each chunk for <figureX.Y> placeholders.

        For each figure reference in a chunk:
        1) Retrieve the figure from document["figures"] by ID
        2) Upload the image to Blob Storage
        3) Generate descriptions (captions)
        4) Generate embeddings
        5) Build one combined caption string that references all figures in this chunk
        6) Attach caption and embeddings to the chunk via metodo_append_figures_to_chunk
        """

        if "figures" not in document or not document["figures"]:
            logging.info(f"[multimodal_chunker][{self.filename}] No figures to attach.")
            return

        result_id = document.get("result_id")
        model_id = document.get("model_id")
        if not result_id or not model_id:
            logging.warning(
                f"[multimodal_chunker][{self.filename}] Missing 'result_id' or 'model_id' in document analysis results."
            )
            return

        logging.info(
            f"[multimodal_chunker][{self.filename}] Attaching figures to chunks using "
            f"result_id: {result_id} and model_id: {model_id}."
        )

        # Create a quick-access dictionary for the figures by their ID
        figures_dict = {fig["id"]: fig for fig in document["figures"] if "id" in fig}

        # Regex to find all <figureX.Y> (or <figureX> if single integer)
        figure_tag_pattern = re.compile(r"<figure(\d+(?:\.\d+)*)>")

        for chunk in chunks:
            chunk_content = chunk.get("content", "")
            figure_refs = figure_tag_pattern.findall(chunk_content)
            if not figure_refs:
                # No figure references in this chunk; move to the next
                continue

            # Build arrays to store references for this chunk
            figure_urls = []
            figure_descriptions = []

            for figure_id in figure_refs:
                # Attempt to find the figure in the dictionary
                figure = figures_dict.get(figure_id)
                if not figure:
                    logging.warning(
                        f"[multimodal_chunker][{self.filename}] Figure with id={figure_id} not found in document['figures']."
                    )
                    chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
                    continue

                try:
                    # 1) Check dimensions
                    figure_area_percentage = round(self._figure_area(figure, document['pages']), 2)
                    if figure_area_percentage <= self.minimum_figure_area_percentage:
                        logging.warning(
                            f"[multimodal_chunker][{self.filename}] Image for figure {figure_id} "
                            f"has insufficient percentual area ({figure_area_percentage}). Skipping."
                        )
                        chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
                        continue

                    # 2) Fetch the figure image
                    image_binary = self.docint_client.get_figure(model_id, result_id, figure_id)
                    if not image_binary:
                        logging.warning(
                            f"[multimodal_chunker][{self.filename}] No image data retrieved for figure {figure_id}."
                        )
                        chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
                        continue

                    # Check dimensions
                    # image = Image.open(io.BytesIO(image_binary))
                    # width, height = image.size
                    # pixel_area = width * height
                    # if pixel_area <= self.minimum_pixel_area:
                    #     logging.warning(
                    #         f"[multimodal_chunker][{self.filename}] Image for figure {figure_id} "
                    #         f"has insufficient pixel area ({pixel_area}). Skipping."
                    #     )
                    #     chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
                    #     continue


                    # 3) Upload to blob
                    blob_name_prefix = self.filepath.replace('/', '-')
                    blob_name = f"{blob_name_prefix}-figure-{figure_id}.png"
                    url = self._upload_figure_blob(image_binary, blob_name)

                    # 4) Generate caption
                    logging.info(f"[multimodal_chunker][{self.filename}] Generating caption for figure {figure_id}. Percent area: {figure_area_percentage}")                    
                    figure_caption = self._generate_caption_for_figure(
                        {
                            "id": figure_id,
                            "image": base64.b64encode(image_binary).decode("utf-8"),
                            "blob_name": blob_name
                        }
                    )

                    # Store references
                    figure_urls.append(url)
                    figure_descriptions.append(f"[{self.image_container}/{blob_name}]: {figure_caption}")

                    # Replace <figureX.Y> with a simpler marker or remove it
                    chunk_content = chunk_content.replace(f"<figure{figure_id}>", f"<figure>{self.image_container}/{blob_name}</figure>")

                except Exception as e:
                    logging.error(
                        f"[multimodal_chunker][{self.filename}] Error processing figure {figure_id}: {str(e)}"
                    )


            # Update the chunk content with placeholders updated 
            chunk["content"] = chunk_content

            # 5) Build the combined caption string
            #    Example:
            #    [myfile-figure-1.1.png]: figure (myfile-figure-1.1.png) description: ...
            #    [myfile-figure-1.2.png]: figure (myfile-figure-1.2.png) description: ...
            combined_caption = "\n".join(figure_descriptions)

            caption_vector = self.aoai_client.get_embeddings(combined_caption)

            # 6) Attach everything to the chunk
            if figure_urls or combined_caption:
                self._append_figures_to_chunk(
                    chunk,
                    figure_urls,
                    combined_caption,
                    caption_vector
                )
                logging.info(f"[multimodal_chunker][{self.filename}] Attached {len(figure_urls)} figures to chunk {chunk['chunk_id']}.")