in chunking/chunkers/multimodal_chunker.py [0:0]
def _attach_figures_to_chunks(self, document, chunks):
"""
Associates figures from the document with their corresponding text chunks.
by scanning each chunk for <figureX.Y> placeholders.
For each figure reference in a chunk:
1) Retrieve the figure from document["figures"] by ID
2) Upload the image to Blob Storage
3) Generate descriptions (captions)
4) Generate embeddings
5) Build one combined caption string that references all figures in this chunk
6) Attach caption and embeddings to the chunk via metodo_append_figures_to_chunk
"""
if "figures" not in document or not document["figures"]:
logging.info(f"[multimodal_chunker][{self.filename}] No figures to attach.")
return
result_id = document.get("result_id")
model_id = document.get("model_id")
if not result_id or not model_id:
logging.warning(
f"[multimodal_chunker][{self.filename}] Missing 'result_id' or 'model_id' in document analysis results."
)
return
logging.info(
f"[multimodal_chunker][{self.filename}] Attaching figures to chunks using "
f"result_id: {result_id} and model_id: {model_id}."
)
# Create a quick-access dictionary for the figures by their ID
figures_dict = {fig["id"]: fig for fig in document["figures"] if "id" in fig}
# Regex to find all <figureX.Y> (or <figureX> if single integer)
figure_tag_pattern = re.compile(r"<figure(\d+(?:\.\d+)*)>")
for chunk in chunks:
chunk_content = chunk.get("content", "")
figure_refs = figure_tag_pattern.findall(chunk_content)
if not figure_refs:
# No figure references in this chunk; move to the next
continue
# Build arrays to store references for this chunk
figure_urls = []
figure_descriptions = []
for figure_id in figure_refs:
# Attempt to find the figure in the dictionary
figure = figures_dict.get(figure_id)
if not figure:
logging.warning(
f"[multimodal_chunker][{self.filename}] Figure with id={figure_id} not found in document['figures']."
)
chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
continue
try:
# 1) Check dimensions
figure_area_percentage = round(self._figure_area(figure, document['pages']), 2)
if figure_area_percentage <= self.minimum_figure_area_percentage:
logging.warning(
f"[multimodal_chunker][{self.filename}] Image for figure {figure_id} "
f"has insufficient percentual area ({figure_area_percentage}). Skipping."
)
chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
continue
# 2) Fetch the figure image
image_binary = self.docint_client.get_figure(model_id, result_id, figure_id)
if not image_binary:
logging.warning(
f"[multimodal_chunker][{self.filename}] No image data retrieved for figure {figure_id}."
)
chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
continue
# Check dimensions
# image = Image.open(io.BytesIO(image_binary))
# width, height = image.size
# pixel_area = width * height
# if pixel_area <= self.minimum_pixel_area:
# logging.warning(
# f"[multimodal_chunker][{self.filename}] Image for figure {figure_id} "
# f"has insufficient pixel area ({pixel_area}). Skipping."
# )
# chunk_content = chunk_content.replace(f"<figure{figure_id}>", "")
# continue
# 3) Upload to blob
blob_name_prefix = self.filepath.replace('/', '-')
blob_name = f"{blob_name_prefix}-figure-{figure_id}.png"
url = self._upload_figure_blob(image_binary, blob_name)
# 4) Generate caption
logging.info(f"[multimodal_chunker][{self.filename}] Generating caption for figure {figure_id}. Percent area: {figure_area_percentage}")
figure_caption = self._generate_caption_for_figure(
{
"id": figure_id,
"image": base64.b64encode(image_binary).decode("utf-8"),
"blob_name": blob_name
}
)
# Store references
figure_urls.append(url)
figure_descriptions.append(f"[{self.image_container}/{blob_name}]: {figure_caption}")
# Replace <figureX.Y> with a simpler marker or remove it
chunk_content = chunk_content.replace(f"<figure{figure_id}>", f"<figure>{self.image_container}/{blob_name}</figure>")
except Exception as e:
logging.error(
f"[multimodal_chunker][{self.filename}] Error processing figure {figure_id}: {str(e)}"
)
# Update the chunk content with placeholders updated
chunk["content"] = chunk_content
# 5) Build the combined caption string
# Example:
# [myfile-figure-1.1.png]: figure (myfile-figure-1.1.png) description: ...
# [myfile-figure-1.2.png]: figure (myfile-figure-1.2.png) description: ...
combined_caption = "\n".join(figure_descriptions)
caption_vector = self.aoai_client.get_embeddings(combined_caption)
# 6) Attach everything to the chunk
if figure_urls or combined_caption:
self._append_figures_to_chunk(
chunk,
figure_urls,
combined_caption,
caption_vector
)
logging.info(f"[multimodal_chunker][{self.filename}] Attached {len(figure_urls)} figures to chunk {chunk['chunk_id']}.")