in chunking/chunkers/multimodal_chunker.py [0:0]
def _figure_area(self, figure: Dict, pages: List[Dict]) -> float:
"""
Calculate the total figure area by summing the areas of all bounding regions across pages.
Args:
figure (Dict): A dictionary representing the figure with 'boundingRegions',
where each bounding region contains 'pageNumber' and 'polygon'.
pages (List[Dict]): A list of page dictionaries each containing 'pageNumber', 'width', and 'height'.
Returns:
float: The total area of all valid bounding regions across pages.
Returns 0.0 if no valid bounding regions are found or an error occurs.
"""
total_area = 0.0
# Ensure 'boundingRegions' exists in the figure
bounding_regions = figure.get('boundingRegions', [])
if not bounding_regions:
logging.warning(f"[multimodal_chunker][{self.filename}] No boundingRegions found in figure.")
return total_area # Returns 0.0
# Create a lookup dictionary for pages to optimize performance
page_lookup = {page['pageNumber']: page for page in pages}
for idx, bounding_region in enumerate(bounding_regions, start=1):
try:
# Extract bounding region details
page_number = bounding_region['pageNumber']
polygon = bounding_region['polygon']
except KeyError as e:
logging.error(f"[multimodal_chunker][{self.filename}] Bounding region {idx} is missing key: {e}")
continue # Skip this bounding region
# Find the corresponding page using the lookup dictionary
page = page_lookup.get(page_number)
if not page:
logging.info(f"[multimodal_chunker][{self.filename}] No matching page found for pageNumber: {page_number} in bounding region {idx}.")
continue # Skip this bounding region
page_width = page.get('width')
page_height = page.get('height')
# Validate page dimensions
if page_width is None or page_height is None:
logging.error(f"[multimodal_chunker][{self.filename}] Page {page_number} is missing 'width' or 'height'.")
continue # Skip this bounding region
try:
# Calculate polygon area using a helper method
polygon_area = self._calculate_polygon_area(polygon)
except ValueError as ve:
logging.error(f"[multimodal_chunker][{self.filename}] Error calculating area for figure on page {page_number}, bounding region {idx}: {ve}")
continue # Skip this bounding region
# Optionally, validate that the polygon area does not exceed the page area
page_area = page_width * page_height
if polygon_area > page_area:
logging.warning(
f"[multimodal_chunker][{self.filename}] Polygon area {polygon_area:.2f} exceeds page area {page_area:.2f} on page {page_number}, bounding region {idx}."
)
# Depending on requirements, we might choose to:
# - Skip adding this area
# - Cap the polygon area to the page area
# - Include the area as is (current implementation)
# Here, we'll include it.
# Accumulate the total area
total_area += polygon_area
logging.debug(
f"[multimodal_chunker][{self.filename}] Figure on Page {page_number}, Bounding Region {idx}: "
f"Polygon Coordinates: {polygon}, "
f"Polygon Area: {polygon_area:.2f}, "
f"Accumulated Total Area: {total_area:.2f}"
)
if total_area == 0.0:
logging.warning(f"[multimodal_chunker][{self.filename}] No valid bounding regions found to calculate total area.")
return total_area