python/datafusion/html_formatter.py (278 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""HTML formatting utilities for DataFusion DataFrames."""
from __future__ import annotations
from typing import (
Any,
Callable,
Optional,
Protocol,
runtime_checkable,
)
def _validate_positive_int(value: Any, param_name: str) -> None:
"""Validate that a parameter is a positive integer.
Args:
value: The value to validate
param_name: Name of the parameter (used in error message)
Raises:
ValueError: If the value is not a positive integer
"""
if not isinstance(value, int) or value <= 0:
msg = f"{param_name} must be a positive integer"
raise ValueError(msg)
def _validate_bool(value: Any, param_name: str) -> None:
"""Validate that a parameter is a boolean.
Args:
value: The value to validate
param_name: Name of the parameter (used in error message)
Raises:
TypeError: If the value is not a boolean
"""
if not isinstance(value, bool):
msg = f"{param_name} must be a boolean"
raise TypeError(msg)
@runtime_checkable
class CellFormatter(Protocol):
"""Protocol for cell value formatters."""
def __call__(self, value: Any) -> str:
"""Format a cell value to string representation."""
...
@runtime_checkable
class StyleProvider(Protocol):
"""Protocol for HTML style providers."""
def get_cell_style(self) -> str:
"""Get the CSS style for table cells."""
...
def get_header_style(self) -> str:
"""Get the CSS style for header cells."""
...
class DefaultStyleProvider:
"""Default implementation of StyleProvider."""
def get_cell_style(self) -> str:
"""Get the CSS style for table cells.
Returns:
CSS style string
"""
return (
"border: 1px solid black; padding: 8px; text-align: left; "
"white-space: nowrap;"
)
def get_header_style(self) -> str:
"""Get the CSS style for header cells.
Returns:
CSS style string
"""
return (
"border: 1px solid black; padding: 8px; text-align: left; "
"background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; "
"max-width: fit-content;"
)
class DataFrameHtmlFormatter:
"""Configurable HTML formatter for DataFusion DataFrames.
This class handles the HTML rendering of DataFrames for display in
Jupyter notebooks and other rich display contexts.
This class supports extension through composition. Key extension points:
- Provide a custom StyleProvider for styling cells and headers
- Register custom formatters for specific types
- Provide custom cell builders for specialized cell rendering
Args:
max_cell_length: Maximum characters to display in a cell before truncation
max_width: Maximum width of the HTML table in pixels
max_height: Maximum height of the HTML table in pixels
max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB)
min_rows_display: Minimum number of rows to display
repr_rows: Default number of rows to display in repr output
enable_cell_expansion: Whether to add expand/collapse buttons for long cell
values
custom_css: Additional CSS to include in the HTML output
show_truncation_message: Whether to display a message when data is truncated
style_provider: Custom provider for cell and header styles
use_shared_styles: Whether to load styles and scripts only once per notebook
session
"""
# Class variable to track if styles have been loaded in the notebook
_styles_loaded = False
def __init__(
self,
max_cell_length: int = 25,
max_width: int = 1000,
max_height: int = 300,
max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB
min_rows_display: int = 20,
repr_rows: int = 10,
enable_cell_expansion: bool = True,
custom_css: Optional[str] = None,
show_truncation_message: bool = True,
style_provider: Optional[StyleProvider] = None,
use_shared_styles: bool = True,
) -> None:
"""Initialize the HTML formatter.
Parameters
----------
max_cell_length : int, default 25
Maximum length of cell content before truncation.
max_width : int, default 1000
Maximum width of the displayed table in pixels.
max_height : int, default 300
Maximum height of the displayed table in pixels.
max_memory_bytes : int, default 2097152 (2MB)
Maximum memory in bytes for rendered data.
min_rows_display : int, default 20
Minimum number of rows to display.
repr_rows : int, default 10
Default number of rows to display in repr output.
enable_cell_expansion : bool, default True
Whether to allow cells to expand when clicked.
custom_css : str, optional
Custom CSS to apply to the HTML table.
show_truncation_message : bool, default True
Whether to show a message indicating that content has been truncated.
style_provider : StyleProvider, optional
Provider of CSS styles for the HTML table. If None, DefaultStyleProvider
is used.
use_shared_styles : bool, default True
Whether to use shared styles across multiple tables.
Raises:
------
ValueError
If max_cell_length, max_width, max_height, max_memory_bytes,
min_rows_display, or repr_rows is not a positive integer.
TypeError
If enable_cell_expansion, show_truncation_message, or use_shared_styles is
not a boolean,
or if custom_css is provided but is not a string,
or if style_provider is provided but does not implement the StyleProvider
protocol.
"""
# Validate numeric parameters
_validate_positive_int(max_cell_length, "max_cell_length")
_validate_positive_int(max_width, "max_width")
_validate_positive_int(max_height, "max_height")
_validate_positive_int(max_memory_bytes, "max_memory_bytes")
_validate_positive_int(min_rows_display, "min_rows_display")
_validate_positive_int(repr_rows, "repr_rows")
# Validate boolean parameters
_validate_bool(enable_cell_expansion, "enable_cell_expansion")
_validate_bool(show_truncation_message, "show_truncation_message")
_validate_bool(use_shared_styles, "use_shared_styles")
# Validate custom_css
if custom_css is not None and not isinstance(custom_css, str):
msg = "custom_css must be None or a string"
raise TypeError(msg)
# Validate style_provider
if style_provider is not None and not isinstance(style_provider, StyleProvider):
msg = "style_provider must implement the StyleProvider protocol"
raise TypeError(msg)
self.max_cell_length = max_cell_length
self.max_width = max_width
self.max_height = max_height
self.max_memory_bytes = max_memory_bytes
self.min_rows_display = min_rows_display
self.repr_rows = repr_rows
self.enable_cell_expansion = enable_cell_expansion
self.custom_css = custom_css
self.show_truncation_message = show_truncation_message
self.style_provider = style_provider or DefaultStyleProvider()
self.use_shared_styles = use_shared_styles
# Registry for custom type formatters
self._type_formatters: dict[type, CellFormatter] = {}
# Custom cell builders
self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None
self._custom_header_builder: Optional[Callable[[Any], str]] = None
def register_formatter(self, type_class: type, formatter: CellFormatter) -> None:
"""Register a custom formatter for a specific data type.
Args:
type_class: The type to register a formatter for
formatter: Function that takes a value of the given type and returns
a formatted string
"""
self._type_formatters[type_class] = formatter
def set_custom_cell_builder(
self, builder: Callable[[Any, int, int, str], str]
) -> None:
"""Set a custom cell builder function.
Args:
builder: Function that takes (value, row, col, table_id) and returns HTML
"""
self._custom_cell_builder = builder
def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None:
"""Set a custom header builder function.
Args:
builder: Function that takes a field and returns HTML
"""
self._custom_header_builder = builder
@classmethod
def is_styles_loaded(cls) -> bool:
"""Check if HTML styles have been loaded in the current session.
This method is primarily intended for debugging UI rendering issues
related to style loading.
Returns:
True if styles have been loaded, False otherwise
Example:
>>> from datafusion.html_formatter import DataFrameHtmlFormatter
>>> DataFrameHtmlFormatter.is_styles_loaded()
False
"""
return cls._styles_loaded
def format_html(
self,
batches: list,
schema: Any,
has_more: bool = False,
table_uuid: str | None = None,
) -> str:
"""Format record batches as HTML.
This method is used by DataFrame's _repr_html_ implementation and can be
called directly when custom HTML rendering is needed.
Args:
batches: List of Arrow RecordBatch objects
schema: Arrow Schema object
has_more: Whether there are more batches not shown
table_uuid: Unique ID for the table, used for JavaScript interactions
Returns:
HTML string representation of the data
Raises:
TypeError: If schema is invalid and no batches are provided
"""
if not batches:
return "No data to display"
# Validate schema
if schema is None or not hasattr(schema, "__iter__"):
msg = "Schema must be provided"
raise TypeError(msg)
# Generate a unique ID if none provided
table_uuid = table_uuid or f"df-{id(batches)}"
# Build HTML components
html = []
# Only include styles and scripts if:
# 1. Not using shared styles, OR
# 2. Using shared styles but they haven't been loaded yet
include_styles = (
not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded
)
if include_styles:
html.extend(self._build_html_header())
# If we're using shared styles, mark them as loaded
if self.use_shared_styles:
DataFrameHtmlFormatter._styles_loaded = True
html.extend(self._build_table_container_start())
# Add table header and body
html.extend(self._build_table_header(schema))
html.extend(self._build_table_body(batches, table_uuid))
html.append("</table>")
html.append("</div>")
# Add footer (JavaScript and messages)
if include_styles and self.enable_cell_expansion:
html.append(self._get_javascript())
# Always add truncation message if needed (independent of styles)
if has_more and self.show_truncation_message:
html.append("<div>Data truncated due to size.</div>")
return "\n".join(html)
def _build_html_header(self) -> list[str]:
"""Build the HTML header with CSS styles."""
html = []
html.append("<style>")
# Only include expandable CSS if cell expansion is enabled
if self.enable_cell_expansion:
html.append(self._get_default_css())
if self.custom_css:
html.append(self.custom_css)
html.append("</style>")
return html
def _build_table_container_start(self) -> list[str]:
"""Build the opening tags for the table container."""
html = []
html.append(
f'<div style="width: 100%; max-width: {self.max_width}px; '
f"max-height: {self.max_height}px; overflow: auto; border: "
'1px solid #ccc;">'
)
html.append('<table style="border-collapse: collapse; min-width: 100%">')
return html
def _build_table_header(self, schema: Any) -> list[str]:
"""Build the HTML table header with column names."""
html = []
html.append("<thead>")
html.append("<tr>")
for field in schema:
if self._custom_header_builder:
html.append(self._custom_header_builder(field))
else:
html.append(
f"<th style='{self.style_provider.get_header_style()}'>"
f"{field.name}</th>"
)
html.append("</tr>")
html.append("</thead>")
return html
def _build_table_body(self, batches: list, table_uuid: str) -> list[str]:
"""Build the HTML table body with data rows."""
html = []
html.append("<tbody>")
row_count = 0
for batch in batches:
for row_idx in range(batch.num_rows):
row_count += 1
html.append("<tr>")
for col_idx, column in enumerate(batch.columns):
# Get the raw value from the column
raw_value = self._get_cell_value(column, row_idx)
# Always check for type formatters first to format the value
formatted_value = self._format_cell_value(raw_value)
# Then apply either custom cell builder or standard cell formatting
if self._custom_cell_builder:
# Pass both the raw value and formatted value to let the
# builder decide
cell_html = self._custom_cell_builder(
raw_value, row_count, col_idx, table_uuid
)
html.append(cell_html)
else:
# Standard cell formatting with formatted value
if (
len(str(raw_value)) > self.max_cell_length
and self.enable_cell_expansion
):
cell_html = self._build_expandable_cell(
formatted_value, row_count, col_idx, table_uuid
)
else:
cell_html = self._build_regular_cell(formatted_value)
html.append(cell_html)
html.append("</tr>")
html.append("</tbody>")
return html
def _get_cell_value(self, column: Any, row_idx: int) -> Any:
"""Extract a cell value from a column.
Args:
column: Arrow array
row_idx: Row index
Returns:
The raw cell value
"""
try:
value = column[row_idx]
if hasattr(value, "as_py"):
return value.as_py()
except (AttributeError, TypeError):
pass
else:
return value
def _format_cell_value(self, value: Any) -> str:
"""Format a cell value for display.
Uses registered type formatters if available.
Args:
value: The cell value to format
Returns:
Formatted cell value as string
"""
# Check for custom type formatters
for type_cls, formatter in self._type_formatters.items():
if isinstance(value, type_cls):
return formatter(value)
# If no formatter matched, return string representation
return str(value)
def _build_expandable_cell(
self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str
) -> str:
"""Build an expandable cell for long content."""
short_value = str(formatted_value)[: self.max_cell_length]
return (
f"<td style='{self.style_provider.get_cell_style()}'>"
f"<div class='expandable-container'>"
"<span class='expandable' "
f"id='{table_uuid}-min-text-{row_count}-{col_idx}'>"
f"{short_value}</span>"
"<span class='full-text' "
f"id='{table_uuid}-full-text-{row_count}-{col_idx}'>"
f"{formatted_value}</span>"
f"<button class='expand-btn' "
f"onclick=\"toggleDataFrameCellText('{table_uuid}',{row_count},{col_idx})\">"
f"...</button>"
f"</div>"
f"</td>"
)
def _build_regular_cell(self, formatted_value: str) -> str:
"""Build a regular table cell."""
return (
f"<td style='{self.style_provider.get_cell_style()}'>{formatted_value}</td>"
)
def _build_html_footer(self, has_more: bool) -> list[str]:
"""Build the HTML footer with JavaScript and messages."""
html = []
# Add JavaScript for interactivity only if cell expansion is enabled
# and we're not using the shared styles approach
if self.enable_cell_expansion and not self.use_shared_styles:
html.append(self._get_javascript())
# Add truncation message if needed
if has_more and self.show_truncation_message:
html.append("<div>Data truncated due to size.</div>")
return html
def _get_default_css(self) -> str:
"""Get default CSS styles for the HTML table."""
return """
.expandable-container {
display: inline-block;
max-width: 200px;
}
.expandable {
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
display: block;
}
.full-text {
display: none;
white-space: normal;
}
.expand-btn {
cursor: pointer;
color: blue;
text-decoration: underline;
border: none;
background: none;
font-size: inherit;
display: block;
margin-top: 5px;
}
"""
def _get_javascript(self) -> str:
"""Get JavaScript code for interactive elements."""
return """
<script>
function toggleDataFrameCellText(table_uuid, row, col) {
var shortText = document.getElementById(
table_uuid + "-min-text-" + row + "-" + col
);
var fullText = document.getElementById(
table_uuid + "-full-text-" + row + "-" + col
);
var button = event.target;
if (fullText.style.display === "none") {
shortText.style.display = "none";
fullText.style.display = "inline";
button.textContent = "(less)";
} else {
shortText.style.display = "inline";
fullText.style.display = "none";
button.textContent = "...";
}
}
</script>
"""
class FormatterManager:
"""Manager class for the global DataFrame HTML formatter instance."""
_default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter()
@classmethod
def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None:
"""Set the global DataFrame HTML formatter.
Args:
formatter: The formatter instance to use globally
"""
cls._default_formatter = formatter
_refresh_formatter_reference()
@classmethod
def get_formatter(cls) -> DataFrameHtmlFormatter:
"""Get the current global DataFrame HTML formatter.
Returns:
The global HTML formatter instance
"""
return cls._default_formatter
def get_formatter() -> DataFrameHtmlFormatter:
"""Get the current global DataFrame HTML formatter.
This function is used by the DataFrame._repr_html_ implementation to access
the shared formatter instance. It can also be used directly when custom
HTML rendering is needed.
Returns:
The global HTML formatter instance
Example:
>>> from datafusion.html_formatter import get_formatter
>>> formatter = get_formatter()
>>> formatter.max_cell_length = 50 # Increase cell length
"""
return FormatterManager.get_formatter()
def set_formatter(formatter: DataFrameHtmlFormatter) -> None:
"""Set the global DataFrame HTML formatter.
Args:
formatter: The formatter instance to use globally
Example:
>>> from datafusion.html_formatter import get_formatter, set_formatter
>>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100)
>>> set_formatter(custom_formatter)
"""
FormatterManager.set_formatter(formatter)
def configure_formatter(**kwargs: Any) -> None:
"""Configure the global DataFrame HTML formatter.
This function creates a new formatter with the provided configuration
and sets it as the global formatter for all DataFrames.
Args:
**kwargs: Formatter configuration parameters like max_cell_length,
max_width, max_height, enable_cell_expansion, etc.
Raises:
ValueError: If any invalid parameters are provided
Example:
>>> from datafusion.html_formatter import configure_formatter
>>> configure_formatter(
... max_cell_length=50,
... max_height=500,
... enable_cell_expansion=True,
... use_shared_styles=True
... )
"""
# Valid parameters accepted by DataFrameHtmlFormatter
valid_params = {
"max_cell_length",
"max_width",
"max_height",
"max_memory_bytes",
"min_rows_display",
"repr_rows",
"enable_cell_expansion",
"custom_css",
"show_truncation_message",
"style_provider",
"use_shared_styles",
}
# Check for invalid parameters
invalid_params = set(kwargs) - valid_params
if invalid_params:
msg = (
f"Invalid formatter parameters: {', '.join(invalid_params)}. "
f"Valid parameters are: {', '.join(valid_params)}"
)
raise ValueError(msg)
# Create and set formatter with validated parameters
set_formatter(DataFrameHtmlFormatter(**kwargs))
def reset_formatter() -> None:
"""Reset the global DataFrame HTML formatter to default settings.
This function creates a new formatter with default configuration
and sets it as the global formatter for all DataFrames.
Example:
>>> from datafusion.html_formatter import reset_formatter
>>> reset_formatter() # Reset formatter to default settings
"""
formatter = DataFrameHtmlFormatter()
# Reset the styles_loaded flag to ensure styles will be reloaded
DataFrameHtmlFormatter._styles_loaded = False
set_formatter(formatter)
def reset_styles_loaded_state() -> None:
"""Reset the styles loaded state to force reloading of styles.
This can be useful when switching between notebook sessions or
when styles need to be refreshed.
Example:
>>> from datafusion.html_formatter import reset_styles_loaded_state
>>> reset_styles_loaded_state() # Force styles to reload in next render
"""
DataFrameHtmlFormatter._styles_loaded = False
def _refresh_formatter_reference() -> None:
"""Refresh formatter reference in any modules using it.
This helps ensure that changes to the formatter are reflected in existing
DataFrames that might be caching the formatter reference.
"""
# This is a no-op but signals modules to refresh their reference