in src/pixparse/data/preprocess.py [0:0]
def get_next_valid_page_index(current_index: int, num_pages: int, anno: dict, retries: int=10):
"""
Get the index of the next valid page which contains text. If it doesn't find any non empty page
after 'retries' attempts, it raises a RuntimeError.
Parameters:
current_index (int): Current page index.
num_pages (int): Total number of pages.
anno (dict): The annotation dictionary which contains the 'pages'.
retries (int): Number of maximum retries for a given document.
Returns:
int: The index of the next non empty page.
"""
for _ in range(retries):
current_index = (current_index + 1) % num_pages # Get the next index, wrap around to 0 if it exceeds num_pages (in case of random init)
anno_page = anno['pages'][current_index]
if anno_page['text']:
return current_index
raise RuntimeError(f"No non-empty page found after {retries} attempts")