in generation/llm_swarm_script.py [0:0]
def extract_chunks(pages: List[Any], max_tokens_per_group: int, max_pages_per_group: int, n_overlap: int) -> List[str]:
"""
Splits a list of pages into chunks with a specified maximum number of tokens per chunk,
a maximum number of pages per chunk, and overlap between chunks.
Args:
pages (List[Any]): The list of pages to be chunked.
max_tokens_per_group (int): The maximum number of tokens allowed per chunk.
max_pages_per_group (int): The maximum number of pages allowed per chunk.
n_overlap (int): The number of overlapping pages between consecutive chunks.
Returns:
List[str]: A list of chunked text, each chunk containing text from multiple pages.
"""
chunks = []
current_chunk = []
current_chunk_tokens = 0
current_chunk_pages = 0
page_token_counts = [len(tokenizer.encode(page, add_special_tokens=False)) for page in pages]
for i, page in enumerate(pages):
page_tokens = page_token_counts[i]
if page_tokens > max_tokens_per_group:
print(f"Skipping document where page nr {i} has {page_tokens} tokens.")
return []
if (current_chunk_tokens + page_tokens > max_tokens_per_group) or (current_chunk_pages + 1 > max_pages_per_group):
if current_chunk:
chunks.append('\nNEW PAGE\n'.join(current_chunk))
current_chunk = current_chunk[-n_overlap:] if n_overlap > 0 else []
current_chunk_tokens = sum(page_token_counts[max(0, i - n_overlap):i])
current_chunk_pages = len(current_chunk)
current_chunk.append(page)
current_chunk_tokens += page_tokens
current_chunk_pages += 1
if current_chunk:
chunks.append('\nNEW PAGE\n'.join(current_chunk))
return chunks