def extract_chunks()

in generation/llm_swarm_script.py [0:0]


def extract_chunks(pages: List[Any], max_tokens_per_group: int, max_pages_per_group: int, n_overlap: int) -> List[str]:
    """
    Splits a list of pages into chunks with a specified maximum number of tokens per chunk,
    a maximum number of pages per chunk, and overlap between chunks.

    Args:
        pages (List[Any]): The list of pages to be chunked.
        max_tokens_per_group (int): The maximum number of tokens allowed per chunk.
        max_pages_per_group (int): The maximum number of pages allowed per chunk.
        n_overlap (int): The number of overlapping pages between consecutive chunks.

    Returns:
        List[str]: A list of chunked text, each chunk containing text from multiple pages.
    """
    chunks = []
    current_chunk = []
    current_chunk_tokens = 0
    current_chunk_pages = 0
    page_token_counts = [len(tokenizer.encode(page, add_special_tokens=False)) for page in pages]
    
    for i, page in enumerate(pages):
        page_tokens = page_token_counts[i]
        if page_tokens > max_tokens_per_group:
            print(f"Skipping document where page nr {i} has {page_tokens} tokens.")
            return []
        
        if (current_chunk_tokens + page_tokens > max_tokens_per_group) or (current_chunk_pages + 1 > max_pages_per_group):
            if current_chunk:
                chunks.append('\nNEW PAGE\n'.join(current_chunk))
            current_chunk = current_chunk[-n_overlap:] if n_overlap > 0 else []
            current_chunk_tokens = sum(page_token_counts[max(0, i - n_overlap):i])
            current_chunk_pages = len(current_chunk)
        
        current_chunk.append(page)
        current_chunk_tokens += page_tokens
        current_chunk_pages += 1
    
    if current_chunk:
        chunks.append('\nNEW PAGE\n'.join(current_chunk))
    
    return chunks