def create_tasks()

in generation/llm_swarm_script.py [0:0]


def create_tasks(dataset: IterableDataset, prompt_id: Optional[int] = None, n_overlap: int = 2) -> List[Dict[str, Any]]:
    """
    Processes a dataset to generate question and answer pairs for each sample.

    Args:
        dataset (IterableDataset): The dataset containing samples.
        prompt_id (Optional[int]): The ID of the prompt template to use for generating questions. If set to None, prompt_id is random.
        n_overlap (int): The number of overlapping pages between consecutive chunks.
        num_samples (int): The number of samples to process.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing the sample key, page count, generated Q/A pairs, and prompt ID.
    """
    if prompt_id is not None:
        selected_id_prompt = prompt_id

    tasks = []

    for index, sample in dataset.iterrows():
        text_per_page = extract_text_per_page_from_sample(sample)
        if len(text_per_page) > MAX_PAGES_PER_PDF:
            continue
        page_chunks = extract_chunks(text_per_page, max_tokens_per_group=5000, max_pages_per_group=5, n_overlap=n_overlap)

        for chunk in page_chunks:
            if prompt_id is None:
                selected_id_prompt = random.randint(0, 4)
            prompt = PROMPTS[selected_id_prompt]
            messages = create_llm_prompt(prompt, chunk)
            prompt = tokenizer.apply_chat_template(messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

            tasks_dict = {
                "__key__": sample['__key__'],
                "Page count": len(text_per_page),
                "messages": prompt,
                "Prompt ID": selected_id_prompt
            }
            tasks.append(tasks_dict)
        
    return tasks