in generation/llm_swarm_script.py [0:0]
def create_tasks(dataset: IterableDataset, prompt_id: Optional[int] = None, n_overlap: int = 2) -> List[Dict[str, Any]]:
"""
Processes a dataset to generate question and answer pairs for each sample.
Args:
dataset (IterableDataset): The dataset containing samples.
prompt_id (Optional[int]): The ID of the prompt template to use for generating questions. If set to None, prompt_id is random.
n_overlap (int): The number of overlapping pages between consecutive chunks.
num_samples (int): The number of samples to process.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the sample key, page count, generated Q/A pairs, and prompt ID.
"""
if prompt_id is not None:
selected_id_prompt = prompt_id
tasks = []
for index, sample in dataset.iterrows():
text_per_page = extract_text_per_page_from_sample(sample)
if len(text_per_page) > MAX_PAGES_PER_PDF:
continue
page_chunks = extract_chunks(text_per_page, max_tokens_per_group=5000, max_pages_per_group=5, n_overlap=n_overlap)
for chunk in page_chunks:
if prompt_id is None:
selected_id_prompt = random.randint(0, 4)
prompt = PROMPTS[selected_id_prompt]
messages = create_llm_prompt(prompt, chunk)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True
)
tasks_dict = {
"__key__": sample['__key__'],
"Page count": len(text_per_page),
"messages": prompt,
"Prompt ID": selected_id_prompt
}
tasks.append(tasks_dict)
return tasks