in generation/llm_swarm_script.py [0:0]
def extract_text_per_page_from_sample(sample: Dict[str, Any]) -> List[str]:
"""
Extracts text from each page of a given sample and returns it as a list of strings.
Args:
sample (Dict[str, Any]): The sample containing page data in JSON format.
Returns:
List[str]: A list of strings, where each string represents the text of a page.
"""
texts = []
for page in sample['json']['pages']:
pages_text = ' \n '.join(page['lines']['text'])
texts.append(pages_text)
return texts