def process_group()

in create_only_with_pdfs/load_data.py [0:0]


def process_group(key_group):
    try:
        key, group = key_group
        qa_pairs = []
        for _, row in group.iterrows():
            question = re.sub(r'^Q\d+: ', '', row['question'])
            answer = re.sub(r'^A\d+: ', '', row['answer'])
            if is_valid_question_or_answer(question) and is_valid_question_or_answer(answer):
                qa_pairs.append({
                    "user": question,
                    "assistant": answer,
                    "source": "PDFA key: " + str(row['__key__'])
                })
        if qa_pairs:
            return {
                "texts": qa_pairs,
                "pdf": group['pdf'].iloc[0]
            }    
    except Exception as e:
        print(f"Error processing group {key}: {e}")
        return None