in clean_and_create/load_data.py [0:0]
def process_group(key_group):
try:
key, group = key_group
qa_pairs = []
for _, row in group.iterrows():
question = re.sub(r'^Q\d+: ', '', row['question'])
answer = re.sub(r'^A\d+: ', '', row['answer'])
if is_valid_question_or_answer(question) and is_valid_question_or_answer(answer):
qa_pairs.append({
"user": question,
"assistant": answer,
"source": "PDFA key: " + str(row['__key__'])
})
if qa_pairs:
return {
"texts": qa_pairs,
"images": group['pdf'].iloc[0]
}
except Exception as e:
print(f"Error processing group {key}: {e}")
return None