in evolve-instruct/convert.py [0:0]
def transform_jsonl(input_file_path, output_file_path):
output_data = []
idx = 1
with open(input_file_path, 'r', encoding='utf-8-sig') as infile:
for line in infile:
conversation = json.loads(line)
skill = None
# Extract skill string in "system" message
system_message = next((msg for msg in conversation['messages'] if msg['role'] == 'system'), None)
if system_message:
match = re.search(r'SME \(Subject Matter Expert\) in ([\w\s]+)', system_message['content'])
if match:
skill = match.group(1).strip()
# Extract "user" message
for message in conversation['messages']:
if message['role'] == 'user' and skill:
output_data.append({
"idx": idx,
"Skill": skill,
"Difficulty": 5,
"Instruction": message['content']
})
idx += 1
with open(output_file_path, 'w', encoding='utf-8') as outfile:
for entry in output_data:
outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
print(f"Transformed {len(output_data)} entries. Please run evolve.py to generate the augmented dataset.")