evolve-instruct/convert.py (34 lines of code) (raw):
import json
import re
import argparse
def transform_jsonl(input_file_path, output_file_path):
output_data = []
idx = 1
with open(input_file_path, 'r', encoding='utf-8-sig') as infile:
for line in infile:
conversation = json.loads(line)
skill = None
# Extract skill string in "system" message
system_message = next((msg for msg in conversation['messages'] if msg['role'] == 'system'), None)
if system_message:
match = re.search(r'SME \(Subject Matter Expert\) in ([\w\s]+)', system_message['content'])
if match:
skill = match.group(1).strip()
# Extract "user" message
for message in conversation['messages']:
if message['role'] == 'user' and skill:
output_data.append({
"idx": idx,
"Skill": skill,
"Difficulty": 5,
"Instruction": message['content']
})
idx += 1
with open(output_file_path, 'w', encoding='utf-8') as outfile:
for entry in output_data:
outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
print(f"Transformed {len(output_data)} entries. Please run evolve.py to generate the augmented dataset.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Options')
parser.add_argument("--input_file_path", type=str, default="../seed/dataset/advertising-multiple-oai.jsonl", help="Path to the input JSONL file")
parser.add_argument("--output_file_path", type=str, default="seed.jsonl", help="Path to the output JSONL file")
args = parser.parse_args()
transform_jsonl(args.input_file_path, args.output_file_path)