seed/util/common_utils.py (68 lines of code) (raw):
import os
import shutil
import json
import random
def get_language_code(language_name):
languages = {
"English": "en",
"Korean": "ko",
"Spanish": "es",
"French": "fr",
"German": "de",
"Chinese": "zh",
"Japanese": "ja",
"Russian": "ru",
"Portuguese": "pt",
"Italian": "it",
"Arabic": "ar",
"Hindi": "hi",
"Bengali": "bn",
"Punjabi": "pa",
"Javanese": "jv",
"Turkish": "tr",
"Vietnamese": "vi",
"Persian": "fa",
"Polish": "pl",
"Dutch": "nl",
}
return languages.get(language_name, "Unknown")
def delete_folder_and_make_folder(output_dir):
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
print(f"The folder '{output_dir}' and its contents have been deleted.")
os.makedirs(output_dir, exist_ok=True)
def convert_to_oai_format(qa_pair, system_prompt_msg="You're an AI assistant that guides a user to the location of your CS center"):
"""
Convert the QA pair to the jsonl format required by the OpenAI API.
Args:
qa_pair: list of dictionaries or list of lists containing the QA pairs
system_prompt_msg: message to be displayed as the system prompt
Returns:
formatted_data: jsonl format data for OpenAI API
"""
if isinstance(qa_pair, list):
formatted_data = []
for qa in qa_pair:
sample = [{"role": "system", "content": system_prompt_msg}]
if isinstance(qa, list): # multi-turn
for qa_ in qa:
if isinstance(qa_, dict):
user_message = {"role": "user", "content": qa_["QUESTION"]}
assistant_message = {"role": "assistant", "content": qa_["ANSWER"]}
else:
user_message = {"role": "user", "content": qa_[0]}
assistant_message = {"role": "assistant", "content": qa_[1]}
sample.append(user_message)
sample.append(assistant_message)
else: # single turn
if isinstance(qa, dict):
user_message = {"role": "user", "content": qa["QUESTION"]}
assistant_message = {"role": "assistant", "content": qa["ANSWER"]}
else:
user_message = {"role": "user", "content": qa[0]}
assistant_message = {"role": "assistant", "content": qa[1]}
sample.append(user_message)
sample.append(assistant_message)
msg = {"messages": sample}
formatted_data.append(msg)
random.shuffle(formatted_data)
return formatted_data
else:
print("Argument is not a list")
return None
def save_jsonl(dictionary_data, file_path):
with open(file_path, 'w', encoding='UTF-8-sig') as f:
for entry in dictionary_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")