in glan-instruct/glan.py [0:0]
def glan_instruction_generation(args):
"""
GLAN Pipeline
"""
GENERATE_DISCIPLINES = args.generate_disciplines
GENERATE_QUESTION_ONLY = args.generate_question_only
DISCIPLINES_FILEPATH = args.disciplines_filepath
LANGUAGE = args.language
MODEL_NAME = args.model_name
MODEL_NAME_FOR_ANSWER = args.model_name_for_answer
MAX_NUMBER_OF_FIELDS = args.max_number_of_fields
MAX_NUMBER_OF_SUBJECTS = args.max_number_of_subjects
MAX_NUMBER_OF_SUBTOPICS = args.max_number_of_subtopics
MAX_NUMBER_OF_SESSION_NAME = args.max_number_of_session_name
NUM_ITERATIONS = args.num_iterations
NUM_QUESTIONS_PER_ITERATION = args.num_questions_per_iteration
QUESTION_MAX_TOKENS = args.question_max_tokens
QUESTION_BACTH_SIZE = args.question_batch_size
ANSWER_MAX_TOKENS = args.answer_max_tokens
ANSWER_BACTH_SIZE = args.answer_batch_size
OUTPUT_DIR = args.output_dir
UUID = str(uuid.uuid4())[:4]
set_logger(args.logfile_name)
logger.info(f"GENERATE_DISCIPLINES = {GENERATE_DISCIPLINES}")
logger.info(f"GENERATE_QUESTION_ONLY = {GENERATE_QUESTION_ONLY}")
logger.info(f"DISCIPLINES_FILEPATH = {DISCIPLINES_FILEPATH}")
logger.info(f"LANGUAGE = {LANGUAGE}")
logger.info(f"MODEL_NAME = {MODEL_NAME}")
logger.info(f"MODEL_NAME_FOR_ANSWER = {MODEL_NAME_FOR_ANSWER}")
logger.info(f"MAX_NUMBER_OF_FIELDS = {MAX_NUMBER_OF_FIELDS}")
logger.info(f"MAX_NUMBER_OF_SUBJECTS = {MAX_NUMBER_OF_SUBJECTS}")
logger.info(f"MAX_NUMBER_OF_SUBTOPICS = {MAX_NUMBER_OF_SUBTOPICS}")
logger.info(f"MAX_NUMBER_OF_SESSION_NAME = {MAX_NUMBER_OF_SESSION_NAME}")
logger.info(f"NUM_ITERATIONS = {NUM_ITERATIONS}")
logger.info(f"NUM_QUESTIONS_PER_ITERATION = {NUM_QUESTIONS_PER_ITERATION}")
logger.info(f"QUESTION_MAX_TOKENS = {QUESTION_MAX_TOKENS}")
logger.info(f"QUESTION_BACTH_SIZE = {QUESTION_BACTH_SIZE}")
logger.info(f"ANSWER_MAX_TOKENS = {ANSWER_MAX_TOKENS}")
logger.info(f"ANSWER_BACTH_SIZE = {ANSWER_BACTH_SIZE}")
logger.info(f"OUTPUT_DIR = {OUTPUT_DIR}")
t0 = time.time()
all_questions = []
if GENERATE_DISCIPLINES:
logger.info(f"===== Generate a Taxonomy of human knowledge and capabilities")
t0 = time.time()
taxonomy_json, disciplines = generate_taxonomy(max_number_of_fields=MAX_NUMBER_OF_FIELDS, model_name="gpt-4o", temperature=0.5)
t1 = time.time()
logger.info(f"Generating taxonomy took {t1 - t0:.4f} seconds.")
else:
logger.info(f"===== Load pre-defined disciplines")
disciplines = read_text_to_list(DISCIPLINES_FILEPATH)
for idx1, discipline in enumerate(disciplines):
logger.info("====================================================================================================")
logger.info(f"===== [Discipline {idx1}] Generating Subjects for discipline: {discipline}")
logger.info("====================================================================================================")
subjects_json = generate_subjects(
discipline,
max_number_of_subjects=MAX_NUMBER_OF_SUBJECTS,
max_number_of_subtopics=MAX_NUMBER_OF_SUBTOPICS,
model_name=MODEL_NAME,
temperature=1.0,
top_p=0.95
)
logger.info(f"Number of subjects is {len(subjects_json['subjects'])}")
for idx2, s in enumerate(subjects_json["subjects"]):
subject = s['subject']
level = s['level']
subtopics = ", ".join(s['subtopics'])
logger.info("\t====================================================================================================")
logger.info(f"\t===== [Subject {idx2}] Generating Syllabus: Discipline: {discipline} - Subject: {subject} - Level: {level}")
logger.info("\t====================================================================================================")
class_sessions, key_concepts = generate_syllabus(
subject,
level,
subtopics,
max_number_of_session_name=MAX_NUMBER_OF_SESSION_NAME,
model_name=MODEL_NAME,
temperature=1.0,
top_p=0.95
)
logger.info(f"\tNumber of class sessions is {len(class_sessions)}")
questions = generate_questions(
class_sessions,
key_concepts,
subject,
level,
subtopics,
model_name=MODEL_NAME,
num_iterations=NUM_ITERATIONS,
num_questions_per_iteration=NUM_QUESTIONS_PER_ITERATION,
max_tokens=QUESTION_MAX_TOKENS,
batch_size=QUESTION_BACTH_SIZE,
language=LANGUAGE
)
# logger.info(f"\t===== Waiting for 30 seconds to avoid rate limit error.")
# time.sleep(30)
all_questions.extend(questions)
t1 = time.time()
timespan = format_timespan(t1 - t0)
logger.info(f"Generating Question dataset took {timespan}")
num_questions = len(all_questions)
os.makedirs(OUTPUT_DIR, exist_ok=True)
filename = f"{OUTPUT_DIR}/GLAN_Questions_{LANGUAGE}_{num_questions}_Samples_{UUID}.jsonl"
with jsonlines.open(filename, mode='w') as writer:
for question in all_questions:
writer.write(question)
if not GENERATE_QUESTION_ONLY:
all_answers = generate_answers(
all_questions,
model_name=MODEL_NAME_FOR_ANSWER,
max_tokens=ANSWER_MAX_TOKENS,
batch_size=ANSWER_BACTH_SIZE
)
instructions = []
for q, a in zip(all_questions, all_answers):
if a not in "DO NOT KNOW":
q.update({"answer": a})
instructions.append(q)
num_instructions = len(instructions)
new_filename = filename.replace("Questions", "Instructions")
with jsonlines.open(new_filename, mode='w') as writer:
for instruction in instructions:
writer.write(instruction)