def glan_instruction_generation()

in glan-instruct/glan.py [0:0]


def glan_instruction_generation(args):
    """
    GLAN Pipeline
    """
    GENERATE_DISCIPLINES = args.generate_disciplines
    GENERATE_QUESTION_ONLY = args.generate_question_only
    DISCIPLINES_FILEPATH = args.disciplines_filepath
    LANGUAGE = args.language
    MODEL_NAME = args.model_name
    MODEL_NAME_FOR_ANSWER = args.model_name_for_answer
    MAX_NUMBER_OF_FIELDS = args.max_number_of_fields
    MAX_NUMBER_OF_SUBJECTS = args.max_number_of_subjects
    MAX_NUMBER_OF_SUBTOPICS = args.max_number_of_subtopics
    MAX_NUMBER_OF_SESSION_NAME = args.max_number_of_session_name
    NUM_ITERATIONS = args.num_iterations
    NUM_QUESTIONS_PER_ITERATION = args.num_questions_per_iteration
    QUESTION_MAX_TOKENS = args.question_max_tokens
    QUESTION_BACTH_SIZE = args.question_batch_size
    ANSWER_MAX_TOKENS = args.answer_max_tokens
    ANSWER_BACTH_SIZE = args.answer_batch_size
    OUTPUT_DIR = args.output_dir
    UUID = str(uuid.uuid4())[:4]

    set_logger(args.logfile_name)

    logger.info(f"GENERATE_DISCIPLINES = {GENERATE_DISCIPLINES}")
    logger.info(f"GENERATE_QUESTION_ONLY = {GENERATE_QUESTION_ONLY}")    
    logger.info(f"DISCIPLINES_FILEPATH = {DISCIPLINES_FILEPATH}")
    logger.info(f"LANGUAGE = {LANGUAGE}")
    logger.info(f"MODEL_NAME = {MODEL_NAME}")
    logger.info(f"MODEL_NAME_FOR_ANSWER = {MODEL_NAME_FOR_ANSWER}")
    logger.info(f"MAX_NUMBER_OF_FIELDS = {MAX_NUMBER_OF_FIELDS}")
    logger.info(f"MAX_NUMBER_OF_SUBJECTS = {MAX_NUMBER_OF_SUBJECTS}")
    logger.info(f"MAX_NUMBER_OF_SUBTOPICS = {MAX_NUMBER_OF_SUBTOPICS}")
    logger.info(f"MAX_NUMBER_OF_SESSION_NAME = {MAX_NUMBER_OF_SESSION_NAME}")
    logger.info(f"NUM_ITERATIONS = {NUM_ITERATIONS}")
    logger.info(f"NUM_QUESTIONS_PER_ITERATION = {NUM_QUESTIONS_PER_ITERATION}")
    logger.info(f"QUESTION_MAX_TOKENS = {QUESTION_MAX_TOKENS}")
    logger.info(f"QUESTION_BACTH_SIZE = {QUESTION_BACTH_SIZE}")
    logger.info(f"ANSWER_MAX_TOKENS = {ANSWER_MAX_TOKENS}")
    logger.info(f"ANSWER_BACTH_SIZE = {ANSWER_BACTH_SIZE}")
    logger.info(f"OUTPUT_DIR = {OUTPUT_DIR}") 
        
    t0 = time.time()
    all_questions = []

    if GENERATE_DISCIPLINES:
        logger.info(f"===== Generate a Taxonomy of human knowledge and capabilities")
        t0 = time.time()
        taxonomy_json, disciplines = generate_taxonomy(max_number_of_fields=MAX_NUMBER_OF_FIELDS, model_name="gpt-4o", temperature=0.5)
        t1 = time.time()
        logger.info(f"Generating taxonomy took {t1 - t0:.4f} seconds.")
    else:
        logger.info(f"===== Load pre-defined disciplines")
        disciplines = read_text_to_list(DISCIPLINES_FILEPATH)

    for idx1, discipline in enumerate(disciplines):
        logger.info("====================================================================================================")        
        logger.info(f"===== [Discipline {idx1}] Generating Subjects for discipline: {discipline}") 
        logger.info("====================================================================================================")        
        subjects_json = generate_subjects(
            discipline, 
            max_number_of_subjects=MAX_NUMBER_OF_SUBJECTS, 
            max_number_of_subtopics=MAX_NUMBER_OF_SUBTOPICS, 
            model_name=MODEL_NAME, 
            temperature=1.0, 
            top_p=0.95
        )
        
        logger.info(f"Number of subjects is {len(subjects_json['subjects'])}") 
        for idx2, s in enumerate(subjects_json["subjects"]):
            subject = s['subject']
            level = s['level']
            subtopics = ", ".join(s['subtopics'])
            
            logger.info("\t====================================================================================================")        
            logger.info(f"\t===== [Subject {idx2}] Generating Syllabus: Discipline: {discipline} - Subject: {subject} - Level: {level}") 
            logger.info("\t====================================================================================================")        
            class_sessions, key_concepts = generate_syllabus(
                subject, 
                level, 
                subtopics,
                max_number_of_session_name=MAX_NUMBER_OF_SESSION_NAME, 
                model_name=MODEL_NAME, 
                temperature=1.0, 
                top_p=0.95
            )
            logger.info(f"\tNumber of class sessions is {len(class_sessions)}")

            questions = generate_questions(
                class_sessions, 
                key_concepts, 
                subject, 
                level, 
                subtopics,
                model_name=MODEL_NAME, 
                num_iterations=NUM_ITERATIONS,
                num_questions_per_iteration=NUM_QUESTIONS_PER_ITERATION, 
                max_tokens=QUESTION_MAX_TOKENS, 
                batch_size=QUESTION_BACTH_SIZE,
                language=LANGUAGE
            )
            # logger.info(f"\t===== Waiting for 30 seconds to avoid rate limit error.") 
            # time.sleep(30)
            all_questions.extend(questions)

    t1 = time.time()
    timespan = format_timespan(t1 - t0)
    logger.info(f"Generating Question dataset took {timespan}")

    num_questions = len(all_questions)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    filename = f"{OUTPUT_DIR}/GLAN_Questions_{LANGUAGE}_{num_questions}_Samples_{UUID}.jsonl"

    with jsonlines.open(filename, mode='w') as writer:
        for question in all_questions:
            writer.write(question)

    if not GENERATE_QUESTION_ONLY:
        all_answers = generate_answers(
            all_questions, 
            model_name=MODEL_NAME_FOR_ANSWER, 
            max_tokens=ANSWER_MAX_TOKENS, 
            batch_size=ANSWER_BACTH_SIZE
        )

        instructions = []
        for q, a in zip(all_questions, all_answers):
            if a not in "DO NOT KNOW":
                q.update({"answer": a})
                instructions.append(q)

        num_instructions = len(instructions)
        new_filename = filename.replace("Questions", "Instructions")

        with jsonlines.open(new_filename, mode='w') as writer:
            for instruction in instructions:
                writer.write(instruction)