def process()

in preprocess/cram_vul_dataset/src2asm.py [0:0]


def process(d, repo):
    idx = d["idx"]
    commit, label = d["commit_id"], d["target"]

    # if this data point have been processed
    if commit in finished:
        print(commit, "has finished. Omitting")
        return

    if args.vulnerable and label == "0":
        return

    print("Start processing", idx, commit)

    # get the whole code to pair with assembly when finished
    all_code = d["func"]

    # get the first line of code to extract the function name
    # FUNC_TYPE FUNC_NAME(Parameters)
    code = d["func"].split("\n")[0]
    func_name = code.split('(')[0].split(' ')[-1]

    # Just in case the first line is not the declaration
    if len(code) < 10:
        print(f"Warning code length {idx} {commit} {code}\n")

    # get the changed files from currect commit, only one contains the file name for the target function
    repo.git.checkout(commit, force=True)
    previous_commit = repo.git.rev_list(
        '--parents', '-n', '1', commit).split()[1]
    diffs = repo.git.diff('HEAD~1', name_only=True).split('\n')

    # restore to parent commit
    previous_commit = repo.git.rev_list(
        '--parents', '-n', '1', commit).split()[1]
    repo.git.checkout(previous_commit, force=True)
    correct = None

    # find out the file name with target function by exactly matching the first line of it
    for d in diffs:
        try:
            if code in open(os.path.join(repo_dir, d)).read():
                correct = d
                break
        except:
            # Sometimes it would report decode error, only very few times
            traceback.print_exc()
            print("Error reading", file, repo_dir, d, "\n\n")

    # if not find the file with target functions
    # never happened till now
    if not correct:
        print(f"Error find file {commit} {previous_commit}\n")
        return

    print(f"Correct {commit} {previous_commit} {correct}")

    # store every build history with the commit as folder name (not the previous)
    build_dir = os.path.join(repo_dir, "build", commit)

    # change optimization level
    configure_file = os.path.join(repo_dir, "configure")
    configure_text = re.sub(r"-O[0-5sz]", f"-O{args.optimization_level}", open(configure_file).read())
    open(configure_file, "w").write(configure_text)

    print("Start building")

    try:
        os.mkdir(build_dir)
        subprocess.check_output(
            [f"cd {build_dir} && ../../configure --disable-werror && make -j{args.parallel_num}"], stderr=subprocess.STDOUT, shell=True)
    except:
        # if the os.mkdir fails, it means this commit has been build
        print("Omit building")

    # find the object file
    obj_path = os.path.join(build_dir, correct.replace(".c", ".o"))

    try:
        results = subprocess.check_output(
            [f"nm -f sysv {obj_path} | grep {func_name}"], 
            stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')
    except:
        # Exception means make failed
        # Simply rebuild
        # If still failed, just abort and record error
        print("Rebuild")
        results = subprocess.check_output(
            [f"rm -rf {build_dir} && mkdir {build_dir} && cd {build_dir} && ../../configure && make -j{args.parallel_num}"], 
            stderr=subprocess.STDOUT, shell=True)

        open(os.path.join(fail_dir, f"{commit}.log"), "wb").write(results)

        results = subprocess.check_output(
            [f"nm -f sysv {obj_path} | grep {func_name}"], 
            stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')

    start, end = None, None

    # find out the function name by exact match
    # then extract the address
    for r in results:
        r = r.split('|')

        if len(r) != 7:
            continue

        if r[0].strip() == func_name:
            start, end = int(r[1].strip(), 16), int(r[4].strip(), 16)
            end = hex(start + end)
            start = hex(start)

    # if not find the fucntion address
    # never happened till now
    if not start:
        print(f"Error finding start {commit} {previous_commit} {results}")
        return

    # get the assembly
    results = subprocess.check_output(
        [f"objdump -w -d --start-address={start} --stop-address={end} {obj_path} --section=.text"], 
        stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')
    assembly = '\n'.join(results[6:])

    copy2(obj_path, os.path.join(output_object_dir, f"{idx}_{commit}.o"))

    data = {
        "idx": idx,
        "commit_hash": commit,
        "previsous_hash": previous_commit,
        "source": all_code,
        "assembly": assembly,
        "label": label,
        "filename": correct
    }
    json.dump(data, open(os.path.join(output_meta_dir, f"{idx}_{commit}.json"), "w"))
    open(os.path.join(output_source_dir, f"{idx}_{commit}.cpp"), "w").write(all_code)
    open(os.path.join(output_assembly_dir, f"{idx}_{commit}.s"), "w").write(assembly)
    print("Success", commit, '\n\n')