in preprocess/cram_vul_dataset/src2asm.py [0:0]
def process(d, repo):
idx = d["idx"]
commit, label = d["commit_id"], d["target"]
# if this data point have been processed
if commit in finished:
print(commit, "has finished. Omitting")
return
if args.vulnerable and label == "0":
return
print("Start processing", idx, commit)
# get the whole code to pair with assembly when finished
all_code = d["func"]
# get the first line of code to extract the function name
# FUNC_TYPE FUNC_NAME(Parameters)
code = d["func"].split("\n")[0]
func_name = code.split('(')[0].split(' ')[-1]
# Just in case the first line is not the declaration
if len(code) < 10:
print(f"Warning code length {idx} {commit} {code}\n")
# get the changed files from currect commit, only one contains the file name for the target function
repo.git.checkout(commit, force=True)
previous_commit = repo.git.rev_list(
'--parents', '-n', '1', commit).split()[1]
diffs = repo.git.diff('HEAD~1', name_only=True).split('\n')
# restore to parent commit
previous_commit = repo.git.rev_list(
'--parents', '-n', '1', commit).split()[1]
repo.git.checkout(previous_commit, force=True)
correct = None
# find out the file name with target function by exactly matching the first line of it
for d in diffs:
try:
if code in open(os.path.join(repo_dir, d)).read():
correct = d
break
except:
# Sometimes it would report decode error, only very few times
traceback.print_exc()
print("Error reading", file, repo_dir, d, "\n\n")
# if not find the file with target functions
# never happened till now
if not correct:
print(f"Error find file {commit} {previous_commit}\n")
return
print(f"Correct {commit} {previous_commit} {correct}")
# store every build history with the commit as folder name (not the previous)
build_dir = os.path.join(repo_dir, "build", commit)
# change optimization level
configure_file = os.path.join(repo_dir, "configure")
configure_text = re.sub(r"-O[0-5sz]", f"-O{args.optimization_level}", open(configure_file).read())
open(configure_file, "w").write(configure_text)
print("Start building")
try:
os.mkdir(build_dir)
subprocess.check_output(
[f"cd {build_dir} && ../../configure --disable-werror && make -j{args.parallel_num}"], stderr=subprocess.STDOUT, shell=True)
except:
# if the os.mkdir fails, it means this commit has been build
print("Omit building")
# find the object file
obj_path = os.path.join(build_dir, correct.replace(".c", ".o"))
try:
results = subprocess.check_output(
[f"nm -f sysv {obj_path} | grep {func_name}"],
stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')
except:
# Exception means make failed
# Simply rebuild
# If still failed, just abort and record error
print("Rebuild")
results = subprocess.check_output(
[f"rm -rf {build_dir} && mkdir {build_dir} && cd {build_dir} && ../../configure && make -j{args.parallel_num}"],
stderr=subprocess.STDOUT, shell=True)
open(os.path.join(fail_dir, f"{commit}.log"), "wb").write(results)
results = subprocess.check_output(
[f"nm -f sysv {obj_path} | grep {func_name}"],
stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')
start, end = None, None
# find out the function name by exact match
# then extract the address
for r in results:
r = r.split('|')
if len(r) != 7:
continue
if r[0].strip() == func_name:
start, end = int(r[1].strip(), 16), int(r[4].strip(), 16)
end = hex(start + end)
start = hex(start)
# if not find the fucntion address
# never happened till now
if not start:
print(f"Error finding start {commit} {previous_commit} {results}")
return
# get the assembly
results = subprocess.check_output(
[f"objdump -w -d --start-address={start} --stop-address={end} {obj_path} --section=.text"],
stderr=subprocess.STDOUT, shell=True).decode("utf-8").split('\n')
assembly = '\n'.join(results[6:])
copy2(obj_path, os.path.join(output_object_dir, f"{idx}_{commit}.o"))
data = {
"idx": idx,
"commit_hash": commit,
"previsous_hash": previous_commit,
"source": all_code,
"assembly": assembly,
"label": label,
"filename": correct
}
json.dump(data, open(os.path.join(output_meta_dir, f"{idx}_{commit}.json"), "w"))
open(os.path.join(output_source_dir, f"{idx}_{commit}.cpp"), "w").write(all_code)
open(os.path.join(output_assembly_dir, f"{idx}_{commit}.s"), "w").write(assembly)
print("Success", commit, '\n\n')