in src/protein_structure/structure_from_esm_v1.py [0:0]
def predict_for_structure(args, filepath_list, sequence_list, reverse=False):
'''
predict the 3d-structure of proteins
:param args: running parameters
:param filepath_list: sequence filepath list
:param sequence_list: sequence list
:param reverse: whether to reverse the list
:return:
'''
if filepath_list is None and sequence_list is None :
raise Exception("input empty error!")
all_sequences = []
if filepath_list:
for filepath in filepath_list:
print("doing filepath: %s ..." % filepath)
if ".csv" in filepath:
filename = os.path.basename(filepath)
cur_save_dir = os.path.join(args.save_dir, filepath.replace("../", "").replace(filename, ""), "pdb", filename.replace(".csv", ""))
if not os.path.exists(cur_save_dir):
os.makedirs(cur_save_dir)
print("save dir: %s." % cur_save_dir)
with open(filepath, "r") as rfp:
reader = csv.reader(rfp)
cnt = 0
for row in reader:
cnt += 1
if cnt == 1:
continue
all_sequences.append([row[0], row[1]])
elif ".fa" in filepath:
filename = os.path.basename(filepath)
cur_save_dir = os.path.join(args.save_dir, filepath.replace("../", "").replace(filename, ""), "pdb", filename.replace(".fasta", "").replace(".fas", "").replace(".fa", ""))
if not os.path.exists(cur_save_dir):
os.makedirs(cur_save_dir)
print("save dir: %s." % cur_save_dir)
all_sequences = all_sequences + [[v[0].strip(), v[1].strip()] for v in fasta_reader(filepath)]
else:
raise Exception("not support the type file: %s, must endswith '.csv' or '.fa*" % filepath)
else:
cur_save_dir = os.path.join(args.save_dir, "pdb")
if not os.path.exists(cur_save_dir):
os.makedirs(cur_save_dir)
all_sequences = sequence_list
if args.try_failure:
done_set, begin_uuid_index = load_done_set(
os.path.join(cur_save_dir, "result_info.csv"),
None
)
else:
done_set, begin_uuid_index = load_done_set(
os.path.join(cur_save_dir, "result_info.csv"),
os.path.join(cur_save_dir, "uncompleted.txt")
)
print("all number: %d" % len(all_sequences))
print("done number: %d" % len(done_set))
# exists
if args.exists_file:
exists_set = exists(args.exists_file)
else:
exists_set = set()
print("exists number: %d" % len(exists_set))
# remove the done list
all_sequences = [item for item in all_sequences if item[0] not in done_set and item[0] not in exists_set]
print("wanted number: %d" % len(all_sequences))
if reverse:
print("reverse=True")
all_sequences.reverse()
num_sequences, num_completed, avg_use_time, avg_total_seq_len = prediction(
args,
all_sequences,
save_dir=cur_save_dir,
begin_uuid_index=begin_uuid_index
)
print("total protein num: %d, completed num: %d, use time per seq: %f, avg seq len: %f" % (
num_sequences,
num_completed,
avg_use_time,
avg_total_seq_len
))
if filepath_list:
print("filepath: %s done." % filepath_list)
else:
print("done")
print("#"*100)