in src/data_preprocess/data_preprocess_for_rdrp_v1.py [0:0]
def read_fasta(filepath, exclude):
'''
read sequences of fasta file
:param filepath: fasta filepath
:param exclude: fasta filepath of exclude sequences
:return: dataset
'''
exclude_ids = set()
if exclude:
if isinstance(exclude, str):
exclude = [exclude]
for p in exclude:
with open(p, "r") as rfp:
for line in rfp:
protein_id = line.strip().split("|")[1]
exclude_ids.add(protein_id)
if isinstance(filepath, str):
filepath = [filepath]
dataset = []
for cur_filepath in filepath:
total = 0
with open(cur_filepath, "r") as rfp:
seq = ""
uuid = ""
for line in rfp:
line = line.strip()
if line.startswith(">"):
if seq and len(seq) > 0:
if len(exclude_ids) == 0:
dataset.append([uuid, seq])
total += 1
else:
strs = uuid.strip().split("|")
if len(strs) <= 1 or strs[1] not in exclude_ids:
dataset.append([uuid, seq])
total += 1
else:
pass
uuid = line
seq = ""
else:
seq += line
if seq and uuid and len(seq) > 0:
if len(exclude_ids) == 0:
dataset.append([uuid, seq])
total += 1
else:
strs = uuid.strip().split("|")
if len(strs) <= 1 or strs[1] not in exclude_ids:
dataset.append([uuid, seq])
total += 1
else:
pass
print("%s: %d" % (cur_filepath, total))
return dataset