in src/data_preprocess/subword.py [0:0]
def read_fasta(filepath, exclude):
'''
read fasta file
:param filepath: fasta filepath
:param exclude: exclude fasta filepath
:return:
'''
exclude_ids = set()
if exclude:
if isinstance(exclude, str):
exclude = [exclude]
for p in exclude:
with open(p, "r") as rfp:
for line in rfp:
protein_id = line.strip().split("|")[1]
exclude_ids.add(protein_id)
if isinstance(filepath, str):
filepath = [filepath]
dataset = []
for cur_filepath in filepath:
total = 0
with open(cur_filepath, "r") as rfp:
seq = ""
uuid = ""
for line in rfp:
line = line.strip()
if line.startswith(">"):
if seq and len(seq) > 0:
if len(exclude_ids) == 0:
dataset.append([uuid, seq])
total += 1
else:
strs = uuid.strip().split("|")
if len(strs) <= 1 or strs[1] not in exclude_ids:
dataset.append([uuid, seq])
total += 1
else:
pass
# print("in exclude list: %s" %uuid)
uuid = line
seq = ""
else:
seq += line
if seq and uuid and len(seq) > 0:
if len(exclude_ids) == 0:
dataset.append([uuid, seq])
total += 1
else:
strs = uuid.strip().split("|")
if len(strs) <= 1 or strs[1] not in exclude_ids:
dataset.append([uuid, seq])
total += 1
else:
pass
# print("in exclude list: %s" %uuid)
print("%s: %d" %(cur_filepath, total))
return dataset