in src/data_preprocess/ncbi_id_2_uniprot.py [0:0]
def extract_id_for_rdrp(s, type="rdrp"):
'''
extract id for protein sequence name
:param s: name
:param type: protein type
:return:
'''
if type == "rdrp":
'''
begin_idx = s.find("like_")
if begin_idx < 0:
begin_idx = s.find("_")
if begin_idx >= 0:
begin_idx += 1
else:
begin_idx += 5
print("begin_idx:", begin_idx)
if begin_idx < 0:
return None
end_idx = s[begin_idx:].find(".")
print("end_idx:", end_idx)
if end_idx < 1:
return None
return s[begin_idx: end_idx+begin_idx]
'''
end_idx = len(s)
for idx in range(len(s) - 1, -1, -1):
if '0' <= s[idx] <= '9' and idx > 0 and s[idx - 1] == '.':
end_idx = idx - 1
break
begin_idx = 0
has_alpha = False
for idx in range(end_idx - 2, -1, -1):
if s[idx] == "_" and has_alpha:
begin_idx = idx + 1
break
elif 'a' <= s[idx] <= 'z' or 'A' <= s[idx] <= 'Z':
has_alpha = True
return s[begin_idx:end_idx]
elif type == "other_virus" or type == "non_virus":
strs = s.split("|")
if len(strs) < 2:
return None
return strs[1]
return None