def extract_id_for_rdrp()

in src/data_preprocess/ncbi_id_2_uniprot.py [0:0]


def extract_id_for_rdrp(s, type="rdrp"):
    '''
    extract id for protein sequence name
    :param s: name
    :param type: protein type
    :return:
    '''
    if type == "rdrp":
        '''
        begin_idx = s.find("like_")
        if begin_idx < 0:
            begin_idx = s.find("_")
            if begin_idx >= 0:
                begin_idx += 1
        else:
            begin_idx += 5
        print("begin_idx:", begin_idx)
        if begin_idx < 0:
            return None
        end_idx = s[begin_idx:].find(".")
        print("end_idx:", end_idx)
        if end_idx < 1:
            return None
        return s[begin_idx: end_idx+begin_idx]
        '''
        end_idx = len(s)
        for idx in range(len(s) - 1, -1, -1):
            if '0' <= s[idx] <= '9' and idx > 0 and s[idx - 1] == '.':
                end_idx = idx - 1
                break
        begin_idx = 0
        has_alpha = False
        for idx in range(end_idx - 2, -1, -1):
            if s[idx] == "_" and has_alpha:
                begin_idx = idx + 1
                break
            elif 'a' <= s[idx] <= 'z' or 'A' <= s[idx] <= 'Z':
                has_alpha = True
        return s[begin_idx:end_idx]
    elif type == "other_virus" or type == "non_virus":
        strs = s.split("|")
        if len(strs) < 2:
            return None
        return strs[1]
    return None