def read_fasta()

in src/data_preprocess/data_preprocess_for_rdrp_v1.py [0:0]


def read_fasta(filepath, exclude):
    '''
    read sequences of fasta file
    :param filepath: fasta filepath
    :param exclude: fasta filepath of exclude sequences
    :return: dataset
    '''
    exclude_ids = set()
    if exclude:
        if isinstance(exclude, str):
            exclude = [exclude]
        for p in exclude:
            with open(p, "r") as rfp:
                for line in rfp:
                    protein_id = line.strip().split("|")[1]
                    exclude_ids.add(protein_id)

    if isinstance(filepath, str):
        filepath = [filepath]
    dataset = []
    for cur_filepath in filepath:
        total = 0
        with open(cur_filepath, "r") as rfp:
            seq = ""
            uuid = ""
            for line in rfp:
                line = line.strip()
                if line.startswith(">"):
                    if seq and len(seq) > 0:
                        if len(exclude_ids) == 0:
                            dataset.append([uuid, seq])
                            total += 1
                        else:
                            strs = uuid.strip().split("|")
                            if len(strs) <= 1 or strs[1] not in exclude_ids:
                                dataset.append([uuid, seq])
                                total += 1
                            else:
                                pass
                    uuid = line
                    seq = ""
                else:
                    seq += line
            if seq and uuid and len(seq) > 0:
                if len(exclude_ids) == 0:
                    dataset.append([uuid, seq])
                    total += 1
                else:
                    strs = uuid.strip().split("|")
                    if len(strs) <= 1 or strs[1] not in exclude_ids:
                        dataset.append([uuid, seq])
                        total += 1
                    else:
                        pass
        print("%s: %d" % (cur_filepath, total))

    return dataset