def load_GO_annot()

in lmgvp/deepfrier_utils.py [0:0]
36 lines of code
9 McCabe index (conditional complexity)

def load_GO_annot(filename):
    """
    Loads the GO annotations.

    Args:
        filename: String representing the path to the GO annotations file.
    Returns
        Quatruple where elements are
            1/ a dict of dict with protein annotations: {protein: {'cc': np.array([...])}}
            2/ a dict with metadata of GO terms: {'cc': [goterm1, ...]}
            3/ a dict with metadata of GO names: {'cc': [goname1, ...]}
            4/ a dict with protein counts of GO terms: {'cc': np.array(...)}
    """
    # Load GO annotations
    onts = ["mf", "bp", "cc"]
    prot2annot = {}
    goterms = {ont: [] for ont in onts}
    gonames = {ont: [] for ont in onts}
    with open(filename, mode="r") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")

        # molecular function
        next(reader, None)  # skip the headers
        goterms[onts[0]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[0]] = next(reader)

        # biological process
        next(reader, None)  # skip the headers
        goterms[onts[1]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[1]] = next(reader)

        # cellular component
        next(reader, None)  # skip the headers
        goterms[onts[2]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[2]] = next(reader)

        next(reader, None)  # skip the headers
        counts = {
            ont: np.zeros(len(goterms[ont]), dtype=float) for ont in onts
        }
        for row in reader:
            prot, prot_goterms = row[0], row[1:]
            prot2annot[prot] = {ont: [] for ont in onts}
            for i in range(3):
                goterm_indices = [
                    goterms[onts[i]].index(goterm)
                    for goterm in prot_goterms[i].split(",")
                    if goterm != ""
                ]
                prot2annot[prot][onts[i]] = np.zeros(len(goterms[onts[i]]))
                prot2annot[prot][onts[i]][goterm_indices] = 1.0
                counts[onts[i]][goterm_indices] += 1.0
    return prot2annot, goterms, gonames, counts