def parse_dense_format()

in mmcif_utils.py [0:0]


def parse_dense_format(node_embed):
    """
    In protein-ebm, we represent amino acids in two different formats.
    This method converts from the dense format to a sparse format.
    
    ===============
    ==== Dense ====
    ===============
    The dense format represents a protein using a is a D x 6 dimensional represention.
    Each 6 dimensional vector represents an atom, following this scheme:
        [1]: amino acid identity of the amino acid the atom is part of (residue_idx)
        [2]: element identity of the amino acid the atom is part of (atom_idx)
        [3]: positional location of atom in the amino acid (atom_num)
        [4..6]: x,y,z coordinates
    The dense format is useful for feeding data into a neural network.

    ===============
    ==== Sparse ===
    ===============
    The sparse format represents a data based on its topology (parent/child/etc).
    It follows this scheme:
        amino_name: amino acid to substitue
        par: A N x 20 encoding of the relative offset of the parent of each atom. For example,
                the amino acid glycine would be represented as [-18 -1 -1 -1 0, ...]
        child: A N x 20 encoding of the child of each atom. For example, the amino acid glycine
                would be represented as [1 1 18 0 0 0 ..]
        pos_exist: A N x 20 mask encoding of which atoms are valid for each amino acid so for
        example the amino acid glycine would be represented as [1 1 1 1 0 0 ...]
        chi_valid: A N x 5 mask encoding which chi angles are valid, so for example glycine would
        be represented as [0 0 0 0 0]
        pos: A N x 20 x 3 encoding the (x, y, z) coordinates of each atom per amino acid in a protein
        i: amino acid position to substitute
        sequence_map: map from amino acid to structure
        rotate_matrix: matrix of rotation to amino acid position
    This format is easier for manipulating the proteins, e.g changing the rotamers
    during negative sampling. 

    See comments in the implementation below for more details.
    """

    # The input is a list of atoms. We keep track of how many we have processed.
    start = 0

    # Construct amino acid-level information from the atomic inputs
    # Each amino acid is described on the atomic-level by 20-dim lists
    pars = []  # ordinal distance of parent atoms
    childs = []  # ordinal distance of cildren atoms
    pos = []  # 3d translations of each atom
    pos_exists = []  # whether a position exists or not
    residues = []  # the name of the amino acid
    chis_valid = []  # a 20-dim list describing which atoms are part of the chi angle

    # consume all of the atoms in the input
    while start < node_embed.shape[0]:
        idx = int(node_embed[start, 0])
        residue = residue_names[idx]

        # Get the parent and child representation (see amino_acid_config.py)
        par = res_parents[residue].copy()
        child = res_children[residue].copy()
        n = len(par)

        # 20-dim mask of which positions encode meaningful values
        pos_exist = [1] * n + [0] * (20 - n)  # this is the mask

        # pad up to 20-dim with 0s
        par = par + [0] * (20 - n)
        child = child + [0] * (20 - len(child))

        # x,y,z coordinates for each of the atoms in the amino acid, padded to 20-dim
        pos_temp = np.concatenate(
            [node_embed[start : start + n, -3:], np.zeros((20 - n, 3))], axis=0
        )

        # If we can fit these n atom in, then record the information
        if start + n <= node_embed.shape[0]:
            pars.append(par)
            childs.append(child)
            pos.append(pos_temp)
            pos_exists.append(pos_exist)
            chis = res_chis[residue]
            chis_valid.append([1] * len(chis) + [0] * (20 - len(chis)))
            residues.append(residue.lower())

        # All atoms from start <-> start+n should belong to the same amino acid
        if not (node_embed[start : start + n, 0] == idx).all():
            return None, None, None, None, None, None

        # keep track of number of atoms consumeed
        start = start + n

    # Don't proceess single amino acid prorteins
    if len(pos) < 2:
        return None, None, None, None, None, None

    # Wrap the results in numpy arrays
    pars, childs, pos, pos_exists, chis_valid = (
        np.array(pars),
        np.array(childs),
        np.stack(pos, axis=0),
        np.array(pos_exists),
        np.array(chis_valid),
    )

    # The code above assumes that each nitrogen is connected to previous carbon
    # and each carbon is connected to the next nitrogen. This is not the case
    # for the N-terminus and C-terminus, so we need to override those cases.
    pars[0, 0] = 0
    childs[-1, 2] = 0

    # return the new encoding in amino acid form
    return pars, childs, pos, pos_exists, residues, chis_valid