in mmcif_utils.py [0:0]
def parse_dense_format(node_embed):
"""
In protein-ebm, we represent amino acids in two different formats.
This method converts from the dense format to a sparse format.
===============
==== Dense ====
===============
The dense format represents a protein using a is a D x 6 dimensional represention.
Each 6 dimensional vector represents an atom, following this scheme:
[1]: amino acid identity of the amino acid the atom is part of (residue_idx)
[2]: element identity of the amino acid the atom is part of (atom_idx)
[3]: positional location of atom in the amino acid (atom_num)
[4..6]: x,y,z coordinates
The dense format is useful for feeding data into a neural network.
===============
==== Sparse ===
===============
The sparse format represents a data based on its topology (parent/child/etc).
It follows this scheme:
amino_name: amino acid to substitue
par: A N x 20 encoding of the relative offset of the parent of each atom. For example,
the amino acid glycine would be represented as [-18 -1 -1 -1 0, ...]
child: A N x 20 encoding of the child of each atom. For example, the amino acid glycine
would be represented as [1 1 18 0 0 0 ..]
pos_exist: A N x 20 mask encoding of which atoms are valid for each amino acid so for
example the amino acid glycine would be represented as [1 1 1 1 0 0 ...]
chi_valid: A N x 5 mask encoding which chi angles are valid, so for example glycine would
be represented as [0 0 0 0 0]
pos: A N x 20 x 3 encoding the (x, y, z) coordinates of each atom per amino acid in a protein
i: amino acid position to substitute
sequence_map: map from amino acid to structure
rotate_matrix: matrix of rotation to amino acid position
This format is easier for manipulating the proteins, e.g changing the rotamers
during negative sampling.
See comments in the implementation below for more details.
"""
# The input is a list of atoms. We keep track of how many we have processed.
start = 0
# Construct amino acid-level information from the atomic inputs
# Each amino acid is described on the atomic-level by 20-dim lists
pars = [] # ordinal distance of parent atoms
childs = [] # ordinal distance of cildren atoms
pos = [] # 3d translations of each atom
pos_exists = [] # whether a position exists or not
residues = [] # the name of the amino acid
chis_valid = [] # a 20-dim list describing which atoms are part of the chi angle
# consume all of the atoms in the input
while start < node_embed.shape[0]:
idx = int(node_embed[start, 0])
residue = residue_names[idx]
# Get the parent and child representation (see amino_acid_config.py)
par = res_parents[residue].copy()
child = res_children[residue].copy()
n = len(par)
# 20-dim mask of which positions encode meaningful values
pos_exist = [1] * n + [0] * (20 - n) # this is the mask
# pad up to 20-dim with 0s
par = par + [0] * (20 - n)
child = child + [0] * (20 - len(child))
# x,y,z coordinates for each of the atoms in the amino acid, padded to 20-dim
pos_temp = np.concatenate(
[node_embed[start : start + n, -3:], np.zeros((20 - n, 3))], axis=0
)
# If we can fit these n atom in, then record the information
if start + n <= node_embed.shape[0]:
pars.append(par)
childs.append(child)
pos.append(pos_temp)
pos_exists.append(pos_exist)
chis = res_chis[residue]
chis_valid.append([1] * len(chis) + [0] * (20 - len(chis)))
residues.append(residue.lower())
# All atoms from start <-> start+n should belong to the same amino acid
if not (node_embed[start : start + n, 0] == idx).all():
return None, None, None, None, None, None
# keep track of number of atoms consumeed
start = start + n
# Don't proceess single amino acid prorteins
if len(pos) < 2:
return None, None, None, None, None, None
# Wrap the results in numpy arrays
pars, childs, pos, pos_exists, chis_valid = (
np.array(pars),
np.array(childs),
np.stack(pos, axis=0),
np.array(pos_exists),
np.array(chis_valid),
)
# The code above assumes that each nitrogen is connected to previous carbon
# and each carbon is connected to the next nitrogen. This is not the case
# for the N-terminus and C-terminus, so we need to override those cases.
pars[0, 0] = 0
childs[-1, 2] = 0
# return the new encoding in amino acid form
return pars, childs, pos, pos_exists, residues, chis_valid