in mmcif_utils.py [0:0]
def cif_to_embed(cif_file, ix=None, parse_skip=False):
"""
Parses a CIF file into a more convenient representation.
# Embedding format for nodes:
# 'one hot amino acid' amino type of molecule
# 'x, y, z' positional encoding
# 'one hot representation of atom type', either C, CA, N, O,
"""
st = gemmi.read_structure(cif_file)
results = []
skips = []
for model in st:
for i, chain in enumerate(model):
if (ix is not None) and (ix != i):
continue
atoms = []
node_embeddings = []
for j, residue in enumerate(chain):
translation = []
if residue.name not in residue_names:
# Skip over any structure that contains nucleotides
if residue.name in ["DA", "DC", "DG", "DT"]:
return None, None
else:
continue
residue_counter = 0
namino_elements = len(res_parents[residue.name])
amino_atoms = res_atoms[residue.name]
residue_atoms = []
residue_embed = []
# reisdue object contains information about the residue, including identity
# and spatial coordiantes for atoms in the residue. We parse this into a
# dense encoding, for feeding into a neural network.
node_embed = parse_residue_embed(residue)
if len(node_embed) == 0:
skips.append(j)
node_embeddings.extend(node_embed)
node_embeddings = np.array(node_embeddings)
result = (node_embeddings,)
results.append(result)
if parse_skip:
return st, results, skips
else:
return st, results