in src/data_preprocess/data_preprocess_for_rdrp_v2.py [0:0]
def _convert_numpy_folder(self, idx):
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
tfrecord_fn = os.path.join(self.save_path, '%0.2d-of-%0.2d.tfrecords' % (idx, self.num_shards))
# writer = tf.python_io.TFRecordWriter(tfrecord_fn)
writer = tf.io.TFRecordWriter(tfrecord_fn)
print("### Serializing %d examples into %s" % (len(self.prot_list), tfrecord_fn))
tmp_prot_list = self.prot_list[self.indices[idx][0]:self.indices[idx][1]]
for i, protein_id in enumerate(tmp_prot_list):
if i % 500 == 0:
print("### Iter = %d/%d" % (i, len(tmp_prot_list)))
item = self.dataset[protein_id]
protein_seq = item[0]
protein_label = item[1]
protein_source = item[2]
embedding_obj = None
if self.protein_2_embedding_idx:
embedding_idx = self.protein_2_embedding_idx[protein_id]
embedding_file = os.path.join(self.embedding_dir, embedding_idx + '.pt')
embedding_obj = torch.load(embedding_file)
# embeding_size
bos_representations = embedding_obj["bos_representations"][36].numpy()
# L * embeding_size
representations = embedding_obj["representations"][36].numpy()
# L * L
contacts = embedding_obj["contacts"].numpy()
embedding_obj = {
# "L": ["int", embedding_obj["seq_len"]],
"L": ["int", representations.shape[0]],
"d": ["int", representations.shape[1]],
"bos_representations": ["float", bos_representations],
"representations": ["float", representations],
"contacts": ["float", contacts]
}
assert protein_seq == embedding_obj["seq"]
pdb_obj = None
if self.protein_2_pdb_idx:
pdb_idx = self.protein_2_pdb_idx[protein_id]
pdb_file = os.path.join(self.pdb_dir, pdb_idx + '.npz')
cmap = np.load(pdb_file, allow_pickle=True)
ca_dist_matrix = cmap['C_alpha']
cb_dist_matrix = cmap['C_beta']
assert protein_seq == str(cmap['seqres'].item())
assert protein_label == cmap['label'].item()
pdb_obj = {
"L": ["int", ca_dist_matrix.shape[0]],
"C_alpha_dist_matrix": ["float", ca_dist_matrix],
"C_beta_dist_matrix": ["float", cb_dist_matrix]
}
example = self._serialize_example(protein_id, protein_seq, embedding_obj, pdb_obj, protein_label)
if example is None:
continue
writer.write(example)
print("label size: %d" % len(self.label_2_id))
print("Writing {} done!".format(tfrecord_fn))