def _convert_numpy_folder()

in src/data_preprocess/data_preprocess_for_rdrp_v2.py [0:0]


    def _convert_numpy_folder(self, idx):
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        tfrecord_fn = os.path.join(self.save_path, '%0.2d-of-%0.2d.tfrecords' % (idx, self.num_shards))
        # writer = tf.python_io.TFRecordWriter(tfrecord_fn)
        writer = tf.io.TFRecordWriter(tfrecord_fn)
        print("### Serializing %d examples into %s" % (len(self.prot_list), tfrecord_fn))
        tmp_prot_list = self.prot_list[self.indices[idx][0]:self.indices[idx][1]]

        for i, protein_id in enumerate(tmp_prot_list):
            if i % 500 == 0:
                print("### Iter = %d/%d" % (i, len(tmp_prot_list)))
            item = self.dataset[protein_id]
            protein_seq = item[0]
            protein_label = item[1]
            protein_source = item[2]

            embedding_obj = None
            if self.protein_2_embedding_idx:
                embedding_idx = self.protein_2_embedding_idx[protein_id]
                embedding_file = os.path.join(self.embedding_dir, embedding_idx + '.pt')
                embedding_obj = torch.load(embedding_file)
                # embeding_size
                bos_representations = embedding_obj["bos_representations"][36].numpy()
                # L * embeding_size
                representations = embedding_obj["representations"][36].numpy()
                # L * L
                contacts = embedding_obj["contacts"].numpy()
                embedding_obj = {
                    # "L": ["int", embedding_obj["seq_len"]],
                    "L": ["int", representations.shape[0]],
                    "d": ["int", representations.shape[1]],
                    "bos_representations": ["float", bos_representations],
                    "representations": ["float", representations],
                    "contacts": ["float", contacts]
                }
                assert protein_seq == embedding_obj["seq"]

            pdb_obj = None
            if self.protein_2_pdb_idx:
                pdb_idx = self.protein_2_pdb_idx[protein_id]
                pdb_file = os.path.join(self.pdb_dir, pdb_idx + '.npz')
                cmap = np.load(pdb_file, allow_pickle=True)
                ca_dist_matrix = cmap['C_alpha']
                cb_dist_matrix = cmap['C_beta']
                assert protein_seq == str(cmap['seqres'].item())
                assert protein_label == cmap['label'].item()
                pdb_obj = {
                    "L": ["int", ca_dist_matrix.shape[0]],
                    "C_alpha_dist_matrix": ["float", ca_dist_matrix],
                    "C_beta_dist_matrix": ["float", cb_dist_matrix]
                }
            example = self._serialize_example(protein_id, protein_seq, embedding_obj, pdb_obj, protein_label)
            if example is None:
                continue
            writer.write(example)

        print("label size: %d" % len(self.label_2_id))
        print("Writing {} done!".format(tfrecord_fn))