in src/data_preprocess/data_preprocess_into_tfrecords_for_rdrp.py [0:0]
def _serialize_example(self, obj_id, sequence, pdb_obj, embedding_obj, label):
d_feature = {'id': self._bytes_feature(obj_id.encode()), 'seq': self._bytes_feature(sequence.encode()),
'L': self._int_feature(len(sequence))}
cur_example_label = label
if cur_example_label is None or len(cur_example_label) == 0:
return None
if isinstance(cur_example_label, list) or isinstance(cur_example_label, set):
cur_example_label_ids = [self.label_2_id[v] for v in cur_example_label]
d_feature['label'] = self._int_feature(cur_example_label_ids)
else:
cur_example_label_id = self.label_2_id[cur_example_label]
d_feature['label'] = self._int_feature(cur_example_label_id)
if embedding_obj:
d_feature['emb_l'] = self._int_feature(embedding_obj["L"][1])
d_feature['emb_size'] = self._int_feature(embedding_obj["d"][1])
for item in embedding_obj.items():
name = item[0]
dtype = item[1][0]
value = item[1][1]
if isinstance(value, np.ndarray):
value = list(value.reshape(-1))
elif isinstance(value, int) or isinstance(value, float) or isinstance(value, str):
value = [value]
if dtype == "str":
d_feature[name] = self._bytes_feature(value)
elif dtype == "int":
d_feature[name] = self._int_feature(value)
else:
d_feature[name] = self._float_feature(value)
if pdb_obj:
d_feature['pdb_l'] = self._int_feature(pdb_obj["L"][1])
for item in pdb_obj.items():
name = item[0]
dtype = item[1][0]
value = item[1][1]
if isinstance(value, np.ndarray):
value = list(value.reshape(-1))
elif isinstance(value, int) or isinstance(value, float) or isinstance(value, str):
value = [value]
if dtype == "str":
d_feature[name] = self._bytes_feature(value)
elif dtype == "int":
d_feature[name] = self._int_feature(value)
else:
d_feature[name] = self._float_feature(value)
example = tf.train.Example(features=tf.train.Features(feature=d_feature))
return example.SerializeToString()