in tzrec/tools/convert_easyrec_config_to_tzrec_config.py [0:0]
def _create_feature_config(self, pipeline_config):
"""Create tzrec feature config."""
easyrec_feature_config = easyrec_feature_config_pb2.FeatureConfig() # NOQA
seq_group_cfg = OrderedDict()
for cfg in self.easyrec_config.feature_configs:
if cfg.feature_name:
feature_name = cfg.feature_name
else:
feature_name = list(cfg.input_names)[0]
input_names = cfg.input_names
feature_type = cfg.feature_type
if feature_name in self.feature_to_fg:
fg_json = self.feature_to_fg[feature_name]
elif feature_name in self.sub_sequence_to_group:
pass
elif input_names[0] in self.feature_to_fg:
fg_json = self.feature_to_fg[input_names[0]]
else:
logger.error(f"in easyrec config {feature_name} not in fg.json")
feature_config = None
if feature_type == easyrec_feature_config.IdFeature:
feature_config = tzrec_feature_pb2.FeatureConfig()
feature = tzrec_feature_pb2.IdFeature()
feature.feature_name = feature_name
feature.expression = fg_json["expression"]
feature.embedding_dim = cfg.embedding_dim
feature.hash_bucket_size = cfg.hash_bucket_size
feature_config.ClearField("feature")
feature_config.id_feature.CopyFrom(feature)
elif feature_type == easyrec_feature_config.TagFeature:
feature_config = tzrec_feature_pb2.FeatureConfig()
feature = tzrec_feature_pb2.IdFeature()
feature.feature_name = feature_name
feature.expression = fg_json["expression"]
feature.embedding_dim = cfg.embedding_dim
feature.hash_bucket_size = cfg.hash_bucket_size
if cfg.HasField("kv_separator"):
feature.weighted = True
feature_config.ClearField("feature")
feature_config.id_feature.CopyFrom(feature)
elif feature_type == easyrec_feature_config.SequenceFeature:
if feature_name in self.sub_sequence_to_group:
sequence_name = self.sub_sequence_to_group[feature_name]
if sequence_name in seq_group_cfg:
seq_group_cfg[sequence_name].append(cfg)
else:
seq_group_cfg[sequence_name] = [cfg]
elif feature_name in self.feature_to_fg:
feature_config = tzrec_feature_pb2.FeatureConfig()
if cfg.sub_feature_type == easyrec_feature_config.IdFeature:
feature = tzrec_feature_pb2.SequenceIdFeature()
feature.feature_name = feature_name
feature.expression = self.feature_to_fg[feature_name][
"expression"
]
feature.embedding_dim = cfg.embedding_dim
feature.hash_bucket_size = cfg.hash_bucket_size
feature_config.ClearField("feature")
feature_config.sequence_id_feature.CopyFrom(feature)
else:
feature = tzrec_feature_pb2.SequenceRawFeature()
feature.feature_name = feature_name
feature.expression = self.feature_to_fg[feature_name][
"expression"
]
boundaries = list(cfg.boundaries)
feature.embedding_dim = cfg.embedding_dim
if len(boundaries):
feature.boundaries.extend(boundaries)
feature_config.ClearField("feature")
feature_config.sequence_raw_feature.CopyFrom(feature)
else:
logger.error(f"sequences feature: {feature_name} can't converted")
elif feature_type == easyrec_feature_config.RawFeature:
feature_config = tzrec_feature_pb2.FeatureConfig()
if fg_json["feature_type"] == "lookup_feature":
feature = tzrec_feature_pb2.LookupFeature()
feature.feature_name = feature_name
map = fg_json["map"]
key = fg_json["key"]
boundaries = list(cfg.boundaries)
feature.feature_name = feature_name
feature.map = map
feature.key = key
feature.embedding_dim = cfg.embedding_dim
if len(boundaries):
feature.boundaries.extend(boundaries)
feature_config.ClearField("feature")
feature_config.lookup_feature.CopyFrom(feature)
else:
feature = tzrec_feature_pb2.RawFeature()
feature.feature_name = feature_name
feature.expression = fg_json["expression"]
boundaries = list(cfg.boundaries)
feature.embedding_dim = cfg.embedding_dim
if len(boundaries):
feature.boundaries.extend(boundaries)
feature_config.ClearField("feature")
feature_config.raw_feature.CopyFrom(feature)
elif feature_type == easyrec_feature_config.ComboFeature:
feature_config = tzrec_feature_pb2.FeatureConfig()
feature = tzrec_feature_pb2.ComboFeature()
feature.feature_name = feature_name
for input in list(cfg.input_names):
if input in self.feature_to_fg:
tmp_fg_json = self.feature_to_fg[input]
feature.expression.append(tmp_fg_json["expression"])
else:
raise ValueError(f"{cfg} input_names:{input} not in fg json")
feature.embedding_dim = cfg.embedding_dim
feature.hash_bucket_size = cfg.hash_bucket_size
feature_config.ClearField("feature")
feature_config.combo_feature.CopyFrom(feature)
elif feature_type == easyrec_feature_config.LookupFeature:
feature_config = tzrec_feature_pb2.FeatureConfig()
feature = tzrec_feature_pb2.LookupFeature()
feature.feature_name = feature_name
map_f = cfg.input_names[0]
key_f = cfg.input_names[1]
if map_f in self.feature_to_fg:
feature.map = self.feature_to_fg[map_f]["expression"]
else:
raise ValueError(f"{cfg} input names: {map_f} not in fg.json")
if key_f in self.feature_to_fg:
feature.key = self.feature_to_fg[key_f]["expression"]
else:
raise ValueError(f"{cfg} input names: {map_f} not in fg.json")
feature.embedding_dim = cfg.embedding_dim
if len(list(cfg.boundaries)):
feature.boundaries.extend(list(cfg.boundaries))
feature_config.ClearField("feature")
feature_config.lookup_feature.CopyFrom(feature)
else:
logger.error(f"{feature_name} can't converted")
if feature_config is not None:
pipeline_config.feature_configs.append(feature_config)
for seq_name, sub_cfgs in seq_group_cfg.items():
sequence_fg = self.sequence_feature_to_fg[seq_name]
feature_config = tzrec_feature_pb2.FeatureConfig()
sequence_feature_config = tzrec_feature_pb2.SequenceFeature()
sequence_feature_config.sequence_name = sequence_fg["sequence_name"]
sequence_feature_config.sequence_length = sequence_fg["sequence_length"]
sequence_feature_config.sequence_delim = sequence_fg["sequence_delim"]
features = sequence_fg["features"]
seq_feature_to_fg = {}
for feature in features:
seq_feature_to_fg[f"{seq_name}__{feature['feature_name']}"] = feature
for cfg in sub_cfgs:
sub_feature_cfg = tzrec_feature_pb2.SeqFeatureConfig()
feature_name = (
cfg.feature_name if cfg.feature_name else cfg.input_names[0]
)
if feature_name in seq_feature_to_fg:
seq_feature_fg = seq_feature_to_fg[feature_name]
if cfg.sub_feature_type == easyrec_feature_config.IdFeature:
feature = tzrec_feature_pb2.IdFeature()
feature.feature_name = seq_feature_fg["feature_name"]
feature.expression = seq_feature_fg["expression"]
feature.embedding_dim = cfg.embedding_dim
feature.hash_bucket_size = cfg.hash_bucket_size
sub_feature_cfg.ClearField("feature")
sub_feature_cfg.id_feature.CopyFrom(feature)
else:
feature = tzrec_feature_pb2.RawFeature()
feature.feature_name = seq_feature_fg["feature_name"]
feature.expression = seq_feature_fg["expression"]
boundaries = list(cfg.boundaries)
feature.embedding_dim = cfg.embedding_dim
if len(boundaries):
feature.boundaries.extend(boundaries)
sub_feature_cfg.ClearField("feature")
sub_feature_cfg.raw_feature.CopyFrom(feature)
sequence_feature_config.features.append(sub_feature_cfg)
else:
logger.error(
f"sequence feature: {feature_name} not config in fg.json"
)
feature_config.sequence_feature.CopyFrom(sequence_feature_config)
pipeline_config.feature_configs.append(feature_config)
return pipeline_config