def _create_feature_config()

in tzrec/tools/convert_easyrec_config_to_tzrec_config.py [0:0]


    def _create_feature_config(self, pipeline_config):
        """Create tzrec feature config."""
        easyrec_feature_config = easyrec_feature_config_pb2.FeatureConfig()  # NOQA
        seq_group_cfg = OrderedDict()
        for cfg in self.easyrec_config.feature_configs:
            if cfg.feature_name:
                feature_name = cfg.feature_name
            else:
                feature_name = list(cfg.input_names)[0]
            input_names = cfg.input_names
            feature_type = cfg.feature_type

            if feature_name in self.feature_to_fg:
                fg_json = self.feature_to_fg[feature_name]
            elif feature_name in self.sub_sequence_to_group:
                pass
            elif input_names[0] in self.feature_to_fg:
                fg_json = self.feature_to_fg[input_names[0]]
            else:
                logger.error(f"in easyrec config {feature_name} not in fg.json")

            feature_config = None
            if feature_type == easyrec_feature_config.IdFeature:
                feature_config = tzrec_feature_pb2.FeatureConfig()
                feature = tzrec_feature_pb2.IdFeature()
                feature.feature_name = feature_name
                feature.expression = fg_json["expression"]
                feature.embedding_dim = cfg.embedding_dim
                feature.hash_bucket_size = cfg.hash_bucket_size
                feature_config.ClearField("feature")
                feature_config.id_feature.CopyFrom(feature)
            elif feature_type == easyrec_feature_config.TagFeature:
                feature_config = tzrec_feature_pb2.FeatureConfig()
                feature = tzrec_feature_pb2.IdFeature()
                feature.feature_name = feature_name
                feature.expression = fg_json["expression"]
                feature.embedding_dim = cfg.embedding_dim
                feature.hash_bucket_size = cfg.hash_bucket_size
                if cfg.HasField("kv_separator"):
                    feature.weighted = True
                feature_config.ClearField("feature")
                feature_config.id_feature.CopyFrom(feature)
            elif feature_type == easyrec_feature_config.SequenceFeature:
                if feature_name in self.sub_sequence_to_group:
                    sequence_name = self.sub_sequence_to_group[feature_name]
                    if sequence_name in seq_group_cfg:
                        seq_group_cfg[sequence_name].append(cfg)
                    else:
                        seq_group_cfg[sequence_name] = [cfg]
                elif feature_name in self.feature_to_fg:
                    feature_config = tzrec_feature_pb2.FeatureConfig()
                    if cfg.sub_feature_type == easyrec_feature_config.IdFeature:
                        feature = tzrec_feature_pb2.SequenceIdFeature()
                        feature.feature_name = feature_name
                        feature.expression = self.feature_to_fg[feature_name][
                            "expression"
                        ]
                        feature.embedding_dim = cfg.embedding_dim
                        feature.hash_bucket_size = cfg.hash_bucket_size
                        feature_config.ClearField("feature")
                        feature_config.sequence_id_feature.CopyFrom(feature)
                    else:
                        feature = tzrec_feature_pb2.SequenceRawFeature()
                        feature.feature_name = feature_name
                        feature.expression = self.feature_to_fg[feature_name][
                            "expression"
                        ]
                        boundaries = list(cfg.boundaries)
                        feature.embedding_dim = cfg.embedding_dim
                        if len(boundaries):
                            feature.boundaries.extend(boundaries)
                        feature_config.ClearField("feature")
                        feature_config.sequence_raw_feature.CopyFrom(feature)
                else:
                    logger.error(f"sequences feature: {feature_name} can't converted")
            elif feature_type == easyrec_feature_config.RawFeature:
                feature_config = tzrec_feature_pb2.FeatureConfig()
                if fg_json["feature_type"] == "lookup_feature":
                    feature = tzrec_feature_pb2.LookupFeature()
                    feature.feature_name = feature_name
                    map = fg_json["map"]
                    key = fg_json["key"]
                    boundaries = list(cfg.boundaries)
                    feature.feature_name = feature_name
                    feature.map = map
                    feature.key = key
                    feature.embedding_dim = cfg.embedding_dim
                    if len(boundaries):
                        feature.boundaries.extend(boundaries)
                    feature_config.ClearField("feature")
                    feature_config.lookup_feature.CopyFrom(feature)
                else:
                    feature = tzrec_feature_pb2.RawFeature()
                    feature.feature_name = feature_name
                    feature.expression = fg_json["expression"]
                    boundaries = list(cfg.boundaries)
                    feature.embedding_dim = cfg.embedding_dim
                    if len(boundaries):
                        feature.boundaries.extend(boundaries)
                    feature_config.ClearField("feature")
                    feature_config.raw_feature.CopyFrom(feature)
            elif feature_type == easyrec_feature_config.ComboFeature:
                feature_config = tzrec_feature_pb2.FeatureConfig()
                feature = tzrec_feature_pb2.ComboFeature()
                feature.feature_name = feature_name
                for input in list(cfg.input_names):
                    if input in self.feature_to_fg:
                        tmp_fg_json = self.feature_to_fg[input]
                        feature.expression.append(tmp_fg_json["expression"])
                    else:
                        raise ValueError(f"{cfg} input_names:{input} not in fg json")
                feature.embedding_dim = cfg.embedding_dim
                feature.hash_bucket_size = cfg.hash_bucket_size
                feature_config.ClearField("feature")
                feature_config.combo_feature.CopyFrom(feature)
            elif feature_type == easyrec_feature_config.LookupFeature:
                feature_config = tzrec_feature_pb2.FeatureConfig()
                feature = tzrec_feature_pb2.LookupFeature()
                feature.feature_name = feature_name
                map_f = cfg.input_names[0]
                key_f = cfg.input_names[1]
                if map_f in self.feature_to_fg:
                    feature.map = self.feature_to_fg[map_f]["expression"]
                else:
                    raise ValueError(f"{cfg} input names: {map_f} not in fg.json")
                if key_f in self.feature_to_fg:
                    feature.key = self.feature_to_fg[key_f]["expression"]
                else:
                    raise ValueError(f"{cfg} input names: {map_f} not in fg.json")
                feature.embedding_dim = cfg.embedding_dim
                if len(list(cfg.boundaries)):
                    feature.boundaries.extend(list(cfg.boundaries))
                feature_config.ClearField("feature")
                feature_config.lookup_feature.CopyFrom(feature)
            else:
                logger.error(f"{feature_name} can't converted")
            if feature_config is not None:
                pipeline_config.feature_configs.append(feature_config)
        for seq_name, sub_cfgs in seq_group_cfg.items():
            sequence_fg = self.sequence_feature_to_fg[seq_name]
            feature_config = tzrec_feature_pb2.FeatureConfig()
            sequence_feature_config = tzrec_feature_pb2.SequenceFeature()
            sequence_feature_config.sequence_name = sequence_fg["sequence_name"]
            sequence_feature_config.sequence_length = sequence_fg["sequence_length"]
            sequence_feature_config.sequence_delim = sequence_fg["sequence_delim"]
            features = sequence_fg["features"]
            seq_feature_to_fg = {}
            for feature in features:
                seq_feature_to_fg[f"{seq_name}__{feature['feature_name']}"] = feature
            for cfg in sub_cfgs:
                sub_feature_cfg = tzrec_feature_pb2.SeqFeatureConfig()
                feature_name = (
                    cfg.feature_name if cfg.feature_name else cfg.input_names[0]
                )
                if feature_name in seq_feature_to_fg:
                    seq_feature_fg = seq_feature_to_fg[feature_name]
                    if cfg.sub_feature_type == easyrec_feature_config.IdFeature:
                        feature = tzrec_feature_pb2.IdFeature()
                        feature.feature_name = seq_feature_fg["feature_name"]
                        feature.expression = seq_feature_fg["expression"]
                        feature.embedding_dim = cfg.embedding_dim
                        feature.hash_bucket_size = cfg.hash_bucket_size
                        sub_feature_cfg.ClearField("feature")
                        sub_feature_cfg.id_feature.CopyFrom(feature)
                    else:
                        feature = tzrec_feature_pb2.RawFeature()
                        feature.feature_name = seq_feature_fg["feature_name"]
                        feature.expression = seq_feature_fg["expression"]
                        boundaries = list(cfg.boundaries)
                        feature.embedding_dim = cfg.embedding_dim
                        if len(boundaries):
                            feature.boundaries.extend(boundaries)
                        sub_feature_cfg.ClearField("feature")
                        sub_feature_cfg.raw_feature.CopyFrom(feature)
                    sequence_feature_config.features.append(sub_feature_cfg)
                else:
                    logger.error(
                        f"sequence feature: {feature_name} not config in fg.json"
                    )

            feature_config.sequence_feature.CopyFrom(sequence_feature_config)
            pipeline_config.feature_configs.append(feature_config)

        return pipeline_config