easy_rec/python/tools/feature_selection.py (270 lines of code) (raw):

from __future__ import division from __future__ import print_function import json import os import sys from collections import OrderedDict import numpy as np import pandas as pd import tensorflow as tf from tensorflow.python.framework.meta_graph import read_meta_graph_file from easy_rec.python.utils import config_util from easy_rec.python.utils import io_util if tf.__version__ >= '2.0': tf = tf.compat.v1 import matplotlib # NOQA matplotlib.use('Agg') # NOQA import matplotlib.pyplot as plt # NOQA tf.app.flags.DEFINE_string('model_type', 'variational_dropout', 'feature selection model type') tf.app.flags.DEFINE_string('config_path', '', 'feature selection model config path') tf.app.flags.DEFINE_string('checkpoint_path', None, 'feature selection model checkpoint path') tf.app.flags.DEFINE_string('output_dir', '', 'feature selection result directory') tf.app.flags.DEFINE_integer( 'topk', 100, 'select topk importance features for each feature group') tf.app.flags.DEFINE_string('fg_path', '', 'fg config path') tf.app.flags.DEFINE_bool('visualize', False, 'visualization feature selection result or not') FLAGS = tf.app.flags.FLAGS class VariationalDropoutFS: def __init__(self, config_path, output_dir, topk, checkpoint_path=None, fg_path=None, visualize=False): self._config_path = config_path self._output_dir = output_dir self._topk = topk if not tf.gfile.Exists(self._output_dir): tf.gfile.MakeDirs(self._output_dir) self._checkpoint_path = checkpoint_path self._fg_path = fg_path self._visualize = visualize def process(self): tf.logging.info('Loading logit_p of VariationalDropout layer ...') feature_dim_dropout_p_map, embedding_wise_variational_dropout = self._feature_dim_dropout_ratio( ) feature_importance_map = {} for group_name, feature_dim_dropout_p in feature_dim_dropout_p_map.items(): tf.logging.info('Calculating %s feature importance ...' % group_name) feature_importance = self._get_feature_importance( feature_dim_dropout_p, embedding_wise_variational_dropout) feature_importance_map[group_name] = feature_importance tf.logging.info('Dump %s feature importance to csv ...' % group_name) self._dump_to_csv(feature_importance, group_name) if self._visualize: tf.logging.info('Visualizing %s feature importance ...' % group_name) if embedding_wise_variational_dropout: self._visualize_embedding_dim_importance(feature_dim_dropout_p) self._visualize_feature_importance(feature_importance, group_name) tf.logging.info('Processing model config ...') self._process_config(feature_importance_map) def _feature_dim_dropout_ratio(self): """Get dropout ratio of embedding-wise or feature-wise.""" config = config_util.get_configs_from_pipeline_file(self._config_path) assert config.model_config.HasField( 'variational_dropout'), 'variational_dropout must be in model_config' embedding_wise_variational_dropout = config.model_config.variational_dropout.embedding_wise_variational_dropout if self._checkpoint_path is None or len(self._checkpoint_path) == 0: checkpoint_path = tf.train.latest_checkpoint(config.model_dir) else: checkpoint_path = self._checkpoint_path meta_graph_def = read_meta_graph_file(checkpoint_path + '.meta') features_dimension_map = dict() for col_def in meta_graph_def.collection_def[ 'variational_dropout'].bytes_list.value: name, features_dimension = json.loads(col_def) name = 'all' if name == '' else name features_dimension_map[name] = OrderedDict(features_dimension) tf.logging.info('Reading checkpoint from %s ...' % checkpoint_path) reader = tf.train.NewCheckpointReader(checkpoint_path) feature_dim_dropout_p_map = {} for feature_group in config.model_config.feature_groups: group_name = feature_group.group_name logit_p_name = 'logit_p' if group_name == 'all' else 'logit_p_%s' % group_name try: logit_p = reader.get_tensor(logit_p_name) except Exception: print('get `logit_p` failed, try to get `backbone/logit_p`') logit_p = reader.get_tensor('backbone/' + logit_p_name) feature_dims_importance = tf.sigmoid(logit_p) with tf.Session() as sess: feature_dims_importance = feature_dims_importance.eval(session=sess) feature_dim_dropout_p = {} if embedding_wise_variational_dropout: index_end = 0 for feature_name, feature_dim in features_dimension_map[ group_name].items(): index_start = index_end index_end = index_start + feature_dim feature_dim_dropout_p[feature_name] = feature_dims_importance[ index_start:index_end] else: index = 0 for feature_name in features_dimension_map[group_name].keys(): feature_dim_dropout_p[feature_name] = feature_dims_importance[index] index += 1 feature_dim_dropout_p_map[group_name] = feature_dim_dropout_p return feature_dim_dropout_p_map, embedding_wise_variational_dropout def _get_feature_importance(self, feature_dim_dropout_p, embedding_wise_variational_dropout): """Calculate feature importance.""" if embedding_wise_variational_dropout: feature_importance = {} for item in feature_dim_dropout_p.items(): dropout_rate_mean = np.mean(item[1]) feature_importance[item[0]] = dropout_rate_mean feature_importance = OrderedDict( sorted(feature_importance.items(), key=lambda e: e[1])) else: feature_importance = OrderedDict( sorted(feature_dim_dropout_p.items(), key=lambda e: e[1])) return feature_importance def _process_config(self, feature_importance_map): """Process model config and fg config with feature selection.""" excluded_features = set() for group_name, feature_importance in feature_importance_map.items(): for i, (feature_name, _) in enumerate(feature_importance.items()): if i >= self._topk: excluded_features.add(feature_name) config = config_util.get_configs_from_pipeline_file(self._config_path) # keep sequence features and side-infos sequence_features = set() for feature_group in config.model_config.feature_groups: for sequence_feature in feature_group.sequence_features: for seq_att_map in sequence_feature.seq_att_map: for key in seq_att_map.key: sequence_features.add(key) for hist_seq in seq_att_map.hist_seq: sequence_features.add(hist_seq) # compat with din for sequence_feature in config.model_config.seq_att_groups: for seq_att_map in sequence_feature.seq_att_map: for key in seq_att_map.key: sequence_features.add(key) for hist_seq in seq_att_map.hist_seq: sequence_features.add(hist_seq) excluded_features = excluded_features - sequence_features feature_configs = [] for feature_config in config_util.get_compatible_feature_configs(config): feature_name = feature_config.feature_name if feature_config.HasField('feature_name') \ else feature_config.input_names[0] if feature_name not in excluded_features: feature_configs.append(feature_config) if config.feature_configs: config.ClearField('feature_configs') config.feature_configs.extend(feature_configs) else: config.feature_config.ClearField('features') config.feature_config.features.extend(feature_configs) for feature_group in config.model_config.feature_groups: feature_names = [] for feature_name in feature_group.feature_names: if feature_name not in excluded_features: feature_names.append(feature_name) feature_group.ClearField('feature_names') feature_group.feature_names.extend(feature_names) config_util.save_message( config, os.path.join(self._output_dir, os.path.basename(self._config_path))) if self._fg_path is not None and len(self._fg_path) > 0: with tf.gfile.Open(self._fg_path) as f: fg_json = json.load(f, object_pairs_hook=OrderedDict) features = [] for feature in fg_json['features']: if 'feature_name' in feature: if feature['feature_name'] not in excluded_features: features.append(feature) else: features.append(feature) fg_json['features'] = features with tf.gfile.Open( os.path.join(self._output_dir, os.path.basename(self._fg_path)), 'w') as f: json.dump(fg_json, f, indent=4) def _dump_to_csv(self, feature_importance, group_name): """Dump feature importance data to a csv file.""" with tf.gfile.Open( os.path.join(self._output_dir, 'feature_dropout_ratio_%s.csv' % group_name), 'w') as f: df = pd.DataFrame( columns=['feature_name', 'mean_drop_p'], data=[list(kv) for kv in feature_importance.items()]) df.to_csv(f, encoding='gbk') def _visualize_embedding_dim_importance(self, feature_dim_dropout_p): """Visualize embedding-wise importance visualization for every feature.""" output_dir = os.path.join(self._output_dir, 'feature_dims_importance_pics') if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) plt.rcdefaults() for feature_name, feature_dropout_p in feature_dim_dropout_p.items(): embedding_len = len(feature_dropout_p) embedding_dims = [] for i in range(embedding_len): embedding_dims.append('dim_' + str(i + 1)) y_pos = np.arange(len(embedding_dims)) performance_list = [] for i in range(0, embedding_len): performance_list.append(feature_dropout_p[i]) fig, ax = plt.subplots() b = ax.barh( y_pos, performance_list, align='center', alpha=0.4, label='dropout_rate', lw=1) for rect in b: w = rect.get_width() ax.text( w, rect.get_y() + rect.get_height() / 2, '%.4f' % w, ha='left', va='center') plt.yticks(y_pos, embedding_dims) plt.xlabel(feature_name) plt.title('Dropout ratio') img_path = os.path.join(output_dir, feature_name + '.png') with tf.gfile.GFile(img_path, 'wb') as f: plt.savefig(f, format='png') def _visualize_feature_importance(self, feature_importance, group_name): """Draw feature importance histogram.""" df = pd.DataFrame( columns=['feature_name', 'mean_drop_p'], data=[list(kv) for kv in feature_importance.items()]) df['color'] = ['red' if x < 0.5 else 'green' for x in df['mean_drop_p']] df.sort_values('mean_drop_p', inplace=True, ascending=False) df.reset_index(inplace=True) # Draw plot plt.figure(figsize=(90, 200), dpi=100) plt.hlines(y=df.index, xmin=0, xmax=df.mean_drop_p) for x, y, tex in zip(df.mean_drop_p, df.index, df.mean_drop_p): plt.text( x, y, round(tex, 2), horizontalalignment='right' if x < 0 else 'left', verticalalignment='center', fontdict={ 'color': 'red' if x < 0 else 'green', 'size': 14 }) # Decorations plt.yticks(df.index, df.feature_name, fontsize=20) plt.title('Dropout Ratio', fontdict={'size': 30}) plt.grid(linestyle='--', alpha=0.5) plt.xlim(0, 1) with tf.gfile.GFile( os.path.join(self._output_dir, 'feature_dropout_pic_%s.png' % group_name), 'wb') as f: plt.savefig(f, format='png') if __name__ == '__main__': sys.argv = io_util.filter_unknown_args(FLAGS, sys.argv) if FLAGS.model_type == 'variational_dropout': fs = VariationalDropoutFS( FLAGS.config_path, FLAGS.output_dir, FLAGS.topk, checkpoint_path=FLAGS.checkpoint_path, fg_path=FLAGS.fg_path, visualize=FLAGS.visualize) fs.process() else: raise ValueError('Unknown feature selection model type %s' % FLAGS.model_type)