in tfx_addons/feature_selection/component.py [0:0]
def FeatureSelection( # pylint: disable=C0103
module_file: Parameter[str], orig_examples: InputArtifact[Examples],
feature_selection: OutputArtifact[FeatureSelectionArtifact],
updated_data: OutputArtifact[Examples]):
"""Feature Selection component
Args (from the module file):
- SELECTOR_PARAMS: Parameters for SelectorFunc in the form of
a kwargs dictionary
- TARGET_FEATURE: Name of the feature containing target data
- SelectorFunc: Selector function for univariate feature selection
example: SelectKBest, SelectPercentile from sklearn.feature_selection
"""
# importing the required functions and variables from the module file
modules = importlib.import_module(module_file)
mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"]
selector_params, target_feature, selector_func = [
getattr(modules, i) for i in mod_names
]
# uri for the required data
train_uri = artifact_utils.get_split_uri([orig_examples], 'train')
np_dataset = _get_data_from_tfrecords(train_uri)
feature_keys, target_data, input_data = _data_preprocessing(
np_dataset, target_feature)
# Select features based on scores
selector = selector_func(**selector_params)
selector.fit_transform(input_data, target_data)
# adding basic info to the updated example artifact as output
updated_data.split_names = orig_examples.split_names
updated_data.span = orig_examples.span
# generate a list of selected features by matching FEATURE_KEYS to selected indices
selected_features = [
val for (idx, val) in enumerate(feature_keys)
if idx in selector.get_support(indices=True)
]
# convert string to array
split_arr = json.loads(orig_examples.split_names)
# update feature per split
for split in split_arr:
split_uri = artifact_utils.get_split_uri([orig_examples], split)
new_split_uri = artifact_utils.get_split_uri([updated_data], split)
os.mkdir(new_split_uri)
for file in _get_file_list(split_uri):
split_dataset = tf.data.TFRecordDataset(os.path.join(split_uri, file),
compression_type='GZIP')
# write the TFRecord
with tf.io.TFRecordWriter(path=os.path.join(new_split_uri, file),
options="GZIP") as writer:
for split_record in split_dataset.as_numpy_iterator():
example = tf.train.Example()
example.ParseFromString(split_record)
updated_example = _update_example(selected_features, example)
writer.write(updated_example.SerializeToString())
# get scores and p-values for artifacts
selector_scores = selector.scores_
selector_p_values = selector.pvalues_
# merge scores and pvalues with feature keys to create a dictionary
selector_scores_dict = dict(zip(feature_keys, selector_scores))
selector_pvalues_dict = dict(zip(feature_keys, selector_p_values))
# populate artifact with the required properties
feature_selection.scores = selector_scores_dict
feature_selection.p_values = selector_pvalues_dict
feature_selection.selected_features = selected_features