def FeatureSelection()

in tfx_addons/feature_selection/component.py [0:0]
43 lines of code
12 McCabe index (conditional complexity)

def FeatureSelection(  # pylint: disable=C0103
    module_file: Parameter[str], orig_examples: InputArtifact[Examples],
    feature_selection: OutputArtifact[FeatureSelectionArtifact],
    updated_data: OutputArtifact[Examples]):
  """Feature Selection component
    Args (from the module file):
    - SELECTOR_PARAMS: Parameters for SelectorFunc in the form of
      a kwargs dictionary
    - TARGET_FEATURE: Name of the feature containing target data
    - SelectorFunc: Selector function for univariate feature selection
      example: SelectKBest, SelectPercentile from sklearn.feature_selection
  """

  # importing the required functions and variables from the module file
  modules = importlib.import_module(module_file)
  mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"]
  selector_params, target_feature, selector_func = [
      getattr(modules, i) for i in mod_names
  ]

  # uri for the required data
  train_uri = artifact_utils.get_split_uri([orig_examples], 'train')
  np_dataset = _get_data_from_tfrecords(train_uri)
  feature_keys, target_data, input_data = _data_preprocessing(
      np_dataset, target_feature)

  # Select features based on scores
  selector = selector_func(**selector_params)
  selector.fit_transform(input_data, target_data)

  # adding basic info to the updated example artifact as output
  updated_data.split_names = orig_examples.split_names
  updated_data.span = orig_examples.span

  # generate a list of selected features by matching FEATURE_KEYS to selected indices
  selected_features = [
      val for (idx, val) in enumerate(feature_keys)
      if idx in selector.get_support(indices=True)
  ]

  # convert string to array
  split_arr = json.loads(orig_examples.split_names)

  # update feature per split
  for split in split_arr:
    split_uri = artifact_utils.get_split_uri([orig_examples], split)
    new_split_uri = artifact_utils.get_split_uri([updated_data], split)
    os.mkdir(new_split_uri)

    for file in _get_file_list(split_uri):
      split_dataset = tf.data.TFRecordDataset(os.path.join(split_uri, file),
                                              compression_type='GZIP')

      # write the TFRecord
      with tf.io.TFRecordWriter(path=os.path.join(new_split_uri, file),
                                options="GZIP") as writer:
        for split_record in split_dataset.as_numpy_iterator():
          example = tf.train.Example()
          example.ParseFromString(split_record)

          updated_example = _update_example(selected_features, example)
          writer.write(updated_example.SerializeToString())

  # get scores and p-values for artifacts
  selector_scores = selector.scores_
  selector_p_values = selector.pvalues_

  # merge scores and pvalues with feature keys to create a dictionary
  selector_scores_dict = dict(zip(feature_keys, selector_scores))
  selector_pvalues_dict = dict(zip(feature_keys, selector_p_values))

  # populate artifact with the required properties
  feature_selection.scores = selector_scores_dict
  feature_selection.p_values = selector_pvalues_dict
  feature_selection.selected_features = selected_features