def _convert_comments_data_csv()

in fairness_indicators/tutorial_utils/util.py [0:0]


def _convert_comments_data_csv(input_filename, output_filename=None):
  """Convert the public civil comments data, for csv data."""
  df = pd.read_csv(input_filename)

  # Filter out rows with empty comment text values.
  df = df[df[TEXT_FEATURE].ne('')]
  df = df[df[TEXT_FEATURE].notnull()]

  new_df = pd.DataFrame()
  new_df[TEXT_FEATURE] = df[TEXT_FEATURE]

  # Reduce the label to value 0 or 1.
  new_df[LABEL] = df[LABEL].ge(_THRESHOLD).astype(int)

  # Extract the list of all identity terms that exceed the threshold.
  def identity_conditions(df, identity_list):
    group = []
    for identity in identity_list:
      if df[identity] >= _THRESHOLD:
        group.append(identity)
    return group

  for identity_category, identity_list in IDENTITY_COLUMNS.items():
    new_df[identity_category] = df.apply(
        identity_conditions, args=((identity_list),), axis=1)

  new_df.to_csv(
      output_filename,
      header=[TEXT_FEATURE, LABEL, *IDENTITY_COLUMNS.keys()],
      index=False)

  return output_filename