def convert_comments_data()

in fairness_indicators/tutorial_utils/util.py [0:0]


def convert_comments_data(input_filename, output_filename=None):
  """Convert the public civil comments data.

  In the orginal dataset
  https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data
  for each indentity annotation columns, the value comes
  from percent of raters thought the comment referenced the identity. When
  processing the raw data, the threshold 0.5 is chosen and the identity terms
  are grouped together by their categories. For example if one comment has {
  male: 0.3, female: 1.0, transgender: 0.0, heterosexual: 0.8,
  homosexual_gay_or_lesbian: 1.0 }. After the processing, the data will be {
  gender: [female], sexual_orientation: [heterosexual,
  homosexual_gay_or_lesbian] }.

  Args:
    input_filename: The path to the raw civil comments data, with extension
      'tfrecord' or 'csv'.
    output_filename: The path to write the processed civil comments data.

  Returns:
    The file path to the converted dataset.

  Raises:
    ValueError: If the input_filename does not have a supported extension.
  """
  extension = os.path.splitext(input_filename)[1][1:]

  if not output_filename:
    output_filename = os.path.join(tempfile.mkdtemp(), 'output.' + extension)

  if extension == 'tfrecord':
    return _convert_comments_data_tfrecord(input_filename, output_filename)
  elif extension == 'csv':
    return _convert_comments_data_csv(input_filename, output_filename)

  raise ValueError(
      'input_filename must have supported file extension csv or tfrecord, '
      'given: {}'.format(input_filename))