in fairness_indicators/tutorial_utils/util.py [0:0]
def _convert_comments_data_csv(input_filename, output_filename=None):
"""Convert the public civil comments data, for csv data."""
df = pd.read_csv(input_filename)
# Filter out rows with empty comment text values.
df = df[df[TEXT_FEATURE].ne('')]
df = df[df[TEXT_FEATURE].notnull()]
new_df = pd.DataFrame()
new_df[TEXT_FEATURE] = df[TEXT_FEATURE]
# Reduce the label to value 0 or 1.
new_df[LABEL] = df[LABEL].ge(_THRESHOLD).astype(int)
# Extract the list of all identity terms that exceed the threshold.
def identity_conditions(df, identity_list):
group = []
for identity in identity_list:
if df[identity] >= _THRESHOLD:
group.append(identity)
return group
for identity_category, identity_list in IDENTITY_COLUMNS.items():
new_df[identity_category] = df.apply(
identity_conditions, args=((identity_list),), axis=1)
new_df.to_csv(
output_filename,
header=[TEXT_FEATURE, LABEL, *IDENTITY_COLUMNS.keys()],
index=False)
return output_filename