in scripts/metadata_to_text.py [0:0]
def speaker_level_relative_to_gender(dataset, text_bins, speaker_column_name, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
'''
Computes mean values on a speaker level and computes bins on top relative to the gender column name.
Then associate a text bin to the column.
This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
'''
list_data = []
for df in dataset:
for split in df:
panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {speaker_column_name, column_name, gender_column_name}]).to_pandas()
list_data.append(panda_data)
dataframe = pd.concat(list_data, ignore_index=True)
dataframe = dataframe.groupby(speaker_column_name).agg({column_name: "mean", gender_column_name: "first"})
if bin_edges is None:
bin_edges = {}
if save_dir is not None:
save_dict = {}
save_dict_afer_filtering = {}
for category in ["male", "female"]:
values = dataframe[dataframe[gender_column_name] == category][column_name]
values = np.array(values)
if save_dir is not None:
save_dict[category] = values
if std_tolerance is not None:
# filter out outliers
values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
if save_dir is not None:
save_dict_afer_filtering[category] = values
bin_edges[category] = np.histogram(values, len(text_bins))[1]
if save_dir is not None:
visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
if std_tolerance is not None:
visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")
if only_save_plot:
return dataset, bin_edges
else:
print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")
speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
def batch_association(batch):
index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
# do min(max(...)) when values are outside of the main bins
# it happens when value = min or max or have been filtered out from bins computation
batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
return {
output_column_name: batch_bins
}
dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
return dataset, bin_edges