def speaker_level_relative_to

def speaker_level_relative_to_gender()

in scripts/metadata_to_text.py [0:0]
48 lines of code
20 McCabe index (conditional complexity)

def speaker_level_relative_to_gender(dataset, text_bins, speaker_column_name, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
    '''
    Computes mean values on a speaker level and computes bins on top relative to the gender column name.
    Then associate a text bin to the column.
    This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
    '''
    list_data = []
    for df in dataset:
        for split in df:
            panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {speaker_column_name, column_name, gender_column_name}]).to_pandas()
            list_data.append(panda_data)
        
    dataframe = pd.concat(list_data, ignore_index=True)
    dataframe = dataframe.groupby(speaker_column_name).agg({column_name: "mean", gender_column_name: "first"})
    if bin_edges is None:
        bin_edges = {}
        if save_dir is not None:
            save_dict = {}
            save_dict_afer_filtering = {}
        for category in ["male", "female"]:
            values = dataframe[dataframe[gender_column_name] == category][column_name]
            values = np.array(values)
            if save_dir is not None:
                save_dict[category] = values
            if std_tolerance is not None:
                # filter out outliers
                values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
                if save_dir is not None:
                    save_dict_afer_filtering[category] = values
            bin_edges[category] = np.histogram(values, len(text_bins))[1]
        
        if save_dir is not None:
            visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
            if std_tolerance is not None:
                visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")

        if only_save_plot:
            return dataset, bin_edges
    else:
        print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")
     
    speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
        
    def batch_association(batch):
        index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
        # do min(max(...)) when values are outside of the main bins
        # it happens when value = min or max or have been filtered out from bins computation
        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
        return {
            output_column_name: batch_bins
        }
        
    
    dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
    return dataset, bin_edges