data_extraction_transformation/scripts/aggregate.py (36 lines of code) (raw):

import os import pandas as pd import shutil import argparse def parse_args(): parser = argparse.ArgumentParser(description="Run KCPA algorithm on a time series dataset.") parser.add_argument('-i', '--input-folder', help="Path to the input CSV timeseries folder") parser.add_argument('-o', '--output-folder', help="Path to the output CSV timeseries folder") parser.add_argument('-a', '--aggregation-method', help="Aggregation method (mean, min, max)", choices=['mean', 'min', 'max'], default='mean') return parser.parse_args() def select_favorable_status(status_series): # Return the status with the highest priority (lowest number in `status_priority`) return sorted(status_series, key=lambda x: status_priority.get(x, float('inf')))[0] def process_folder(input_folder, output_folder, folder, aggregation_method): for signature_file in os.listdir(input_folder + '/' + folder): print(folder + '/' + signature_file) df = pd.read_csv(input_folder + '/' + folder + '/' + signature_file, index_col=False) df_grouped = df.groupby('revision').agg({ 'value': aggregation_method, **{col: 'first' for col in df.columns if col not in ['revision', 'value']} }).reset_index() # Sort by push_timestamp if it exists, otherwise omit this step if 'push_timestamp' in df_grouped.columns: df_grouped = df_grouped.sort_values(by='push_timestamp') # Save the processed DataFrame output_path = f'{output_folder}/{folder}/{signature_file}' df_grouped.to_csv(output_path, index=False) def main(): args = parse_args() input_folder = args.input_folder output_folder = args.output_folder aggregation_method = args.aggregation_method # The following usage projects_folders_mapping in case the names of the subfolders does not reflect the names of the projects. The code is designed to handle this change. #projects_folders_mapping = {"autoland": ["autoland1", "autoland2", "autoland3", "autoland4"], "firefox-android": ["firefox-android"], "mozilla-beta": ["mozilla-beta"], "mozilla-release": ["mozilla-release"], "mozilla-central": ["mozilla-central"]} projects_folders_mapping = {name: [name] for name in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, name))} # Process each project and folder for project in projects_folders_mapping: for folder in projects_folders_mapping[project]: os.makedirs(f'{output_folder}/{folder}', exist_ok=True) process_folder(input_folder, output_folder, folder, aggregation_method) if __name__ == "__main__": main()