def find_category_counts()

in data_extraction_transformation/scripts/one_time_use_scripts/get_distinct_cat_2.py [0:0]


def find_category_counts(parent_folder, column_name):
    # Dictionary to store the count of files for each category
    category_counts = defaultdict(int)
    
    # Regular expression to match files with the pattern "<ID>_timeseries_data.csv"
    file_pattern = re.compile(r'.+_timeseries_data\.csv$')
    
    # Walk through all directories and files within the parent folder
    for root, dirs, files in os.walk(parent_folder):
        for filename in files:
            # Only process files matching the pattern
            if file_pattern.match(filename):
                file_path = os.path.join(root, filename)
                
                # Read the CSV file
                try:
                    df = pd.read_csv(file_path)
                    
                    # Check if the column exists in the file
                    if column_name in df.columns:
                        # Find unique categories in the column for this file
                        unique_categories_in_file = df[column_name].dropna().unique()
                        
                        # Update the count for each category
                        for category in unique_categories_in_file:
                            category_counts[category] += 1
                    else:
                        print(f"Column '{column_name}' not found in {filename}")
                
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    return category_counts