mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py (60 lines of code) (raw):

import re import shutil from pathlib import Path import pandas as pd from tqdm import tqdm def prepare(raw: Path, public: Path, private: Path): """ Splits the data in raw into public and private datasets with appropriate test/train splits. """ # Data is in train2.zip - we need to unzip it shutil.unpack_archive(raw / "train2.zip", raw) # Files are named as # Train: "YYYYMMDD_HHMMSS_{seconds}_TRAIN{idx}_{label:0,1}.aif" # Test: "YYYYMMDD_HHMMSS_{seconds}_Test{idx}.aif" # There are 4 days in Train and 3 days in Test # In our new dataset, we'll just split Train_old into 2 days for Train and 2 days for Test samples_by_date = {} n_train_old = 0 for sample in (raw / "train2").iterdir(): date = sample.name.split("_")[0] if date not in samples_by_date: samples_by_date[date] = [] samples_by_date[date].append(sample) n_train_old += 1 assert len(samples_by_date) == 4, "Expected 4 days in Train_old" dates = sorted(list(samples_by_date.keys())) new_train = samples_by_date[dates[0]] + samples_by_date[dates[1]] new_test = samples_by_date[dates[2]] + samples_by_date[dates[3]] # Sort files - filenames have timestamps so we want new idxs to be time-ordered new_train = sorted(new_train) new_test = sorted(new_test) # Copy files to new directories (public / "train2").mkdir(exist_ok=True, parents=True) for idx, sample in enumerate(tqdm(new_train)): # Replace index part of filename with new index new_sample_name = re.sub(r"TRAIN\d+", f"TRAIN{idx}", sample.name) new_sample = public / "train2" / new_sample_name shutil.copy(sample, new_sample) answer_rows = [] # While we're at it, collect answers for the new test set (public / "test2").mkdir(exist_ok=True, parents=True) for idx, sample in enumerate(tqdm(new_test)): # Replace everything after the TRAIN{idx} part of the filename # (replaces index as well as label part of filename) new_sample_name = sample.name.split("TRAIN")[0] + f"Test{idx}.aif" new_sample = public / "test2" / new_sample_name shutil.copy(sample, new_sample) # Add to new test set answers answer_rows.append( {"clip": new_sample_name, "probability": 1 if sample.stem.endswith("_1") else 0} ) assert len(new_train) == len( list((public / "train2").glob("*.aif")) ), f"Expected {len(new_train)} samples in new_train ({len(list((public / 'train2').glob('*.aif')))}" assert len(new_test) == len( list((public / "test2").glob("*.aif")) ), f"Expected {len(new_test)} samples in new_test ({len(list((public / 'test2').glob('*.aif')))}" assert ( len(new_train) + len(new_test) == n_train_old ), f"Expected {n_train_old} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})" # Make zipped versions shutil.make_archive(public / "train2", "zip", public, "train2") shutil.make_archive(public / "test2", "zip", public, "test2") # Remove unzipped directories (original comp doesn't have these) shutil.rmtree(public / "train2") shutil.rmtree(public / "test2") # we also don't need the raw dirs anymore shutil.rmtree(raw / "train2") # Create answers answers_df = pd.DataFrame(answer_rows) answers_df.to_csv(private / "test.csv", index=False) # Create sample submission sample_submission = answers_df.copy() sample_submission["probability"] = 0 sample_submission.to_csv(public / "sampleSubmission.csv", index=False) assert set(answers_df.columns) == set( ["clip", "probability"] ), "Answers must have 'clip' and 'probability' columns" assert set(sample_submission.columns) == set( ["clip", "probability"] ), "Sample submission must have 'clip' and 'probability' columns"