mlebench/competitions/random-acts-of-pizza/prepare.py (57 lines of code) (raw):

import json import shutil from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split def prepare(raw: Path, public: Path, private: Path): """ Splits the data in raw into public and private datasets with appropriate test/train splits. """ # Load data with open(raw / "train.json") as f: old_train = json.load(f) with open(raw / "test.json") as f: old_test = json.load(f) test_ratio = len(old_test) / (len(old_train) + len(old_test)) all_fields = list([key for key in old_train[0].keys()]) assert all(set(all_fields) == set([key for key in sample.keys()]) for sample in old_train) test_fields = list([key for key in old_test[0].keys()]) assert all(set(test_fields) == set([key for key in sample.keys()]) for sample in old_test) # Create train, test from train split new_train, new_test = train_test_split(old_train, test_size=test_ratio, random_state=0) new_test_without_labels = [] for sample in new_test: # Keep only the fields in the test set sample = {key: sample[key] for key in test_fields} new_test_without_labels.append(sample) # Save the new train and test splits with open(public / "train.json", "w") as f: json.dump(new_train, f, indent=4) with open(public / "test.json", "w") as f: json.dump(new_test_without_labels, f, indent=4) # Also save zipped versions "train.json.zip" and "test.json.zip" shutil.make_archive(public / "train", "zip", public, "train.json") shutil.make_archive(public / "test", "zip", public, "test.json") # Create answers answers_rows = [] for sample in new_test: answers_rows.append( { "request_id": sample["request_id"], "requester_received_pizza": int(sample["requester_received_pizza"]), } ) answers = pd.DataFrame(answers_rows) answers.to_csv(private / "test.csv", index=False) # Create sample submission sample_submission = answers.copy() sample_submission["requester_received_pizza"] = 0 sample_submission.to_csv(public / "sampleSubmission.csv", index=False) # Checks assert len(new_train) + len(new_test) == len( old_train ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})" assert len(new_test) == len( new_test_without_labels ), f"Expected new_test ({len(new_test)}) to have the same length as new_test_without_labels ({len(new_test_without_labels)})" assert len(answers) == len( new_test ), f"Expected answers ({len(answers)}) to have the same length as new_test ({len(new_test)})" assert len(sample_submission) == len( new_test ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as new_test ({len(new_test)})" assert set(answers.columns) == set( ["request_id", "requester_received_pizza"] ), "Answers must have 'request_id' and 'requester_received_pizza' columns" assert set(sample_submission.columns) == set( ["request_id", "requester_received_pizza"] ), "Sample submission must have 'request_id' and 'requester_received_pizza' columns"