mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py (80 lines of code) (raw):
import json
import shutil
from pathlib import Path
import pandas as pd
import py7zr
from sklearn.model_selection import train_test_split
from mlebench.utils import extract
def prepare(raw: Path, public: Path, private: Path):
"""
Splits the data in raw into public and private datasets with appropriate test/train splits.
"""
extract(raw / "train.json.7z", raw)
extract(raw / "test.json.7z", raw)
old_train = json.load((raw / "data/processed/train.json").open())
old_test = json.load((raw / "data/processed/test.json").open())
all_fields = list([key for key in old_train[0].keys()])
assert all(
set(all_fields) == set([key for key in sample.keys()]) for sample in old_train
), "Inconsistent fields in train set"
test_fields = list([key for key in old_test[0].keys()])
assert all(
set(test_fields) == set([key for key in sample.keys()]) for sample in old_test
), "Inconsistent fields in test set"
# Old ratio is Train set - 1,604 samples; Test set - 8,424 samples (~84% ratio)
# We do a 20% ratio to avoid removing too many samples from train
new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
new_test_without_labels = []
for sample in new_test:
# Keep only the fields in the test set
sample = {key: sample[key] for key in test_fields}
new_test_without_labels.append(sample)
# Write new train and test splits, compress, then remove the uncompressed files
(private / "tmp_data").mkdir(exist_ok=True)
with open(private / "tmp_data" / "train.json", "w") as f:
json.dump(new_train, f)
with open(private / "tmp_data" / "test.json", "w") as f:
json.dump(new_test_without_labels, f)
with py7zr.SevenZipFile(public / "train.json.7z", "w") as archive:
archive.write(
private / "tmp_data" / "train.json",
arcname=(private / "tmp_data" / "train.json").relative_to(private / "tmp_data"),
)
with py7zr.SevenZipFile(public / "test.json.7z", "w") as archive:
archive.write(
private / "tmp_data" / "test.json",
arcname=(private / "tmp_data" / "test.json").relative_to(private / "tmp_data"),
)
# Make answers as csv from json
answer_rows = []
for sample in new_test:
answer_rows.append(
{
"id": sample["id"],
"is_iceberg": int(sample["is_iceberg"]),
}
)
answers = pd.DataFrame(answer_rows)
answers.to_csv(private / "test.csv", index=False)
# Make sample submission
sample_submission = answers.copy()
sample_submission["is_iceberg"] = 0.5
sample_submission.to_csv(private / "sample_submission.csv", index=False)
with py7zr.SevenZipFile(public / "sample_submission.csv.7z", "w") as archive:
archive.write(
private / "sample_submission.csv",
arcname=(private / "sample_submission.csv").relative_to(private),
)
# Remove uncompressed files
shutil.rmtree(private / "tmp_data")
# Checks
assert len(new_train) + len(new_test) == len(
old_train
), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
assert len(new_test) == len(
new_test_without_labels
), f"Expected new_test ({len(new_test)}) to have the same length as new_test_without_labels ({len(new_test_without_labels)})"
assert len(answers) == len(
new_test
), f"Expected answers ({len(answers)}) to have the same length as new_test ({len(new_test)})"
assert len(sample_submission) == len(
new_test
), f"Expected sample_submission ({len(sample_submission)}) to have the same length as new_test ({len(new_test)})"
assert set(answers.columns) == set(
["id", "is_iceberg"]
), "Answers must have 'id' and 'is_iceberg' columns"
assert set(sample_submission.columns) == set(
["id", "is_iceberg"]
), "Sample submission must have 'id' and 'is_iceberg' columns"
new_train_ids = set([sample["id"] for sample in new_train])
new_test_ids = set([sample["id"] for sample in new_test])
assert new_train_ids.isdisjoint(new_test_ids), "Train and test ids should not overlap"