in mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
"""
Splits the data in raw into public and private datasets with appropriate test/train splits.
"""
# Extract only what we need
extract(raw / "train.zip", raw / "train")
extract(raw / "train.csv.zip", raw / "train.csv")
extract(raw / "test.zip", raw / "test")
extract(raw / "test.csv.zip", raw / "test.csv")
# Create train, test from train split
old_train = read_csv(raw / "train.csv/train.csv")
new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
# Make ids go 1, 2, ... for both train and test. Keep old ids so we can map ids of other files
old_train_id_to_new = {
old_id: new_id for new_id, old_id in enumerate(new_train["id"], start=1)
} # id starts from 1
new_train["id"] = new_train["id"].map(old_train_id_to_new)
old_test_id_to_new = {
old_id: new_id for new_id, old_id in enumerate(new_test["id"], start=1)
} # id starts from 1
new_test["id"] = new_test["id"].map(old_test_id_to_new)
new_test_without_labels = new_test.drop(
columns=["formation_energy_ev_natom", "bandgap_energy_ev"]
)
# Copy over files
new_train.to_csv(public / "train.csv", index=False)
new_test.to_csv(private / "test.csv", index=False)
new_test_without_labels.to_csv(public / "test.csv", index=False)
train_paths = sorted(glob.glob(str(raw / "train/train/**/*.xyz")))
for src in train_paths:
id = int(Path(src).parts[-2])
if id not in old_train_id_to_new.keys(): # Filter for train ids
continue
new_id = old_train_id_to_new[id]
(public / "train" / str(new_id)).mkdir(parents=True, exist_ok=True)
shutil.copy(src=src, dst=public / "train" / str(new_id) / "geometry.xyz")
assert len(list(public.glob("train/**/*.xyz"))) == len(
new_train
), f"Expected {len(new_train)} train geometry files, found {len(list(public.glob('train/**/*.xyz')))}"
for src in train_paths:
id = int(Path(src).parts[-2])
if id not in old_test_id_to_new.keys(): # Filter for test ids
continue
new_id = old_test_id_to_new[id]
(public / "test" / str(new_id)).mkdir(parents=True, exist_ok=True)
shutil.copy(src=src, dst=public / "test" / str(new_id) / "geometry.xyz")
assert len(list(public.glob("test/**/*.xyz"))) == len(
new_test
), f"Expected {len(new_test)} test geometry files, found {len(list(public.glob('test/**/*.xyz')))}"
# Create mock submission
sample_submission = pd.DataFrame(
{"id": new_test["id"], "formation_energy_ev_natom": 0.1779, "bandgap_energy_ev": 1.8892}
)
sample_submission.to_csv(public / "sample_submission.csv", index=False)
assert len(sample_submission) == len(
new_test
), "Sample submission should have the same number of rows as the test set"