mlebench/competitions/champs-scalar-coupling/prepare.py (84 lines of code) (raw):
import shutil
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from mlebench.utils import read_csv
def prepare(raw: Path, public: Path, private: Path):
# Create train, test from train split
old_train = read_csv(raw / "train.csv")
grouped_by_molecule = list(old_train.groupby("molecule_name"))
train_groups, test_groups = train_test_split(grouped_by_molecule, test_size=0.1, random_state=0)
new_train = pd.concat([group for _, group in train_groups])
answers = pd.concat([group for _, group in test_groups])
new_test = answers.drop(columns=["scalar_coupling_constant"])
# Create sample submission
sample_submission = new_test[["id"]].copy()
sample_submission["scalar_coupling_constant"] = 0
# Molecule structure data in CSV format
structures = read_csv(raw / "structures.csv")
structures = structures[structures["molecule_name"].isin(new_train["molecule_name"])]
# Additional data CSVs
dipole_moments = read_csv(raw / "dipole_moments.csv")
dipole_moments = dipole_moments[
dipole_moments["molecule_name"].isin(new_train["molecule_name"])
]
magnetic_shielding_tensors = read_csv(raw / "magnetic_shielding_tensors.csv")
magnetic_shielding_tensors = magnetic_shielding_tensors[
magnetic_shielding_tensors["molecule_name"].isin(new_train["molecule_name"])
]
mulliken_charges = read_csv(raw / "mulliken_charges.csv")
mulliken_charges = mulliken_charges[
mulliken_charges["molecule_name"].isin(new_train["molecule_name"])
]
potential_energy = read_csv(raw / "potential_energy.csv")
potential_energy = potential_energy[
potential_energy["molecule_name"].isin(new_train["molecule_name"])
]
scalar_coupling_contributions = read_csv(raw / "scalar_coupling_contributions.csv")
scalar_coupling_contributions = scalar_coupling_contributions[
scalar_coupling_contributions["molecule_name"].isin(new_train["molecule_name"])
]
# Checks before writing
data_csvs = {
"structures": structures,
"dipole_moments": dipole_moments,
"magnetic_shielding_tensors": magnetic_shielding_tensors,
"mulliken_charges": mulliken_charges,
"potential_energy": potential_energy,
"scalar_coupling_contributions": scalar_coupling_contributions,
}
for name, dataset in data_csvs.items():
assert set(dataset["molecule_name"]) == set(
new_train["molecule_name"]
), f"Filtered {name} should exactly match the molecule names present in the new_train set."
assert set(new_train["molecule_name"]).isdisjoint(
set(new_test["molecule_name"])
), "Train and test sets should not share any samples with the same molecule name."
assert set(new_train["id"]).isdisjoint(
set(new_test["id"])
), "Train and test sets should not share any samples with the same id."
assert len(sample_submission) == len(
new_test
), "Sample submission length does not match test length."
assert (
sample_submission.shape[1] == 2
), f"Sample submission should have 2 columns, but has {sample_submission.shape[1]}"
assert new_test.shape[1] == 5, f"new_test should have 5 columns, but has {new_test.shape[1]}"
assert answers.shape[1] == 6, f"answers should have 6 columns, but has {answers.shape[1]}"
assert new_train.shape[1] == 6, f"new_train should have 6 columns, but has {new_train.shape[1]}"
# Copy over molecule structure data individual files
for molecule_name in tqdm(
new_train["molecule_name"].unique(), desc="Copying molecule structure files"
):
src_file = raw / "structures" / f"{molecule_name}.xyz"
dst_file = public / "structures" / f"{molecule_name}.xyz"
dst_file.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(src=src_file, dst=dst_file)
# Write CSVs
answers.to_csv(private / "answers.csv", index=False)
new_train.to_csv(public / "train.csv", index=False)
new_test.to_csv(public / "test.csv", index=False)
sample_submission.to_csv(public / "sample_submission.csv", index=False)
structures.to_csv(public / "structures.csv", index=False)
dipole_moments.to_csv(public / "dipole_moments.csv", index=False)
magnetic_shielding_tensors.to_csv(public / "magnetic_shielding_tensors.csv", index=False)
mulliken_charges.to_csv(public / "mulliken_charges.csv", index=False)
potential_energy.to_csv(public / "potential_energy.csv", index=False)
scalar_coupling_contributions.to_csv(public / "scalar_coupling_contributions.csv", index=False)
# Checks after writing
assert len(list((public / "structures").glob("*.xyz"))) == len(
new_train["molecule_name"].unique()
), "The number of files in public/structures should match the number of unique molecule names in the train set."