in mlebench/competitions/cassava-leaf-disease-classification/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
# need to split based on the TFRecord files, since not mentioned in the CSVs
tfrecord_files = [
path
for path in sorted((raw / "train_tfrecords").iterdir())
if path.is_file() and path.suffix == ".tfrec"
]
# In the original there are 21397 train samples and they say test has ~15000 test samples, which is ~ 0.4/0.6 test/train split
# We use 0.1 ratio to avoid removing too many samples from train
new_train_tfrecords, new_test_tfrecords = train_test_split(
tfrecord_files, test_size=0.1, random_state=0
)
# parse the IDs from the test tf records
test_ids = []
for path in new_test_tfrecords:
test_ids.extend(get_ids_from_tf_records(path))
old_train = read_csv(raw / "train.csv")
old_train["split"] = "train"
old_train.loc[old_train["image_id"].isin(test_ids), "split"] = "test"
new_train = old_train[old_train["split"] == "train"].drop(columns=["split"])
new_test = old_train[old_train["split"] == "test"].drop(columns=["split"])
sample_submission = new_test.copy()
sample_submission["label"] = 4
new_train.to_csv(public / "train.csv", index=False)
new_test.to_csv(private / "test.csv", index=False)
sample_submission.to_csv(public / "sample_submission.csv", index=False)
(public / "train_tfrecords").mkdir(parents=True, exist_ok=True)
for i, path in tqdm(
enumerate(new_train_tfrecords),
desc="Copying Train TFRecords",
total=len(new_train_tfrecords),
):
length = path.stem.split("-")[1]
new_name = f"ld_train{i:02d}-{length}.tfrec"
shutil.copy(path, public / "train_tfrecords" / new_name)
(public / "test_tfrecords").mkdir(parents=True, exist_ok=True)
for i, path in tqdm(
enumerate(new_test_tfrecords), desc="Copying Test TFRecords", total=len(new_test_tfrecords)
):
length = path.stem.split("-")[1]
new_name = f"ld_test{i:02d}-{length}.tfrec"
shutil.copy(path, public / "test_tfrecords" / new_name)
(public / "train_images").mkdir(parents=True, exist_ok=True)
for image_id in tqdm(new_train["image_id"], desc="Copying Train Images", total=len(new_train)):
shutil.copy(raw / "train_images" / image_id, public / "train_images")
(public / "test_images").mkdir(parents=True, exist_ok=True)
for image_id in tqdm(new_test["image_id"], desc="Copying Test Images", total=len(new_test)):
shutil.copy(raw / "train_images" / image_id, public / "test_images")
shutil.copy(raw / "label_num_to_disease_map.json", public / "label_num_to_disease_map.json")
# checks
assert len(new_train) + len(new_test) == len(
old_train
), "Expected new train and new test lengths to sum to old train length"
assert len(sample_submission) == len(
new_test
), "Expected sample submission length to be equal to new test length"
assert len(new_train) == sum(
1 for _ in (public / "train_images").iterdir()
), "Mismatch in number of expected train images copied"
assert len(new_test) == sum(
1 for _ in (public / "test_images").iterdir()
), "Mismatch in number of expected test images copied"
assert len(new_train_tfrecords) == sum(
1 for _ in (public / "train_tfrecords").iterdir()
), "Mismatch in number of expected train TFRecords copied"
assert len(new_test_tfrecords) == sum(
1 for _ in (public / "test_tfrecords").iterdir()
), "Mismatch in number of expected test TFRecords copied"
assert new_train.columns.tolist() == [
"image_id",
"label",
], "Expected new train columns to be ['image_id', 'label']"
assert new_test.columns.tolist() == [
"image_id",
"label",
], "Expected new test columns to be ['image_id', 'label']"
assert sample_submission.columns.tolist() == [
"image_id",
"label",
], "Expected sample submission columns to be ['image_id', 'label']"
assert set(new_train["image_id"]).isdisjoint(
new_test["image_id"]
), "Expected train and test image IDs to be disjoint"