def prepare()

in src/autotrain/preprocessor/vision.py [0:0]


    def prepare(self):
        random_uuid = uuid.uuid4()
        cache_dir = os.environ.get("HF_HOME")
        if not cache_dir:
            cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
        data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))

        if self.valid_data:
            shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
            shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))

            dataset = load_dataset("imagefolder", data_dir=data_dir)
            dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"})
            if self.local:
                dataset.save_to_disk(f"{self.project_name}/autotrain-data")
            else:
                dataset.push_to_hub(
                    f"{self.username}/autotrain-data-{self.project_name}",
                    private=True,
                    token=self.token,
                )

        else:
            subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]

            image_filenames = []
            subfolder_names = []

            for subfolder in subfolders:
                for filename in os.listdir(subfolder):
                    if filename.endswith(("jpeg", "png", "jpg")):
                        image_filenames.append(filename)
                        subfolder_names.append(os.path.basename(subfolder))

            df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names})
            train_df, valid_df = self.split(df)

            for row in train_df.itertuples():
                os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True)
                shutil.copy(
                    os.path.join(self.train_data, row.subfolder, row.image_filename),
                    os.path.join(data_dir, "train", row.subfolder, row.image_filename),
                )

            for row in valid_df.itertuples():
                os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True)
                shutil.copy(
                    os.path.join(self.train_data, row.subfolder, row.image_filename),
                    os.path.join(data_dir, "validation", row.subfolder, row.image_filename),
                )

            dataset = load_dataset("imagefolder", data_dir=data_dir)
            dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"})
            if self.local:
                dataset.save_to_disk(f"{self.project_name}/autotrain-data")
            else:
                dataset.push_to_hub(
                    f"{self.username}/autotrain-data-{self.project_name}",
                    private=True,
                    token=self.token,
                )

        if self.local:
            return f"{self.project_name}/autotrain-data"
        return f"{self.username}/autotrain-data-{self.project_name}"