def prepare()

in src/autotrain/preprocessor/vision.py [0:0]


    def prepare(self):
        random_uuid = uuid.uuid4()
        cache_dir = os.environ.get("HF_HOME")
        if not cache_dir:
            cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
        data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))

        if self.valid_data:
            shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
            shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))

            train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train"))
            valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation"))

            train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
            valid_metadata.to_json(
                os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True
            )

            all_categories = train_categories.union(valid_categories)

            features = Features(
                {
                    "image": Image(),
                    "objects": Sequence(
                        {
                            "bbox": Sequence(Value("float32"), length=4),
                            "category": ClassLabel(names=list(all_categories)),
                        }
                    ),
                }
            )

            dataset = load_dataset("imagefolder", data_dir=data_dir, features=features)
            dataset = dataset.rename_columns(
                {
                    "image": "autotrain_image",
                    "objects": "autotrain_objects",
                }
            )

            if self.local:
                dataset.save_to_disk(f"{self.project_name}/autotrain-data")
            else:
                dataset.push_to_hub(
                    f"{self.username}/autotrain-data-{self.project_name}",
                    private=True,
                    token=self.token,
                )
        else:
            metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True)
            train_df, valid_df = self.split(metadata)

            # create train and validation folders
            os.makedirs(os.path.join(data_dir, "train"), exist_ok=True)
            os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True)

            # move images to train and validation folders
            for row in train_df.iterrows():
                shutil.copy(
                    os.path.join(self.train_data, row[1]["file_name"]),
                    os.path.join(data_dir, "train", row[1]["file_name"]),
                )

            for row in valid_df.iterrows():
                shutil.copy(
                    os.path.join(self.train_data, row[1]["file_name"]),
                    os.path.join(data_dir, "validation", row[1]["file_name"]),
                )

            # save metadata.jsonl file to train and validation folders
            train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
            valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True)

            train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train"))
            valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation"))

            train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
            valid_metadata.to_json(
                os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True
            )

            all_categories = train_categories.union(valid_categories)

            features = Features(
                {
                    "image": Image(),
                    "objects": Sequence(
                        {
                            "bbox": Sequence(Value("float32"), length=4),
                            "category": ClassLabel(names=list(all_categories)),
                        }
                    ),
                }
            )

            dataset = load_dataset("imagefolder", data_dir=data_dir, features=features)
            dataset = dataset.rename_columns(
                {
                    "image": "autotrain_image",
                    "objects": "autotrain_objects",
                }
            )

            if self.local:
                dataset.save_to_disk(f"{self.project_name}/autotrain-data")
            else:
                dataset.push_to_hub(
                    f"{self.username}/autotrain-data-{self.project_name}",
                    private=True,
                    token=self.token,
                )

        if self.local:
            return f"{self.project_name}/autotrain-data"
        return f"{self.username}/autotrain-data-{self.project_name}"