in src/autotrain/preprocessor/vision.py [0:0]
def prepare(self):
random_uuid = uuid.uuid4()
cache_dir = os.environ.get("HF_HOME")
if not cache_dir:
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))
if self.valid_data:
shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"})
if self.local:
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
else:
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]
image_filenames = []
subfolder_names = []
for subfolder in subfolders:
for filename in os.listdir(subfolder):
if filename.endswith(("jpeg", "png", "jpg")):
image_filenames.append(filename)
subfolder_names.append(os.path.basename(subfolder))
df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names})
train_df, valid_df = self.split(df)
for row in train_df.itertuples():
os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "train", row.subfolder, row.image_filename),
)
for row in valid_df.itertuples():
os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "validation", row.subfolder, row.image_filename),
)
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"})
if self.local:
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"