in src/autotrain/preprocessor/vision.py [0:0]
def prepare(self):
random_uuid = uuid.uuid4()
cache_dir = os.environ.get("HF_HOME")
if not cache_dir:
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))
if self.valid_data:
shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))
train_metadata = self._process_metadata(os.path.join(data_dir, "train"))
valid_metadata = self._process_metadata(os.path.join(data_dir, "validation"))
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
valid_metadata.to_json(
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True
)
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset = dataset.rename_columns(
{
"image": "autotrain_image",
"target": "autotrain_label",
}
)
if self.local:
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
else:
metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True)
train_df, valid_df = self.split(metadata)
# create train and validation folders
os.makedirs(os.path.join(data_dir, "train"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True)
# move images to train and validation folders
for row in train_df.iterrows():
shutil.copy(
os.path.join(self.train_data, row[1]["file_name"]),
os.path.join(data_dir, "train", row[1]["file_name"]),
)
for row in valid_df.iterrows():
shutil.copy(
os.path.join(self.train_data, row[1]["file_name"]),
os.path.join(data_dir, "validation", row[1]["file_name"]),
)
# save metadata.jsonl file to train and validation folders
train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True)
train_metadata = self._process_metadata(os.path.join(data_dir, "train"))
valid_metadata = self._process_metadata(os.path.join(data_dir, "validation"))
train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True)
valid_metadata.to_json(
os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True
)
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset = dataset.rename_columns(
{
"image": "autotrain_image",
"target": "autotrain_label",
}
)
if self.local:
dataset.save_to_disk(f"{self.project_name}/autotrain-data")
else:
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
if self.local:
return f"{self.project_name}/autotrain-data"
return f"{self.username}/autotrain-data-{self.project_name}"