in src/preprocess.py [0:0]
def _download_file(self, index, bucket, key):
pathlib.Path(f"{self._base_dir}/data").mkdir(parents=True, exist_ok=True)
self._logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
fn = f"{self._base_dir}/data/{index}.csv"
s3 = boto3.resource("s3")
s3.Bucket(bucket).download_file(key, fn)
self._logger.debug("Reading raw input data.")
df = pd.read_csv(
fn,
header=None,
names=feature_columns_names + [label_column],
dtype=DataProcessor.merge_two_dicts(feature_columns_dtype, label_column_dtype),
)
os.unlink(fn)
return df