in iopath/common/s3.py [0:0]
def _get_local_path(self, path: str, **kwargs: Any) -> str:
"""
Get a filepath which is compatible with native Python I/O such as `open`
and `os.path`.
If URI points to a remote resource, this function may download and cache
the resource to local disk. In this case, the cache stays on filesystem
(under `file_io.get_cache_dir()`) and will be used by a different run.
Therefore this function is meant to be used with read-only resources.
Args:
path (str): A URI supported by this PathHandler
Returns:
local_path (str): a file path which exists on the local file system
"""
logger = logging.getLogger(__name__)
self._check_kwargs(kwargs)
# Cheap check first.
if path.endswith("/"):
raise NotImplementedError(
"S3PathHandler does not currently support downloading directories"
)
assert self._isfile(path)
local_path = self._local_cache_path(path)
with file_lock(local_path):
if os.path.exists(local_path):
# If local object's last modified time is *after* remote object's last modified
# time, do not use the cache. Instead, redownload.
response = self._head_object(path)
if response is not None:
remote_dt = response["LastModified"]
local_dt = dt.datetime.fromtimestamp(
os.path.getmtime(local_path)
).astimezone()
# NOTE: may consider still avoid cache if times are close, to avoid a race condition.
# Currently, a lengthy download of a very recent but stale file would have a late
# local last modified timestamp, and would be improperly used.
# Better fix: set last modified time via the remote object's last modified time,
# in download_file().
if (local_dt - remote_dt) > dt.timedelta(minutes=0):
logger.info(
"URL {} was already cached in {}".format(path, local_path)
)
return local_path
logger.info("Caching {} ...".format(path))
tmp = local_path + ".tmp"
# clean-up tmp if found, because if tmp exists, it must be a dirty
# result of a previously process that didn't cleanup itself.
if os.path.isfile(tmp):
os.unlink(tmp)
bucket, s3_path = self._parse_uri(path)
client = self._get_client(bucket)
try:
response = client.download_file(
bucket, s3_path, tmp, Config=self.transfer_config
)
# First download to tmp, then move it, because move is
# (almost?) atomic when src and dst are in the same file
# system. This will avoid partial cache state if the
# process is killed.
shutil.move(tmp, local_path)
finally:
try:
os.unlink(tmp)
except Exception:
pass
logger.info("URL {} cached in {}".format(path, local_path))
return local_path