in azure/datalake/store/multithread.py [0:0]
def _setup(self):
""" Create set of parameters to loop over
"""
def is_glob_path(path):
path = AzureDLPath(path).trim()
prefix = path.globless_prefix
return not path == prefix
is_rpath_glob = is_glob_path(self.rpath)
if is_rpath_glob:
rfiles = self.client._adlfs.glob(self.rpath, details=True, invalidate_cache=True)
else:
rfiles = self.client._adlfs.walk(self.rpath, details=True, invalidate_cache=True)
if not rfiles:
raise ValueError('No files to download')
# If only one file is returned we are not sure whether user specified a dir or a file to download,
# since walk gives the same result for both i.e walk("DirWithsingleFile") == walk("DirWithSingleFile\SingleFile)
# If user specified a file in rpath,
# then we want to download the file into lpath directly and not create another subdir for that.
# If user specified a dir that happens to contain only one file, we want to create the dir as well under lpath.
if len(rfiles) == 1 and not is_rpath_glob and self.client._adlfs.info(self.rpath)['type'] == 'FILE':
if os.path.exists(self.lpath) and os.path.isdir(self.lpath):
file_pairs = [(os.path.join(self.lpath, os.path.basename(rfiles[0]['name'] + '.inprogress')),
rfiles[0])]
else:
file_pairs = [(self.lpath, rfiles[0])]
else:
local_rel_rpath = str(AzureDLPath(self.rpath).trim().globless_prefix)
file_pairs = [(os.path.join(self.lpath, os.path.relpath(f['name'] +'.inprogress', local_rel_rpath)), f)
for f in rfiles]
# this property is used for internal validation
# and should not be referenced directly by public callers
self._file_pairs = file_pairs
existing_files = []
for lfile, rfile in file_pairs:
# only interested in the final destination file name for existence,
# not the initial inprogress target
destination_file = lfile.replace('.inprogress', '')
if not self._overwrite and os.path.exists(destination_file):
existing_files.append(destination_file)
else:
self.client.submit(rfile['name'], lfile, rfile['length'])
return existing_files