in gslib/wildcard_iterator.py [0:0]
def _IterDir(self, directory, wildcard):
"""An iterator over the specified dir and wildcard.
Args:
directory (unicode): The path of the directory to iterate over.
wildcard (str): The wildcard characters used for filename pattern
matching.
Yields:
(str) A string containing the path to a file somewhere under the directory
hierarchy of `directory`.
Raises:
ComandException: If this method encounters a file path that it cannot
decode as UTF-8.
"""
if os.path.splitdrive(directory)[0] == directory:
# For Windows-style paths that consist of a drive letter followed by a
# colon, os.path.join behaves in an odd manner. It intentionally will not
# join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter
# format is not handled correctly by gsutil, so we check if the path
# specifies the root of a volume, and if so, append a backslash so that
# the resulting joined path looks like 'c:\\foo'.
directory += '\\'
# UTF8-encode directory before passing it to os.walk() so if there are
# non-valid UTF8 chars in the file name (e.g., that can happen if the file
# originated on Windows) os.walk() will not attempt to decode and then die
# with a "codec can't decode byte" error, and instead we can catch the error
# at yield time and print a more informative error message.
for dirpath, dirnames, filenames in os.walk(six.ensure_text(directory),
topdown=True):
filtered_dirnames = []
for dirname in dirnames:
full_dir_path = os.path.join(dirpath, dirname)
# Removes directories in place to prevent them and their children from
# being iterated. See https://docs.python.org/3/library/os.html#os.walk
if not self._ExcludeDir(full_dir_path):
filtered_dirnames.append(dirname)
else:
# If a symlink is excluded above we don't want to print 2 messages.
continue
# This only prints a log message as os.walk() will not, by default,
# walk down into symbolic links that resolve to directories.
if self.logger and os.path.islink(full_dir_path):
self.logger.info('Skipping symlink directory "%s"', full_dir_path)
dirnames[:] = filtered_dirnames
for f in fnmatch.filter(filenames, wildcard):
try:
yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f))
except UnicodeDecodeError:
# Note: We considered several ways to deal with this, but each had
# problems:
# 1. Raise an exception and try to catch in a higher layer (the
# gsutil cp command), so we can properly support the gsutil cp -c
# option. That doesn't work because raising an exception during
# iteration terminates the generator.
# 2. Accumulate a list of bad filenames and skip processing each
# during iteration, then raise at the end, with exception text
# printing the bad paths. That doesn't work because iteration is
# wrapped in PluralityCheckableIterator, so it's possible there
# are not-yet-performed copy operations at the time we reach the
# end of the iteration and raise the exception - which would cause
# us to skip copying validly named files. Moreover, the gsutil
# cp command loops over argv, so if you run the command gsutil cp
# -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
# would cause dir2 never to be visited.
# 3. Print the invalid pathname and skip it during iteration. That
# would work but would mean gsutil cp could exit with status 0
# even though some files weren't copied.
# 4. Change the WildcardIterator to include an error status along with
# the result. That would solve the problem but would be a
# substantial change (WildcardIterator is used in many parts of
# gsutil), and we didn't feel that magnitude of change was
# warranted by this relatively uncommon corner case.
# Instead we chose to abort when one such file is encountered, and
# require the user to remove or rename the files and try again.
raise CommandException('\n'.join(
textwrap.wrap(_UNICODE_EXCEPTION_TEXT %
repr(os.path.join(dirpath, f)))))