def _IterDir()

in gslib/wildcard_iterator.py [0:0]


  def _IterDir(self, directory, wildcard):
    """An iterator over the specified dir and wildcard.

    Args:
      directory (unicode): The path of the directory to iterate over.
      wildcard (str): The wildcard characters used for filename pattern
          matching.

    Yields:
      (str) A string containing the path to a file somewhere under the directory
      hierarchy of `directory`.

    Raises:
      ComandException: If this method encounters a file path that it cannot
      decode as UTF-8.
    """
    if os.path.splitdrive(directory)[0] == directory:
      # For Windows-style paths that consist of a drive letter followed by a
      # colon, os.path.join behaves in an odd manner. It intentionally will not
      # join ['c:' and 'foo'] as 'c:\\foo', but rather as 'c:foo'. The latter
      # format is not handled correctly by gsutil, so we check if the path
      # specifies the root of a volume, and if so, append a backslash so that
      # the resulting joined path looks like 'c:\\foo'.
      directory += '\\'

    # UTF8-encode directory before passing it to os.walk() so if there are
    # non-valid UTF8 chars in the file name (e.g., that can happen if the file
    # originated on Windows) os.walk() will not attempt to decode and then die
    # with a "codec can't decode byte" error, and instead we can catch the error
    # at yield time and print a more informative error message.
    for dirpath, dirnames, filenames in os.walk(six.ensure_text(directory),
                                                topdown=True):
      filtered_dirnames = []

      for dirname in dirnames:
        full_dir_path = os.path.join(dirpath, dirname)
        # Removes directories in place to prevent them and their children from
        # being iterated. See https://docs.python.org/3/library/os.html#os.walk
        if not self._ExcludeDir(full_dir_path):
          filtered_dirnames.append(dirname)
        else:
          # If a symlink is excluded above we don't want to print 2 messages.
          continue
        # This only prints a log message as os.walk() will not, by default,
        # walk down into symbolic links that resolve to directories.
        if self.logger and os.path.islink(full_dir_path):
          self.logger.info('Skipping symlink directory "%s"', full_dir_path)

      dirnames[:] = filtered_dirnames

      for f in fnmatch.filter(filenames, wildcard):
        try:
          yield os.path.join(dirpath, FixWindowsEncodingIfNeeded(f))
        except UnicodeDecodeError:
          # Note: We considered several ways to deal with this, but each had
          # problems:
          # 1. Raise an exception and try to catch in a higher layer (the
          #    gsutil cp command), so we can properly support the gsutil cp -c
          #    option. That doesn't work because raising an exception during
          #    iteration terminates the generator.
          # 2. Accumulate a list of bad filenames and skip processing each
          #    during iteration, then raise at the end, with exception text
          #    printing the bad paths. That doesn't work because iteration is
          #    wrapped in PluralityCheckableIterator, so it's possible there
          #    are not-yet-performed copy operations at the time we reach the
          #    end of the iteration and raise the exception - which would cause
          #    us to skip copying validly named files. Moreover, the gsutil
          #    cp command loops over argv, so if you run the command gsutil cp
          #    -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
          #    would cause dir2 never to be visited.
          # 3. Print the invalid pathname and skip it during iteration. That
          #    would work but would mean gsutil cp could exit with status 0
          #    even though some files weren't copied.
          # 4. Change the WildcardIterator to include an error status along with
          #    the result. That would solve the problem but would be a
          #    substantial change (WildcardIterator is used in many parts of
          #    gsutil), and we didn't feel that magnitude of change was
          #    warranted by this relatively uncommon corner case.
          # Instead we chose to abort when one such file is encountered, and
          # require the user to remove or rename the files and try again.
          raise CommandException('\n'.join(
              textwrap.wrap(_UNICODE_EXCEPTION_TEXT %
                            repr(os.path.join(dirpath, f)))))