gslib/storage_url.py (257 lines of code) (raw):
# -*- coding: utf-8 -*-
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""File and Cloud URL representation classes."""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import os
import re
import stat
import sys
from gslib.exception import CommandException
from gslib.exception import InvalidUrlError
from gslib.utils import system_util
from gslib.utils import text_util
# Matches provider strings of the form 'gs://'
PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
# Matches bucket strings of the form 'gs://bucket'
BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
# Matches object strings of the form 'gs://bucket/obj'
OBJECT_REGEX = re.compile(
r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
# Matches versioned object strings of the form 'gs://bucket/obj#1234'
GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
# Matches versioned object strings of the form 's3://bucket/obj#NULL'
S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
# Matches file strings of the form 'file://dir/filename'
FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
# Regex to determine if a string contains any wildcards.
WILDCARD_REGEX = re.compile(r'[*?\[\]]')
RELATIVE_PATH_SYMBOLS = frozenset(['.', '..'])
class StorageUrl(object):
"""Abstract base class for file and Cloud Storage URLs."""
def Clone(self):
raise NotImplementedError('Clone not overridden')
def IsFileUrl(self):
raise NotImplementedError('IsFileUrl not overridden')
def IsCloudUrl(self):
raise NotImplementedError('IsCloudUrl not overridden')
def IsStream():
raise NotImplementedError('IsStream not overridden')
def IsFifo(self):
raise NotImplementedError('IsFifo not overridden')
def CreatePrefixUrl(self, wildcard_suffix=None):
"""Returns a prefix of this URL that can be used for iterating.
Args:
wildcard_suffix: If supplied, this wildcard suffix will be appended to the
prefix with a trailing slash before being returned.
Returns:
A prefix of this URL that can be used for iterating.
If this URL contains a trailing slash, it will be stripped to create the
prefix. This helps avoid infinite looping when prefixes are iterated, but
preserves other slashes so that objects with '/' in the name are handled
properly.
For example, when recursively listing a bucket with the following contents:
gs://bucket// <-- object named slash
gs://bucket//one-dir-deep
a top-level expansion with '/' as a delimiter will result in the following
URL strings:
'gs://bucket//' : OBJECT
'gs://bucket//' : PREFIX
If we right-strip all slashes from the prefix entry and add a wildcard
suffix, we will get 'gs://bucket/*' which will produce identical results
(and infinitely recurse).
Example return values:
('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
('gs://bucket/', '*') becomes 'gs://bucket/*'
('gs://bucket/', None) becomes 'gs://bucket'
('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
as a BucketListingObject, so we will not recurse on it as a subdir
during listing.
"""
raise NotImplementedError('CreatePrefixUrl not overridden')
def _WarnIfUnsupportedDoubleWildcard(self):
"""Warn if ** use may lead to undefined results."""
# Accepted 'url_string' values with '**', where '^' = start, and '$' = end.
# - ^**$
# - ^**/
# - /**$
# - /**/
if not self.object_name:
return
delimiter_bounded_url = self.delim + self.object_name + self.delim
split_url = delimiter_bounded_url.split(
'{delim}**{delim}'.format(delim=self.delim))
removed_correct_double_wildcards_url_string = ''.join(split_url)
if '**' in removed_correct_double_wildcards_url_string:
# Found a center '**' not in the format '/**/'.
# Not using logger.warning b/c it's too much overhead to pass the logger
# object to every StorageUrl.
sys.stderr.write(
'** behavior is undefined if directly preceeded or followed by'
' with characters other than / in the cloud and {} locally.'.format(
os.sep))
@property
def url_string(self):
raise NotImplementedError('url_string not overridden')
@property
def versionless_url_string(self):
raise NotImplementedError('versionless_url_string not overridden')
def __eq__(self, other):
return isinstance(other, StorageUrl) and self.url_string == other.url_string
def __hash__(self):
return hash(self.url_string)
class _FileUrl(StorageUrl):
"""File URL class providing parsing and convenience methods.
This class assists with usage and manipulation of an
(optionally wildcarded) file URL string. Depending on the string
contents, this class represents one or more directories or files.
For File URLs, scheme is always file, bucket_name is always blank,
and object_name contains the file/directory path.
"""
def __init__(self, url_string, is_stream=False, is_fifo=False):
self.scheme = 'file'
self.delim = os.sep
self.bucket_name = ''
# If given a URI that starts with "<scheme>://", the object name should not
# include that prefix.
match = FILE_OBJECT_REGEX.match(url_string)
if match and match.lastindex == 2:
self.object_name = match.group(2)
else:
self.object_name = url_string
# On Windows, the pathname component separator is "\" instead of "/". If we
# find an occurrence of "/", replace it with "\" so that other logic can
# rely on being able to split pathname components on `os.sep`.
if system_util.IS_WINDOWS:
self.object_name = self.object_name.replace('/', '\\')
self.generation = None
self.is_stream = is_stream
self.is_fifo = is_fifo
self._WarnIfUnsupportedDoubleWildcard()
def Clone(self):
return _FileUrl(self.url_string)
def IsFileUrl(self):
return True
def IsCloudUrl(self):
return False
def IsStream(self):
return self.is_stream
def IsFifo(self):
return self.is_fifo
def IsDirectory(self):
return (not self.IsStream() and not self.IsFifo() and
os.path.isdir(self.object_name))
def CreatePrefixUrl(self, wildcard_suffix=None):
return self.url_string
@property
def url_string(self):
return '%s://%s' % (self.scheme, self.object_name)
@property
def versionless_url_string(self):
return self.url_string
def __str__(self):
return self.url_string
class _CloudUrl(StorageUrl):
"""Cloud URL class providing parsing and convenience methods.
This class assists with usage and manipulation of an
(optionally wildcarded) cloud URL string. Depending on the string
contents, this class represents a provider, bucket(s), or object(s).
This class operates only on strings. No cloud storage API calls are
made from this class.
"""
def __init__(self, url_string):
self.scheme = None
self.delim = '/'
self.bucket_name = None
self.object_name = None
self.generation = None
provider_match = PROVIDER_REGEX.match(url_string)
bucket_match = BUCKET_REGEX.match(url_string)
if provider_match:
self.scheme = provider_match.group('provider')
elif bucket_match:
self.scheme = bucket_match.group('provider')
self.bucket_name = bucket_match.group('bucket')
else:
object_match = OBJECT_REGEX.match(url_string)
if object_match:
self.scheme = object_match.group('provider')
self.bucket_name = object_match.group('bucket')
self.object_name = object_match.group('object')
if self.object_name == '.' or self.object_name == '..':
raise InvalidUrlError('%s is an invalid root-level object name' %
self.object_name)
if self.scheme == 'gs':
generation_match = GS_GENERATION_REGEX.match(self.object_name)
if generation_match:
self.object_name = generation_match.group('object')
self.generation = generation_match.group('generation')
elif self.scheme == 's3':
version_match = S3_VERSION_REGEX.match(self.object_name)
if version_match:
self.object_name = version_match.group('object')
self.generation = version_match.group('version_id')
else:
raise InvalidUrlError(
'CloudUrl: URL string %s did not match URL regex' % url_string)
if url_string[(len(self.scheme) + len('://')):].startswith(self.delim):
raise InvalidUrlError(
'Cloud URL scheme should be followed by colon and two slashes: "://".'
' Found: "{}"'.format(url_string))
self._WarnIfUnsupportedDoubleWildcard()
def Clone(self):
return _CloudUrl(self.url_string)
def IsFileUrl(self):
return False
def IsCloudUrl(self):
return True
def IsStream(self):
raise NotImplementedError('IsStream not supported on CloudUrl')
def IsFifo(self):
raise NotImplementedError('IsFifo not supported on CloudUrl')
def IsBucket(self):
return bool(self.bucket_name and not self.object_name)
def IsObject(self):
return bool(self.bucket_name and self.object_name)
def HasGeneration(self):
return bool(self.generation)
def IsProvider(self):
return bool(self.scheme and not self.bucket_name)
def CreatePrefixUrl(self, wildcard_suffix=None):
prefix = StripOneSlash(self.versionless_url_string)
if wildcard_suffix:
prefix = '%s/%s' % (prefix, wildcard_suffix)
return prefix
@property
def bucket_url_string(self):
return '%s://%s/' % (self.scheme, self.bucket_name)
@property
def url_string(self):
url_str = self.versionless_url_string
if self.HasGeneration():
url_str += '#%s' % self.generation
return url_str
@property
def versionless_url_string(self):
if self.IsProvider():
return '%s://' % self.scheme
elif self.IsBucket():
return self.bucket_url_string
return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
def __str__(self):
return self.url_string
def GetSchemeFromUrlString(url_str):
"""Returns scheme component of a URL string."""
end_scheme_idx = url_str.find('://')
if end_scheme_idx == -1:
# File is the default scheme.
return 'file'
else:
return url_str[0:end_scheme_idx].lower()
def IsKnownUrlScheme(scheme_str):
return scheme_str in ('file', 's3', 'gs')
def _GetPathFromUrlString(url_str):
"""Returns path component of a URL string."""
end_scheme_idx = url_str.find('://')
if end_scheme_idx == -1:
return url_str
else:
return url_str[end_scheme_idx + 3:]
def ContainsWildcard(url_string):
"""Checks whether url_string contains a wildcard.
Args:
url_string: URL string to check.
Returns:
bool indicator.
"""
return bool(WILDCARD_REGEX.search(url_string))
def GenerationFromUrlAndString(url, generation):
"""Decodes a generation from a StorageURL and a generation string.
This is used to represent gs and s3 versioning.
Args:
url: StorageUrl representing the object.
generation: Long or string representing the object's generation or
version.
Returns:
Valid generation string for use in URLs.
"""
if url.scheme == 's3' and generation:
return text_util.DecodeLongAsString(generation)
return generation
def HaveFileUrls(args_to_check):
"""Checks whether args_to_check contain any file URLs.
Args:
args_to_check: Command-line argument subset to check.
Returns:
True if args_to_check contains any file URLs.
"""
for url_str in args_to_check:
storage_url = StorageUrlFromString(url_str)
if storage_url.IsFileUrl():
return True
return False
def HaveProviderUrls(args_to_check):
"""Checks whether args_to_check contains any provider URLs (like 'gs://').
Args:
args_to_check: Command-line argument subset to check.
Returns:
True if args_to_check contains any provider URLs.
"""
for url_str in args_to_check:
storage_url = StorageUrlFromString(url_str)
if storage_url.IsCloudUrl() and storage_url.IsProvider():
return True
return False
def IsCloudSubdirPlaceholder(url, blr=None):
"""Determines if a StorageUrl is a cloud subdir placeholder.
This function is needed because GUI tools (like the GCS cloud console) allow
users to create empty "folders" by creating a placeholder object; and parts
of gsutil need to treat those placeholder objects specially. For example,
gsutil rsync needs to avoid downloading those objects because they can cause
conflicts (see comments in rsync command for details).
We currently detect two cases:
- Cloud objects whose name ends with '_$folder$'
- Cloud objects whose name ends with '/'
Args:
url: (gslib.storage_url.StorageUrl) The URL to be checked.
blr: (gslib.BucketListingRef or None) The blr to check, or None if not
available. If `blr` is None, size won't be checked.
Returns:
(bool) True if the URL is a cloud subdir placeholder, otherwise False.
"""
if not url.IsCloudUrl():
return False
url_str = url.url_string
if url_str.endswith('_$folder$'):
return True
if blr and blr.IsObject():
size = blr.root_object.size
else:
size = 0
return size == 0 and url_str.endswith('/')
def IsFileUrlString(url_str):
"""Returns whether a string is a file URL."""
return GetSchemeFromUrlString(url_str) == 'file'
def StorageUrlFromString(url_str):
"""Static factory function for creating a StorageUrl from a string."""
scheme = GetSchemeFromUrlString(url_str)
if not IsKnownUrlScheme(scheme):
raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
if scheme == 'file':
path = _GetPathFromUrlString(url_str)
is_stream = (path == '-')
is_fifo = False
try:
is_fifo = stat.S_ISFIFO(os.stat(path).st_mode)
except OSError:
pass
return _FileUrl(url_str, is_stream=is_stream, is_fifo=is_fifo)
return _CloudUrl(url_str)
def StripOneSlash(url_str):
if url_str and url_str.endswith('/'):
return url_str[:-1]
return url_str
def UrlsAreForSingleProvider(url_args):
"""Tests whether the URLs are all for a single provider.
Args:
url_args: (Iterable[str]) Collection of strings to check.
Returns:
True if all URLs are for single provider; False if `url_args` was empty (as
this would not result in a single unique provider) or URLs targeted multiple
unique providers.
"""
provider = None
url = None
for url_str in url_args:
url = StorageUrlFromString(url_str)
if not provider:
provider = url.scheme
elif url.scheme != provider:
return False
return provider is not None
def UrlsAreMixOfBucketsAndObjects(urls):
"""Tests whether the URLs are a mix of buckets and objects.
Args:
url_args: (Iterable[gslib.storage_url.StorageUrl]) Collection of URLs to
check.
Returns:
True if URLs are a mix of buckets and objects. False if URLs are all buckets
or all objects. None if invalid Cloud URLs are included.
"""
if all(url.IsCloudUrl() for url in urls):
are_buckets = list(map(lambda x: x.IsBucket(), urls))
return any(are_buckets) and not all(are_buckets)
def RaiseErrorIfUrlsAreMixOfBucketsAndObjects(urls, recursion_requested):
"""Raises error if mix of buckets and objects adjusted for recursion."""
if UrlsAreMixOfBucketsAndObjects(urls) and not recursion_requested:
raise CommandException('Cannot operate on a mix of buckets and objects.')