From a71c21963d7358d19fb8eb96a84c76c83a1c39fd Mon Sep 17 00:00:00 2001 From: DinoBektesevic Date: Wed, 26 Jun 2019 13:32:55 -0700 Subject: [PATCH] Review fixes for S3Datastore Hardcoded Butler.get from S3 storage for simplest of Datasets with no - rebased to include newest changes to daf_butler (namely, ButlerURI) - removed parsePathToUriElements, S3Location and S3LocationFactory, and replaced all path handling with ButlerURI and its test cases. - added transaction checks back into S3Datastore. Unsure on proper usage. - added more path manipulation methods/properties to LocationFactory and Location. * bucketName returns the name of the bucket in the Location/ LocationFactory * Location has a pathInBucket property that will convert posix style paths to S3 protocol style paths. The main difference is with respect to leading and trailing separators. S3 interprets `/path/`, `/path`, `path/` and `path` keys differently, even though some of them are equivalent on a POSIX compliant system. So what `/path/to/file.ext` would be on a POSIX system, on S3 it would read `path/to/file.ext` and the bucket is referenced separately with boto3. - For saving Config as file, moved all the URI handling logic into Config and out of Butler makeRepo. The only logic there is root directory creation. * The call to save config as a file at the butler root directory is now done through dumpToUri which then resolves the appropriate backend method to call. - Improved(?) on the proposed scheme for checking if we are dealing with a file or a directory in absence of the trailing path separator. * Noted some differences between the requested generality of code in the review for writing to files and inits of config classes. For inits it seems as if there's an 2+ year old commit limiting the Config files to `yaml` type files only. However, the review implied that `dumpToFile` on Config class should be file format independent. Then, for `dumpTo*` methods, to check whether we have a dir or a file I only inspect whether the path ends on a 'something.something' style. However, since I can count on files having to be `yaml` type and having `yaml` extensions in inits I use a simplified logic to determine if we have a dir or file. It is possible to generalize inits to other filetypes, as long as they have an extension added to them. * I assume we do not want to force users to be mindfull of trailing separators. - Now raising errors on unrecognized schemes on all `if scheme ==` patterns. - Closed the StreamIO in Config.dumpToFile - fixed up the Formatters to return bytes instead of strings. The fromBytes methods now expect bytes as well. JSON and YAML were main culprints. Fixed the docs for them. At this point I am confident I just overwrite the fixes when rewinding changes on rebase by accident because I have done that before, twice. - Added a different way to check if files exist, cheaper but can be slower. From https://github.com/boto/botocore/issues/1248 it is my understanding that this should not be an issue anymore. But the newer boto3 versions are slow to hit package managers. --- python/lsst/daf/butler/butler.py | 26 +-- python/lsst/daf/butler/core/butlerConfig.py | 18 +- python/lsst/daf/butler/core/config.py | 49 ++++- python/lsst/daf/butler/core/location.py | 143 ++++++++----- python/lsst/daf/butler/core/s3utils.py | 197 +++++++----------- .../lsst/daf/butler/datastores/s3Datastore.py | 179 ++++++++-------- .../daf/butler/formatters/fileFormatter.py | 2 +- .../daf/butler/formatters/jsonFormatter.py | 18 +- .../daf/butler/formatters/pickleFormatter.py | 2 +- .../daf/butler/formatters/yamlFormatter.py | 23 +- tests/config/basic/s3Datastore.yaml | 2 +- tests/test_butler.py | 22 +- tests/test_butlerFits.py | 12 +- tests/test_location.py | 1 + tests/test_s3utils.py | 90 ++++++++ tests/test_utils.py | 105 ---------- 16 files changed, 448 insertions(+), 441 deletions(-) create mode 100644 tests/test_s3utils.py diff --git a/python/lsst/daf/butler/butler.py b/python/lsst/daf/butler/butler.py index 880acad67f..bb066e26bc 100644 --- a/python/lsst/daf/butler/butler.py +++ b/python/lsst/daf/butler/butler.py @@ -37,7 +37,6 @@ from lsst.utils import doImport from .core.utils import transactional -from .core.s3utils import parsePathToUriElements from .core.datasets import DatasetRef, DatasetType from .core.datastore import Datastore from .core.registry import Registry @@ -50,6 +49,7 @@ from .core.exceptions import ValidationError from .core.repoRelocation import BUTLER_ROOT_TAG from .core.safeFileIo import safeMakeDir +from . core.location import ButlerURI log = logging.getLogger(__name__) @@ -180,16 +180,18 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat if isinstance(config, (ButlerConfig, ConfigSubset)): raise ValueError("makeRepo must be passed a regular Config without defaults applied.") - scheme, rootpath, relpath = parsePathToUriElements(root) - if scheme == 'file://': + uri = ButlerURI(root) + if uri.scheme == 'file': root = os.path.abspath(root) if not os.path.isdir(root): - os.makedirs(root) - elif scheme == 's3://': + safeMakeDir(root) + elif uri.scheme == 's3': s3 = boto3.resource('s3') # implies bucket exists, if not another level of checks - bucket = s3.Bucket(rootpath) - bucket.put_object(Bucket=rootpath, Key=(relpath)) + bucket = s3.Bucket(uri.netloc) + bucket.put_object(Bucket=uri.netloc, Key=(uri.path.lstrip('/'))) + else: + raise ValueError(f'Unrecognized scheme: {uri.scheme}') config = Config(config) # If we are creating a new repo from scratch with relative roots, @@ -202,19 +204,17 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) registryClass = doImport(full["registry", "cls"]) registryClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) + if standalone: config.merge(full) - - if scheme == 'file://': - config.dumpToFile(os.path.join(root, "butler.yaml")) - elif scheme == 's3://': - config.dumpToS3(rootpath, os.path.join(relpath, 'butler.yaml')) + config.dumpToUri(uri) # Create Registry and populate tables registryClass.fromConfig(config, create=createRegistry, butlerRoot=root) return config - def __init__(self, config=None, collection=None, run=None): + def __init__(self, config=None, butler=None, collection=None, run=None, searchPaths=None): + # save arguments for pickling self._args = (config, butler, collection, run, searchPaths) if butler is not None: if config is not None or searchPaths is not None: diff --git a/python/lsst/daf/butler/core/butlerConfig.py b/python/lsst/daf/butler/core/butlerConfig.py index 3efde5bf5c..ae5801b494 100644 --- a/python/lsst/daf/butler/core/butlerConfig.py +++ b/python/lsst/daf/butler/core/butlerConfig.py @@ -27,7 +27,7 @@ import os.path -from .s3utils import parsePathToUriElements +from .location import ButlerURI from .config import Config from .datastore import DatastoreConfig from .schema import SchemaConfig @@ -77,12 +77,18 @@ def __init__(self, other=None, searchPaths=None): return if isinstance(other, str): - if other.startswith('s3://') and not other.endswith('.yaml'): - scheme, root, relpath = parsePathToUriElements(other) - other = scheme + os.path.join(root, relpath, "butler.yaml") - else: - if os.path.isdir(other): + uri = ButlerURI(other) + if uri.scheme == 'file': + if os.path.isdir(uri.path): other = os.path.join(other, "butler.yaml") + elif uri.scheme == 's3': + if not uri.path.endswith('yaml'): + if not uri.path.endswith('/'): + uri = ButlerURI(uri.geturl()+'/') + uri.updateFile('butler.yaml') + other = uri.geturl() + else: + raise ValueError(f'Unrecognized URI scheme: {uri.scheme}') # Create an empty config for us to populate super().__init__() diff --git a/python/lsst/daf/butler/core/config.py b/python/lsst/daf/butler/core/config.py index abaf8c8676..9e379f94b6 100644 --- a/python/lsst/daf/butler/core/config.py +++ b/python/lsst/daf/butler/core/config.py @@ -31,9 +31,9 @@ import yaml import sys import io +import re from yaml.representer import Representer - -from .s3utils import parsePathToUriElements +from lsst.daf.butler.core.location import ButlerURI try: import boto3 @@ -244,10 +244,10 @@ def __initFromS3File(self, path): raise ModuleNotFoundError(('boto3 not found.' 'Are you sure it is installed?')) - scheme, bucket, key = parsePathToUriElements(path) + uri = ButlerURI(path) s3 = boto3.client('s3') try: - response = s3.get_object(Bucket=bucket, Key=key) + response = s3.get_object(Bucket=uri.netloc, Key=uri.path.lstrip('/')) except (s3.exceptions.NoSuchKey, s3.exceptions.NoSuchBucket) as err: raise FileNotFoundError(f'No such file or directory: {path}') from err byteStr = response['Body'].read() @@ -773,9 +773,6 @@ def dumpToS3(self, bucket, key): key : `str` Path to the file to use for output, relative to the bucket. """ - if boto3 is None: - raise ModuleNotFoundError(("Could not find boto3. " - "Are you sure it is installed?")) stream = io.StringIO() self.dump(stream) stream.seek(0) @@ -783,6 +780,44 @@ def dumpToS3(self, bucket, key): s3 = boto3.client('s3') s3.put_object(Bucket=bucket, Key=key, Body=stream.read()) + stream.close() + + def dumpToUri(self, uri): + """Writes the config to location pointed to by given URI. + + If URI does not specify the filename, by default `butler.yaml` will be + used. Currently supports 's3' and 'file' URI schemes. + + Parameters + ---------- + uri: `str` or `ButlerURI` + URI of location where the Config will be written. + """ + if boto3 is None: + raise ModuleNotFoundError(("Could not find boto3. " + "Are you sure it is installed?")) + + if isinstance(uri, str): + uri = ButlerURI(uri) + + if uri.scheme == 'file': + if os.path.isdir(uri.path): + uri = ButlerURI(os.path.join(uri.path, 'butler.yaml')) + self.dumpToFile(uri.path) + elif uri.scheme == 's3': + # we guesstimate if dir or a file. Paths that *end* on strings with + # dots `[A-Za-z0-9].[A-Za-z0-9]` are assumed to be files. + isDirExpression = re.compile("\w*\.\w*$") + if isDirExpression.search(uri.path) is None: + # trailing '/' is mandatory for dirs, otherwise ButlerURI + # has a hard time updating files in URIs. + if not uri.path.endswith('/'): + uri = ButlerURI(uri.geturl()+'/') + uri.updateFile('butler.yaml') + self.dumpToS3(uri.netloc, uri.path.lstrip('/')) + else: + raise ValueError(f'Unrecognized URI scheme: {uri.scheme}') + @staticmethod def updateParameters(configType, config, full, toUpdate=None, toCopy=None, overwrite=True): """Generic helper function for updating specific config parameters. diff --git a/python/lsst/daf/butler/core/location.py b/python/lsst/daf/butler/core/location.py index 7c7b537c90..8661444126 100644 --- a/python/lsst/daf/butler/core/location.py +++ b/python/lsst/daf/butler/core/location.py @@ -175,23 +175,11 @@ def replace(self, **kwargs): new : `ButlerURI` New `ButlerURI` object with updated values. """ - - def __init__(self, datastoreRoot): - """Constructor - Parameters - ---------- - datastoreRoot : `str` - Root location of the `Datastore` in the filesystem. - """ return self.__class__(self._uri._replace(**kwargs)) def updateFile(self, newfile): """Update in place the final component of the path with the supplied file name. - """ - - def fromUri(self, uri): - """Factory function to create a `Location` from a URI. Parameters ---------- @@ -220,6 +208,21 @@ def __str__(self): def _fixupFileUri(parsed, root=None, forceAbsolute=False): """Fix up relative paths in file URI instances. + Parameters + ---------- + parsed : `~urllib.parse.ParseResult` + The result from parsing a URI using `urllib.parse`. + root : `str`, optional + Path to use as root when converting relative to absolute. + If `None`, it will be the current working directory. This + is a local file system path, not a URI. + forceAbsolute : `bool` + If `True`, scheme-less relative URI will be converted to an + absolute path using a ``file`` scheme. If `False` scheme-less URI + will remain scheme-less and will not be updated to ``file`` or + absolute path. URIs with a defined scheme will not be affected + by this parameter. + Returns ------- modified : `~urllib.parse.ParseResult` @@ -306,8 +309,10 @@ class Location: Relative path within datastore. Assumed to be using the local path separator if a ``file`` scheme is being used for the URI, else a POSIX separator. - """ + + __slots__ = ("_datastoreRootUri", "_path") + def __init__(self, datastoreRootUri, path): if isinstance(datastoreRootUri, str): datastoreRootUri = ButlerURI(datastoreRootUri) @@ -342,15 +347,44 @@ def uri(self): @property def path(self): """Path corresponding to location. - This path includes the root of the `Datastore`. + + This path includes the root of the `Datastore`, but does not include + non-path components of the root URI. If a file URI scheme is being + used the path will be returned with the local OS path separator. """ - return os.path.join(self._datastoreRoot, self._uri.path.lstrip("/")) + if not self._datastoreRootUri.scheme: + # Entirely local file system + return os.path.normpath(os.path.join(self._datastoreRootUri.path, self.pathInStore)) + elif self._datastoreRootUri.scheme == "file": + return os.path.normpath(os.path.join(posix2os(self._datastoreRootUri.path), self.pathInStore)) + else: + return posixpath.join(self._datastoreRootUri.path, self.pathInStore) @property def pathInStore(self): - """Path corresponding to S3Location relative to `S3Datastore` root. + """Path corresponding to location relative to `Datastore` root. + + Uses the same path separator as supplied to the object constructor. + """ + return self._path + + @property + def bucketName(self): + """If Location is an S3 protocol, returns the bucket name.""" + if self._datastoreRootUri.scheme == 's3': + return self._datastoreRootUri.netloc + else: + raise AttributeError(f'Path {self} is not a valid S3 protocol.') + + @property + def pathInBucket(self): + """Returns the path in an S3 Bucket, normalized so that directory + structure in the bucket matches that of a local filesystem. + + Effectively, this is the path property with posix separator stipped + from the left hand side of the path. """ - return self._uri.path.lstrip("/") + return self.path.lstrip('/') def updateExtension(self, ext): """Update the file extension associated with this `Location`. @@ -363,68 +397,67 @@ def updateExtension(self, ext): """ if ext is None: return - path, _ = os.path.splitext(self._uri.path) + + path, _ = os.path.splitext(self.pathInStore) # Ensure that we have a leading "." on file extension (and we do not # try to modify the empty string) if ext and not ext.startswith("."): ext = "." + ext - parts = list(self._uri) - parts[2] = path + ext - self._uri = urllib.parse.urlparse(urllib.parse.urlunparse(parts)) + self._path = path + ext class LocationFactory: """Factory for `Location` instances. + + The factory is constructed from the root location of the datastore. + This location can be a path on the file system (absolute or relative) + or as a URI. + + Parameters + ---------- + datastoreRoot : `str` + Root location of the `Datastore` either as a path in the local + filesystem or as a URI. File scheme URIs can be used. If a local + filesystem path is used without URI scheme, it will be converted + to an absolute path and any home directory indicators expanded. + If a file scheme is used with a relative path, the path will + be treated as a posixpath but then converted to an absolute path. """ def __init__(self, datastoreRoot): - """Constructor - - Parameters - ---------- - bucket : `str` - Name of the Bucket that is used. - datastoreRoot : `str` - Root location of the `S3Datastore` in the Bucket. - """ - self._datastoreRoot = datastoreRoot + self._datastoreRootUri = ButlerURI(datastoreRoot, forceAbsolute=True) - def fromUri(self, uri): - """Factory function to create a `S3Location` from a URI. + def __str__(self): + return f"{self.__class__.__name__}@{self._datastoreRootUri}" - Parameters - ---------- - uri : `str` - A valid Universal Resource Identifier. + @property + def bucketName(self): + """If Location is an S3 protocol, returns the bucket name.""" + if self._datastoreRootUri.scheme == 's3': + return self._datastoreRootUri.netloc + else: + raise AttributeError(f'Path {self} is not a valid S3 protocol.') - Returns - ------- - location : `S3Location` - The equivalent `S3Location`. - """ - if uri is None or not isinstance(uri, str): - raise ValueError("URI must be a string and not {}".format(uri)) - return Location(self._datastoreRoot, uri) + @property + def datastoreRootName(self): + """Returns the name of the directory used as datastore's root.""" + return os.path.basename(self._datastoreRootUri.path) def fromPath(self, path): - """Factory function to create a `S3Location` from a POSIX-like path. + """Factory function to create a `Location` from a POSIX path. Parameters ---------- path : `str` - A POSIX-like path, relative to the `S3Datastore` root. + A standard POSIX path, relative to the `Datastore` root. Returns ------- - location : `S3Location` - The equivalent `S3Location`. + location : `Location` + The equivalent `Location`. """ - uri = urllib.parse.urljoin("file://", path) - return self.fromUri(uri) if os.path.isabs(path): - raise ValueError(('A path whose absolute location is in an S3 bucket ' - 'can not have an absolute path: {}').format(path)) - - return self.fromUri('s3://' + os.path.join(self._bucket, self._datastoreRoot, path)) + raise ValueError("LocationFactory path must be relative to datastore, not absolute.") + return Location(self._datastoreRootUri, path) diff --git a/python/lsst/daf/butler/core/s3utils.py b/python/lsst/daf/butler/core/s3utils.py index 318580bc1f..1214778c55 100644 --- a/python/lsst/daf/butler/core/s3utils.py +++ b/python/lsst/daf/butler/core/s3utils.py @@ -19,11 +19,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -__all__ = ("s3CheckFileExists", "parsePathToUriElements", "bucketExists") - -import os -import urllib -from urllib.parse import urlparse +__all__ = ("s3CheckFileExists", "bucketExists") try: import boto3 @@ -31,21 +27,56 @@ boto3 = None -def s3CheckFileExists(client, bucket, filepath): - """Returns (True, filesize) if file exists in the bucket - and (False, -1) if the file is not found. +def s3CheckFileExistsGET(client, bucket, filepath): + """Returns (True, filesize) if file exists in the bucket and (False, -1) if + the file is not found. - You are getting charged for a Bucket GET request. The request - returns the list of all files matching the given filepath. - Multiple matches are considered a non-match. + Parameters + ---------- + client : `boto3.client` + S3 Client object to query. + bucket : `str` + Name of the bucket in which to look. + filepath : `str` + Path to file. + + Returns + ------- + (`bool`, `int`) : `tuple` + Tuple (exists, size). If file exists (True, filesize) + and (False, -1) when the file is not found. + + Notes + ----- + A Bucket GET request will be charged against your account. This is on + average 10x cheaper than a LIST request (see `s3CheckFileExistsLIST` + function) but can be up to 90% slower whenever additional work is done with + the s3 client. See PR: + ``https://github.com/boto/botocore/issues/1248`` + for details, in boto3 versions >=1.11.0 the loss of performance should not + be an issue anymore. + S3 Paths are sensitive to leading and trailing path separators. + """ + try: + obj = client.head_object(Bucket=bucket, Key=filepath) + return (True, obj['ContentLength']) + except client.exceptions.ClientError as err: + if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return (False, -1) + raise + + +def s3CheckFileExistsLIST(client, bucket, filepath): + """Returns (True, filesize) if file exists in the bucket and (False, -1) if + the file is not found. Parameters ---------- - client : 'boto3.client' + client : `boto3.client` S3 Client object to query. - bucket : 'str' + bucket : `str` Name of the bucket in which to look. - filepath : 'str' + filepath : `str` Path to file. Returns @@ -53,12 +84,17 @@ def s3CheckFileExists(client, bucket, filepath): (`bool`, `int`) : `tuple` Tuple (exists, size). If file exists (True, filesize) and (False, -1) when the file is not found. + + Notes + ----- + You are getting charged for a Bucket LIST request. This is on average 10x + more expensive than a GET request (see `s3CheckFileExistsGET` fucntion) but + can be up to 90% faster whenever additional work is done with the s3 + client. For details, see PR: + ``https://github.com/boto/botocore/issues/1248`` + Boto3 versions >=1.11.0 the loss of performance should not be an issue + anymore. """ - # this has maxkeys kwarg, limited to 1000 by default - # apparently this is the fastest way do look-ups as it avoids having - # to create and add a new HTTP connection to pool - # https://github.com/boto/botocore/issues/1248 - # https://github.com/boto/boto3/issues/1128 response = client.list_objects_v2( Bucket=bucket, Prefix=filepath @@ -71,111 +107,40 @@ def s3CheckFileExists(client, bucket, filepath): return (False, -1) -def parsePathToUriElements(path, root=None): - """Constructs elements of an URI. +def s3CheckFileExists(client, bucket, filepath, cheap=True): + """Returns (True, filesize) if file exists in the bucket and (False, -1) if + the file is not found. Parameters ---------- - path : `str` - URI or a POSIX-like path to parse. URI can contain an absolute or an - relative path. - root : `str` (optional) - If provided path is relative, it is taken relative to given root. - If the provided path is absolute and contains given root, it is split - on the root. - If the provided path is absolute and does not contain root, then root - is ignored + client : `boto3.client` + S3 Client object to query. + bucket : `str` + Name of the bucket in which to look. + filepath : `str` + Path to file. + cheap : `bool` + If True, makes a GET request to S3 instead of a LIST request. See + `s3CheckFileExistsGET` or `s3CheckFileExistsLIST` for more details. Returns ------- - scheme : 'str' - Either 'file://' or 's3://'. - root : 'str' - Absolute path up to the top of the relative path determined by root. If - no root is given root is taken to be the entire path to the last dir. - Practically speaking, this is the bucket name or the given root. - relpath : 'str' - Posix-like path relative to root. - - Examples - -------- - When URIs contain absolute paths to files or directories: - - >>> parsePathToUriElements('s3://bucketname/root/relative/file.ext') - ('s3://', 'bucketname', 'root/relative/') - - >>> parsePathToUriElements('file:///root/relative/file.ext') - ('file://', '/root/relative', 'file.ext') - - >>> parsePathToUriElements('file:///root/relative/file.ext', '/root') - ('file://', '/root', 'relative/file.ext') - - Only `file://` URIs can contain relative paths. The same case for S3 URIs - would not make any sense as neither the bucket nor root key would be known. - - >>> parsePathToUriElements('file://relative/file.ext', '/root') - ('file://', '/root', 'relative/file.ext') - - The behaviour is the same in the case POSIX-like paths are given instead of - an URI. - - >>> parsePathToUriElements('relative/file.ext', '/root') - ('file://', '/root', 'relative/file.ext') - - >>> parsePathToUriElements('/root/relative/file.ext') - ('file://', '/root/relative', 'file.ext') - - Root is as an empty string if a relative path without root is given. - - >>> parsePathToUriElements('relative/file.ext') - ('file://', '', 'relative/file.ext') + (`bool`, `int`) : `tuple` + Tuple (exists, size). If file exists (True, filesize) + and (False, -1) when the file is not found. """ - if path.startswith('~'): - path = os.path.expanduser(path) - - parsed = urlparse(path) - - if parsed.scheme == 'file' or not parsed.scheme: - scheme = 'file://' - # Absolute paths in URI and absolute POSIX paths - if not parsed.netloc and os.path.isabs(parsed.path): - if root is None or root not in path: - rootPath, relPath = os.path.split(parsed.path) - else: - parts = parsed.path.split(root) - rootPath = os.path.abspath(os.path.join(parts[0], root)) - relPath = parts[-1].lstrip(os.sep) - # Relative paths in URI and relative POSIX paths - else: - if root is None: - rootPath = '' if root is None else os.path.abspath(root) - relPath = os.path.join(parsed.netloc, parsed.path.lstrip('/')) - else: - tmpPath = os.path.abspath( - os.path.join( - root, parsed.netloc, parsed.path.lstrip(os.sep) - ) - ) - rootPath = os.path.commonpath((tmpPath, root)) - relPath = tmpPath[len(rootPath):].lstrip(os.sep) - # S3 URIs are always s3://bucketName/root/subdir/file.ext - elif parsed.scheme == 's3': - scheme = 's3://' - rootPath = parsed.netloc - relPath = parsed.path.lstrip('/') - else: - raise urllib.error.URLError(f'Can not parse path: {path}') - - return scheme, rootPath, relPath + if cheap: + return s3CheckFileExistsGET(client, bucket, filepath) + return s3CheckFileExistsLIST(client, bucket, filepath) -def bucketExists(uri): - """Check if the S3 bucket at a given URI actually exists. +def bucketExists(bucketName): + """Check if the S3 bucket with the given name actually exists. Parameters ---------- - uri : `str` - URI of the S3 Bucket + bucketName : `str` + Name of the S3 Bucket Returns ------- @@ -186,12 +151,10 @@ def bucketExists(uri): if boto3 is None: raise ModuleNotFoundError(("Could not find boto3. " "Are you sure it is installed?")) - client = boto3.client('s3') - scheme, root, relpath = parsePathToUriElements(uri) + s3 = boto3.client('s3') try: - client.get_bucket_location(Bucket=root) - # bucket exists, all is well + s3.get_bucket_location(Bucket=bucketName) return True - except client.exceptions.NoSuchBucket: + except s3.exceptions.NoSuchBucket: return False diff --git a/python/lsst/daf/butler/datastores/s3Datastore.py b/python/lsst/daf/butler/datastores/s3Datastore.py index 7876c919a4..4c150c0efc 100644 --- a/python/lsst/daf/butler/datastores/s3Datastore.py +++ b/python/lsst/daf/butler/datastores/s3Datastore.py @@ -26,19 +26,17 @@ import os import logging from collections import namedtuple -from urllib.parse import urlparse import tempfile import boto3 -from lsst.daf.butler import (Config, Datastore, DatastoreConfig, S3LocationFactory, +from lsst.daf.butler import (Config, Datastore, DatastoreConfig, LocationFactory, ButlerURI, Location, FileDescriptor, FormatterFactory, FileTemplates, StoredFileInfo, StorageClassFactory, DatasetTypeNotSupportedError, DatabaseDict, DatastoreValidationError, FileTemplateValidationError, Constraints) from lsst.daf.butler.core.utils import transactional, getInstanceOf -from lsst.daf.butler.core.s3utils import (s3CheckFileExists, - parsePathToUriElements) +from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists from lsst.daf.butler.core.repoRelocation import replaceRoot log = logging.getLogger(__name__) @@ -55,7 +53,7 @@ class S3Datastore(Datastore): `Registry` to use when recording the writing of Datasets. root : `str` Root directory of this `Datastore`. - s3locationFactory : `LocationFactory` + locationFactory : `LocationFactory` Factory for creating locations relative to S3 bucket. locationFactory : `LocationFactory` Factory for creating locations relative to datastore root @@ -97,7 +95,7 @@ def setConfigRoot(cls, root, config, full, overwrite=True): Parameters ---------- root : `str` - Filesystem path to the root of the data repository. + URI to the root of the data repository. config : `Config` A `Config` to update. Only the subset understood by this component will be updated. Will not expand @@ -137,21 +135,16 @@ def __init__(self, config, registry, butlerRoot=None): # Support repository relocation in config self.root = replaceRoot(self.config["root"], butlerRoot) - - parsed = urlparse(self.root) - self.s3locationFactory = S3LocationFactory(parsed.netloc, parsed.path) + self.locationFactory = LocationFactory(self.root) self.client = boto3.client('s3') - # self.bucket = self.s3locationFactory.bucket - # we check if a bucket actually exists or not. For PosixDatastore this - # would be checking if directory exists, and if not, it would create - # one. Call to client.create_bucket is possible but also requires ACL - # LocationConstraints, Permissions and other configuration parameters. - try: - # bucket exsists - all is well - self.client.get_bucket_location(Bucket=parsed.netloc) - except self.client.exceptions.NoSuchBucket: - raise IOError(f"Bucket {parsed.netloc} does not exists!") + if not bucketExists(self.locationFactory.bucketName): + # PosixDatastore creates the root directory if one does not exist. + # Calling s3 client.create_bucket is possible but also requires + # ACL LocationConstraints, Permissions and other configuration + # parameters, so for now we do not create a bucket if one is + # missing. Further discussion can make this happen though. + raise IOError(f"Bucket {self.locationFactory.bucketName} does not exists!") self.formatterFactory = FormatterFactory() self.storageClassFactory = StorageClassFactory() @@ -246,8 +239,8 @@ def exists(self, ref): except KeyError: return False - loc = self.s3locationFactory.fromPath(storedFileInfo.path) - return s3CheckFileExists(self.client, loc.bucket, loc.path)[0] + loc = self.locationFactory.fromPath(storedFileInfo.path) + return s3CheckFileExists(self.client, loc.bucketName, loc.pathInBucket)[0] def get(self, ref, parameters=None): """Load an InMemoryDataset from the store. @@ -283,19 +276,29 @@ def get(self, ref, parameters=None): except KeyError: raise FileNotFoundError(f"Could not retrieve Dataset {ref}.") - location = self.s3locationFactory.fromPath(storedFileInfo.path) + location = self.locationFactory.fromPath(storedFileInfo.path) - # checks for existence were done through listing request - # (s3CheckFileExists), but since we have to make a GET request to S3 - # anyhow (for download) we might as well use the HEADER metadata for - # size comparison instead + # since we have to make a GET request to S3 anyhow (for download) we + # might as well use the HEADER metadata for size comparison instead. + # s3CheckFileExists would just duplicate GET/LIST charges in this case. try: - response = self.client.get_object(Bucket=location.bucket, Key=location.path) + response = self.client.get_object(Bucket=location.bucketName, + Key=location.pathInBucket) except self.client.exceptions.ClientError as err: - if err['Error']['Code'] == '404': - errstr = ("Dataset with Id {} does not exists at expected " - "location {}") - raise FileNotFoundError(errstr.format(ref.id, location.path)) from err + errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] + # the HTTPS code 403 should be reserved for authorization failures. + # For some reason it's 403 that is raised when paths, that begin + # with '/', are being asked for. So `path/file` raises a 404 but + # `/path/file` raises a 403. Unit tests right now demand a + # FileExistsError be raised, so I do, but this should be updated. + # This has been the way boto3 does errors since at least 2015 + # https://github.com/boto/boto3/issues/167 +# if errorcode == 403: +# errmsg = "Insufficient, or invalid, permissions." +# raise PermissionError(errmsg) from err + if errorcode == 404 or errorcode == 403: + errmsg = "Dataset with Id {} does not exists at expected location {}." + raise FileNotFoundError(errmsg.format(ref.id, location)) from err # other errors are reraised also, but less descriptively raise err @@ -334,10 +337,7 @@ def get(self, ref, parameters=None): except NotImplementedError: with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: tmpFile.file.write(serializedDataset) - # This makes me think that there is a reason to have both - # Location and S3Location factories in this class, where - # Location would track the temporary files and directories. - fileDescriptor.location = Location('/', tmpFile.name) + fileDescriptor.location = Location(*os.path.split(tmpFile.name)) result = formatter.read(fileDescriptor, component=component) except Exception as e: raise ValueError(f"Failure from formatter for Dataset {ref.id}: {e}") from e @@ -370,6 +370,14 @@ def put(self, inMemoryDataset, ref): Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. + + Notes + ----- + If the datastore is configured to reject certain dataset types it + is possible that the put will fail and raise a + `DatasetTypeNotSupportedError`. The main use case for this is to + allow `ChainedDatastore` to put to multiple datastores without + requiring that every datastore accepts the dataset. """ datasetType = ref.datasetType storageClass = datasetType.storageClass @@ -392,7 +400,7 @@ def put(self, inMemoryDataset, ref): except KeyError as e: raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e - location = self.s3locationFactory.fromPath(template.format(ref)) + location = self.locationFactory.fromPath(template.format(ref)) # Get the formatter based on the storage class try: @@ -404,29 +412,28 @@ def put(self, inMemoryDataset, ref): # then the file is written if checks against overwriting an existing # file pass. But in S3 `Keys` instead only look like directories, but # are not. We check if an exact full key already exists before writing - # instead. + # instead, if it does not the insert key operation will be equivalent + # to creating a directory and the file together. location.updateExtension(formatter.extension) - if s3CheckFileExists(self.client, location.bucket, location.path)[0]: + if s3CheckFileExists(self.client, location.bucketName, location.pathInBucket)[0]: raise FileExistsError(f"Cannot write file for ref {ref} as " f"output file {location.uri} exists.") - # upload the file directly from bytes or by using a temporary file + # upload the file directly from bytes or by using a temporary file if + # _toBytes is not implemented for the formatter in question fileDescriptor = FileDescriptor(location, storageClass=storageClass) try: serializedDataset = formatter.toBytes(inMemoryDataset, fileDescriptor) - self.client.put_object(Bucket=location.bucket, Key=location.path, + self.client.put_object(Bucket=location.bucketName, Key=location.pathInBucket, Body=serializedDataset) log.debug("Wrote file directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: - # same as in get - is it worthwile carrying a factory for - # tempfile overrides like these - fileDescriptor.location = Location('/', tmpFile.name) + fileDescriptor.location = Location(*os.path.split(tmpFile.name)) formatter.write(inMemoryDataset, fileDescriptor) - self.client.upload_file(Bucket=location.bucket, Key=location.path, + self.client.upload_file(Bucket=location.bucketName, Key=location.pathInBucket, Filename=tmpFile.name) - log.debug("Wrote file to %s via a temporary directory.", - location.uri) + log.debug("Wrote file to %s via a temporary directory.", location.uri) # URI is needed to resolve what ingest case are we dealing with self.ingest(location.uri, ref, formatter=formatter) @@ -474,71 +481,59 @@ def ingest(self, path, ref, formatter=None, transfer=None): formatter = self.formatterFactory.getFormatter(ref) # we can not assume that root is the same as self.root when ingesting - scheme, root, relpath = parsePathToUriElements(path) - if (scheme != 'file://') and (scheme != 's3://'): - raise NotImplementedError(f'Scheme type {scheme} not supported.') + uri = ButlerURI(path) + if (uri.scheme != 'file') and (uri.scheme != 's3'): + raise NotImplementedError(f'Scheme type {uri.scheme} not supported.') if transfer is None: - if scheme == 'file://': - # someone wants to ingest a local file, but not transfer it to - # object storage - abspath = os.path.join(root, relpath) - errmsg = ("'{}' is not inside repository root '{}'. Ingesting " - "local data to S3Datastore without upload to S3 is " - "not allowed.") - raise RuntimeError(errmsg.format(abspath, self.root)) - if scheme == 's3://': - # if both transfer is None and scheme s3, file was already - # uploaded in put. There is no equivalent of os.isdir/os.isfile - # for the S3 - problem for parsing. The string comparisons will - # fail since one of them will have the '/' and the other not. - scheme, bucketname, rootDir = parsePathToUriElements(self.root) - topDir = relpath.split('/')[0] + '/' - rootDir = rootDir + '/' if rootDir[-1] != '/' else rootDir - if (bucketname != root) or (rootDir != topDir): - raise RuntimeError((f"'{path}' is not inside repository " - f"root '{self.root}'")) + if uri.scheme == 'file': + # ingest a local file, but not transfer it to object storage + errmsg = (f"'{uri}' is not inside repository root '{self.root}'. " + "Ingesting local data to S3Datastore without upload " + "to S3 is not allowed.") + raise RuntimeError(errmsg.format(uri, self.root)) + elif uri.scheme == 's3': + # if transfer is None and scheme is s3, put already uploaded it + rooturi = ButlerURI(self.root) + if not uri.path.startswith(rooturi.path): + raise RuntimeError(f"'{uri}' is not inside repository root '{rooturi}'.") elif transfer == 'move' or transfer == 'copy': - if scheme == 'file://': - # reuploads not allowed? - if s3CheckFileExists(self.client, root, relpath)[0]: - raise FileExistsError(f"File '{path}' exists") + if uri.scheme == 'file': + # uploading file from local disk and potentially deleting it + if s3CheckFileExists(self.client, uri.netloc, uri.pathInBucket)[0]: + raise FileExistsError(f"File '{path}' exists!") template = self.templates.getTemplate(ref) - location = self.s3locationFactory.fromPath(template.format(ref)) + location = self.locationFactory.fromPath(template.format(ref)) location.updateExtension(formatter.extension) - - self.client.upload_file(Bucket=location.bucket, - Key=location.path, + self.client.upload_file(Bucket=location.bucketName, Key=location.path, Filename=path) if transfer == 'move': os.remove(path) - - if scheme == 's3://': - # ingesting is done from another bucket - not tested - if s3CheckFileExists(self.client, root, relpath)[0]: - fullpath = os.path.join(root, relpath) - raise FileExistsError(f"File '{scheme+fullpath}' exists.") - - copySrc = {'Bucket': root, 'Key': relpath} - self.client.copy(copySrc, self.s3locationFactory._bucket, + elif uri.scheme == 's3': + # copying files between buckets, potentially deleting src files + if s3CheckFileExists(self.client, uri.netloc, uri.pathInBucket)[0]: + raise FileExistsError(f"File '{uri}' exists.") + + relpath = uri.path.lstrip('/') + copySrc = {'Bucket': uri.netloc, 'Key': relpath} + self.client.copy(copySrc, self.locationFactory.bucketName, relpath) - if transfer == 'move': # https://github.com/boto/boto3/issues/507 - there is no # way of knowing if the file was actually deleted except # for checking all the keys again, reponse just ends up # being HTTP 204 OK request all the time - self.client.delete(Bucket=root, Key=relpath) + self.client.delete(Bucket=uri.netloc, Key=relpath) else: raise NotImplementedError(f"Transfer type '{transfer}' not supported.") if path.startswith(self.root): path = path[len(self.root):].lstrip('/') - location = self.s3locationFactory.fromPath(path) + location = self.locationFactory.fromPath(path) # the file should exist on the bucket by now - exists, size = s3CheckFileExists(self.client, location.bucket, relpath) + exists, size = s3CheckFileExists(self.client, location.bucketName, uri.path.lstrip('/')) self.registry.addDatasetLocation(ref, self.name) # Associate this dataset with the formatter for later read. @@ -625,13 +620,13 @@ def remove(self, ref): storedFileInfo = self.getStoredFileInfo(ref) except KeyError: raise FileNotFoundError(f"Requested dataset ({ref}) does not exist") - location = self.s3locationFactory.fromPath(storedFileInfo.path) - if not s3CheckFileExists(self.client, location.bucket, location.path): + location = self.locationFactory.fromPath(storedFileInfo.path) + if not s3CheckFileExists(self.client, location.bucketName, location.pathInBucket): raise FileNotFoundError("No such file: {0}".format(location.uri)) # https://github.com/boto/boto3/issues/507 - there is no way of knowing # if the file was actually deleted - self.client.delete_object(Bucket=location.bucket, Key=location.path) + self.client.delete_object(Bucket=location.bucketName, Key=location.pathInBucket) # Remove rows from registries self.removeStoredFileInfo(ref) diff --git a/python/lsst/daf/butler/formatters/fileFormatter.py b/python/lsst/daf/butler/formatters/fileFormatter.py index 1268cf54aa..a08a7066fa 100644 --- a/python/lsst/daf/butler/formatters/fileFormatter.py +++ b/python/lsst/daf/butler/formatters/fileFormatter.py @@ -260,7 +260,7 @@ def toBytes(self, inMemoryDataset, fileDescriptor): Returns ------- - serializedDataset : `str` + serializedDataset : `bytes` bytes representing the serialized dataset. """ if not hasattr(self, '_toBytes'): diff --git a/python/lsst/daf/butler/formatters/jsonFormatter.py b/python/lsst/daf/butler/formatters/jsonFormatter.py index 619ecf347d..f07762ee3b 100644 --- a/python/lsst/daf/butler/formatters/jsonFormatter.py +++ b/python/lsst/daf/butler/formatters/jsonFormatter.py @@ -78,14 +78,14 @@ def _writeFile(self, inMemoryDataset, fileDescriptor): with open(fileDescriptor.location.path, "w") as fd: if hasattr(inMemoryDataset, "_asdict"): inMemoryDataset = inMemoryDataset._asdict() - fd.write(self._toBytes(inMemoryDataset)) + fd.write(self._toBytes(inMemoryDataset).decode()) - def _fromBytes(self, inMemoryDataset, pytype=None): + def _fromBytes(self, bytesObject, pytype=None): """Read the bytes object as a python object. Parameters ---------- - pickledDataset : `bytes` + serializedDataset : `bytes` Bytes object to unserialize. pytype : `class`, optional Not used by this implementation. @@ -93,11 +93,11 @@ def _fromBytes(self, inMemoryDataset, pytype=None): Returns ------- data : `object` - Either data as Python object read from the pickled string, or None - if the string could not be read. + Either data as Python object read from bytes, or None if the string + could not be read. """ try: - data = json.loads(inMemoryDataset) + data = json.loads(bytesObject) except json.JSONDecodeError: data = None @@ -114,14 +114,14 @@ def _toBytes(self, inMemoryDataset): Returns ------- data : `bytes` - Bytes object representing the pickled object. + Object stored as bytes. Raises ------ Exception - The object could not be pickled. + The object could not be serialized. """ - return json.dumps(inMemoryDataset) + return json.dumps(inMemoryDataset, ensure_ascii=False).encode() def _coerceType(self, inMemoryDataset, storageClass, pytype=None): """Coerce the supplied inMemoryDataset to type `pytype`. diff --git a/python/lsst/daf/butler/formatters/pickleFormatter.py b/python/lsst/daf/butler/formatters/pickleFormatter.py index 2b4c84bfcf..92fcf3681e 100644 --- a/python/lsst/daf/butler/formatters/pickleFormatter.py +++ b/python/lsst/daf/butler/formatters/pickleFormatter.py @@ -91,7 +91,7 @@ def _fromBytes(self, inMemoryDataset, pytype=None): Returns ------- - data : `object` + inMemoryDataset : `object` Either data as Python object read from the pickled string, or None if the string could not be read. """ diff --git a/python/lsst/daf/butler/formatters/yamlFormatter.py b/python/lsst/daf/butler/formatters/yamlFormatter.py index d97baba2f2..7048ab2975 100644 --- a/python/lsst/daf/butler/formatters/yamlFormatter.py +++ b/python/lsst/daf/butler/formatters/yamlFormatter.py @@ -61,7 +61,7 @@ def _readFile(self, path, pytype=None): return data - def _fromBytes(self, inMemoryDataset, pytype=None): + def _fromBytes(self, bytesObject, pytype=None): """Read the bytes object as a python object. Parameters @@ -74,25 +74,16 @@ def _fromBytes(self, inMemoryDataset, pytype=None): Returns ------- data : `object` - Either data as Python object read from the pickled string, or None - if the string could not be read. + Either data as Python object read from bytes, or None if the string + could not be read. """ try: - data = yaml.load(inMemoryDataset, Loader=yaml.UnsafeLoader) + data = yaml.load(bytesObject, Loader=yaml.UnsafeLoader) except yaml.YAMLError: data = None try: data = data.exportAsDict() except AttributeError: - # its either my mis-use of yaml or intended behaviour, but yaml - # returns an py object, a list or an dictionary. Later, however, - # FileFormatter assembles and checks if only one of objects - # components was requested. It does so by forcing assembly of the - # full object and then coercing one of its parts to a pytype. I - # didn't want to figure out what is involved in the assembly - # process but it seems as if it only wants an dict. Pickle, - # however, always gets back the object. There is short-cutting - # potential here I believe (but in fileformatter) pass return data @@ -117,7 +108,7 @@ def _writeFile(self, inMemoryDataset, fileDescriptor): with open(fileDescriptor.location.path, "w") as fd: if hasattr(inMemoryDataset, "_asdict"): inMemoryDataset = inMemoryDataset._asdict() - fd.write(self._toBytes(inMemoryDataset)) + fd.write(self._toBytes(inMemoryDataset).decode()) def _toBytes(self, inMemoryDataset): """Write the in memory dataset to a bytestring. @@ -129,7 +120,7 @@ def _toBytes(self, inMemoryDataset): Returns ------- - data : `str` + data : `bytes` YAML string encoded to bytes. Raises @@ -137,7 +128,7 @@ def _toBytes(self, inMemoryDataset): Exception The object could not be serialized. """ - return yaml.dump(inMemoryDataset) + return yaml.dump(inMemoryDataset).encode() def _coerceType(self, inMemoryDataset, storageClass, pytype=None): """Coerce the supplied inMemoryDataset to type `pytype`. diff --git a/tests/config/basic/s3Datastore.yaml b/tests/config/basic/s3Datastore.yaml index 1274da844d..c15212ea0d 100644 --- a/tests/config/basic/s3Datastore.yaml +++ b/tests/config/basic/s3Datastore.yaml @@ -1,6 +1,6 @@ datastore: cls: lsst.daf.butler.datastores.s3Datastore.S3Datastore - root: s3://anybucketname + root: s3://anybucketname/butlerRoot templates: default: "{collection}/{datasetType}.{component:?}/{tract:?}/{patch:?}/{physical_filter:?}/{instrument:?}_{visit:?}" calexp: "{collection}/{datasetType}.{component:?}/{datasetType}_v{visit}_f{physical_filter:?}_{component:?}" diff --git a/tests/test_butler.py b/tests/test_butler.py index 6b0d5435ad..66ccc62c7f 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -30,10 +30,6 @@ import string import random -# I suspect botocore might exist if boto2 exists but we only -# use it to reference boto3 generic ClientError class. moto -# should not be able to exist without boto3. Are these good enough -# reasons to bundle them all together? try: import boto3 import botocore @@ -53,7 +49,8 @@ def mock_s3(cls): from lsst.daf.butler import FileTemplateValidationError, ValidationError from examplePythonTypes import MetricsExample from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG -from lsst.daf.butler.core.s3utils import parsePathToUriElements, s3CheckFileExists +from lsst.daf.butler.core.location import ButlerURI +from lsst.daf.butler.core.s3utils import s3CheckFileExists TESTDIR = os.path.abspath(os.path.dirname(__file__)) @@ -405,9 +402,10 @@ def testMakeRepo(self): self.assertEqual(butler2.registry.getAllCollections(), collections1) def testStringification(self): + # is root declared in the self.configFile, which one exactly is this? butler = Butler(self.tmpConfigFile) - butlerStr = str(butler) + butlerStr = str(butler) if self.datastoreStr is not None: for testStr in self.datastoreStr: self.assertIn(testStr, butlerStr) @@ -435,7 +433,7 @@ def checkFileExists(self, root, path): Test testPutTemplates verifies actual physical existance of the files in the requested location. For POSIXDatastore this test is equivalent - to `os.path.exist` call. + to `os.path.exists` call. """ return os.path.exists(os.path.join(root, path)) @@ -457,7 +455,7 @@ def testPutTemplates(self): # Create two almost-identical DatasetTypes (both will use default # template) - dimensions = ("instrument", "visit") + dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) @@ -618,8 +616,8 @@ def genRoot(self): def setUp(self): config = Config(self.configFile) - schema, bucket, root = parsePathToUriElements(config['.datastore.datastore.root']) - self.bucketName = bucket + uri = ButlerURI(config['.datastore.datastore.root']) + self.bucketName = uri.netloc if self.useTempRoot: self.root = self.genRoot() @@ -661,9 +659,9 @@ def checkFileExists(self, root, relpath): if boto3 is None: raise ModuleNotFoundError(("Could not find boto3. " "Are you sure it is installed?")) - scheme, bucketname, relpath = parsePathToUriElements(root) + uri = ButlerURI(root) client = boto3.client('s3') - return s3CheckFileExists(client, bucketname, relpath)[0] + return s3CheckFileExists(client, uri.netloc, uri.path.lstrip('/'))[0] if __name__ == "__main__": diff --git a/tests/test_butlerFits.py b/tests/test_butlerFits.py index f216d018ac..fbb2f85a0e 100644 --- a/tests/test_butlerFits.py +++ b/tests/test_butlerFits.py @@ -38,13 +38,12 @@ def mock_s3(cls): """ return cls -from lsst.daf.butler.core.s3utils import parsePathToUriElements - import lsst.utils.tests from lsst.daf.butler import Butler, Config from lsst.daf.butler import StorageClassFactory from lsst.daf.butler import DatasetType +from lsst.daf.butler.core.location import ButlerURI from datasetsHelper import FitsCatalogDatasetsHelper, DatasetTestHelper try: @@ -192,8 +191,8 @@ def genRoot(self): def setUp(self): config = Config(self.configFile) - schema, bucket, root = parsePathToUriElements(config['.datastore.datastore.root']) - self.bucketName = bucket + uri = ButlerURI(config['.datastore.datastore.root']) + self.bucketName = uri.netloc if self.useTempRoot: self.root = self.genRoot() @@ -215,8 +214,9 @@ def tearDown(self): bucket = s3.Bucket(self.bucketName) try: bucket.objects.all().delete() - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == '404': + except botocore.exceptions.ClientError as err: + errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] + if errorcode == 404: # the key was not reachable - pass pass else: diff --git a/tests/test_location.py b/tests/test_location.py index 9a78ac6618..df28a1636a 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -68,6 +68,7 @@ def testButlerUri(self): uriStrings = ( ("relative/file.ext", "newfile.fits", "relative/newfile.fits"), ("relative/", "newfile.fits", "relative/newfile.fits"), + ("isThisADirOrFile", "aFile.fits", "aFile.fits"), ("https://www.lsst.org/butler/", "butler.yaml", "/butler/butler.yaml"), ("s3://amazon/datastore/", "butler.yaml", "/datastore/butler.yaml"), ("s3://amazon/datastore/mybutler.yaml", "butler.yaml", "/datastore/butler.yaml"), diff --git a/tests/test_s3utils.py b/tests/test_s3utils.py new file mode 100644 index 0000000000..0540af3b7f --- /dev/null +++ b/tests/test_s3utils.py @@ -0,0 +1,90 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest + +try: + import boto3 + import botocore + from moto import mock_s3 +except ImportError: + boto3 = None + + def mock_s3(cls): + """A no-op decorator in case moto mock_s3 can not be imported. + """ + return cls + +from lsst.daf.butler.core.s3utils import bucketExists, s3CheckFileExists + + +@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") +@mock_s3 +class S3UtilsTestCase(unittest.TestCase): + """Test for the S3 related utilities. + """ + bucketName = 'testBucketName' + fileName = 'testFileName' + + def setUp(self): + s3 = boto3.client('s3') + try: + s3.create_bucket(Bucket=self.bucketName) + s3.put_object(Bucket=self.bucketName, Key=self.fileName, + Body=b'test content') + except s3.exceptions.BucketAlreadyExists: + pass + + def tearDown(self): + s3 = boto3.resource('s3') + bucket = s3.Bucket(self.bucketName) + try: + bucket.objects.all().delete() + except botocore.exceptions.ClientError as err: + errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] + if errorcode == 404: + # the key does not exists - pass + pass + else: + raise + + bucket = s3.Bucket(self.bucketName) + bucket.delete() + + def testBucketExists(self): + self.assertTrue(bucketExists(f'{self.bucketName}')) + self.assertFalse(bucketExists(f'{self.bucketName}_NO_EXIST')) + + def testFileExists(self): + s3 = boto3.client('s3') + self.assertTrue(s3CheckFileExists(s3, self.bucketName, self.fileName, + cheap=True)[0]) + self.assertFalse(s3CheckFileExists(s3, self.bucketName, self.fileName+'_NO_EXIST', + cheap=True)[0]) + + self.assertTrue(s3CheckFileExists(s3, self.bucketName, self.fileName, + cheap=False)[0]) + self.assertFalse(s3CheckFileExists(s3, self.bucketName, self.fileName+'_NO_EXIST', + cheap=False)[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py index ed41eb441a..4211555dd8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,70 +20,12 @@ # along with this program. If not, see . import unittest -import os - -try: - import boto3 - import botocore - from moto import mock_s3 -except ImportError: - boto3 = None - - def mock_s3(cls): - """A no-op decorator in case moto mock_s3 can not be imported. - """ - return cls from lsst.daf.butler.core.utils import iterable, getFullTypeName, Singleton -from lsst.daf.butler.core.s3utils import (bucketExists, parsePathToUriElements, - s3CheckFileExists) from lsst.daf.butler.core.formatter import Formatter from lsst.daf.butler import StorageClass -@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") -@mock_s3 -class S3UtilsTestCase(unittest.TestCase): - """Test for the S3 related utilities. - """ - bucketName = 'testBucketName' - fileName = 'testFileName' - - def setUp(self): - s3 = boto3.client('s3') - try: - s3.create_bucket(Bucket=self.bucketName) - s3.put_object(Bucket=self.bucketName, Key=self.fileName, - Body=b'test content') - except s3.exceptions.BucketAlreadyExists: - pass - - def tearDown(self): - s3 = boto3.resource('s3') - bucket = s3.Bucket(self.bucketName) - try: - bucket.objects.all().delete() - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == '404': - # the key was not reachable - pass - pass - else: - raise - - bucket = s3.Bucket(self.bucketName) - bucket.delete() - - def testBucketExists(self): - self.assertTrue(bucketExists(f's3://{self.bucketName}')) - self.assertFalse(bucketExists(f's3://{self.bucketName}_NO_EXIST')) - - def testFileExists(self): - s3 = boto3.client('s3') - self.assertTrue(s3CheckFileExists(s3, self.bucketName, self.fileName)[0]) - self.assertFalse(s3CheckFileExists(s3, self.bucketName, - self.fileName+'_NO_EXIST')[0]) - - class IterableTestCase(unittest.TestCase): """Tests for `iterable` helper. """ @@ -219,53 +161,6 @@ def testTypeNames(self): for item, typeName in tests: self.assertEqual(getFullTypeName(item), typeName) - def testParsePathToUriElements(self): - absPaths = [ - 'file:///rootDir/relative/file.ext', - '/rootDir/relative/file.ext' - ] - relPaths = [ - 'file://relative/file.ext', - 'relative/file.ext' - ] - s3Path = 's3://bucketname/rootDir/relative/file.ext' - globPath1 = '~/relative/file.ext' - globPath2 = '../relative/file.ext' - globPath3 = 'test/../relative/file.ext' - - # absolute paths take precedence over additionaly supplied root paths - for path in absPaths: - self.assertEqual(parsePathToUriElements(path), - ('file://', '/rootDir/relative', 'file.ext')) - self.assertEqual(parsePathToUriElements(path, '/'), - ('file://', '/rootDir/relative', 'file.ext')) - - self.assertEqual(parsePathToUriElements(globPath1, '//rootDir'), - ('file://', os.path.expanduser('~/relative'), 'file.ext')) - self.assertEqual(parsePathToUriElements(globPath1), - ('file://', os.path.expanduser('~/relative'), 'file.ext')) - - # relative paths should not expand, unless root to which they are - # relative to is also provided - for path in relPaths: - self.assertEqual(parsePathToUriElements(path, '/'), - ('file://', '/', 'relative/file.ext')) - self.assertEqual(parsePathToUriElements(path), - ('file://', '', 'relative/file.ext')) - - # basic globbing should work relative to given root or not at all - self.assertEqual(parsePathToUriElements(globPath2, '//rootDir'), - ('file://', '/', 'relative/file.ext')) - self.assertEqual(parsePathToUriElements(globPath3, '/'), - ('file://', '/', 'relative/file.ext')) - self.assertEqual(parsePathToUriElements(globPath2), - ('file://', '', globPath2)) - - self.assertEqual(parsePathToUriElements(s3Path), - ('s3://', 'bucketname', 'rootDir/relative/file.ext')) - self.assertEqual(parsePathToUriElements(s3Path, '/'), - ('s3://', 'bucketname', 'rootDir/relative/file.ext')) - if __name__ == "__main__": unittest.main()