From 4902aea6aeb69f37b299dcd5e7df025f373f9882 Mon Sep 17 00:00:00 2001 From: Binam Bajracharya <44302895+BinamB@users.noreply.github.com> Date: Thu, 13 Aug 2020 15:07:10 -0500 Subject: [PATCH] feat(bundles): Add bundle endpoints (#267) * migration * migrate * new_db_and_migration * bundle * drs_enpoint_bundle_integration * More tests * authorize * remove_unused_bundle * no_duplicates * checksum * form_type * literaleval_to_eval * json_dump_loads * query_limit * fixed_bundle_structure * fixed_checksums * get_bundles_in_drs * add_params_drs * get_ordered_list_and_tests * sort_fix * test_self_reference * better_error_msg_and_tests * black * better_post_return * more_tests * updated_swagger * add alias and description in drs * remove expand and initial fixes * resolve merge conflicts * blueprint fix * fix drs blueprint * resolve drs drivers and swagger * Add optional fields * optional fields * add drs to readme * fix typos * more typos Co-authored-by: BinamB --- README.md | 1 + docs/drs.md | 147 ++++++++ indexd/drs/blueprint.py | 202 +++++++++- indexd/index/blueprint.py | 200 +++++++++- indexd/index/drivers/alchemy.py | 237 +++++++++++- indexd/index/schema.py | 28 ++ indexd/utils.py | 4 + openapis/swagger.yaml | 285 +++++++++++++- tests/test_bundles.py | 599 ++++++++++++++++++++++++++++++ tests/test_client.py | 115 ++++++ tests/test_driver_alchemy_crud.py | 98 +++++ tests/test_drs.py | 19 +- tests/test_schema_migration.py | 12 + tests/test_setup.py | 12 + 14 files changed, 1911 insertions(+), 48 deletions(-) create mode 100644 docs/drs.md create mode 100644 tests/test_bundles.py diff --git a/README.md b/README.md index 725259db..865535ff 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,7 @@ Indexd's distributed resolution logic for a given GUID/alias is roughly as follo * [More info DOS](https://github.com/ga4gh/data-repository-service-schemas) * NOTE: Was renamed to DRS * Resolving to servers with other identifiers, like [ARK IDs](http://n2t.net/e/ark_ids.html) could be supported if a client was created (otherwise, you can use the _aliases_ in Indexd to simply map from an existing identifier to a GUID) + * We have a [GA4GH DRS Implementation](./docs/drs.md) which includes bundles. * The distributed resolution can be "smart", in that you can configure `hints` that tell a central resolver Indexd that a given input should be resolved with a specific distributed service * The `hints` are a list of regexes that will attempt to match against given input * For example: `hints: ["10\..*"]` for DOIs since they'll begin with `10.` diff --git a/docs/drs.md b/docs/drs.md new file mode 100644 index 00000000..926476a9 --- /dev/null +++ b/docs/drs.md @@ -0,0 +1,147 @@ +# Data Repository Service + +Data Repository Service (DRS) API provides a generic interface to data repositories so that data can be accessed in a single, standardized way regardless of where it's stored and how it's managed. DRS v1 supports two data types, *blob* and *bundles*. Detailed information on DRS [here](https://github.com/ga4gh/data-repository-service-schemas). +* *Blob* is like a file, a single blob of bytes, without `contents_array`. +* *Bundle* is like a folder structure that can contain `DrsObject` (either blobs or bundle) insisde `contents_array`. + +NOTE: Indexd records automatically exists as a `DrsObject` + +# Fetching DRS Objects with Indexd +**Quick Links:** +* [View DRS API Documentation](http://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/Indexd/master/openapis/swagger.yaml#/DRS) +* [GA4GH DRS API Documentation](https://ga4gh.github.io/data-repository-service-schemas/swagger-ui/#/DataRepositoryService/) +* [Definitions and details for field names](https://ga4gh.github.io/data-repository-service-schemas/docs/#_drsobject). + +The DRS API supports a variety of different content acccess policies, depending on what `AccessMethod` records they return. + +1. public content: + - server provides an `access_url` with a url and no headers + - caller fetches the object bytes without providing any auth info +2. private content that requires the caller to have out-of-band auth knowledge (e.g. service account credentials): + - server provides an `access_url` with a url and no headers + - caller fetches the object bytes, passing the auth info they obtained out-of-band +3. private content that requires the caller to pass an Authorization token: + - server provides an `access_url` with a url and headers + - caller fetches the object bytes, passing auth info via the specified header(s) +4. private content that uses an expensive-to-generate auth mechanism (e.g. a signed URL): + - server provides an `access_id` + - caller passes the access_id to the `/access` endpoint + - server provides an `access_url` with the generated mechanism (e.g. a signed URL in the url field) + - caller fetches the object bytes from the url (passing auth info from the specified headers, if any) + +**Example Blob DrsObject:** +```javascript +{ + "access_methods": [ + { + "access_id": "gs", + "access_url": { + "url": "gs://some-bucket/File-A" + }, + "region": "", + "type": "gs" + }, + { + "access_id": "s3", + "access_url": { + "url": "s3:/some-bucket/File-A" + }, + "region": "", + "type": "s3" + } + ], + "aliases": [], + "checksums": [ + { + "checksum": "c29b922795e05b819d6d27064e636468", + "type": "md5" + } + ], + "contents": [], + "created_time": "2020-06-22T20:34:06.136066", + "description": null, + "form": "object", + "id": "dg.xxxx/01c3e7b2-2aca-47fc-b2e2-a5d7196652a5", + "mime_type": "application/json", + "name": "File-A", + "self_uri": "drs://binamb.planx-pla.net/dg.xxxx/01c3e7b2-2aca-47fc-b2e2-a5d7196652a5", + "size": 90, + "updated_time": "2020-06-22T20:34:06.136078", + "version": "838ed2d4" +} +``` + +**Example Bundle DrsObject when `expand=false`:** +```javascript +{ + "aliases": [], + "checksums": [ + { + "checksum": "3de2e595340a95c0e8a388bba817d8fd", + "type": "md5" + } + ], + "contents": [], + "created_time": "2020-06-22T20:39:02.578005", + "description": "", + "form": "bundle", + "id": "1a53681b-e50b-4bbc-8f99-d3af532909ec", + "mime_type": "application/json", + "name": "Bundle-A", + "self_uri": "drs://binamb.planx-pla.net/1a53681b-e50b-4bbc-8f99-d3af532909ec", + "size": 360, + "updated_time": "2020-06-22T20:39:02.578012", + "version": "" +} +``` + +**Example Bundle DrsObject when `expand=true`:** +```javascript +{ + "aliases": [], + "checksums": [ + { + "checksum": "3de2e595340a95c0e8a388bba817d8fd", + "type": "md5" + } + ], + "contents": [ + { + "contents": [], + "drs_uri": "drs://binamb.planx-pla.net/dg.xxxx/01c3e7b2-2aca-47fc-b2e2-a5d7196652a5", + "id": "dg.xxxx/01c3e7b2-2aca-47fc-b2e2-a5d7196652a5", + "name": "File-A" + }, + { + "drs_uri": "drs://binamb.planx-pla.net/dg.xxxx/050defbd-f07a-4f74-849c-3f13509a703f", + "id": "dg.xxxx/050defbd-f07a-4f74-849c-3f13509a703f", + "name": "Bundle-B" + "contents": [ + { + "contents": [], + "drs_uri": "drs://binamb.planx-pla.net/dg.xxxx/0ad17b64-84b2-4c87-9691-8f6ba016b8cf", + "id": "dg.xxxx/0ad17b64-84b2-4c87-9691-8f6ba016b8cf", + "name": "File-B" + }, + { + "contents": [], + "drs_uri": "drs://binamb.planx-pla.net/dg.xxxx/0b3f06b8-b2df-49bc-a0cf-6f77fa9c6faf", + "id": "dg.xxxx/0b3f06b8-b2df-49bc-a0cf-6f77fa9c6faf", + "name": "File-C" + } + ], + }, + ], + "created_time": "2020-06-22T20:39:02.578005", + "description": "", + "form": "bundle", + "id": "1a53681b-e50b-4bbc-8f99-d3af532909ec", + "mime_type": "application/json", + "name": "Bundle-A", + "self_uri": "drs://binamb.planx-pla.net/1a53681b-e50b-4bbc-8f99-d3af532909ec", + "size": 360, + "updated_time": "2020-06-22T20:39:02.578012", + "version": "" +} + +```` diff --git a/indexd/drs/blueprint.py b/indexd/drs/blueprint.py index 91a2d8ed..f84e768a 100644 --- a/indexd/drs/blueprint.py +++ b/indexd/drs/blueprint.py @@ -3,7 +3,6 @@ from indexd.errors import UserError from indexd.index.errors import NoRecordFound as IndexNoRecordFound from indexd.errors import IndexdUnexpectedError -from indexd.index.blueprint import get_index blueprint = flask.Blueprint("drs", __name__) @@ -16,15 +15,50 @@ def get_drs_object(object_id): """ Returns a specific DRSobject with object_id """ + expand = True if flask.request.args.get("expand") == "true" else False + ret = blueprint.index_driver.get(object_id) - return flask.jsonify(indexd_to_drs(ret)), 200 + data = indexd_to_drs(ret, expand=expand, list_drs=False) + + return flask.jsonify(data), 200 @blueprint.route("/ga4gh/drs/v1/objects", methods=["GET"]) def list_drs_records(): - records = get_index()[0].json["records"] - ret = {"drs_objects": [indexd_to_drs(record, True) for record in records]} + limit = flask.request.args.get("limit") + start = flask.request.args.get("start") + page = flask.request.args.get("page") + + form = flask.request.args.get("form") + + try: + limit = 100 if limit is None else int(limit) + except ValueError as err: + raise UserError("limit must be an integer") + + if limit < 0 or limit > 1024: + raise UserError("limit must be between 0 and 1024") + + if page is not None: + try: + page = int(page) + except ValueError as err: + raise UserError("page must be an integer") + + if form == "bundle": + records = blueprint.index_driver.get_bundle_list( + start=start, limit=limit, page=page + ) + elif form == "object": + records = blueprint.index_driver.ids(start=start, limit=limit, page=page) + else: + records = blueprint.index_driver.get_bundle_and_object_list( + start=start, limit=limit, page=page + ) + ret = { + "drs_objects": [indexd_to_drs(record, True) for record in records], + } return flask.jsonify(ret), 200 @@ -49,30 +83,76 @@ def get_signed_url(object_id, access_id): return res, 200 -def indexd_to_drs(record, list_drs=False): +def indexd_to_drs(record, expand=False, list_drs=False): + bearer_token = flask.request.headers.get("AUTHORIZATION") - self_uri = "drs://" + flask.current_app.hostname + "/" + record["did"] + + did = ( + record["id"] + if "id" in record + else record["did"] + if "did" in record + else record["bundle_id"] + ) + + self_uri = "drs://" + flask.current_app.hostname + "/" + did + + name = record["file_name"] if "file_name" in record else record["name"] + + created_time = ( + record["created_date"] if "created_date" in record else record["created_time"] + ) + + version = ( + record["rev"] + if "rev" in record + else record["version"] + if "version" in record + else "" + ) + + updated_date = ( + record["updated_date"] if "updated_date" in record else record["updated_time"] + ) + + form = record["form"] if "form" in record else "bundle" + + description = record["description"] if "description" in record else None + + alias = ( + record["alias"] + if "alias" in record + else eval(record["aliases"]) + if "aliases" in record + else [] + ) + drs_object = { - "id": record["did"], + "id": did, "description": "", "mime_type": "application/json", - "name": record["file_name"], - "created_time": record["created_date"], - "updated_time": record["updated_date"], + "name": name, + "created_time": created_time, + "updated_time": updated_date, "size": record["size"], - "aliases": [], + "aliases": alias, "contents": [], "self_uri": self_uri, - "version": record["rev"], + "version": version, + "form": form, + "checksums": [], + "description": description, } if "description" in record: drs_object["description"] = record["description"] - if "alias" in record: - drs_object["aliases"].append(record["alias"]) - if "contents" in record: - drs_object["contents"] = record["contents"] + if expand == True and "bundle_data" in record: + bundle_data = record["bundle_data"] + for bundle in bundle_data: + drs_object["contents"].append( + bundle_to_drs(bundle, expand=expand, is_content=True) + ) # access_methods mapping if "urls" in record: @@ -90,16 +170,98 @@ def indexd_to_drs(record, list_drs=False): "region": "", } ) - print(drs_object) # parse out checksums - drs_object["checksums"] = [] - for k in record["hashes"]: - drs_object["checksums"].append({"checksum": record["hashes"][k], "type": k}) + parse_checksums(record, drs_object) return drs_object +def bundle_to_drs(record, expand=False, is_content=False): + """ + is_content: is an expanded content in a bundle + """ + + did = ( + record["id"] + if "id" in record + else record["did"] + if "did" in record + else record["bundle_id"] + ) + + drs_uri = "drs://" + flask.current_app.hostname + "/" + did + + name = record["file_name"] if "file_name" in record else record["name"] + + drs_object = { + "id": did, + "name": name, + "drs_uri": drs_uri, + "contents": [], + } + + if expand: + contents = ( + record["contents"] + if "contents" in record + else record["bundle_data"] + if "bundle_data" in record + else [] + ) + drs_object["contents"] = contents + + if not is_content: + # Show these only if its the leading bundle + description = record["description"] if "description" in record else "" + aliases = ( + record["alias"] + if "alias" in record + else eval(record["aliases"]) + if "aliases" in record + else [] + ) + version = record["version"] if "version" in record else "" + drs_object["checksums"] = [] + parse_checksums(record, drs_object) + + created_time = ( + record["created_date"] + if "created_date" in record + else record["created_time"] + ) + + updated_time = ( + record["updated_date"] + if "updated_date" in record + else record["updated_time"] + ) + drs_object["created_time"] = created_time + drs_object["updated_time"] = updated_time + drs_object["size"] = record["size"] + drs_object["aliases"] = aliases + drs_object["description"] = description + drs_object["version"] = version + + return drs_object + + +def parse_checksums(record, drs_object): + if "hashes" in record: + for k in record["hashes"]: + drs_object["checksums"].append({"checksum": record["hashes"][k], "type": k}) + else: + if "checksums" in record: + for checksum in record["checksums"]: + drs_object["checksums"].append( + {"checksum": checksum["checksum"], "type": checksum["type"]} + ) + else: + drs_object["checksums"].append( + {"checksum": record["checksum"], "type": "md5"} + ) + + @blueprint.errorhandler(UserError) def handle_user_error(err): ret = {"msg": str(err), "status_code": 400} diff --git a/indexd/index/blueprint.py b/indexd/index/blueprint.py index d8538a64..15b64b32 100644 --- a/indexd/index/blueprint.py +++ b/indexd/index/blueprint.py @@ -1,6 +1,7 @@ import re import json import flask +import hashlib import jsonschema import os.path import subprocess @@ -14,6 +15,7 @@ from .schema import PUT_RECORD_SCHEMA from .schema import POST_RECORD_SCHEMA from .schema import RECORD_ALIAS_SCHEMA +from .schema import BUNDLE_SCHEMA from .schema import UPDATE_ALL_VERSIONS_SCHEMA from .errors import NoRecordFound @@ -22,6 +24,7 @@ from .errors import UnhealthyCheck from cdislogging import get_logger +from indexd.drs.blueprint import indexd_to_drs, get_drs_object, bundle_to_drs logger = get_logger("indexd/index blueprint", log_level="info") @@ -52,7 +55,7 @@ def validate_hashes(**hashes): @blueprint.route("/index/", methods=["GET"]) -def get_index(): +def get_index(form=None): """ Returns a list of records. """ @@ -128,23 +131,47 @@ def get_index(): except ValueError: raise UserError("negate_params must be a valid json string") - records = blueprint.index_driver.ids( - start=start, - limit=limit, - page=page, - size=size, - file_name=file_name, - version=version, - urls=urls, - acl=acl, - authz=authz, - hashes=hashes, - uploader=uploader, - ids=ids, - metadata=metadata, - urls_metadata=urls_metadata, - negate_params=negate_params, - ) + form = flask.request.args.get("form") if not form else form + if form == "bundle": + records = blueprint.index_driver.get_bundle_list( + start=start, limit=limit, page=page + ) + elif form == "all": + records = blueprint.index_driver.get_bundle_and_object_list( + limit=limit, + page=page, + start=start, + size=size, + urls=urls, + acl=acl, + authz=authz, + hashes=hashes, + file_name=file_name, + version=version, + uploader=uploader, + metadata=metadata, + ids=ids, + urls_metadata=urls_metadata, + negate_params=negate_params, + ) + else: + records = blueprint.index_driver.ids( + start=start, + limit=limit, + page=page, + size=size, + file_name=file_name, + version=version, + urls=urls, + acl=acl, + authz=authz, + hashes=hashes, + uploader=uploader, + ids=ids, + metadata=metadata, + urls_metadata=urls_metadata, + negate_params=negate_params, + ) base = { "ids": ids, @@ -570,6 +597,143 @@ def version(): return flask.jsonify(base), 200 +def compute_checksum(checksums): + """ + Checksum created by sorting alphabetically then concatenating first layer of bundles/objects. + + Args: + checksums (list): list of checksums from the first layer of bundles and objects + + Returns: + md5 checksum + """ + checksums.sort() + checksum = "".join(checksums) + return hashlib.md5(checksum.encode("utf-8")).hexdigest() + + +def get_checksum(data): + if "hashes" in data: + return data["hashes"][list(data["hashes"])[0]] + elif "checksums" in data: + return data["checksums"][0]["checksum"] + elif "checksum" in data: + return data["checksum"] + + +@blueprint.route("/bundle/", methods=["POST"]) +def post_bundle(): + """ + Create a new bundle + """ + + try: + authorize("create", ["/services/indexd/bundles"]) + except: + raise AuthError("Invalid Token.") + try: + jsonschema.validate(flask.request.json, BUNDLE_SCHEMA) + except jsonschema.ValidationError as err: + raise UserError(err) + + name = flask.request.json.get("name") + bundles = flask.request.json.get("bundles") + bundle_id = flask.request.json.get("bundle_id") + size = flask.request.json.get("size") if flask.request.json.get("size") else 0 + description = ( + flask.request.json.get("description") + if flask.request.json.get("description") + else "" + ) + version = ( + flask.request.json.get("version") if flask.request.json.get("version") else "" + ) + aliases = ( + flask.request.json.get("aliases") if flask.request.json.get("aliases") else [] + ) + + if len(bundles) == 0: + raise UserError("Bundle data required.") + + if len(bundles) != len(set(bundles)): + raise UserError("Duplicate GUID in bundles.") + + if bundle_id in bundles: + raise UserError("Bundle refers to itself.") + + bundle_data = [] + checksums = [] + + # get bundles/records that already exists and add it to bundle_data + for bundle in bundles: + data = get_index_record(bundle)[0] + data = data.json + size += data["size"] if not flask.request.json.get("size") else 0 + checksums.append(get_checksum(data)) + data = bundle_to_drs(data, expand=True, is_content=True) + bundle_data.append(data) + checksum = ( + flask.request.json.get("checksum") + if flask.request.json.get("checksum") + else compute_checksum(checksums) + ) + + ret = blueprint.index_driver.add_bundle( + bundle_id=bundle_id, + name=name, + size=size, + bundle_data=json.dumps(bundle_data), + checksum=checksum, + description=description, + version=version, + aliases=json.dumps(aliases), + ) + + return flask.jsonify({"bundle_id": ret[0], "name": ret[1], "contents": ret[2]}), 200 + + +@blueprint.route("/bundle/", methods=["GET"]) +def get_bundle_record_list(): + """ + Returns a list of bundle records. + """ + + form = ( + flask.request.args.get("form") if flask.request.args.get("form") else "bundle" + ) + + return get_index(form=form) + + +@blueprint.route("/bundle/", methods=["GET"]) +def get_bundle_record_with_id(bundle_id): + """ + Returns a record given bundle_id + """ + + expand = True if flask.request.args.get("expand") == "true" else False + + ret = blueprint.index_driver.get(bundle_id) + + ret = bundle_to_drs(ret, expand=expand, is_content=False) + + return flask.jsonify(ret), 200 + + +@blueprint.route("/bundle/", methods=["DELETE"]) +def delete_bundle_record(bundle_id): + """ + Delete bundle record given bundle_id + """ + try: + authorize("delete", ["/services/indexd/bundles"]) + except: + raise AuthError("Invalid Token.") + blueprint.index_driver.delete_bundle(bundle_id) + + return "", 200 + + @blueprint.errorhandler(NoRecordFound) def handle_no_record_error(err): return flask.jsonify(error=str(err)), 404 diff --git a/indexd/index/drivers/alchemy.py b/indexd/index/drivers/alchemy.py index b7012b15..3c79efb9 100644 --- a/indexd/index/drivers/alchemy.py +++ b/indexd/index/drivers/alchemy.py @@ -1,7 +1,7 @@ import datetime import uuid +import json from contextlib import contextmanager - from cdislogging import get_logger from sqlalchemy import ( BigInteger, @@ -12,6 +12,7 @@ Index, Integer, String, + Text, and_, func, or_, @@ -244,6 +245,49 @@ class IndexRecordHash(Base): ) +class DrsBundleRecord(Base): + """ + DRS bundle record representaion. + """ + + __tablename__ = "drs_bundle_record" + + bundle_id = Column(String, primary_key=True) + name = Column(String) + created_time = Column(DateTime, default=datetime.datetime.utcnow) + updated_time = Column(DateTime, default=datetime.datetime.utcnow) + checksum = Column(String) + size = Column(BigInteger) + bundle_data = Column(Text) + description = Column(Text) + version = Column(String) + aliases = Column(String) + + def to_document_dict(self, expand=False): + """ + Get the full bundle document + expand: True to include bundle_data + """ + ret = { + "id": self.bundle_id, + "name": self.name, + "created_time": self.created_time.isoformat(), + "updated_time": self.updated_time.isoformat(), + "checksum": self.checksum, + "size": self.size, + "form": "bundle", + "version": self.version, + "description": self.description, + "aliases": self.aliases, + } + + if expand: + bundle_data = json.loads(self.bundle_data) + ret["bundle_data"] = bundle_data + + return ret + + def create_urls_metadata(urls_metadata, record, session): """ create url metadata record in database @@ -294,7 +338,7 @@ def __init__( def migrate_index_database(self): """ - migrate alias database to match CURRENT_SCHEMA_VERSION + migrate index database to match CURRENT_SCHEMA_VERSION """ migrate_database( driver=self, @@ -751,6 +795,26 @@ def add_blank_record(self, uploader, file_name=None, authz=None): return record.did, record.rev, record.baseid + def add_blank_bundle(self): + """ + Create a new blank record with only uploader and optionally + file_name fields filled + """ + with self.session as session: + record = DrsBundleRecord() + base_version = BaseVersion() + + bundle_id = str(uuid.uuid4()) + + record.bundle_id = bundle_id + base_version.baseid = bundle_id + + session.add(base_version) + session.add(record) + session.commit() + + return record.bundle_id + def update_blank_record(self, did, rev, size, hashes, urls, authz=None): """ Update a blank record with size, hashes, urls, authz and raise @@ -1011,7 +1075,7 @@ def delete_one_alias_for_did(self, alias, did): self.logger.info(f"Deleted alias {alias} for did {did}.") - def get(self, did): + def get(self, did, expand=True): """ Gets a record given the record id or baseid. If the given id is a baseid, it will return the latest version @@ -1024,7 +1088,12 @@ def get(self, did): record = query.first() if record is None: - raise NoRecordFound("no record found") + record = self.get_bundle(bundle_id=did, expand=expand) + if record: + return record + else: + raise NoRecordFound("no record found") + return record.to_document_dict() def update(self, did, rev, changing_fields): @@ -1452,6 +1521,164 @@ def len(self): select([func.count()]).select_from(IndexRecord) ).scalar() + def add_bundle( + self, + bundle_id=None, + name=None, + checksum=None, + size=None, + bundle_data=None, + description=None, + version=None, + aliases=None, + ): + """ + Add a bundle record + """ + with self.session as session: + record = DrsBundleRecord() + if not bundle_id: + bundle_id = str(uuid.uuid4()) + if self.config.get("PREPEND_PREFIX"): + bundle_id = self.config["DEFAULT_PREFIX"] + bundle_id + if not name: + name = bundle_id + + record.bundle_id = bundle_id + + record.name = name + + record.checksum = checksum + + record.size = size + + record.bundle_data = bundle_data + + record.description = description + + record.version = version + + record.aliases = aliases + + try: + session.add(record) + session.commit() + except IntegrityError: + raise UserError( + 'bundle id "{bundle_id}" already exists'.format( + bundle_id=record.bundle_id + ), + 400, + ) + + return record.bundle_id, record.name, record.bundle_data + + def get_bundle_list(self, start=None, limit=100, page=None): + """ + Returns list of all bundles + """ + with self.session as session: + query = session.query(DrsBundleRecord) + query = query.limit(limit) + + if start is not None: + query = query.filter(DrsBundleRecord.bundle_id > start) + + if page is not None: + query = query.offset(limit * page) + + return [i.to_document_dict() for i in query] + + def get_bundle(self, bundle_id, expand=False): + """ + Gets a bundle record given the bundle_id. + """ + with self.session as session: + query = session.query(DrsBundleRecord) + + query = query.filter(or_(DrsBundleRecord.bundle_id == bundle_id)).order_by( + DrsBundleRecord.created_time.desc() + ) + + record = query.first() + if record is None: + raise NoRecordFound("No bundle found") + + doc = record.to_document_dict(expand) + + return doc + + def get_bundle_and_object_list( + self, + limit=100, + page=None, + start=None, + size=None, + urls=None, + acl=None, + authz=None, + hashes=None, + file_name=None, + version=None, + uploader=None, + metadata=None, + ids=None, + urls_metadata=None, + negate_params=None, + ): + """ + Gets bundles and objects and orders them by created time. + """ + limit = int((limit / 2) + 1) + bundle = self.get_bundle_list(start=start, limit=limit, page=page) + objects = self.ids( + limit=limit, + page=page, + start=start, + size=size, + urls=urls, + acl=acl, + authz=authz, + hashes=hashes, + file_name=file_name, + version=version, + uploader=uploader, + metadata=metadata, + ids=ids, + urls_metadata=urls_metadata, + negate_params=negate_params, + ) + + ret = [] + i = 0 + j = 0 + + while i + j < len(bundle) + len(objects): + if i != len(bundle) and ( + j == len(objects) + or bundle[i]["created_time"] > objects[j]["created_date"] + ): + ret.append(bundle[i]) + i += 1 + else: + ret.append(objects[j]) + j += 1 + return ret + + def delete_bundle(self, bundle_id): + with self.session as session: + query = session.query(DrsBundleRecord) + query = query.filter(DrsBundleRecord.bundle_id == bundle_id) + + try: + record = query.one() + except NoResultFound: + raise NoRecordFound("No bundle found") + except MultipleResultsFound: + raise MultipleRecordsFound("Multiple bundles found") + + session.delete(record) + def migrate_1(session, **kwargs): session.execute( @@ -1524,7 +1751,7 @@ def migrate_3(session, **kwargs): ) session.execute( - "CREATE INDEX {tb}__file_name_idx ON {tb} ( file_name )".format( + "x INDEX {tb}__file_name_idx ON {tb} ( file_name )".format( tb=IndexRecord.__tablename__ ) ) diff --git a/indexd/index/schema.py b/indexd/index/schema.py index bad2ec00..74f2e2d8 100644 --- a/indexd/index/schema.py +++ b/indexd/index/schema.py @@ -98,6 +98,34 @@ }, } +BUNDLE_SCHEMA = { + "$schema": "http://json-schema.org/schema#", + "type": "object", + "additionalProperties": False, + "description": "Creates a new bundle", + "required": ["bundles"], + "properties": { + "bundle_id": {"type": "string",}, + "name": { + "description": "Required bundle name created my author of the bundle", + "type": "string", + }, + "bundles": {"description": "Expanded bundles and objects.", "type": "array",}, + "size": { + "description": "Sum of size of objects inside bundles.", + "type": "integer", + "minimum": 0, + }, + "checksum": {"type": "string", "pattern": "^[0-9a-f]{32}$",}, + "description": {"type": "string"}, + "version": { + "description": "optional version string of the object", + "type": "string", + }, + "aliases": {"description": "Optional", "type": "array",}, + }, +} + UPDATE_ALL_VERSIONS_SCHEMA = { "$schema": "http://json-schema.org/schema#", "type": "object", diff --git a/indexd/utils.py b/indexd/utils.py index 452b6918..d553354c 100644 --- a/indexd/utils.py +++ b/indexd/utils.py @@ -100,11 +100,15 @@ def create_tables(host, user, password, database): # pragma: no cover FOREIGN KEY(did) REFERENCES index_record (did) )" create_index_schema_version_stm = "CREATE TABLE index_schema_version (\ version INT)" + create_drs_bundle_record = "CREATE TABLE drs_bundle_record (\ + bundle_id VARCHAR NOT NULL, name VARCHAR, created_time DATETIME, updated_time DATETIME,\ + checksum VARCHAR, size BIGINT, bundle_data TEXT, description TEXT, version VARCHAR, aliases VARCHAR, PRIMARY KEY(bundle_id)" try: conn.execute(create_index_record_stm) conn.execute(create_record_hash_stm) conn.execute(create_record_url_stm) conn.execute(create_index_schema_version_stm) + conn.execute(create_drs_bundle_record) except Exception: logging.warn("Unable to create table") conn.close() diff --git a/openapis/swagger.yaml b/openapis/swagger.yaml index 4af2f4aa..0b15c1c5 100644 --- a/openapis/swagger.yaml +++ b/openapis/swagger.yaml @@ -39,6 +39,8 @@ tags: description: 'Data Object Service Retrieval Endpoints' - name: DRS description: 'Data Repository Service Retrieval Endpoints' + - name: bundle + description: 'Bundle endpoints.' - name: system description: System endpoints schemes: @@ -137,6 +139,12 @@ paths: description: '' operationId: listEntries parameters: + - name: form + in: query + description: specify whether you want to list bundle, objects or both Defaults to object. + required: false + type: "string" + enum: ["bundle", "object", "all"] - name: urls_metadata in: query description: | @@ -318,7 +326,7 @@ paths: get: tags: - index - summary: Get the metadata associated with the given id + summary: Get the metadata associated with the given id. Resolves bundle id. operationId: getIndexEntry produces: - application/json @@ -328,6 +336,10 @@ paths: description: entry id required: true type: string + - name: expand + type: boolean + in: query + description: 'Only shows first layer of contents when expand=false. Recursively unbundles contents when expand=true. false by default' responses: '200': description: successful operation @@ -969,6 +981,13 @@ paths: summary: List all DrsObject. description: 'Url field here contains their location and not the presigned url.' operationId: ListDrsObject + parameters: + - name: form + in: query + description: specify whether you want to list bundle, objects or both. + required: false + type: "string" + enum: ["bundle", "object", "all"] responses: '200': description: successful operation @@ -982,6 +1001,7 @@ paths: summary: Get info about a DrsObject. description: >- Returns object metadata, and a list of access methods that can be used to fetch object bytes. + Resolves for bundles. operationId: GetObject responses: '200': @@ -1114,7 +1134,117 @@ paths: tags: - DRS x-swagger-router-controller: ga4gh.drs.server - + '/bundle': + post: + tags: + - bundle + summary: Create a bundle. + description: '' + consumes: + - application/json + produces: + - application/json + parameters: + - in: body + name: body + description: Body takes in GUID, bundle_name, checksum, list of bundles/objects to bundle, and size. Only list of bundles/objects is required, rest will be created by indexd if not provided. + required: true + schema: + $ref: '#/definitions/InputBundle' + responses: + '200': + description: successful operation + schema: + $ref: '#/definitions/BundleResponse' + '400': + description: Invalid input + '404': + description: GUID in bundle doesn't exist. + security: + - authToken: [] + get: + summary: List all Bundles. + description: 'By default lists all the bundles' + operationId: ListBundles + parameters: + - name: form + in: query + description: specify whether you want to list bundle, objects or both. + required: false + type: "string" + enum: ["bundle", "object", "all"] + - name: start + in: query + description: start did + required: false + type: string + - name: limit + in: query + description: number of records to return for this page, default to 100 + required: false + type: integer + - name: page + in: query + description: pagination support without relying on dids. offsets results by limit*page + required: false + type: integer + produces: + - application/json + responses: + '200': + description: successful operation + schema: + $ref: '#/definitions/ListBundles' + security: [] + tags: + - bundle + '/bundle/{GUID}': + get: + tags: + - bundle + summary: Get bundle info for a given GUID + produces: + - application/json + parameters: + - name: GUID + in: path + description: Resolves for both bundle id and object id. + required: true + type: string + - name: expand + type: boolean + in: query + description: 'Only shows first layer of contents when expand=false. Recursively unbundles contents when expand=true. false by default' + responses: + '200': + description: successful operation + schema: + $ref: '#/definitions/BundleGet' + '404': + description: bundle not found + delete: + tags: + - bundle + summary: delete bundle from bundle records + description: '' + produces: + - application/json + parameters: + - name: GUID + in: path + description: bundle id + required: true + type: string + responses: + '200': + description: Bundle is successfully deleted + '404': + description: Bundle ID not found + security: + - authToken: [] + + + '/bulk/documents': post: tags: @@ -2062,7 +2192,90 @@ definitions: type: array items: $ref: '#/definitions/DrsObject' - + BundleGet: + type: object + properties: + id: + type: string + drs_uri: + type: string + name: + type: string + size: + type: integer + created_time: + type: string + format: date-time + updated_time: + type: string + format: date-time + checksums: + type: array + items: + $ref: '#/definitions/Checksum' + contents: + type: array + items: + $ref: '#/definitions/BundleContent' + + BundleContent: + type: object + properties: + id: + type: string + drs_uri: + type: string + name: + type: string + contents: + type: array + items: + type: object + + + ListBundles: + type: object + properties: + ids: + type: array + items: + $ref: '#/definitions/DID' + records: + type: array + items: + $ref: "#/definitions/OutputBundle" + size: + type: integer + format: int64 + description: size in bytes + start: + type: integer + format: int64 + description: start index for the pagination + limit: + type: integer + format: int64 + description: number of dids to return + file_name: + type: string + urls: + type: array + items: + type: string + acl: + type: array + items: + type: string + authz: + type: array + items: + type: string + hashes: + $ref: '#/definitions/HashInfo' + metadata: + type: object + version: + type: string ListRecords: type: object properties: @@ -2106,6 +2319,72 @@ definitions: type: object version: type: string + + InputBundle: + type: object + required: + - bundles + properties: + bundle_id: + $ref: '#/definitions/DID' + name: + type: string + size: + type: integer + format: int64 + description: size in bytes + checksum: + type: string + bundles: + type: array + items: + type: string + description: list of guid to be in the bundle + + BundleResponse: + type: object + properties: + bundle_id: + $ref: '#/definitions/DID' + name: + type: string + contents: + type: array + items: + $ref: '#/definitions/OutputBundle' + + + OutputBundle: + type: object + properties: + id: + $ref: "#/definitions/DID" + form: + type: string + description: how the entry was submitted to storage + enum: + - object + - container + - multipart + - bundle + size: + type: integer + format: int64 + description: size in bytes + name: + type: string + checksum: + type: string + updated_time: + type: string + format: date-time + description: last updated time + created_time: + type: string + format: date-time + description: time created + externalDocs: description: Find out more about Swagger url: 'http://swagger.io' + \ No newline at end of file diff --git a/tests/test_bundles.py b/tests/test_bundles.py new file mode 100644 index 00000000..dfbc8060 --- /dev/null +++ b/tests/test_bundles.py @@ -0,0 +1,599 @@ +import json +import pytest +import uuid +import tests.conftest +import requests +import responses +from tests.default_test_settings import settings + + +def get_bundle_doc(bundles, bundle_id=None): + doc = { + "name": "test_bundle", + "bundles": bundles, + } + + if not bundle_id: + bundle_id = uuid.uuid4() + doc["bundle_id"] = bundle_id + """ + add options to do bundles and objects + """ + return doc + + +def get_index_doc(has_version=True, urls=list(), add_bundle=False): + doc = { + "form": "object", + "size": 123, + "urls": ["s3://endpointurl/bucket/key"], + "hashes": {"md5": "8b9942cf415384b27cadf1f4d2d682e5"}, + } + if has_version: + doc["version"] = "1" + if urls: + doc["urls"] = urls + + return doc + + +def create_index(client, user, add_bundle=False): + i_data = get_index_doc(add_bundle=add_bundle) + res1 = client.post("/index/", json=i_data, headers=user) + assert res1.status_code == 200 + rec1 = res1.json + did_list = [rec1["did"]] + + return did_list, rec1 + + +def test_bundle_post(client, user): + """ + Bundle 1 + +-object1 + """ + did_list, _ = create_index(client, user) + + data = get_bundle_doc(bundles=did_list) + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + +def test_bundle_get_post_with_optional_fields(client, user): + """ + Bundle 1 + +-object1 + + Bundel 2 + +-Bundle 1 + +-object1 + +-object1 + """ + did_list, _ = create_index(client, user) + + data = get_bundle_doc(bundles=did_list) + data["description"] = "This is a cool bundle." + data["version"] = "v13cde" + data["aliases"] = ["123", "456"] + + res2 = client.post("/bundle/", json=data, headers=user) + rec2 = res2.json + did = rec2["bundle_id"] + assert res2.status_code == 200 + + res3 = client.get("/ga4gh/drs/v1/objects/" + did) + rec3 = res3.json + assert res3.status_code == 200 + assert rec3["description"] == data["description"] + assert rec3["version"] == data["version"] + assert rec3["aliases"] == data["aliases"] + + res4 = client.get("/bundle/" + did) + rec4 = res4.json + assert res4.status_code == 200 + assert rec4["description"] == data["description"] + assert rec4["version"] == data["version"] + assert rec4["aliases"] == data["aliases"] + + # Nested bundle shouldn't contain optional fields + data2 = get_bundle_doc(bundles=[did, did_list[0]]) + res5 = client.post("/bundle/", json=data2, headers=user) + did2 = res5.json["bundle_id"] + assert res5.status_code == 200 + res6 = client.get("/bundle/" + did2 + "?expand=true") + rec6 = res6.json + contents = rec6["contents"] + for content in contents: + assert "description" not in content + assert "version" not in content + assert "aliases" not in content + + +def test_bundle_post_self_reference(client, user): + """ + Make sure this doesnt exist + Bundle 1 + Object 1 + Bundle 1 + . + . + """ + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4) + + did_list.append(bundle_id) + data = get_bundle_doc(bundles=did_list, bundle_id=bundle_id) + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 400 + + +def test_bundle_post_defined_size_checksum(client, user): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4) + data = { + "name": "test_bundle", + "bundles": did_list, + "bundle_id": bundle_id, + "checksum": "1bab24e003ac48840123e5bbe72a5ec9", + "size": 12345, + } + res2 = client.post("/bundle/", json=data, headers=user) + print(res2.json) + assert res2.status_code == 200 + + +def test_bundle_bundle_data_not_found(client, user): + bundle_id = str(uuid.uuid4) + data = { + "name": "test_bundle", + "bundles": ["1987hgd09183hd0981hjd0h08ashjd80"], + "bundle_id": bundle_id, + "checksum": "1bab24e003ac48840123e5bbe72a5ec9", + "size": 12345, + } + res2 = client.post("/bundle/", json=data, headers=user) + print(res2.json) + assert res2.status_code == 404 + + +def test_post_drs_no_duplicate_bundles(client, user): + did_list, _ = create_index(client, user) + + data = get_bundle_doc(bundles=[did_list[0], did_list[0], did_list[0]]) + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 400 + + +def test_bundle_post_invalid_input(client, user): + data = {} + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 400 + + +def test_bundle_post_no_bundle_data(client, user): + data = { + "name": "test_bundle", + "bundles": [], + } + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 400 + assert res2.json["error"] == "Bundle data required." + + +def test_bundle_get(client, user): + """ + Post with bundle_id and get. + Bundle1 + +-object1 + """ + did_list, rec = create_index(client, user) + res1 = client.get("/ga4gh/drs/v1/objects/" + rec["did"]) + rec1 = res1.json + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res1 = client.post("/bundle/", json=data, headers=user) + assert res1.status_code == 200 + + res2 = client.get("/bundle/" + bundle_id) + assert res2.status_code == 200 + rec2 = res2.json + + assert rec2["id"] == bundle_id + assert rec2["name"] == data["name"] + assert rec2["created_time"] + assert rec2["updated_time"] + assert rec2["checksums"] + assert rec2["size"] == 123 + + +def test_bundle_get_form_type(client, user): + """ + form = object when object + form = bundle when bundle + """ + did_list, rec = create_index(client, user) + res1 = client.get("/ga4gh/drs/v1/objects/" + rec["did"]) + rec1 = res1.json + rec1["form"] = "object" + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res1 = client.post("/bundle/", json=data, headers=user) + assert res1.status_code == 200 + + res2 = client.get("/ga4gh/drs/v1/objects/" + bundle_id) + assert res2.status_code == 200 + + rec2 = res2.json + print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + print(rec2) + assert rec2["form"] == "bundle" + + +def test_bundle_get_no_bundle_id(client, user): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res1 = client.post("/bundle/", json=data, headers=user) + assert res1.status_code == 200 + + res2 = client.get("/bundle/" + "hc42397hf902-37g4hf970h23479fgh9euwh") + assert res2.status_code == 404 + + +def test_bundle_get_expand_false(client, user): + + did_list, rec = create_index(client, user) + res1 = client.get("/ga4gh/drs/v1/objects/" + rec["did"]) + + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res1 = client.post("/bundle/", json=data, headers=user) + assert res1.status_code == 200 + + res2 = client.get("/bundle/" + bundle_id + "?expand=false") + rec2 = res2.json + assert res2.status_code == 200 + assert rec2["id"] == bundle_id + assert rec2["name"] == data["name"] + assert "bundle_data" not in rec2 + + +def test_redirect_to_bundle_from_index(client, user): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + res = client.get("/bundle/" + bundle_id) + assert res.status_code == 200 + + res3 = client.get("/index/" + bundle_id) + assert res3.status_code == 200 + + +def test_bundle_from_drs_endpoint(client, user): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + res = client.get("/bundle/" + bundle_id) + assert res.status_code == 200 + + res3 = client.get("/ga4gh/drs/v1/objects/" + bundle_id) + assert res3.status_code == 200 + + +def test_get_bundle_list(client, user): + """ + bundle1 + +-object1 + bundle2 + +-object2 + . + . + bundlen + +-objectn + """ + n_bundles = 6 + n_records = 6 + for _ in range(n_records): + _, _ = create_index(client, user) + n_records = 6 + n_bundles + + for _ in range(n_bundles): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + res3 = client.get("/bundle/") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == n_bundles + # check to see bundle_data is not included + assert "bundle_data" not in rec3["records"][0] + + res4 = client.get("/bundle/?form=object") + assert res4.status_code == 200 + rec4 = res4.json + assert len(rec4["records"]) == n_records + + res5 = client.get("/bundle/?form=all") + assert res5.status_code == 200 + rec5 = res5.json + assert len(rec5["records"]) == n_records + n_bundles + + +def test_multiple_bundle_data(client, user): + """ + bundle1 + +-object1 + +-object2 + . + . + +-objectn + """ + n_bundle_data = 5 + did_list = [] + for _ in range(n_bundle_data): + did, _ = create_index(client, user) + did_list.append(did[0]) + + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + res3 = client.get("/bundle/" + bundle_id + "?expand=true") + assert res3.status_code == 200 + + rec3 = res3.json + bundle_data = rec3["contents"] + assert len(rec3["contents"]) == n_bundle_data + + for data in bundle_data: + assert data["id"] in did_list + + +def test_bundle_delete(client, user): + n_records = 6 + n_delete = 2 + bundle_ids = [] + for _ in range(n_records): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + bundle_ids.append(bundle_id) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + res3 = client.get("/bundle/") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == n_records + + for i in range(n_delete): + res4 = client.delete("/bundle/" + bundle_ids[i], headers=user) + assert res4.status_code == 200 + res5 = client.get("/bundle/" + bundle_ids[i]) + assert res5.status_code == 404 + + res3 = client.get("/bundle/") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == n_records - n_delete + + +def test_bundle_delete_invalid_bundle_id(client, user): + bundle_id = "12938hd981h123hd18hd80h028" + res = client.delete("/bundle/" + bundle_id, headers=user) + assert res.status_code == 404 + + +def test_bundle_delete_no_bundle_id(client, user): + res = client.delete("/bundle/", headers=user) + assert res.status_code == 405 + + +def test_bundle_data_bundle_and_index(client, user): + """ + bundle_main + +-bundle1 + +-object1 + +-bundle2 + +-object2 + +-bundle3 + +-object3 + +-object1 + +-object2 + +-object3 + """ + n_records = 3 + bundle_data_ids = [] + for _ in range(n_records): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + bundle_data_ids.append(bundle_id) + bundle_data_ids.append(did_list[0]) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id_main = str(uuid.uuid4()) + data_main = get_bundle_doc(bundle_data_ids, bundle_id=bundle_id_main) + res1 = client.post("/bundle/", json=data_main, headers=user) + assert res1.status_code == 200 + + res2 = client.get("/bundle/" + bundle_id_main + "?expand=true") + assert res2.status_code == 200 + rec3 = res2.json + + assert len(rec3["contents"]) == 2 * n_records + + assert rec3["size"] == len(rec3["contents"]) * 123 + + +def test_nested_bundle_data(client, user): + """ + bundle1 + +-bundle2 + +-bundle3 + +-bundle4 + +-object1 + """ + n_nested = 6 + did_list, _ = create_index(client, user) + + base_bundle_id = str(uuid.uuid4()) + base_data = get_bundle_doc(did_list, bundle_id=base_bundle_id) + res = client.post("/bundle/", json=base_data, headers=user) + assert res.status_code == 200 + + for _ in range(n_nested): + bundle_id = str(uuid.uuid4()) + assert bundle_id != base_bundle_id + data = get_bundle_doc([base_bundle_id], bundle_id=bundle_id) + res1 = client.post("/bundle/", json=data, headers=user) + assert res1.status_code == 200 + base_bundle_id = bundle_id + + assert base_bundle_id == bundle_id + res2 = client.get("/bundle/" + bundle_id + "?expand=true") + assert res2.status_code == 200 + rec3 = res2.json + + for _ in range(n_nested): + check = "bundle_data" in rec3 or "contents" in rec3 + assert check + key = "bundle_data" if "bundle_data" in rec3 else "contents" + rec3 = rec3[key][0] + + +def test_bundle_no_bundle_name(client, user): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + + data = get_bundle_doc(did_list, bundle_id=bundle_id) + del data["name"] + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + rec = res.json + assert rec["bundle_id"] == bundle_id + assert rec["name"] == bundle_id + + +def build_bundle(client, user): + """ + bundle1 + +-object1 + +-bundle2 + +-object2 + +-bundle3 + +-object3 + +-bundle4 + +-object4 + +-bundle5 + +-bundle6 + +-object5 + """ + object_list = [] + n_objects = 5 + for _ in range(n_objects): + did_list, _ = create_index(client, user) + object_list.append(did_list[0]) + + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc([object_list[0]], bundle_id=bundle_id) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id1 = str(uuid.uuid4()) + data = get_bundle_doc([bundle_id], bundle_id=bundle_id1) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc([bundle_id1, object_list[1]], bundle_id=bundle_id) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id1 = str(uuid.uuid4()) + data = get_bundle_doc([bundle_id, object_list[2]], bundle_id=bundle_id1) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc([object_list[3]], bundle_id=bundle_id) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + bundle_id2 = str(uuid.uuid4()) + data = get_bundle_doc([object_list[4], bundle_id, bundle_id1], bundle_id=bundle_id2) + res = client.post("/bundle/", json=data, headers=user) + assert res.status_code == 200 + + return bundle_id2 + + +def content_validation(contents): + for content in contents: + if len(content) != 0: + content_validation(content["contents"]) + elif "contents" not in content: + return False + return True + + +def test_get_drs_expand_contents_default(client, user): + + bundle_id = build_bundle(client, user) + res = client.get("/bundle/" + bundle_id) + assert res.status_code == 200 + + res2 = client.get("/ga4gh/drs/v1/objects/" + bundle_id) + assert res2.status_code == 200 + rec2 = res2.json + + contents = rec2["contents"] + assert len(contents) == 0 + + +def test_get_drs_expand_contents_false(client, user): + + bundle_id = build_bundle(client, user) + res = client.get("/bundle/" + bundle_id) + assert res.status_code == 200 + + res2 = client.get("/ga4gh/drs/v1/objects/" + bundle_id + "?expand=false") + assert res2.status_code == 200 + rec2 = res2.json + + contents = rec2["contents"] + assert len(contents) == 0 + + +def test_get_drs_expand_contents_true(client, user): + + bundle_id = build_bundle(client, user) + res = client.get("/bundle/" + bundle_id) + assert res.status_code == 200 + + res2 = client.get("/ga4gh/drs/v1/objects/" + bundle_id + "?expand=true") + assert res2.status_code == 200 + rec2 = res2.json + + contents = rec2["contents"] + + assert content_validation(contents) diff --git a/tests/test_client.py b/tests/test_client.py index 83e07203..0ba54a0a 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -5,6 +5,8 @@ from tests.util import assert_blank from indexd.index.blueprint import ACCEPTABLE_HASHES +from tests.test_bundles import create_index, get_bundle_doc +import uuid def get_doc( @@ -100,6 +102,119 @@ def test_index_list_with_params(client, user): assert data_list["records"][0]["urls_metadata"] == data1["urls_metadata"] +def test_get_list_form_param(client, user): + """ + bundle1 + +-object1 + bundle2 + +-object2 + . + . + bundlen + +-objectn + """ + n_records = 6 + for _ in range(n_records): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + res3 = client.get("/index/") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == n_records + + res3 = client.get("/index/?form=bundle") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == n_records + + res3 = client.get("/index/?form=all") + assert res3.status_code == 200 + rec3 = res3.json + assert len(rec3["records"]) == 2 * n_records + + +def test_get_list_form_with_params(client, user): + n_records = 6 + for _ in range(n_records): + did_list, _ = create_index(client, user) + bundle_id = str(uuid.uuid4()) + data = get_bundle_doc(did_list, bundle_id=bundle_id) + + res2 = client.post("/bundle/", json=data, headers=user) + assert res2.status_code == 200 + + data1 = get_doc() + data1["urls"] = [ + "s3://endpointurl/bucket_2/key_2", + "s3://anotherurl/bucket_2/key_2", + ] + data1["urls_metadata"] = { + "s3://endpointurl/bucket_2/key_2": {"state": "error", "other": "xxx"}, + "s3://anotherurl/bucket_2/key_2": {"state": "error", "other": "xxx"}, + } + res_1 = client.post("/index/", json=data1, headers=user) + assert res_1.status_code == 200 + rec_1 = res_1.json + + data2 = get_doc() + data2["metadata"] = {"project_id": "other-project", "state": "abc", "other": "xxx"} + data2["urls"] = ["s3://endpointurl/bucket/key_2", "s3://anotherurl/bucket/key_2"] + data2["urls_metadata"] = { + "s3://endpointurl/bucket/key_2": {"state": "error", "other": "yyy"} + } + res_2 = client.post("/index/", json=data2, headers=user) + assert res_2.status_code == 200 + rec_2 = res_2.json + + data1_by_md = client.get("/index/?metadata=project_id:bpa-UChicago¶m=all") + assert data1_by_md.status_code == 200 + data1_list = data1_by_md.json + ids = [record["did"] for record in data1_list["records"] if "did" in record] + assert rec_1["did"] in ids + + data2_by_md = client.get("/index/?form=all&metadata=project_id:other-project") + assert data2_by_md.status_code == 200 + data2_list = data2_by_md.json + ids = [record["did"] for record in data2_list["records"] if "did" in record] + assert rec_2["did"] in ids + + data_by_hash = client.get( + "/index/?form=all&hash=md5:8b9942cf415384b27cadf1f4d2d682e5" + ) + assert data_by_hash.status_code == 200 + data_list_all = data_by_hash.json + ids = [record["did"] for record in data_list_all["records"] if "did" in record] + assert rec_1["did"] in ids + assert rec_2["did"] in ids + + data_by_ids = client.get("/index/?form=all&ids={}".format(rec_1["did"])) + assert data_by_ids.status_code == 200 + data_list_all = data_by_ids.json + + ids = [record["did"] for record in data_list_all["records"] if "did" in record] + assert rec_1["did"] in ids + assert not rec_2["did"] in ids + + data_with_limit = client.get("/index/?form=all&limit=1") + assert data_with_limit.status_code == 200 + data_list_limit = data_with_limit.json + assert len(data_list_limit["records"]) == 2 + + param = {"bucket": {"state": "error", "other": "xxx"}} + + data_by_url_md = client.get("/index/?form=all&urls_metadata=" + json.dumps(param)) + assert data_by_url_md.status_code == 200 + data_list = data_by_url_md.json + assert len(data_list["records"]) == n_records + 1 + ids = [record["did"] for record in data_list["records"] if "did" in record] + assert rec_1["did"] in ids + + def test_index_list_by_size(client, user): # post two records of different size data = get_doc() diff --git a/tests/test_driver_alchemy_crud.py b/tests/test_driver_alchemy_crud.py index 91967510..4a887848 100644 --- a/tests/test_driver_alchemy_crud.py +++ b/tests/test_driver_alchemy_crud.py @@ -109,6 +109,35 @@ def test_driver_add_object_record(): assert record[4] is None, "record size non-null" +@util.removes("index.sq3") +def test_driver_add_bundle_record(): + """ + Tests creation of a record. + """ + with sqlite3.connect("index.sq3") as conn: + + driver = SQLAlchemyIndexDriver("sqlite:///index.sq3") + + driver.add_blank_bundle() + + count = conn.execute( + """ + SELECT COUNT(*) FROM drs_bundle_record + """ + ).fetchone()[0] + + assert count == 1, "driver did not create record" + + record = conn.execute( + """ + SELECT * FROM drs_bundle_record + """ + ).fetchone() + + assert record != None + assert len(record) == 10 + + @util.removes("index.sq3") def test_driver_add_container_record(): """ @@ -141,6 +170,37 @@ def test_driver_add_container_record(): assert record[4] == None, "record size non-null" +@util.removes("index.sq3") +def test_driver_add_bundles_record(): + """ + Tests creation of a record. + """ + with sqlite3.connect("index.sq3") as conn: + + driver = SQLAlchemyIndexDriver("sqlite:///index.sq3") + + driver.add_bundle(name="bundle",) + + count = conn.execute( + """ + SELECT COUNT(*) FROM drs_bundle_record + """ + ).fetchone()[0] + + assert count == 1, "driver did not create record" + + record = conn.execute( + """ + SELECT * FROM drs_bundle_record + """ + ).fetchone() + assert record[0], "record id not populated" + assert record[1], "record name not populated" + assert record[1] == "bundle", "record name is not bundle" + assert record[2], "record created date not populated" + assert record[3], "record updated date not populated" + + @util.removes("index.sq3") def test_driver_add_multipart_record(): """ @@ -860,3 +920,41 @@ def test_driver_delete_fails_with_invalid_rev(): with pytest.raises(RevisionMismatch): driver.delete(did, "some_revision") + + +@util.removes("index.sq3") +def test_driver_get_bundle(): + """ + Tests retrieval of a record. + """ + with sqlite3.connect("index.sq3") as conn: + + driver = SQLAlchemyIndexDriver("sqlite:///index.sq3") + + bundle_id = str(uuid.uuid4()) + checksum = "iuhd91h9ufh928jidsoajh9du328" + size = 512 + name = "object" + created_time = updated_time = datetime.now() + bundle_data = "{'bundle_data': [{'access_methods': [{'access_id': 's3', 'access_url': {'url': 's3://endpointurl/bucket/key'}, 'region': '', 'type': 's3'}], 'aliases': [], 'checksums': [{'checksum': '8b9942cf415384b27cadf1f4d2d682e5', 'type': 'md5'}], 'contents': [], 'created_time': '2020-04-23T21:42:36.506404', 'description': '', 'id': 'testprefix:7e677693-9da3-455a-b51c-03467d5498b0', 'mime_type': 'application/json', 'name': None, 'self_uri': 'drs://fictitious-commons.io/testprefix:7e677693-9da3-455a-b51c-03467d5498b0', 'size': 123, 'updated_time': '2020-04-23T21:42:36.506410', 'version': '3c995667'}], 'bundle_id': '1ff381ef-55c7-42b9-b33f-81ac0689d131', 'checksum': '65b464c1aea98176ef2fa38e8b6b9fc7', 'created_time': '2020-04-23T21:42:36.564808', 'name': 'test_bundle', 'size': 123, 'updated_time': '2020-04-23T21:42:36.564819'}" + conn.execute( + """ + INSERT INTO drs_bundle_record(bundle_id, name, checksum, size, bundle_data, created_time, updated_time) VALUES (?,?,?,?,?,?,?) + """, + (bundle_id, name, checksum, size, bundle_data, created_time, updated_time), + ) + + conn.commit() + + record = driver.get_bundle(bundle_id) + + assert record["id"] == bundle_id, "record id does not match" + assert record["checksum"] == checksum, "record revision does not match" + assert record["size"] == size, "record size does not match" + assert record["name"] == name, "record name does not match" + assert ( + record["created_time"] == created_time.isoformat() + ), "created date does not match" + assert ( + record["updated_time"] == updated_time.isoformat() + ), "created date does not match" diff --git a/tests/test_drs.py b/tests/test_drs.py index 0035560b..ad2284fb 100644 --- a/tests/test_drs.py +++ b/tests/test_drs.py @@ -3,6 +3,7 @@ import requests import responses from tests.default_test_settings import settings +from tests.test_bundles import get_bundle_doc def generate_presigned_url_response(did, protocol="", status=200): @@ -80,14 +81,28 @@ def test_drs_list(client, user): submitted_guids = [] for _ in range(record_length): res_1 = client.post("/index/", json=data, headers=user) - submitted_guids.append(res_1.json["did"]) + did = res_1.json["did"] + submitted_guids.append(did) + bundle_data = get_bundle_doc(bundles=[did]) + res2 = client.post("/bundle/", json=bundle_data, headers=user) assert res_1.status_code == 200 + res_2 = client.get("/ga4gh/drs/v1/objects") assert res_2.status_code == 200 rec_2 = res_2.json - assert len(rec_2["drs_objects"]) == record_length + assert len(rec_2["drs_objects"]) == 2 * record_length assert submitted_guids.sort() == [r["id"] for r in rec_2["drs_objects"]].sort() + res_3 = client.get("/ga4gh/drs/v1/objects/?form=bundle") + assert res_3.status_code == 200 + rec_3 = res_3.json + assert len(rec_3["drs_objects"]) == record_length + + res_4 = client.get("/ga4gh/drs/v1/objects/?form=object") + assert res_4.status_code == 200 + rec_4 = res_4.json + assert len(rec_4["drs_objects"]) == record_length + def test_get_drs_record_not_found(client, user): # test exception raised at nonexistent diff --git a/tests/test_schema_migration.py b/tests/test_schema_migration.py index a06af9cd..584110eb 100644 --- a/tests/test_schema_migration.py +++ b/tests/test_schema_migration.py @@ -39,6 +39,18 @@ ("hash_value", "character varying"), ], "index_record_url": [("did", "character varying"), ("url", "character varying")], + "drs_bundle_record": [ + ("bundle_id", "character varying"), + ("name", "character varying"), + ("created_time", "timestamp without time zone"), + ("updated_time", "timestamp without time zone"), + ("checksum", "character varying"), + ("size", "bigint"), + ("bundle_data", "text"), + ("description", "text"), + ("version", "character varying"), + ("aliases", "character varying"), + ], } diff --git a/tests/test_setup.py b/tests/test_setup.py index 282154a3..c361209d 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -35,6 +35,18 @@ (1, "url", "VARCHAR", 1, None, 1 if OLD_SQLITE else 2), ], "index_schema_version": [(0, "version", "INTEGER", 1, None, 1)], + "drs_bundle_record": [ + (0, "bundle_id", "VARCHAR", 1, None, 1), + (1, "name", "VARCHAR", 0, None, 0), + (2, "created_time", "DATETIME", 0, None, 0), + (3, "updated_time", "DATETIME", 0, None, 0), + (4, "checksum", "VARCHAR", 0, None, 0), + (5, "size", "BIGINT", 0, None, 0), + (6, "bundle_data", "TEXT", 0, None, 0), + (7, "description", "TEXT", 0, None, 0), + (8, "version", "VARCHAR", 0, None, 0), + (9, "aliases", "VARCHAR", 0, None, 0), + ], } ALIAS_TABLES = {