chore: deploy staging <- main (#3213) (#3221)

* fix: remove self example from openapi (#3173) * chore: cherry pick changes from #3172 surface dataset batch_condition in curation API (#3177) * feat: surface dataset batch_condition in curation API * resolve merge conflict * fix: document upload links that are suported (#3176) * Document the types of links that are supported * Update documentation * docs: Renaming scExpression to Gene Expression (#3091) * docs: Renaming scExpression to Gene Expression - changing route to gene-expression instead of scExpression * chore(fe): upgrade to nextjs12 blueprint4 node16 (#3119) * chore(fe): upgrade to nextjs12 blueprint4 node16 * upgrade packages to resolve vulns * fix(lint): lint stuff * fix(lint): lint stuff * restore css usage * update Dockerfile * clean up css * fix bp4 css change * update docsite image css * fix: remove access_type from curation API (#3080) * remove access_type form curation API * remove is_allowed check from reshape_for_curation_api * update openapi * Fix the error message when curator is used by not a super curator * fix errors * Move all reshaping collection code into reshape_for_curation_api. * fix: change the datasets response shape (#3178) change name to title in dataset change is_primary_data to list of boolean in dataset * fix(curation api):Remove h5ad suffix requirements (#3151) - remove .h5ad suffix from uploads and curator tags - check for embedded UUID in curator tag * fix: backend/Makefile local db improvements, migration README (#3029) * fix: backend/Makefile local db improvements, migration README - Rename the targets for loading data and loading schema into local test db - Adapt the commands for the above two scripts to actually work (??) * doc updates * additional doc update * style: remove unused regex and consolidate curator tag logic (#3200) Co-authored-by: Trent Smith <1429913+Bento007@users.noreply.github.com> Co-authored-by: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Co-authored-by: ashin-czi <109984998+ashin-czi@users.noreply.github.com> Co-authored-by: Timmy Huang <tihuan@users.noreply.github.com> Co-authored-by: Trent Smith <1429913+Bento007@users.noreply.github.com> Co-authored-by: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Co-authored-by: ashin-czi <109984998+ashin-czi@users.noreply.github.com> Co-authored-by: Timmy Huang <tihuan@users.noreply.github.com>
chanzuckerberg · Sep 1, 2022 · b05fde4 · b05fde4
1 parent 6881de9
commit b05fde4
Show file tree

Hide file tree

Showing 128 changed files with 22,339 additions and 4,912 deletions.
diff --git a/.gitignore b/.gitignore
@@ -141,3 +141,6 @@ dmypy.json
 .envrc
 .env.ecr
 *.sqlc
+
+# Typescript
+tsconfig.tsbuildinfo
diff --git a/backend/Makefile b/backend/Makefile
@@ -63,17 +63,19 @@ db/dump:
 	PGPASSWORD=${DB_PW} pg_dump -Fc --dbname=corpora_${DEPLOYMENT_STAGE} --file=$(OUTFILE) --host 0.0.0.0 --username corpora_${DEPLOYMENT_STAGE}
 	$(MAKE) db/tunnel/down
 
-db/load/local:
+db/local/load-data:
 	# Loads corpora_dev.sqlc into the local Docker env corpora database
-	# Usage: make db/load/local INFILE=<file>
-	docker-compose exec database pg_restore --clean --no-owner --username corpora --dbname corpora $(INFILE)
+	# Usage: make db/local/load-data INFILE=<file>
+	$(eval DB_PW = $(shell aws secretsmanager get-secret-value --secret-id corpora/backend/test/database --region us-west-2 | jq -r '.SecretString | match(":([^:]*)@").captures[0].string'))
+	PGPASSWORD=${DB_PW} pg_restore --clean --no-owner --host 0.0.0.0 --username corpora --dbname corpora $(INFILE)
 
-db/load/schema:
+db/local/load-schema:
     # Imports the corpora_dev.sqlc schema (schema ONLY) into the corpora_test database
-	# Usage: DEPLOYMENT_STAGE=test make db/import/schema
-	pg_restore --schema-only --clean --no-owner --dbname corpora_test corpora_$(DEPLOYMENT_STAGE).sqlc
+	# Usage: make db/local/load-schema INFILE=<file>
+    $(eval DB_PW = $(shell aws secretsmanager get-secret-value --secret-id corpora/backend/test/database --region us-west-2 | jq -r '.SecretString | match(":([^:]*)@").captures[0].string'))
+	PGPASSWORD=${DB_PW} pg_restore --schema-only --clean --no-owner --host 0.0.0.0 --dbname corpora $(INFILE)
 	# Also import alembic schema version
-	pg_restore --data-only --table=alembic_version --no-owner --dbname corpora_test corpora_$(DEPLOYMENT_STAGE).sqlc
+	PGPASSWORD=${DB_PW} pg_restore --data-only --table=alembic_version --no-owner --host 0.0.0.0 --dbname corpora $(INFILE)
 
 db/dump_schema:
 ifeq ($(DEPLOYMENT_STAGE),test)

diff --git a/backend/config/curation-api.yml b/backend/config/curation-api.yml
@@ -73,10 +73,6 @@ paths:
           schema:
             type: string
           examples:
-            current_curator:
-              summary: Return collections owned by current curator.
-              value:
-                name: "self"
             specified_curator:
               summary: Return collections owned by the specified curator
               value:
@@ -94,8 +90,12 @@ paths:
                     items:
                       allOf:
                         - $ref: "#/components/schemas/collection_list"
+        "400":
+          $ref: "#/components/responses/400"
         "401":
           $ref: "#/components/responses/401"
+        "403":
+          $ref: "#/components/responses/403"
     post:
       summary: Create a Collection
       description: Create a new Collection
@@ -337,8 +337,7 @@ paths:
               additionalProperties: false
               properties:
                 curator_tag:
-                  type: string
-                  description: curator-provided tag
+                  $ref: "#/components/schemas/curator_tag"
             example:
               curator_tag: "new/curator_tag"
       responses:
@@ -359,6 +358,8 @@ paths:
         request body; if an existing Dataset has this tag, the existing Dataset SHALL be replaced, otherwise a new
         Dataset will be added. MAY include the **id** of an existing Dataset, in which case the existing Dataset
         SHALL be replaced.
+
+        Only presigned AWS S3 URLs and Dropbox shared file links are supported.
       operationId: backend.corpora.lambdas.api.v1.curation.collections.collection_id.datasets.upload_link.put
       tags:
         - collection
@@ -374,7 +375,7 @@ paths:
               additionalProperties: false
               properties:
                 curator_tag:
-                  type: string
+                  $ref: "#/components/schemas/curator_tag"
                 id:
                   $ref: "#/components/schemas/dataset_id"
                 link:
@@ -431,9 +432,8 @@ paths:
 
         To upload the files to S3, use the Python boto3 package. Once a file is successfully uploaded, it wil be
         processed and added to the Collection (specified in the S3 key path) with no further user action required.
-        Include an email address in the S3 file upload metadata to receive processing and error notifications.
-        Alternatively, use `/curation/collections/{collection_id}/datasets/status` to check on the processing
-        status for a given Dataset.
+        Use `GET /curation/collections/{collection_id}/datasets/` with `curator_tag` or `id` query parameter  to check 
+        on the processing status for a given Dataset.
 
       security:
         - curatorAccess: []
@@ -470,7 +470,7 @@ paths:
       tags:
         - collection
       description: >-
-        Generate URLs to download the files associated with the dataset. MUST include *one* of the **curator_tag**
+        Generate Presigned S3 URLs to download the files associated with the dataset. MUST include *one* of the **curator_tag**
         and **dataset_id** parameters.
       operationId: backend.corpora.lambdas.api.v1.curation.collections.collection_id.assets.get
       parameters:
@@ -517,18 +517,20 @@ components:
           type: array
         curator_tag:
           "$ref": "#/components/schemas/curator_tag"
+          nullable: true
         dataset_id:
           "$ref": "#/components/schemas/dataset_id"
       type: object
+    batch_condition:
+      description: These keys define the batches that a normalization or integration algorithm should be aware of
+      type: array
+      items:
+        type: string
+      nullable: true
+      example: ["patient", "seqBatch"]
     collection_list:
       description: Full Collection metadata
       properties:
-        access_type:
-          enum:
-            - READ
-            - WRITE
-          nullable: true
-          type: string
         collection_url:
           type: string
         contact_email:
@@ -581,12 +583,6 @@ components:
     collection_get:
       description: Full Collection metadata
       properties:
-        access_type:
-          enum:
-            - READ
-            - WRITE
-          nullable: true
-          type: string
         collection_url:
           type: string
         contact_email:
@@ -680,6 +676,8 @@ components:
       allOf:
         - "$ref": "#/components/schemas/dataset_preview"
         - properties:
+            batch_condition:
+              "$ref": "#/components/schemas/batch_condition"
             assay:
               "$ref": "#/components/schemas/ontology_elements"
             cell_count:
@@ -710,7 +708,7 @@ components:
             mean_genes_per_cell:
               nullable: true
               type: number
-            name:
+            title:
               nullable: true
               type: string
             organism:
@@ -790,13 +788,10 @@ components:
     is_primary_data:
       description:
         Describes whether cellular observations for this dataset are all
-        canonical (PRIMARY), all non-canonical (SECONDARY), or contain a mixture (BOTH).
-      enum:
-        - PRIMARY
-        - SECONDARY
-        - BOTH
-      nullable: true
-      type: string
+        canonical (True), all non-canonical (False), or contain a mixture (True, False).
+      items:
+        type: boolean
+      type: array
     links:
       items:
         additionalProperties: false

diff --git a/backend/corpora/common/corpora_config.py b/backend/corpora/common/corpora_config.py
@@ -25,7 +25,6 @@ def get_defaults_template(self):
                 collections_base_url = f"https://cellxgene.{deployment_stage}.single-cell.czi.technology"
 
         template = {
-            "upload_file_formats": ["h5ad"],
             "upload_max_file_size_gb": 30,
             "submission_bucket": os.getenv("DATASET_SUBMISSIONS_BUCKET", "cellxgene-dataset-submissions-test"),
             "collections_base_url": collections_base_url,

diff --git a/backend/corpora/common/upload.py b/backend/corpora/common/upload.py
@@ -12,12 +12,12 @@
 from .utils.authorization_checks import owner_or_allowed
 from .utils.exceptions import (
     MaxFileSizeExceededException,
-    InvalidFileFormatException,
     NonExistentCollectionException,
     InvalidProcessingStateException,
     NonExistentDatasetException,
 )
 from .utils.math_utils import GB
+from .utils.regex import validate_curator_tag
 
 _stepfunctions_client = None
 
@@ -49,7 +49,6 @@ def upload(
     collection_id: str,
     url: str,
     file_size: int,
-    file_extension: str,
     user: str,
     scope: str = None,
     dataset_id: str = None,
@@ -59,9 +58,8 @@ def upload(
     if file_size is not None and file_size > max_file_size_gb:
         raise MaxFileSizeExceededException(f"{url} exceeds the maximum allowed file size of {max_file_size_gb} Gb")
 
-    allowed_file_formats = CorporaConfig().upload_file_formats
-    if file_extension not in allowed_file_formats:
-        raise InvalidFileFormatException(f"{url} must be in the file format(s): {allowed_file_formats}")
+    if curator_tag:
+        validate_curator_tag(curator_tag)
 
     # Check if datasets can be added to the collection
     collection = Collection.get_collection(

diff --git a/backend/corpora/common/utils/regex.py b/backend/corpora/common/utils/regex.py
@@ -1,7 +1,26 @@
+import re
+
 USERNAME_REGEX = r"(?P<username>[\w\-\|]+)"
 ID_REGEX = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
-EXTENSION_REGEX = r"(?P<extension>h5ad)"
 DATASET_ID_REGEX = f"(?P<dataset_id>{ID_REGEX})"
 COLLECTION_ID_REGEX = f"(?P<collection_id>{ID_REGEX})"
-CURATOR_TAG_PREFIX_REGEX = r"(?P<tag_prefix>.*)"
 CONTROL_CHARS = r"[\x00-\x1f\x7f-\xa0]"
+CURATOR_TAG_REGEX = r"(?P<tag>.+)"
+
+
+def validate_curator_tag(curator_tag: str) -> bool:
+    """
+    Verify the correct curator tag format is obeyed (i.e., it is not a UUID)
+
+    :param curator_tag: the tag name to validate.
+    :return: True if CURATOR_TAG_REGEX is matched.
+    """
+    regex = f"^({DATASET_ID_REGEX}|{CURATOR_TAG_REGEX})$"
+    matched = re.match(regex, curator_tag)
+    if matched:
+        matches = matched.groupdict()
+        if matches.get("tag"):
+            return True
+        elif matches.get("dataset_id"):
+            raise ValueError("Curator tag cannot assume UUID format.")
+    raise ValueError("Curator tag cannot be empty.")
diff --git a/backend/corpora/dataset_submissions/app.py b/backend/corpora/dataset_submissions/app.py
@@ -13,13 +13,12 @@
 from backend.corpora.common.utils.regex import (
     USERNAME_REGEX,
     COLLECTION_ID_REGEX,
-    EXTENSION_REGEX,
     DATASET_ID_REGEX,
-    CURATOR_TAG_PREFIX_REGEX,
+    CURATOR_TAG_REGEX,
 )
 
 logger = logging.getLogger(__name__)
-REGEX = f"^{USERNAME_REGEX}/{COLLECTION_ID_REGEX}/({DATASET_ID_REGEX}|{CURATOR_TAG_PREFIX_REGEX})\\.{EXTENSION_REGEX}$"
+REGEX = f"^{USERNAME_REGEX}/{COLLECTION_ID_REGEX}/({DATASET_ID_REGEX}|{CURATOR_TAG_REGEX})$"
 
 
 def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
@@ -39,8 +38,6 @@ def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
         parsed = parse_key(key)
         if not parsed:
             raise CorporaException(f"Missing collection ID, curator tag, and/or dataset ID for {key=}")
-        if parsed["tag_prefix"]:
-            parsed["tag"] = f"{parsed['tag_prefix']}.{parsed['extension']}"
         logger.debug(parsed)
 
         with db_session_manager() as session:
@@ -66,7 +63,6 @@ def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
                 user=collection_owner,
                 url=s3_uri,
                 file_size=size,
-                file_extension=parsed["extension"],
                 dataset_id=dataset_id,
                 curator_tag=parsed.get("tag"),
             )
@@ -111,12 +107,14 @@ def get_dataset_info(
 ) -> Tuple[Optional[str], Optional[str]]:
     if dataset_id:  # If a dataset uuid was provided
         dataset = Dataset.get(session, dataset_id)
-    else:  # if incoming_curator_tag
+    elif incoming_curator_tag:  # if incoming_curator_tag
         dataset = Dataset.get_dataset_from_curator_tag(session, collection_id, incoming_curator_tag)
         if not dataset:  # New dataset
             collection = Collection.get_collection(session, collection_id)
             if collection:
                 return collection.owner, None
+    else:
+        raise CorporaException("No dataset identifier provided")
     if dataset:
         return dataset.collection.owner, dataset.id
     return None, None
diff --git a/backend/corpora/lambdas/api/v1/collection_id/upload.py b/backend/corpora/lambdas/api/v1/collection_id/upload.py
@@ -54,15 +54,13 @@ def upload_from_link(collection_id: str, token_info: dict, url: str, dataset_id:
         raise InvalidParametersHTTPException(detail=ex.detail)
 
     file_size = resp.get("size")
-    file_extension = resp["name"].rsplit(".")[-1].lower()
 
     try:
         return upload(
             db_session,
             collection_id=collection_id,
             url=url,
             file_size=file_size,
-            file_extension=file_extension,
             user=token_info["sub"],
             scope=token_info["scope"],
             dataset_id=dataset_id,

diff --git a/backend/corpora/lambdas/api/v1/curation/collections/actions.py b/backend/corpora/lambdas/api/v1/curation/collections/actions.py
@@ -1,9 +1,8 @@
 from flask import jsonify, g
-from .common import reshape_for_curation_api_and_is_allowed, add_collection_level_processing_status
-from .common import EntityColumns
-from ...authorization import is_super_curator
+from .common import reshape_for_curation_api
+from ...authorization import is_super_curator, owner_or_allowed
 from ......common.corpora_orm import CollectionVisibility, DbCollection
-from ......common.utils.http_exceptions import UnauthorizedError
+from ......common.utils.http_exceptions import ForbiddenHTTPException
 from backend.corpora.api_server.db import dbconnect
 
 
@@ -18,23 +17,26 @@ def get(visibility: str, token_info: dict, curator: str = None):
     @return: Response
     """
     filters = [DbCollection.tombstone == False]  # noqa
-
-    if visibility == CollectionVisibility.PRIVATE.name and not token_info:
-        raise UnauthorizedError()
-    elif visibility:
+    if visibility:
         filters.append(DbCollection.visibility == getattr(CollectionVisibility, visibility))
+        if visibility == CollectionVisibility.PRIVATE.name:
+            if not token_info:
+                raise ForbiddenHTTPException(detail="Not authorized to query for PRIVATE collection.")
+            else:
+                owner_filter = owner_or_allowed(token_info)
+                if owner_filter:  # None means the user is a super curator and don't need to filter by owner.
+                    filters.append(DbCollection.owner == owner_filter)
 
-    if curator and not is_super_curator(token_info):
-        raise UnauthorizedError()
-    elif curator:  # user want collections from a specific curator
-        filters.append(DbCollection.curator_name == curator)
+    if curator:
+        if not is_super_curator(token_info):
+            raise ForbiddenHTTPException(detail="Not authorized to use the curator query parameter.")
+        else:
+            filters.append(DbCollection.curator_name == curator)
 
     db_session = g.db_session
     resp_collections = []
     for collection in db_session.query(DbCollection).filter(*filters).all():
-        resp_collection = collection.to_dict_keep(EntityColumns.columns_for_collections)
-        resp_collection["processing_status"] = add_collection_level_processing_status(collection)
-        if reshape_for_curation_api_and_is_allowed(db_session, resp_collection, token_info, preview=True):
-            resp_collections.append(resp_collection)
+        resp_collection = reshape_for_curation_api(db_session, collection, token_info, preview=True)
+        resp_collections.append(resp_collection)
 
     return jsonify({"collections": resp_collections})
diff --git a/backend/corpora/lambdas/api/v1/curation/collections/collection_id/actions.py b/backend/corpora/lambdas/api/v1/curation/collections/collection_id/actions.py
@@ -6,9 +6,7 @@
     normalize_and_get_doi,
 )
 from ..common import (
-    add_collection_level_processing_status,
-    reshape_for_curation_api_and_is_allowed,
-    EntityColumns,
+    reshape_for_curation_api,
 )
 from backend.corpora.api_server.db import dbconnect
 from backend.corpora.common.corpora_orm import (
@@ -44,11 +42,7 @@ def get(collection_id: str, token_info: dict):
     collection = Collection.get_collection(db_session, collection_id, include_tombstones=False)
     if not collection:
         raise NotFoundHTTPException
-    collection_response: dict = collection.to_dict_keep(EntityColumns.columns_for_collection_id)
-
-    collection_response["processing_status"] = add_collection_level_processing_status(collection)
-    reshape_for_curation_api_and_is_allowed(db_session, collection_response, token_info, id_provided=True)
-
+    collection_response = reshape_for_curation_api(db_session, collection, token_info)
     return jsonify(collection_response)