Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: deploy staging <- main #3213

Merged
merged 11 commits into from
Aug 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,6 @@ dmypy.json
.envrc
.env.ecr
*.sqlc

# Typescript
tsconfig.tsbuildinfo
16 changes: 9 additions & 7 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,19 @@ db/dump:
PGPASSWORD=${DB_PW} pg_dump -Fc --dbname=corpora_${DEPLOYMENT_STAGE} --file=$(OUTFILE) --host 0.0.0.0 --username corpora_${DEPLOYMENT_STAGE}
$(MAKE) db/tunnel/down

db/load/local:
db/local/load-data:
# Loads corpora_dev.sqlc into the local Docker env corpora database
# Usage: make db/load/local INFILE=<file>
docker-compose exec database pg_restore --clean --no-owner --username corpora --dbname corpora $(INFILE)
# Usage: make db/local/load-data INFILE=<file>
$(eval DB_PW = $(shell aws secretsmanager get-secret-value --secret-id corpora/backend/test/database --region us-west-2 | jq -r '.SecretString | match(":([^:]*)@").captures[0].string'))
PGPASSWORD=${DB_PW} pg_restore --clean --no-owner --host 0.0.0.0 --username corpora --dbname corpora $(INFILE)

db/load/schema:
db/local/load-schema:
# Imports the corpora_dev.sqlc schema (schema ONLY) into the corpora_test database
# Usage: DEPLOYMENT_STAGE=test make db/import/schema
pg_restore --schema-only --clean --no-owner --dbname corpora_test corpora_$(DEPLOYMENT_STAGE).sqlc
# Usage: make db/local/load-schema INFILE=<file>
$(eval DB_PW = $(shell aws secretsmanager get-secret-value --secret-id corpora/backend/test/database --region us-west-2 | jq -r '.SecretString | match(":([^:]*)@").captures[0].string'))
PGPASSWORD=${DB_PW} pg_restore --schema-only --clean --no-owner --host 0.0.0.0 --dbname corpora $(INFILE)
# Also import alembic schema version
pg_restore --data-only --table=alembic_version --no-owner --dbname corpora_test corpora_$(DEPLOYMENT_STAGE).sqlc
PGPASSWORD=${DB_PW} pg_restore --data-only --table=alembic_version --no-owner --host 0.0.0.0 --dbname corpora $(INFILE)

db/dump_schema:
ifeq ($(DEPLOYMENT_STAGE),test)
Expand Down
57 changes: 26 additions & 31 deletions backend/config/curation-api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,6 @@ paths:
schema:
type: string
examples:
current_curator:
summary: Return collections owned by current curator.
value:
name: "self"
specified_curator:
summary: Return collections owned by the specified curator
value:
Expand All @@ -94,8 +90,12 @@ paths:
items:
allOf:
- $ref: "#/components/schemas/collection_list"
"400":
$ref: "#/components/responses/400"
"401":
$ref: "#/components/responses/401"
"403":
$ref: "#/components/responses/403"
post:
summary: Create a Collection
description: Create a new Collection
Expand Down Expand Up @@ -337,8 +337,7 @@ paths:
additionalProperties: false
properties:
curator_tag:
type: string
description: curator-provided tag
$ref: "#/components/schemas/curator_tag"
example:
curator_tag: "new/curator_tag"
responses:
Expand All @@ -359,6 +358,8 @@ paths:
request body; if an existing Dataset has this tag, the existing Dataset SHALL be replaced, otherwise a new
Dataset will be added. MAY include the **id** of an existing Dataset, in which case the existing Dataset
SHALL be replaced.

Only presigned AWS S3 URLs and Dropbox shared file links are supported.
operationId: backend.corpora.lambdas.api.v1.curation.collections.collection_id.datasets.upload_link.put
tags:
- collection
Expand All @@ -374,7 +375,7 @@ paths:
additionalProperties: false
properties:
curator_tag:
type: string
$ref: "#/components/schemas/curator_tag"
id:
$ref: "#/components/schemas/dataset_id"
link:
Expand Down Expand Up @@ -431,9 +432,8 @@ paths:

To upload the files to S3, use the Python boto3 package. Once a file is successfully uploaded, it wil be
processed and added to the Collection (specified in the S3 key path) with no further user action required.
Include an email address in the S3 file upload metadata to receive processing and error notifications.
Alternatively, use `/curation/collections/{collection_id}/datasets/status` to check on the processing
status for a given Dataset.
Use `GET /curation/collections/{collection_id}/datasets/` with `curator_tag` or `id` query parameter to check
on the processing status for a given Dataset.

security:
- curatorAccess: []
Expand Down Expand Up @@ -470,7 +470,7 @@ paths:
tags:
- collection
description: >-
Generate URLs to download the files associated with the dataset. MUST include *one* of the **curator_tag**
Generate Presigned S3 URLs to download the files associated with the dataset. MUST include *one* of the **curator_tag**
and **dataset_id** parameters.
operationId: backend.corpora.lambdas.api.v1.curation.collections.collection_id.assets.get
parameters:
Expand Down Expand Up @@ -517,18 +517,20 @@ components:
type: array
curator_tag:
"$ref": "#/components/schemas/curator_tag"
nullable: true
dataset_id:
"$ref": "#/components/schemas/dataset_id"
type: object
batch_condition:
description: These keys define the batches that a normalization or integration algorithm should be aware of
type: array
items:
type: string
nullable: true
example: ["patient", "seqBatch"]
collection_list:
description: Full Collection metadata
properties:
access_type:
enum:
- READ
- WRITE
nullable: true
type: string
collection_url:
type: string
contact_email:
Expand Down Expand Up @@ -581,12 +583,6 @@ components:
collection_get:
description: Full Collection metadata
properties:
access_type:
enum:
- READ
- WRITE
nullable: true
type: string
collection_url:
type: string
contact_email:
Expand Down Expand Up @@ -680,6 +676,8 @@ components:
allOf:
- "$ref": "#/components/schemas/dataset_preview"
- properties:
batch_condition:
"$ref": "#/components/schemas/batch_condition"
assay:
"$ref": "#/components/schemas/ontology_elements"
cell_count:
Expand Down Expand Up @@ -710,7 +708,7 @@ components:
mean_genes_per_cell:
nullable: true
type: number
name:
title:
nullable: true
type: string
organism:
Expand Down Expand Up @@ -790,13 +788,10 @@ components:
is_primary_data:
description:
Describes whether cellular observations for this dataset are all
canonical (PRIMARY), all non-canonical (SECONDARY), or contain a mixture (BOTH).
enum:
- PRIMARY
- SECONDARY
- BOTH
nullable: true
type: string
canonical (True), all non-canonical (False), or contain a mixture (True, False).
items:
type: boolean
type: array
links:
items:
additionalProperties: false
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/common/corpora_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def get_defaults_template(self):
collections_base_url = f"https://cellxgene.{deployment_stage}.single-cell.czi.technology"

template = {
"upload_file_formats": ["h5ad"],
"upload_max_file_size_gb": 30,
"submission_bucket": os.getenv("DATASET_SUBMISSIONS_BUCKET", "cellxgene-dataset-submissions-test"),
"collections_base_url": collections_base_url,
Expand Down
8 changes: 3 additions & 5 deletions backend/corpora/common/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from .utils.authorization_checks import owner_or_allowed
from .utils.exceptions import (
MaxFileSizeExceededException,
InvalidFileFormatException,
NonExistentCollectionException,
InvalidProcessingStateException,
NonExistentDatasetException,
)
from .utils.math_utils import GB
from .utils.regex import validate_curator_tag

_stepfunctions_client = None

Expand Down Expand Up @@ -49,7 +49,6 @@ def upload(
collection_id: str,
url: str,
file_size: int,
file_extension: str,
user: str,
scope: str = None,
dataset_id: str = None,
Expand All @@ -59,9 +58,8 @@ def upload(
if file_size is not None and file_size > max_file_size_gb:
raise MaxFileSizeExceededException(f"{url} exceeds the maximum allowed file size of {max_file_size_gb} Gb")

allowed_file_formats = CorporaConfig().upload_file_formats
if file_extension not in allowed_file_formats:
raise InvalidFileFormatException(f"{url} must be in the file format(s): {allowed_file_formats}")
if curator_tag:
validate_curator_tag(curator_tag)

# Check if datasets can be added to the collection
collection = Collection.get_collection(
Expand Down
23 changes: 21 additions & 2 deletions backend/corpora/common/utils/regex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
import re

USERNAME_REGEX = r"(?P<username>[\w\-\|]+)"
ID_REGEX = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
EXTENSION_REGEX = r"(?P<extension>h5ad)"
DATASET_ID_REGEX = f"(?P<dataset_id>{ID_REGEX})"
COLLECTION_ID_REGEX = f"(?P<collection_id>{ID_REGEX})"
CURATOR_TAG_PREFIX_REGEX = r"(?P<tag_prefix>.*)"
CONTROL_CHARS = r"[\x00-\x1f\x7f-\xa0]"
CURATOR_TAG_REGEX = r"(?P<tag>.+)"


def validate_curator_tag(curator_tag: str) -> bool:
"""
Verify the correct curator tag format is obeyed (i.e., it is not a UUID)

:param curator_tag: the tag name to validate.
:return: True if CURATOR_TAG_REGEX is matched.
"""
regex = f"^({DATASET_ID_REGEX}|{CURATOR_TAG_REGEX})$"
matched = re.match(regex, curator_tag)
if matched:
matches = matched.groupdict()
if matches.get("tag"):
return True
elif matches.get("dataset_id"):
raise ValueError("Curator tag cannot assume UUID format.")
raise ValueError("Curator tag cannot be empty.")
12 changes: 5 additions & 7 deletions backend/corpora/dataset_submissions/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
from backend.corpora.common.utils.regex import (
USERNAME_REGEX,
COLLECTION_ID_REGEX,
EXTENSION_REGEX,
DATASET_ID_REGEX,
CURATOR_TAG_PREFIX_REGEX,
CURATOR_TAG_REGEX,
)

logger = logging.getLogger(__name__)
REGEX = f"^{USERNAME_REGEX}/{COLLECTION_ID_REGEX}/({DATASET_ID_REGEX}|{CURATOR_TAG_PREFIX_REGEX})\\.{EXTENSION_REGEX}$"
REGEX = f"^{USERNAME_REGEX}/{COLLECTION_ID_REGEX}/({DATASET_ID_REGEX}|{CURATOR_TAG_REGEX})$"


def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
Expand All @@ -39,8 +38,6 @@ def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
parsed = parse_key(key)
if not parsed:
raise CorporaException(f"Missing collection ID, curator tag, and/or dataset ID for {key=}")
if parsed["tag_prefix"]:
parsed["tag"] = f"{parsed['tag_prefix']}.{parsed['extension']}"
logger.debug(parsed)

with db_session_manager() as session:
Expand All @@ -66,7 +63,6 @@ def dataset_submissions_handler(s3_event: dict, unused_context) -> None:
user=collection_owner,
url=s3_uri,
file_size=size,
file_extension=parsed["extension"],
dataset_id=dataset_id,
curator_tag=parsed.get("tag"),
)
Expand Down Expand Up @@ -111,12 +107,14 @@ def get_dataset_info(
) -> Tuple[Optional[str], Optional[str]]:
if dataset_id: # If a dataset uuid was provided
dataset = Dataset.get(session, dataset_id)
else: # if incoming_curator_tag
elif incoming_curator_tag: # if incoming_curator_tag
dataset = Dataset.get_dataset_from_curator_tag(session, collection_id, incoming_curator_tag)
if not dataset: # New dataset
collection = Collection.get_collection(session, collection_id)
if collection:
return collection.owner, None
else:
raise CorporaException("No dataset identifier provided")
if dataset:
return dataset.collection.owner, dataset.id
return None, None
2 changes: 0 additions & 2 deletions backend/corpora/lambdas/api/v1/collection_id/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,13 @@ def upload_from_link(collection_id: str, token_info: dict, url: str, dataset_id:
raise InvalidParametersHTTPException(detail=ex.detail)

file_size = resp.get("size")
file_extension = resp["name"].rsplit(".")[-1].lower()

try:
return upload(
db_session,
collection_id=collection_id,
url=url,
file_size=file_size,
file_extension=file_extension,
user=token_info["sub"],
scope=token_info["scope"],
dataset_id=dataset_id,
Expand Down
34 changes: 18 additions & 16 deletions backend/corpora/lambdas/api/v1/curation/collections/actions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from flask import jsonify, g
from .common import reshape_for_curation_api_and_is_allowed, add_collection_level_processing_status
from .common import EntityColumns
from ...authorization import is_super_curator
from .common import reshape_for_curation_api
from ...authorization import is_super_curator, owner_or_allowed
from ......common.corpora_orm import CollectionVisibility, DbCollection
from ......common.utils.http_exceptions import UnauthorizedError
from ......common.utils.http_exceptions import ForbiddenHTTPException
from backend.corpora.api_server.db import dbconnect


Expand All @@ -18,23 +17,26 @@ def get(visibility: str, token_info: dict, curator: str = None):
@return: Response
"""
filters = [DbCollection.tombstone == False] # noqa

if visibility == CollectionVisibility.PRIVATE.name and not token_info:
raise UnauthorizedError()
elif visibility:
if visibility:
filters.append(DbCollection.visibility == getattr(CollectionVisibility, visibility))
if visibility == CollectionVisibility.PRIVATE.name:
if not token_info:
raise ForbiddenHTTPException(detail="Not authorized to query for PRIVATE collection.")
else:
owner_filter = owner_or_allowed(token_info)
if owner_filter: # None means the user is a super curator and don't need to filter by owner.
filters.append(DbCollection.owner == owner_filter)

if curator and not is_super_curator(token_info):
raise UnauthorizedError()
elif curator: # user want collections from a specific curator
filters.append(DbCollection.curator_name == curator)
if curator:
if not is_super_curator(token_info):
raise ForbiddenHTTPException(detail="Not authorized to use the curator query parameter.")
else:
filters.append(DbCollection.curator_name == curator)

db_session = g.db_session
resp_collections = []
for collection in db_session.query(DbCollection).filter(*filters).all():
resp_collection = collection.to_dict_keep(EntityColumns.columns_for_collections)
resp_collection["processing_status"] = add_collection_level_processing_status(collection)
if reshape_for_curation_api_and_is_allowed(db_session, resp_collection, token_info, preview=True):
resp_collections.append(resp_collection)
resp_collection = reshape_for_curation_api(db_session, collection, token_info, preview=True)
resp_collections.append(resp_collection)

return jsonify({"collections": resp_collections})
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
normalize_and_get_doi,
)
from ..common import (
add_collection_level_processing_status,
reshape_for_curation_api_and_is_allowed,
EntityColumns,
reshape_for_curation_api,
)
from backend.corpora.api_server.db import dbconnect
from backend.corpora.common.corpora_orm import (
Expand Down Expand Up @@ -44,11 +42,7 @@ def get(collection_id: str, token_info: dict):
collection = Collection.get_collection(db_session, collection_id, include_tombstones=False)
if not collection:
raise NotFoundHTTPException
collection_response: dict = collection.to_dict_keep(EntityColumns.columns_for_collection_id)

collection_response["processing_status"] = add_collection_level_processing_status(collection)
reshape_for_curation_api_and_is_allowed(db_session, collection_response, token_info, id_provided=True)

collection_response = reshape_for_curation_api(db_session, collection, token_info)
return jsonify(collection_response)


Expand Down
Loading