Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Document file_type validation #467

Merged
merged 4 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions creator/analyses/file_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FILE_TYPES = {
"OTH": {"name": "Other", "required_columns": []},
"SEQ": {"name": "Sequencing Manifest", "required_columns": []},
"SHM": {"name": "Shipping Manifest", "required_columns": []},
"CLN": {"name": "Clinical Data", "required_columns": []},
"DBG": {"name": "dbGaP Submission File", "required_columns": []},
"FAM": {"name": "Familial Relationships", "required_columns": []},
"S3S": {
"name": "S3 Scrapes",
"required_columns": ["Bucket", "Key", "Size", "ETag"],
},
}
18 changes: 18 additions & 0 deletions creator/files/migrations/0019_add_file_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 2.2.13 on 2020-09-01 14:13

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('files', '0018_add_study_to_version'),
]

operations = [
migrations.AlterField(
model_name='file',
name='file_type',
field=models.CharField(choices=[('OTH', 'Other'), ('SEQ', 'Sequencing Manifest'), ('SHM', 'Shipping Manifest'), ('CLN', 'Clinical Data'), ('DBG', 'dbGaP Submission File'), ('FAM', 'Familial Relationships'), ('S3S', 'S3 Scrape')], default='OTH', max_length=3),
),
]
52 changes: 42 additions & 10 deletions creator/files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from django.conf import settings
from django.db import models
from django.core.validators import MinValueValidator
from django.core.exceptions import ValidationError
from django.utils import timezone
from django.contrib.auth import get_user_model
from django.contrib.postgres.fields import ArrayField
from creator.studies.models import Study
from creator.fields import KFIDField, kf_id_generator
from creator.analyses.file_types import FILE_TYPES

User = get_user_model()

Expand Down Expand Up @@ -64,16 +66,18 @@ class Meta:
)

file_type = models.CharField(
max_length=3,
choices=(
('OTH', 'Other'),
('SEQ', 'Sequencing Manifest'),
('SHM', 'Shipping Manifest'),
('CLN', 'Clinical Data'),
("DBG", "dbGaP Submission File"),
('FAM', 'Familial Relationships')),
default='OTH',
)
max_length=3,
choices=(
("OTH", "Other"),
("SEQ", "Sequencing Manifest"),
("SHM", "Shipping Manifest"),
("CLN", "Clinical Data"),
("DBG", "dbGaP Submission File"),
("FAM", "Familial Relationships"),
("S3S", "S3 Scrape"),
),
default="OTH",
)

tags = ArrayField(
models.CharField(max_length=50, blank=True),
Expand All @@ -82,6 +86,34 @@ class Meta:
help_text="Tags to group the files by",
)

def clean(self):
if (
self.study is None
and self.versions.latest("created_at").study is None
):
raise ValidationError(
"Study must be specified or the version given must have a "
"linked study"
)

# Validate file type if it has required columns
if (
self.file_type in FILE_TYPES
and len(FILE_TYPES[self.file_type].get("required_columns", [])) > 0
):
file_type = FILE_TYPES[self.file_type]
required_columns = set(file_type["required_columns"])
version_columns = set(
c["name"]
for c in self.versions.latest("created_at").analysis.columns
)
if not (required_columns <= version_columns):
raise ValidationError(
f"The version is missing columns required for the "
f"{file_type['name']} type: "
f"{required_columns - version_columns}"
)

def __str__(self):
return f'{self.kf_id}'

Expand Down
24 changes: 13 additions & 11 deletions creator/files/mutations/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,19 @@ def mutate(
):
raise GraphQLError("Not allowed")

root_file = File(
name=name,
study=study,
creator=user,
description=description,
file_type=fileType,
tags=tags,
)
root_file.save()
version.root_file = root_file
version.save()
with transaction.atomic():
root_file = File(
name=name,
study=study,
creator=user,
description=description,
file_type=fileType,
tags=tags,
)
root_file.save()
version.root_file = root_file
version.save()
root_file.full_clean()

return CreateFileMutation(file=root_file)

Expand Down
39 changes: 29 additions & 10 deletions creator/urls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.urls import path, include
from django.core.exceptions import ValidationError
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.conf import settings
Expand All @@ -7,19 +8,37 @@


def health_check(request):
return HttpResponse('ok')
return HttpResponse("ok")


class GraphQLView(FileUploadGraphQLView):
"""
Custom view that overwrites error formatting to handle Django form
validation errors more natively.
"""

@staticmethod
def format_error(error):
formatted_error = super(
FileUploadGraphQLView, FileUploadGraphQLView
).format_error(error)

if hasattr(error, "original_error") and isinstance(
error.original_error, ValidationError
):
error_dict = error.original_error.error_dict
if "__all__" in error_dict:
formatted_error["message"] = ", ".join(
[e.message for e in error_dict["__all__"]]
)

return formatted_error


urlpatterns = [
path('health_check', health_check),
path(
r'',
csrf_exempt(FileUploadGraphQLView.as_view(graphiql=True))
),
path(
r'graphql',
csrf_exempt(FileUploadGraphQLView.as_view(graphiql=True))
),
path("health_check", health_check),
path(r"", csrf_exempt(GraphQLView.as_view(graphiql=True))),
path(r"graphql", csrf_exempt(GraphQLView.as_view(graphiql=True))),
path("django-rq/", include("django_rq.urls")),
path(
r'download/study/<study_id>/file/<file_id>/version/<version_id>',
Expand Down
3 changes: 3 additions & 0 deletions tests/data/SD_ME0WME0W/s3_scrape.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Bucket,Key,Size,ETag
kf-study-us-east-1-dev-sd-00000000,source/my-file.bam,1234567,33a64df551425fcc55e4d42a148795d9f25f89d4
kf-study-us-east-1-dev-sd-00000000,source/hg32.fq,83823434,d41d8cd98f00b204e9800998ecf8427e-32
3 changes: 3 additions & 0 deletions tests/data/SD_ME0WME0W/s3_scrape_extra.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Bucket,Key,Size,ETag,Participant
kf-study-us-east-1-dev-sd-00000000,source/my-file.bam,1234567,33a64df551425fcc55e4d42a148795d9f25f89d4,SUBJ1
kf-study-us-east-1-dev-sd-00000000,source/hg32.fq,83823434,d41d8cd98f00b204e9800998ecf8427e-32,SUBJ2
3 changes: 3 additions & 0 deletions tests/data/SD_ME0WME0W/s3_scrape_partial.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Bucket,Key
kf-study-us-east-1-dev-sd-00000000,source/my-file.bam
kf-study-us-east-1-dev-sd-00000000,source/hg32.fq
145 changes: 145 additions & 0 deletions tests/files/test_file_type_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pytest
from django.contrib.auth import get_user_model
from graphql_relay import to_global_id

from creator.studies.factories import StudyFactory
from creator.studies.models import Study, Membership
from creator.files.models import Version

from creator.studies.factories import StudyFactory
from creator.files.factories import FileFactory

User = get_user_model()


CREATE_FILE = """
mutation (
$version: ID!,
$name: String!,
$description: String!,
$fileType: FileFileType!,
$study: ID
$tags: [String]
) {
createFile(
version: $version,
name: $name,
study: $study,
description: $description,
fileType: $fileType
tags: $tags
) {
file {
id
kfId
name
description
fileType
tags
versions(orderBy: "-created_at") {
edges {
node {
id
analysis {
id
}
}
}
}
}
}
}
"""


def test_missing_columns(db, clients, upload_version):
"""
Test that an error outlining missing columns is returned correctly
"""
client = clients.get("Administrators")
study = StudyFactory(kf_id="SD_ME0WME0W")

resp = upload_version(
"SD_ME0WME0W/FV_4DP2P2Y2_clinical.csv",
study_id=study.kf_id,
client=client,
)
version = resp.json()["data"]["createVersion"]["version"]["id"]

variables = {
"version": version,
"name": "Test file",
"study": to_global_id("StudyNode", study.kf_id),
"description": "This is my test file",
"fileType": "S3S",
"tags": [],
}

data = {"query": CREATE_FILE, "variables": variables}
resp = client.post("/graphql", content_type="application/json", data=data)

assert "errors" in resp.json()
assert "missing columns" in resp.json()["errors"][0]["message"]
assert "ETag" in resp.json()["errors"][0]["message"]
assert "Key" in resp.json()["errors"][0]["message"]


def test_partially_missing_columns(db, clients, upload_version):
"""
Test that an error outlining missing columns is returned correctly
"""
client = clients.get("Administrators")
study = StudyFactory(kf_id="SD_ME0WME0W")

resp = upload_version(
"SD_ME0WME0W/s3_scrape_partial.csv",
study_id=study.kf_id,
client=client,
)
version = resp.json()["data"]["createVersion"]["version"]["id"]

variables = {
"version": version,
"name": "Test file",
"study": to_global_id("StudyNode", study.kf_id),
"description": "This is my test file",
"fileType": "S3S",
"tags": [],
}

data = {"query": CREATE_FILE, "variables": variables}
resp = client.post("/graphql", content_type="application/json", data=data)

assert "errors" in resp.json()
assert "missing columns" in resp.json()["errors"][0]["message"]
assert "ETag" in resp.json()["errors"][0]["message"]
assert "Size" in resp.json()["errors"][0]["message"]
assert "Bucket" not in resp.json()["errors"][0]["message"]
assert "Key" not in resp.json()["errors"][0]["message"]


def test_extra_columns(db, clients, upload_version):
"""
Test that an error outlining missing columns is returned correctly
"""
client = clients.get("Administrators")
study = StudyFactory(kf_id="SD_ME0WME0W")

resp = upload_version(
"SD_ME0WME0W/s3_scrape_extra.csv", study_id=study.kf_id, client=client
)
version = resp.json()["data"]["createVersion"]["version"]["id"]

variables = {
"version": version,
"name": "Test file",
"study": to_global_id("StudyNode", study.kf_id),
"description": "This is my test file",
"fileType": "S3S",
"tags": [],
}

data = {"query": CREATE_FILE, "variables": variables}
resp = client.post("/graphql", content_type="application/json", data=data)

assert "errors" not in resp.json()