Skip to content

Commit

Permalink
Merge pull request #494 from kids-first/gen-workflow-output-doc-type
Browse files Browse the repository at this point in the history
✨ Add Genomic Workflow Output Manifest
  • Loading branch information
znatty22 authored Oct 27, 2020
2 parents 2bc60fd + d5fbe6c commit a9101a9
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 30 deletions.
15 changes: 15 additions & 0 deletions creator/analyses/file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,19 @@
],
"template": "family_trio_config.py",
},
"GWO": {
"name": "Genomic Workflow Output Manifest",
"required_columns": [
'Cavatica ID',
'Cavatica Task ID',
'KF Biospecimen ID',
'KF Participant ID',
'KF Family ID',
'Filepath',
'Data Type',
'Workflow Type',
'Source Read'
],
"template": "genomic_workflow_output_manifest_config.py",
},
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
This is an extract config intended for Genomic Workflow Output Manifests
produced by the Bix team. This manifest contains the list of files produced
by the genomic harmonization workflows along with the attached specimens,
and source genomic files.
To use this extract config, you can make a copy of it and add it to your
ingest package or you can import it as a module in an existing extract config
and override at least the `source_data_url`. You may also append additional:w
operations to the `operations` list as well.
"""

from kf_lib_data_ingest.common.concept_schema import CONCEPT
from kf_lib_data_ingest.etl.extract.operations import (
keep_map, value_map, Split
)

source_data_url = "{{ download_url }}"

operations = [
keep_map(
in_col="Data Type",
out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
),
keep_map(
in_col="Filepath",
out_col=CONCEPT.GENOMIC_FILE.ID,
),
value_map(
in_col="KF Biospecimen ID",
m=lambda x: Split(x.split(",")),
out_col=CONCEPT.BIOSPECIMEN.ID,
),
]
18 changes: 18 additions & 0 deletions creator/files/migrations/0022_add_genomic_workflow_output_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 2.2.13 on 2020-10-23 20:14

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('files', '0021_add_file_types'),
]

operations = [
migrations.AlterField(
model_name='file',
name='file_type',
field=models.CharField(choices=[('OTH', 'OTH'), ('SEQ', 'SEQ'), ('SHM', 'SHM'), ('CLN', 'CLN'), ('DBG', 'DBG'), ('FAM', 'FAM'), ('S3S', 'S3S'), ('PDA', 'PDA'), ('FTR', 'FTR'), ('GWO', 'GWO')], default='OTH', max_length=3),
),
]
9 changes: 5 additions & 4 deletions creator/files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class FileType(Enum):
S3S = "S3S"
PDA = "PDA"
FTR = "FTR"
GWO = "GWO"


class File(models.Model):
Expand Down Expand Up @@ -256,10 +257,10 @@ class Meta:
null=False,
help_text='Time the version was created')
size = models.BigIntegerField(
validators=[
MinValueValidator(0, 'File size must be a positive number')
],
help_text='Size of the version in bytes')
validators=[
MinValueValidator(0, 'File size must be a positive number')
],
help_text='Size of the version in bytes')

root_file = models.ForeignKey(
File,
Expand Down
37 changes: 37 additions & 0 deletions tests/data/genomic_workflow_output_manifest_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
This is an extract config intended for Genomic Workflow Output Manifests
produced by the Bix team. This manifest contains the list of files produced
by the genomic harmonization workflows along with the attached specimens,
and source genomic files.
To use this extract config, you can make a copy of it and add it to your
ingest package or you can import it as a module in an existing extract config
and override at least the `source_data_url`. You may also append additional:w
operations to the `operations` list as well.
"""

from kf_lib_data_ingest.common.concept_schema import CONCEPT
from kf_lib_data_ingest.etl.extract.operations import (
keep_map, value_map, Split
)

source_data_url = (
'https://localhost:5002/download/study/SD_ME0WME0W/'
'file/SF_Y1JMXTTT/version/FV_4RYEMD72'
)
operations = [
keep_map(
in_col="Data Type",
out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
),
keep_map(
in_col="Filepath",
out_col=CONCEPT.GENOMIC_FILE.ID,
),
value_map(
in_col="KF Biospecimen ID",
m=lambda x: Split(x.split(",")),
out_col=CONCEPT.BIOSPECIMEN.ID,
),
]
File renamed without changes.
64 changes: 38 additions & 26 deletions tests/files/test_extract_config.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,46 @@
import os
import pytest
import json
import boto3
from moto import mock_s3
from django.http.response import HttpResponse

from creator.files.models import Version, File
from creator.studies.factories import StudyFactory
from creator.files.factories import FileFactory
from creator.analyses.file_types import FILE_TYPES


@pytest.mark.parametrize(
"file_type,has_config",
"file_type",
[
("OTH", False),
("SEQ", False),
("SHM", False),
("CLN", False),
("DBG", False),
("FAM", False),
("S3S", True),
file_type
for file_type, params in FILE_TYPES.items()
if not params["required_columns"]
],
)
def test_valid_file_type(clients, db, mocker, file_type, has_config):
def test_no_extract_configs(clients, db, mocker, file_type):
"""
Test that user cannot download extract config for non-expedited file types
"""
client = clients.get("Administrators")
study = StudyFactory()
file = FileFactory(study=study)
file.file_type = file_type
file.save()
resp = client.get(f"/extract_config/study/{study.kf_id}/file/{file.kf_id}")
assert resp.status_code == 404


@pytest.mark.parametrize(
"file_type,content",
[
("S3S", "S3 object manifests"),
("GWO", "Genomic Workflow Output Manifest"),
],
)
def test_has_extract_config(clients, db, mocker, file_type, content):
"""
Test that user can download extract config for expedited file types
"""
mock_resp = mocker.patch("creator.files.views.HttpResponse")
mock_resp.return_value = HttpResponse(
open("tests/data/s3_scrape_config_file.py")
open(f"tests/data/{FILE_TYPES[file_type]['template']}")
)
client = clients.get("Administrators")
study = StudyFactory()
Expand All @@ -35,19 +49,17 @@ def test_valid_file_type(clients, db, mocker, file_type, has_config):
file.save()
version = file.versions.latest("created_at")
resp = client.get(f"/extract_config/study/{study.kf_id}/file/{file.kf_id}")
if has_config:
assert resp.status_code == 200
assert resp.get("Content-Disposition") == (
f"attachment; filename*=UTF-8''" f"{version.kf_id}_config.py"
)
assert b"/download/study/" in resp.content
else:
assert resp.status_code == 404
assert resp.status_code == 200
assert resp.get("Content-Disposition") == (
f"attachment; filename*=UTF-8''" f"{version.kf_id}_config.py"
)
assert b"/download/study/" in resp.content
assert bytes(content, "utf-8") in resp.content


def test_file_not_exist(clients, db):
client = clients.get("Administrators")
resp = client.get(f"/extract_config/study/study_id/file/123")
resp = client.get("/extract_config/study/study_id/file/123")
assert resp.status_code == 404


Expand All @@ -71,10 +83,10 @@ def test_config_auth(db, mocker, clients, user_group, allowed):
client = clients.get(user_group)
study = StudyFactory()
file = FileFactory(study=study)
file.file_type = 'S3S'
file.file_type = "S3S"
file.save()
version = file.versions.latest("created_at")
version.key = open(f"tests/data/manifest.txt")
version.key = open("tests/data/manifest.txt")
mock_resp = mocker.patch("creator.files.views._resolve_version")
mock_resp.return_value = (file, version)

Expand Down

0 comments on commit a9101a9

Please sign in to comment.