Merge pull request #494 from kids-first/gen-workflow-output-doc-type

✨ Add Genomic Workflow Output Manifest
kids-first · Oct 27, 2020 · a9101a9 · a9101a9
2 parents 2bc60fd + d5fbe6c
commit a9101a9
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 30 deletions.
diff --git a/creator/analyses/file_types.py b/creator/analyses/file_types.py
@@ -50,4 +50,19 @@
         ],
         "template": "family_trio_config.py",
     },
+    "GWO": {
+        "name": "Genomic Workflow Output Manifest",
+        "required_columns": [
+            'Cavatica ID',
+            'Cavatica Task ID',
+            'KF Biospecimen ID',
+            'KF Participant ID',
+            'KF Family ID',
+            'Filepath',
+            'Data Type',
+            'Workflow Type',
+            'Source Read'
+        ],
+        "template": "genomic_workflow_output_manifest_config.py",
+    },
 }
diff --git a/creator/extract_configs/templates/genomic_workflow_output_manifest_config.py b/creator/extract_configs/templates/genomic_workflow_output_manifest_config.py
@@ -0,0 +1,35 @@
+"""
+This is an extract config intended for Genomic Workflow Output Manifests
+produced by the Bix team. This manifest contains the list of files produced
+by the genomic harmonization workflows along with the attached specimens,
+and source genomic files.
+
+To use this extract config, you can make a copy of it and add it to your
+ingest package or you can import it as a module in an existing extract config
+and override at least the `source_data_url`. You may also append additional:w
+
+operations to the `operations` list as well.
+"""
+
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+from kf_lib_data_ingest.etl.extract.operations import (
+    keep_map, value_map, Split
+)
+
+source_data_url = "{{ download_url }}"
+
+operations = [
+    keep_map(
+        in_col="Data Type",
+        out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
+    ),
+    keep_map(
+        in_col="Filepath",
+        out_col=CONCEPT.GENOMIC_FILE.ID,
+    ),
+    value_map(
+        in_col="KF Biospecimen ID",
+        m=lambda x: Split(x.split(",")),
+        out_col=CONCEPT.BIOSPECIMEN.ID,
+    ),
+]
diff --git a/creator/files/migrations/0022_add_genomic_workflow_output_type.py b/creator/files/migrations/0022_add_genomic_workflow_output_type.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.13 on 2020-10-23 20:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('files', '0021_add_file_types'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='file',
+            name='file_type',
+            field=models.CharField(choices=[('OTH', 'OTH'), ('SEQ', 'SEQ'), ('SHM', 'SHM'), ('CLN', 'CLN'), ('DBG', 'DBG'), ('FAM', 'FAM'), ('S3S', 'S3S'), ('PDA', 'PDA'), ('FTR', 'FTR'), ('GWO', 'GWO')], default='OTH', max_length=3),
+        ),
+    ]
diff --git a/creator/files/models.py b/creator/files/models.py
@@ -38,6 +38,7 @@ class FileType(Enum):
     S3S = "S3S"
     PDA = "PDA"
     FTR = "FTR"
+    GWO = "GWO"
 
 
 class File(models.Model):
@@ -256,10 +257,10 @@ class Meta:
                                       null=False,
                                       help_text='Time the version was created')
     size = models.BigIntegerField(
-            validators=[
-                MinValueValidator(0, 'File size must be a positive number')
-            ],
-            help_text='Size of the version in bytes')
+        validators=[
+            MinValueValidator(0, 'File size must be a positive number')
+        ],
+        help_text='Size of the version in bytes')
 
     root_file = models.ForeignKey(
         File,

diff --git a/tests/data/genomic_workflow_output_manifest_config.py b/tests/data/genomic_workflow_output_manifest_config.py
@@ -0,0 +1,37 @@
+"""
+This is an extract config intended for Genomic Workflow Output Manifests
+produced by the Bix team. This manifest contains the list of files produced
+by the genomic harmonization workflows along with the attached specimens,
+and source genomic files.
+
+To use this extract config, you can make a copy of it and add it to your
+ingest package or you can import it as a module in an existing extract config
+and override at least the `source_data_url`. You may also append additional:w
+
+operations to the `operations` list as well.
+"""
+
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+from kf_lib_data_ingest.etl.extract.operations import (
+    keep_map, value_map, Split
+)
+
+source_data_url = (
+    'https://localhost:5002/download/study/SD_ME0WME0W/'
+    'file/SF_Y1JMXTTT/version/FV_4RYEMD72'
+)
+operations = [
+    keep_map(
+        in_col="Data Type",
+        out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
+    ),
+    keep_map(
+        in_col="Filepath",
+        out_col=CONCEPT.GENOMIC_FILE.ID,
+    ),
+    value_map(
+        in_col="KF Biospecimen ID",
+        m=lambda x: Split(x.split(",")),
+        out_col=CONCEPT.BIOSPECIMEN.ID,
+    ),
+]
diff --git a/tests/data/s3_scrape_config_file.py → tests/data/s3_scrape_config.py b/tests/data/s3_scrape_config_file.py → tests/data/s3_scrape_config.py
diff --git a/tests/files/test_extract_config.py b/tests/files/test_extract_config.py
@@ -1,32 +1,46 @@
-import os
 import pytest
-import json
-import boto3
-from moto import mock_s3
 from django.http.response import HttpResponse
 
-from creator.files.models import Version, File
 from creator.studies.factories import StudyFactory
 from creator.files.factories import FileFactory
+from creator.analyses.file_types import FILE_TYPES
 
 
 @pytest.mark.parametrize(
-    "file_type,has_config",
+    "file_type",
     [
-        ("OTH", False),
-        ("SEQ", False),
-        ("SHM", False),
-        ("CLN", False),
-        ("DBG", False),
-        ("FAM", False),
-        ("S3S", True),
+        file_type
+        for file_type, params in FILE_TYPES.items()
+        if not params["required_columns"]
     ],
 )
-def test_valid_file_type(clients, db, mocker, file_type, has_config):
+def test_no_extract_configs(clients, db, mocker, file_type):
+    """
+    Test that user cannot download extract config for non-expedited file types
+    """
     client = clients.get("Administrators")
+    study = StudyFactory()
+    file = FileFactory(study=study)
+    file.file_type = file_type
+    file.save()
+    resp = client.get(f"/extract_config/study/{study.kf_id}/file/{file.kf_id}")
+    assert resp.status_code == 404
+
+
+@pytest.mark.parametrize(
+    "file_type,content",
+    [
+        ("S3S", "S3 object manifests"),
+        ("GWO", "Genomic Workflow Output Manifest"),
+    ],
+)
+def test_has_extract_config(clients, db, mocker, file_type, content):
+    """
+    Test that user can download extract config for expedited file types
+    """
     mock_resp = mocker.patch("creator.files.views.HttpResponse")
     mock_resp.return_value = HttpResponse(
-        open("tests/data/s3_scrape_config_file.py")
+        open(f"tests/data/{FILE_TYPES[file_type]['template']}")
     )
     client = clients.get("Administrators")
     study = StudyFactory()
@@ -35,19 +49,17 @@ def test_valid_file_type(clients, db, mocker, file_type, has_config):
     file.save()
     version = file.versions.latest("created_at")
     resp = client.get(f"/extract_config/study/{study.kf_id}/file/{file.kf_id}")
-    if has_config:
-        assert resp.status_code == 200
-        assert resp.get("Content-Disposition") == (
-            f"attachment; filename*=UTF-8''" f"{version.kf_id}_config.py"
-        )
-        assert b"/download/study/" in resp.content
-    else:
-        assert resp.status_code == 404
+    assert resp.status_code == 200
+    assert resp.get("Content-Disposition") == (
+        f"attachment; filename*=UTF-8''" f"{version.kf_id}_config.py"
+    )
+    assert b"/download/study/" in resp.content
+    assert bytes(content, "utf-8") in resp.content
 
 
 def test_file_not_exist(clients, db):
     client = clients.get("Administrators")
-    resp = client.get(f"/extract_config/study/study_id/file/123")
+    resp = client.get("/extract_config/study/study_id/file/123")
     assert resp.status_code == 404
 
 
@@ -71,10 +83,10 @@ def test_config_auth(db, mocker, clients, user_group, allowed):
     client = clients.get(user_group)
     study = StudyFactory()
     file = FileFactory(study=study)
-    file.file_type = 'S3S'
+    file.file_type = "S3S"
     file.save()
     version = file.versions.latest("created_at")
-    version.key = open(f"tests/data/manifest.txt")
+    version.key = open("tests/data/manifest.txt")
     mock_resp = mocker.patch("creator.files.views._resolve_version")
     mock_resp.return_value = (file, version)