Skip to content

Commit

Permalink
finished added the version calcs and fixed version retrieval, checkin…
Browse files Browse the repository at this point in the history
…g if transcripts exists is still broken, the case where transcript_id and no transcripts are listed in the variant is the case that needs to be fixed
  • Loading branch information
SeriousHorncat committed Sep 17, 2024
1 parent 2f88d17 commit ec04208
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 70 deletions.
16 changes: 11 additions & 5 deletions backend/src/core/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,18 @@ def process_tasks(
if annotation_unit.has_dependencies():
missing_dependencies = annotation_unit.get_missing_dependencies()
for missing_dataset_name in missing_dependencies:
dependency_dataset = analysis_collection.get_manifest_dataset_config(
# logger.info(f"for annotation_unit '%s' looking for missing '%s'", annotation_unit.to_name_string(), missing_dataset_name)
analysis_manifest_dataset = analysis_collection.get_manifest_dataset_config(
analysis_name, missing_dataset_name
)
if analysis_manifest_dataset is None:
continue
# logger.info('manifest entry in %s for %s datset: %s', analysis_name, missing_dataset_name, analysis_manifest_dataset)

dependency_annotation_unit = AnnotationUnit(
annotation_unit.genomic_unit, dependency_dataset
annotation_unit.genomic_unit, analysis_manifest_dataset
)
dependency_annotation_unit.set_latest_version(analysis_manifest_dataset['version'])
annotation_value = genomic_unit_collection.find_genomic_unit_annotation_value(
dependency_annotation_unit
)
Expand Down Expand Up @@ -149,10 +155,10 @@ def process_tasks(
task_process_result = future.result()
if isinstance(task, VersionAnnotationTask):
annotation_unit = task.annotation_unit
annotation_unit.set_latest_version(task_process_result)
version = task.extract_version(task_process_result)
annotation_unit.set_latest_version(version)
logger.info(
'%s Version Calculated %s...', format_annotation_logging(annotation_unit),
task_process_result
'%s Version Calculated %s...', format_annotation_logging(annotation_unit), version
)
annotation_queue.put(annotation_unit)
else:
Expand Down
40 changes: 16 additions & 24 deletions backend/src/core/annotation_task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tasks for annotating a genomic unit with datasets"""
from abc import abstractmethod
from datetime import date
import json
from random import randint
import time
Expand Down Expand Up @@ -46,10 +47,10 @@ def aggregate_string_replacements(self, base_string) -> str:
https://grch37.rest.ensembl.org/vep/human/hgvs/NM_001017980.3:c.164G>T?content-type=application/json;CADD=1;refseq=1;
example base string:
.[].transcript_consequences[] | select( .transcript_id | contains(\"{transcript}\") )
| { CADD: .cadd_phred }
return value: .[].transcript_consequences[] | select( .transcript_id |
contains(\"NM_001017980\") ) | { CADD: .cadd_phred }
.[].transcript_consequences[] | select( .transcript_id | contains(\"{transcript}\") ) | { CADD: .cadd_phred }
return value:
.[].transcript_consequences[] | select( .transcript_id | contains(\"NM_001017980\") ) | { CADD: .cadd_phred }
genomic unit within the annotation unit in this task to be
{
Expand All @@ -75,8 +76,7 @@ def annotate(self):

def __json_extract__(self, jq_query, json_to_parse):
"""Private ethod to execute jq to extract JSON"""
replaced_attributes = self.aggregate_string_replacements(jq_query)
jq_results = iter(jq.compile(replaced_attributes).input(json_to_parse).all())
jq_results = iter(jq.compile(jq_query).input(json_to_parse).all())
return jq_results

def extract(self, incomming_json):
Expand All @@ -94,7 +94,8 @@ def extract(self, incomming_json):

jq_results = empty_gen()
try:
jq_results = self.__json_extract__(self.annotation_unit.dataset['attribute'], incomming_json)
replaced_attributes = self.aggregate_string_replacements(self.annotation_unit.dataset['attribute'])
jq_results = self.__json_extract__(replaced_attributes, incomming_json)
except ValueError as value_error:
logger.info((
'Failed to annotate "%s" from "%s" on %s with error "%s"', annotation_unit_json['data_set'],
Expand Down Expand Up @@ -127,12 +128,12 @@ def extract_version(self, incoming_version_json):
""" Interface extraction method for Version Annotation tasks"""

version = ""
annotation_unit_json = {"version": self.annotation_unit.version}

if self.annotation_unit.dataset['versioning_type'] == "rosalution":
annotation_unit_json["version"] = incoming_version_json["rosalution"]
elif self.annotation_unit.dataset['versioning_type'] == "date":
annotation_unit_json["version"] = incoming_version_json["date"]
versioning_type = self.annotation_unit.get_dataset_version_type()
if versioning_type == "rosalution":
version = incoming_version_json["rosalution"]
elif versioning_type == "date":
version = incoming_version_json["date"]
else:
jq_query = self.annotation_unit.dataset['version_attribute']

Expand All @@ -142,9 +143,8 @@ def extract_version(self, incoming_version_json):
except ValueError as value_error:
logger.info(('Failed to extract version', value_error))
jq_result = next(jq_result, None)
annotation_unit_json["version"] = jq_result
version = jq_result

version = annotation_unit_json
return version


Expand Down Expand Up @@ -251,18 +251,11 @@ def get_annotation_version_from_rest(self):
"""Gets version for rest type and returns the version data"""
version = {"rest": "rosalution-temp-manifest-00"}

url_to_query = self.build_versioning_url()
url_to_query = self.annotation_unit.dataset['version_url']
result = requests.get(url_to_query, verify=False, headers={"Accept": "application/json"}, timeout=30)
version = result.json()
return version

def build_versioning_url(self):
"""
Builds the version URL from aggregate_string_replacements and
then appends the list of query parameters for the list of datasets.
"""
return self.aggregate_string_replacements(self.annotation_unit.dataset['version_url'])

def get_annotation_version_from_rosalution(self):
"""Gets version for rosalution type and returns the version data"""

Expand All @@ -271,9 +264,8 @@ def get_annotation_version_from_rosalution(self):

def get_annotation_version_from_date(self):
"""Gets version for date type and returns the version data"""
# getting version from date

version = {"date": "rosalution-temp-manifest-00"}
version = {"date": str(date.today())}
return version


Expand Down
4 changes: 4 additions & 0 deletions backend/src/core/annotation_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ def get_dataset_source(self):
"""Returns the dataset's source"""
return self.dataset['data_source']

def get_dataset_version_type(self):
"""Returns the dataset's versioning type"""
return self.dataset['versioning_type']

def is_transcript_dataset(self):
"""Returns true if the dataset is for a transcript"""
return 'transcript' in self.dataset
Expand Down
19 changes: 13 additions & 6 deletions backend/src/repository/analysis_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from typing import List
from uuid import uuid4
import logging

from pymongo import ReturnDocument

Expand All @@ -14,6 +15,7 @@

# pylint: disable=too-many-public-methods
# Disabling too few public metods due to utilizing Pydantic/FastAPI BaseSettings class
logger = logging.getLogger(__name__)


class AnalysisCollection:
Expand Down Expand Up @@ -136,8 +138,9 @@ def add_dataset_to_manifest(self, analysis_name: str, annotation_unit: Annotatio
"""Adds this dataset and its version to this Analysis."""

dataset = {
annotation_unit.get_dataset_name(): {annotation_unit.get_dataset_source(),
annotation_unit.get_version()}
annotation_unit.get_dataset_name(): {
'data_source': annotation_unit.get_dataset_source(), 'version': annotation_unit.get_version()
}
}

updated_document = self.collection.find_one_and_update({"name": analysis_name},
Expand All @@ -149,14 +152,18 @@ def add_dataset_to_manifest(self, analysis_name: str, annotation_unit: Annotatio
def get_manifest_dataset_config(self, analysis_name: str, dataset_name: str):
""" Returns an individual dataset manifest """
dataset_attribute = f"manifest.{dataset_name}"
result = self.collection.find_one({"name": analysis_name, dataset_attribute: {'$exists': True}})
projection = {"manifest.$": 1}
analysis = self.collection.find_one({"name": analysis_name, dataset_attribute: {'$exists': True}}, projection)

if not result:
if not analysis:
return None

manifest_entry = next((dataset for dataset in analysis['manifest'] if dataset_name in dataset), None)

# logger.info('manifest entry in %s for %s datset: %s', analysis_name, dataset_name, manifest_entry)
return {
"data_set": dataset_name, "data_source": result[dataset_name]['data_source'],
"version": result[dataset_name]['version']
"data_set": dataset_name, "data_source": manifest_entry[dataset_name]['data_source'],
"version": manifest_entry[dataset_name]['version']
}

def get_dataset_manifest(self, analysis_name):
Expand Down
21 changes: 14 additions & 7 deletions backend/src/repository/genomic_unit_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,10 @@ def __find_annotation_query__(self, annotation_unit: AnnotationUnit):
datasource_attribute = f"{dataset_attribute_base}.data_source"
version_attribute = f"{dataset_attribute_base}.version"

return {
**find_query, **{
dataset_attribute_base: {'$exists': True}, datasource_attribute: annotation_unit.get_dataset_source(),
version_attribute: annotation_unit.get_version()
}
}
find_query[dataset_attribute_base] = {'$exists': True}
find_query[datasource_attribute] = annotation_unit.get_dataset_source()
find_query[version_attribute] = annotation_unit.get_version()
return find_query

def all(self):
""" Returns all genomic units that are currently stored """
Expand All @@ -82,6 +80,10 @@ def annotation_exist(self, annotation_unit: AnnotationUnit):
if annotation_unit.is_transcript_dataset():
hgvs_genomic_unit = self.collection.find_one(find_query)

logger.info(
'%s (%s): dataset - %s', annotation_unit.to_name_string(), annotation_unit.get_version(),
hgvs_genomic_unit
)
for transcript in hgvs_genomic_unit['transcripts']:
dataset_in_transcript_annotation = next((
annotation for annotation in transcript['annotations']
Expand All @@ -100,11 +102,16 @@ def find_genomic_unit_annotation_value(self, annotation_unit: AnnotationUnit):
""" Returns the annotation value for a genomic unit according the the dataset"""

dataset_name = annotation_unit.get_dataset_name()
find_query = self.__find_annotation_query__(annotation_unit)

find_query = self.__find_annotation_query__(annotation_unit)
projection = {f"annotations.{dataset_name}.value.$": 1, "_id": 0}

# logger.info('find query: %s', find_query)
# logger.info('projection: %s', projection)
result = self.collection.find_one(find_query, projection)

# logger.info('retrieved the genomic unit value for "%s"', result);

if result is None:
return None

Expand Down
6 changes: 0 additions & 6 deletions backend/tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,6 @@ def _create_dataset_manifest(analysis_name, dataset_name):
return _create_dataset_manifest


# @pytest.fixture(name="cpam0046_analysis")
# def fixture_cpam0046_analysis(cpam0046_analysis_json):
# """Returns the Analysis for CPAM0046 to verify creating annotation tasks"""
# return Analysis(**cpam0046_analysis_json)


@pytest.fixture(name="genomic_units_with_types")
def fixture_genomic_units_with_types(analysis_collection_json):
"""Returns the multiple analyses being mocked as an array"""
Expand Down
16 changes: 10 additions & 6 deletions backend/tests/unit/core/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,10 @@ def fixture_extract_and_annotate_cpam0002(cpam0002_annotation_queue, get_dataset
}]

with (
patch("src.core.annotation_task.AnnotationTaskInterface.extract", return_value=mock_extract_result) as
extract_task_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
patch("src.core.annotation_task.AnnotationTaskInterface.extract",
return_value=mock_extract_result) as extract_task_annotate,
patch("src.core.annotation_task.AnnotationTaskInterface.extract_version", return_value='fake-version') as
extract_task_version_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
version_task_annotate, patch("src.core.annotation_task.ForgeAnnotationTask.annotate") as forge_task_annotate,
patch("src.core.annotation_task.HttpAnnotationTask.annotate") as http_task_annotate,
patch("src.core.annotation_task.NoneAnnotationTask.annotate") as none_task_annotate
Expand All @@ -117,7 +119,7 @@ def fixture_extract_and_annotate_cpam0002(cpam0002_annotation_queue, get_dataset
yield {
'extract': extract_task_annotate, 'version': version_task_annotate, 'http': http_task_annotate,
'none': none_task_annotate, 'forge': forge_task_annotate,
'genomic_unit_collection': mock_genomic_unit_collection
'genomic_unit_collection': mock_genomic_unit_collection, 'extract_version': extract_task_version_annotate
}


Expand Down Expand Up @@ -153,8 +155,10 @@ def fixture_extract_and_annotate_cpam0046(cpam0046_annotation_queue, get_dataset
}]

with (
patch("src.core.annotation_task.AnnotationTaskInterface.extract", return_value=mock_extract_result) as
extract_task_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
patch("src.core.annotation_task.AnnotationTaskInterface.extract",
return_value=mock_extract_result) as extract_task_annotate,
patch("src.core.annotation_task.AnnotationTaskInterface.extract_version", return_value='fake-version') as
extract_task_version_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
version_task_annotate, patch("src.core.annotation_task.ForgeAnnotationTask.annotate") as forge_task_annotate,
patch("src.core.annotation_task.HttpAnnotationTask.annotate") as http_task_annotate,
patch("src.core.annotation_task.NoneAnnotationTask.annotate") as none_task_annotate
Expand All @@ -175,5 +179,5 @@ def fixture_extract_and_annotate_cpam0046(cpam0046_annotation_queue, get_dataset
yield {
'extract': extract_task_annotate, 'version': version_task_annotate, 'http': http_task_annotate,
'none': none_task_annotate, 'forge': forge_task_annotate,
'genomic_unit_collection': mock_genomic_unit_collection
'genomic_unit_collection': mock_genomic_unit_collection, 'extract_version': extract_task_version_annotate
}
32 changes: 21 additions & 11 deletions backend/tests/unit/core/test_annotation_task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Tests Annotation Tasks and the creation of them"""
from datetime import date
from unittest.mock import Mock, patch
import pytest
import requests

from src.core.annotation_task import AnnotationTaskFactory, ForgeAnnotationTask, \
HttpAnnotationTask, VersionAnnotationTask
Expand Down Expand Up @@ -119,28 +122,35 @@ def test_annotation_versioning_task_created(genomic_unit, dataset_name, get_anno
('NM_001017980.3:c.164G>T', 'ClinVar_Variantion_Id', {"rosalution": "rosalution-manifest-00"}),
('VMA21', 'Ensembl Gene Id', {"releases": [112]}),
('NM_001017980.3:c.164G>T', 'Polyphen Prediction', {"releases": [112]}),
('VMA21', 'HPO_NCBI_GENE_ID', {"date": "rosalution-temp-manifest-00"}),
('LMNA', 'OMIM', {"date": "rosalution-temp-manifest-00"}),
('VMA21', 'HPO_NCBI_GENE_ID', {"date": "2024-09-16"}),
('LMNA', 'OMIM', {"date": "2024-09-16"}),
]
)
def test_process_annotation_versioning_all_types(genomic_unit, dataset_name, expected, get_version_task):
"""Verifies that Version Annotation Tasks process and annotate for all 3 versioning types- date, rest, rosalution"""
task = get_version_task(genomic_unit, dataset_name)
actual_version_json = task.annotate()
assert actual_version_json == expected

mock_response = Mock(spec=requests.Response)
mock_response.json.return_value = {"releases": [112]}

with (patch("requests.get", return_value=mock_response), patch('src.core.annotation_task.date') as mock_date):
mock_date.today.return_value = date(2024, 9, 16)

task = get_version_task(genomic_unit, dataset_name)
actual_version_json = task.annotate()
assert actual_version_json == expected


@pytest.mark.parametrize(
"genomic_unit,dataset_name,expected", [
('VMA21', 'Entrez Gene Id', {"version": "rosalution-manifest-00"}),
('VMA21', 'Ensembl Gene Id', {"version": 112}),
('LMNA', 'OMIM', {"version": "rosalution-temp-manifest-00"}),
"genomic_unit,dataset_name,version_to_extract,expected", [
('VMA21', 'Entrez Gene Id', {"rosalution": "rosalution-manifest-00"}, "rosalution-manifest-00"),
('VMA21', 'Ensembl Gene Id', {"releases": [112]}, 112),
('LMNA', 'OMIM', {"date": "rosalution-temp-manifest-00"}, "rosalution-temp-manifest-00"),
]
)
def test_version_extraction(genomic_unit, dataset_name, expected, get_version_task):
def test_version_extraction(genomic_unit, dataset_name, expected, version_to_extract, get_version_task):
""" Verifies extraction for datasets for all 3 versioning types - rest, date, rosalution"""
task = get_version_task(genomic_unit, dataset_name)
actual_version_extraction = task.extract_version(task.annotate())
actual_version_extraction = task.extract_version(version_to_extract)
assert actual_version_extraction == expected


Expand Down
13 changes: 8 additions & 5 deletions backend/tests/unit/repository/test_genomic_unit_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ def test_find_genomic_units(genomic_unit_collection):
"transcript_annotation_unit", [
('Polyphen Prediction', 'Ensembl', '112', '120', False),
('Polyphen Prediction', 'Ensembl', '112', '112', True),
('transcript_id', 'Ensembl', '', '120', False),
('Polyphen Prediction', 'Ensembl', '', '120', False),
],
indirect=True,
ids=["polyphen_exists_different_version", "polyphen_exists", "polyphen_not_exists"]
ids=["polyphen_exists_different_version", "polyphen_exists", "no_transcripts_yet", "polyphen_not_exists"]
)
def test_transcripts_annotations_exists(transcript_annotation_unit, genomic_unit_collection):
""" Tests if a transcript annotation does not exist """
Expand Down Expand Up @@ -284,10 +285,12 @@ def variant_with_datasets_annotation_unit(request, get_annotation_unit, get_anno
if dataset in transcript_annotation:
transcript_annotation[dataset].data_source = data_source
transcript_annotation[dataset].version = existing_version
elif dataset == 'transcript_id':
variant_in_database_json['transcripts'] = []
else:
variant_in_database_json['transcripts'] = [
transcript_annotation for transcript_annotation in variant_in_database_json['transcripts']
if dataset not in transcript_annotation
]
for transcript in variant_in_database_json['transcripts']:
transcript['annotations'] = [
annotation for annotation in transcript['annotations'] if dataset not in annotation
]

return (annotation_unit, variant_in_database_json, expected)

0 comments on commit ec04208

Please sign in to comment.