finished added the version calcs and fixed version retrieval, checkin…

…g if transcripts exists is still broken, the case where transcript_id and no transcripts are listed in the variant is the case that needs to be fixed
uab-cgds-worthey · Sep 17, 2024 · ec04208 · ec04208
1 parent 2f88d17
commit ec04208
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 70 deletions.
diff --git a/backend/src/core/annotation.py b/backend/src/core/annotation.py
@@ -110,12 +110,18 @@ def process_tasks(
  if annotation_unit.has_dependencies():
  missing_dependencies = annotation_unit.get_missing_dependencies()
  for missing_dataset_name in missing_dependencies:
- dependency_dataset = analysis_collection.get_manifest_dataset_config(
+ # logger.info(f"for annotation_unit '%s' looking for missing '%s'", annotation_unit.to_name_string(), missing_dataset_name)
+ analysis_manifest_dataset = analysis_collection.get_manifest_dataset_config(
  analysis_name, missing_dataset_name
  )
+ if analysis_manifest_dataset is None:
+ continue
+ # logger.info('manifest entry in %s for %s datset: %s', analysis_name, missing_dataset_name, analysis_manifest_dataset)
+
  dependency_annotation_unit = AnnotationUnit(
- annotation_unit.genomic_unit, dependency_dataset
+ annotation_unit.genomic_unit, analysis_manifest_dataset
  )
+ dependency_annotation_unit.set_latest_version(analysis_manifest_dataset['version'])
  annotation_value = genomic_unit_collection.find_genomic_unit_annotation_value(
  dependency_annotation_unit
  )
@@ -149,10 +155,10 @@ def process_tasks(
  task_process_result = future.result()
  if isinstance(task, VersionAnnotationTask):
  annotation_unit = task.annotation_unit
- annotation_unit.set_latest_version(task_process_result)
+ version = task.extract_version(task_process_result)
+ annotation_unit.set_latest_version(version)
  logger.info(
- '%s Version Calculated %s...', format_annotation_logging(annotation_unit),
- task_process_result
+ '%s Version Calculated %s...', format_annotation_logging(annotation_unit), version
  )
  annotation_queue.put(annotation_unit)
  else:

diff --git a/backend/src/core/annotation_task.py b/backend/src/core/annotation_task.py
@@ -1,5 +1,6 @@
 """Tasks for annotating a genomic unit with datasets"""
 from abc import abstractmethod
+from datetime import date
 import json
 from random import randint
 import time
@@ -46,10 +47,10 @@ def aggregate_string_replacements(self, base_string) -> str:
  https://grch37.rest.ensembl.org/vep/human/hgvs/NM_001017980.3:c.164G>T?content-type=application/json;CADD=1;refseq=1;
  
  example base string:
-  .[].transcript_consequences[] | select( .transcript_id | contains(\"{transcript}\") ) 
-  | { CADD: .cadd_phred }
-  return value: .[].transcript_consequences[] | select( .transcript_id | 
-   contains(\"NM_001017980\") ) | { CADD: .cadd_phred }
+ .[].transcript_consequences[] | select( .transcript_id | contains(\"{transcript}\") ) | { CADD: .cadd_phred }
+ 
+ return value: 
+ .[].transcript_consequences[] | select( .transcript_id | contains(\"NM_001017980\") ) | { CADD: .cadd_phred }
 
  genomic unit within the annotation unit in this task to be
  {
@@ -75,8 +76,7 @@ def annotate(self):
 
  def __json_extract__(self, jq_query, json_to_parse):
  """Private ethod to execute jq to extract JSON"""
- replaced_attributes = self.aggregate_string_replacements(jq_query)
- jq_results = iter(jq.compile(replaced_attributes).input(json_to_parse).all())
+ jq_results = iter(jq.compile(jq_query).input(json_to_parse).all())
  return jq_results
 
  def extract(self, incomming_json):
@@ -94,7 +94,8 @@ def extract(self, incomming_json):
 
  jq_results = empty_gen()
  try:
- jq_results = self.__json_extract__(self.annotation_unit.dataset['attribute'], incomming_json)
+ replaced_attributes = self.aggregate_string_replacements(self.annotation_unit.dataset['attribute'])
+ jq_results = self.__json_extract__(replaced_attributes, incomming_json)
  except ValueError as value_error:
  logger.info((
  'Failed to annotate "%s" from "%s" on %s with error "%s"', annotation_unit_json['data_set'],
@@ -127,12 +128,12 @@ def extract_version(self, incoming_version_json):
  """ Interface extraction method for Version Annotation tasks"""
 
  version = ""
- annotation_unit_json = {"version": self.annotation_unit.version}
 
- if self.annotation_unit.dataset['versioning_type'] == "rosalution":
- annotation_unit_json["version"] = incoming_version_json["rosalution"]
- elif self.annotation_unit.dataset['versioning_type'] == "date":
- annotation_unit_json["version"] = incoming_version_json["date"]
+ versioning_type = self.annotation_unit.get_dataset_version_type()
+ if versioning_type == "rosalution":
+ version = incoming_version_json["rosalution"]
+ elif versioning_type == "date":
+ version = incoming_version_json["date"]
  else:
  jq_query = self.annotation_unit.dataset['version_attribute']
 
@@ -142,9 +143,8 @@ def extract_version(self, incoming_version_json):
  except ValueError as value_error:
  logger.info(('Failed to extract version', value_error))
  jq_result = next(jq_result, None)
- annotation_unit_json["version"] = jq_result
+ version = jq_result
 
- version = annotation_unit_json
  return version
 
 
@@ -251,18 +251,11 @@ def get_annotation_version_from_rest(self):
  """Gets version for rest type and returns the version data"""
  version = {"rest": "rosalution-temp-manifest-00"}
 
- url_to_query = self.build_versioning_url()
+ url_to_query = self.annotation_unit.dataset['version_url']
  result = requests.get(url_to_query, verify=False, headers={"Accept": "application/json"}, timeout=30)
  version = result.json()
  return version
 
- def build_versioning_url(self):
- """
- Builds the version URL from aggregate_string_replacements and 
- then appends the list of query parameters for the list of datasets.
- """
- return self.aggregate_string_replacements(self.annotation_unit.dataset['version_url'])
-
  def get_annotation_version_from_rosalution(self):
  """Gets version for rosalution type and returns the version data"""
 
@@ -271,9 +264,8 @@ def get_annotation_version_from_rosalution(self):
 
  def get_annotation_version_from_date(self):
  """Gets version for date type and returns the version data"""
- # getting version from date
 
- version = {"date": "rosalution-temp-manifest-00"}
+ version = {"date": str(date.today())}
  return version
 
 

diff --git a/backend/src/core/annotation_unit.py b/backend/src/core/annotation_unit.py
@@ -23,6 +23,10 @@ def get_dataset_source(self):
  """Returns the dataset's source"""
  return self.dataset['data_source']
 
+ def get_dataset_version_type(self):
+ """Returns the dataset's versioning type"""
+ return self.dataset['versioning_type']
+
  def is_transcript_dataset(self):
  """Returns true if the dataset is for a transcript"""
  return 'transcript' in self.dataset

diff --git a/backend/src/repository/analysis_collection.py b/backend/src/repository/analysis_collection.py
@@ -3,6 +3,7 @@
 """
 from typing import List
 from uuid import uuid4
+import logging
 
 from pymongo import ReturnDocument
 
@@ -14,6 +15,7 @@
 
 # pylint: disable=too-many-public-methods
 # Disabling too few public metods due to utilizing Pydantic/FastAPI BaseSettings class
+logger = logging.getLogger(__name__)
 
 
 class AnalysisCollection:
@@ -136,8 +138,9 @@ def add_dataset_to_manifest(self, analysis_name: str, annotation_unit: Annotatio
  """Adds this dataset and its version to this Analysis."""
 
  dataset = {
- annotation_unit.get_dataset_name(): {annotation_unit.get_dataset_source(),
- annotation_unit.get_version()}
+ annotation_unit.get_dataset_name(): {
+ 'data_source': annotation_unit.get_dataset_source(), 'version': annotation_unit.get_version()
+ }
  }
 
  updated_document = self.collection.find_one_and_update({"name": analysis_name},
@@ -149,14 +152,18 @@ def add_dataset_to_manifest(self, analysis_name: str, annotation_unit: Annotatio
  def get_manifest_dataset_config(self, analysis_name: str, dataset_name: str):
  """ Returns an individual dataset manifest """
  dataset_attribute = f"manifest.{dataset_name}"
- result = self.collection.find_one({"name": analysis_name, dataset_attribute: {'$exists': True}})
+ projection = {"manifest.$": 1}
+ analysis = self.collection.find_one({"name": analysis_name, dataset_attribute: {'$exists': True}}, projection)
 
- if not result:
+ if not analysis:
  return None
 
+ manifest_entry = next((dataset for dataset in analysis['manifest'] if dataset_name in dataset), None)
+
+ # logger.info('manifest entry in %s for %s datset: %s', analysis_name, dataset_name, manifest_entry)
  return {
- "data_set": dataset_name, "data_source": result[dataset_name]['data_source'],
- "version": result[dataset_name]['version']
+ "data_set": dataset_name, "data_source": manifest_entry[dataset_name]['data_source'],
+ "version": manifest_entry[dataset_name]['version']
  }
 
  def get_dataset_manifest(self, analysis_name):

diff --git a/backend/src/repository/genomic_unit_collection.py b/backend/src/repository/genomic_unit_collection.py
@@ -60,12 +60,10 @@ def __find_annotation_query__(self, annotation_unit: AnnotationUnit):
  datasource_attribute = f"{dataset_attribute_base}.data_source"
  version_attribute = f"{dataset_attribute_base}.version"
 
- return {
- **find_query, **{
- dataset_attribute_base: {'$exists': True}, datasource_attribute: annotation_unit.get_dataset_source(),
- version_attribute: annotation_unit.get_version()
- }
- }
+ find_query[dataset_attribute_base] = {'$exists': True}
+ find_query[datasource_attribute] = annotation_unit.get_dataset_source()
+ find_query[version_attribute] = annotation_unit.get_version()
+ return find_query
 
  def all(self):
  """ Returns all genomic units that are currently stored """
@@ -82,6 +80,10 @@ def annotation_exist(self, annotation_unit: AnnotationUnit):
  if annotation_unit.is_transcript_dataset():
  hgvs_genomic_unit = self.collection.find_one(find_query)
 
+ logger.info(
+ '%s (%s): dataset - %s', annotation_unit.to_name_string(), annotation_unit.get_version(),
+ hgvs_genomic_unit
+ )
  for transcript in hgvs_genomic_unit['transcripts']:
  dataset_in_transcript_annotation = next((
  annotation for annotation in transcript['annotations']
@@ -100,11 +102,16 @@ def find_genomic_unit_annotation_value(self, annotation_unit: AnnotationUnit):
  """ Returns the annotation value for a genomic unit according the the dataset"""
 
  dataset_name = annotation_unit.get_dataset_name()
- find_query = self.__find_annotation_query__(annotation_unit)
 
+ find_query = self.__find_annotation_query__(annotation_unit)
  projection = {f"annotations.{dataset_name}.value.$": 1, "_id": 0}
+
+ # logger.info('find query: %s', find_query)
+ # logger.info('projection: %s', projection)
  result = self.collection.find_one(find_query, projection)
 
+ # logger.info('retrieved the genomic unit value for "%s"', result);
+
  if result is None:
  return None
 

diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py
@@ -137,12 +137,6 @@ def _create_dataset_manifest(analysis_name, dataset_name):
  return _create_dataset_manifest
 
 
-# @pytest.fixture(name="cpam0046_analysis")
-# def fixture_cpam0046_analysis(cpam0046_analysis_json):
-# """Returns the Analysis for CPAM0046 to verify creating annotation tasks"""
-# return Analysis(**cpam0046_analysis_json)
-
-
 @pytest.fixture(name="genomic_units_with_types")
 def fixture_genomic_units_with_types(analysis_collection_json):
  """Returns the multiple analyses being mocked as an array"""

diff --git a/backend/tests/unit/core/test_annotate.py b/backend/tests/unit/core/test_annotate.py
@@ -94,8 +94,10 @@ def fixture_extract_and_annotate_cpam0002(cpam0002_annotation_queue, get_dataset
  }]
 
  with (
- patch("src.core.annotation_task.AnnotationTaskInterface.extract", return_value=mock_extract_result) as
- extract_task_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
+ patch("src.core.annotation_task.AnnotationTaskInterface.extract",
+ return_value=mock_extract_result) as extract_task_annotate,
+ patch("src.core.annotation_task.AnnotationTaskInterface.extract_version", return_value='fake-version') as
+ extract_task_version_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
  version_task_annotate, patch("src.core.annotation_task.ForgeAnnotationTask.annotate") as forge_task_annotate,
  patch("src.core.annotation_task.HttpAnnotationTask.annotate") as http_task_annotate,
  patch("src.core.annotation_task.NoneAnnotationTask.annotate") as none_task_annotate
@@ -117,7 +119,7 @@ def fixture_extract_and_annotate_cpam0002(cpam0002_annotation_queue, get_dataset
  yield {
  'extract': extract_task_annotate, 'version': version_task_annotate, 'http': http_task_annotate,
  'none': none_task_annotate, 'forge': forge_task_annotate,
- 'genomic_unit_collection': mock_genomic_unit_collection
+ 'genomic_unit_collection': mock_genomic_unit_collection, 'extract_version': extract_task_version_annotate
  }
 
 
@@ -153,8 +155,10 @@ def fixture_extract_and_annotate_cpam0046(cpam0046_annotation_queue, get_dataset
  }]
 
  with (
- patch("src.core.annotation_task.AnnotationTaskInterface.extract", return_value=mock_extract_result) as
- extract_task_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
+ patch("src.core.annotation_task.AnnotationTaskInterface.extract",
+ return_value=mock_extract_result) as extract_task_annotate,
+ patch("src.core.annotation_task.AnnotationTaskInterface.extract_version", return_value='fake-version') as
+ extract_task_version_annotate, patch("src.core.annotation_task.VersionAnnotationTask.annotate") as
  version_task_annotate, patch("src.core.annotation_task.ForgeAnnotationTask.annotate") as forge_task_annotate,
  patch("src.core.annotation_task.HttpAnnotationTask.annotate") as http_task_annotate,
  patch("src.core.annotation_task.NoneAnnotationTask.annotate") as none_task_annotate
@@ -175,5 +179,5 @@ def fixture_extract_and_annotate_cpam0046(cpam0046_annotation_queue, get_dataset
  yield {
  'extract': extract_task_annotate, 'version': version_task_annotate, 'http': http_task_annotate,
  'none': none_task_annotate, 'forge': forge_task_annotate,
- 'genomic_unit_collection': mock_genomic_unit_collection
+ 'genomic_unit_collection': mock_genomic_unit_collection, 'extract_version': extract_task_version_annotate
  }
diff --git a/backend/tests/unit/core/test_annotation_task.py b/backend/tests/unit/core/test_annotation_task.py
@@ -1,5 +1,8 @@
 """Tests Annotation Tasks and the creation of them"""
+from datetime import date
+from unittest.mock import Mock, patch
 import pytest
+import requests
 
 from src.core.annotation_task import AnnotationTaskFactory, ForgeAnnotationTask, \
  HttpAnnotationTask, VersionAnnotationTask
@@ -119,28 +122,35 @@ def test_annotation_versioning_task_created(genomic_unit, dataset_name, get_anno
  ('NM_001017980.3:c.164G>T', 'ClinVar_Variantion_Id', {"rosalution": "rosalution-manifest-00"}),
  ('VMA21', 'Ensembl Gene Id', {"releases": [112]}),
  ('NM_001017980.3:c.164G>T', 'Polyphen Prediction', {"releases": [112]}),
- ('VMA21', 'HPO_NCBI_GENE_ID', {"date": "rosalution-temp-manifest-00"}),
- ('LMNA', 'OMIM', {"date": "rosalution-temp-manifest-00"}),
+ ('VMA21', 'HPO_NCBI_GENE_ID', {"date": "2024-09-16"}),
+ ('LMNA', 'OMIM', {"date": "2024-09-16"}),
  ]
 )
 def test_process_annotation_versioning_all_types(genomic_unit, dataset_name, expected, get_version_task):
  """Verifies that Version Annotation Tasks process and annotate for all 3 versioning types- date, rest, rosalution"""
- task = get_version_task(genomic_unit, dataset_name)
- actual_version_json = task.annotate()
- assert actual_version_json == expected
+
+ mock_response = Mock(spec=requests.Response)
+ mock_response.json.return_value = {"releases": [112]}
+
+ with (patch("requests.get", return_value=mock_response), patch('src.core.annotation_task.date') as mock_date):
+ mock_date.today.return_value = date(2024, 9, 16)
+
+ task = get_version_task(genomic_unit, dataset_name)
+ actual_version_json = task.annotate()
+ assert actual_version_json == expected
 
 
 @pytest.mark.parametrize(
- "genomic_unit,dataset_name,expected", [
- ('VMA21', 'Entrez Gene Id', {"version": "rosalution-manifest-00"}),
- ('VMA21', 'Ensembl Gene Id', {"version": 112}),
- ('LMNA', 'OMIM', {"version": "rosalution-temp-manifest-00"}),
+ "genomic_unit,dataset_name,version_to_extract,expected", [
+ ('VMA21', 'Entrez Gene Id', {"rosalution": "rosalution-manifest-00"}, "rosalution-manifest-00"),
+ ('VMA21', 'Ensembl Gene Id', {"releases": [112]}, 112),
+ ('LMNA', 'OMIM', {"date": "rosalution-temp-manifest-00"}, "rosalution-temp-manifest-00"),
  ]
 )
-def test_version_extraction(genomic_unit, dataset_name, expected, get_version_task):
+def test_version_extraction(genomic_unit, dataset_name, expected, version_to_extract, get_version_task):
  """ Verifies extraction for datasets for all 3 versioning types - rest, date, rosalution"""
  task = get_version_task(genomic_unit, dataset_name)
- actual_version_extraction = task.extract_version(task.annotate())
+ actual_version_extraction = task.extract_version(version_to_extract)
  assert actual_version_extraction == expected
 
 

diff --git a/backend/tests/unit/repository/test_genomic_unit_collection.py b/backend/tests/unit/repository/test_genomic_unit_collection.py
@@ -19,10 +19,11 @@ def test_find_genomic_units(genomic_unit_collection):
  "transcript_annotation_unit", [
  ('Polyphen Prediction', 'Ensembl', '112', '120', False),
  ('Polyphen Prediction', 'Ensembl', '112', '112', True),
+ ('transcript_id', 'Ensembl', '', '120', False),
  ('Polyphen Prediction', 'Ensembl', '', '120', False),
  ],
  indirect=True,
- ids=["polyphen_exists_different_version", "polyphen_exists", "polyphen_not_exists"]
+ ids=["polyphen_exists_different_version", "polyphen_exists", "no_transcripts_yet", "polyphen_not_exists"]
 )
 def test_transcripts_annotations_exists(transcript_annotation_unit, genomic_unit_collection):
  """ Tests if a transcript annotation does not exist """
@@ -284,10 +285,12 @@ def variant_with_datasets_annotation_unit(request, get_annotation_unit, get_anno
  if dataset in transcript_annotation:
  transcript_annotation[dataset].data_source = data_source
  transcript_annotation[dataset].version = existing_version
+ elif dataset == 'transcript_id':
+ variant_in_database_json['transcripts'] = []
  else:
- variant_in_database_json['transcripts'] = [
- transcript_annotation for transcript_annotation in variant_in_database_json['transcripts']
- if dataset not in transcript_annotation
- ]
+ for transcript in variant_in_database_json['transcripts']:
+ transcript['annotations'] = [
+  annotation for annotation in transcript['annotations'] if dataset not in annotation
+  ]
 
  return (annotation_unit, variant_in_database_json, expected)