bento-platform · davidlougheed · Oct 31, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/chord_metadata_service/discovery/stats.py b/chord_metadata_service/discovery/stats.py
@@ -25,12 +25,16 @@ async def individual_experiment_type_stats(
     """
     Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for experiment type
-    note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
+    Note: queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys.
     """
+
+    # Note: the queryset used to join through phenopackets, but individuals can be created without a phenopacket (which
+    # occurs sometimes in tests or in the case of a new packet model), which would cause this to return the wrong stats.
+
     return await bento_public_format_count_and_stats_list(
         queryset
-        .values(label=F("phenopackets__biosamples__experiment__experiment_type"))
-        .annotate(value=Count("phenopackets__biosamples__experiment", distinct=True)),
+        .values(label=F("biosamples__experiment__experiment_type"))
+        .annotate(value=Count("biosamples__experiment", distinct=True)),
         discovery,
         field_permissions,
     )
@@ -43,10 +47,14 @@ async def individual_biosample_tissue_stats(
     Used for a fixed-response public API and beacon.
     returns count and bento_public format list of stats for biosample sampled_tissue
     """
+
+    # Note: the queryset used to join through phenopackets, but individuals can be created without a phenopacket (which
+    # occurs sometimes in tests or in the case of a new packet model), which would cause this to return the wrong stats.
+
     return await bento_public_format_count_and_stats_list(
         queryset
-        .values(label=F("phenopackets__biosamples__sampled_tissue__label"))
-        .annotate(value=Count("phenopackets__biosamples", distinct=True)),
+        .values(label=F("biosamples__sampled_tissue__label"))
+        .annotate(value=Count("biosamples", distinct=True)),
         discovery,
         field_permissions,
     )

diff --git a/chord_metadata_service/discovery/tests/constants.py b/chord_metadata_service/discovery/tests/constants.py
@@ -23,7 +23,7 @@
     "search": [
         {
             "section_title": "First Section",
-            "fields": ["sex", "age", "tissues"]
+            "fields": ["sex", "age", "tissues", "extraction_protocol"]
         }
     ],
     "fields": {
@@ -87,6 +87,16 @@
                 "units": "mm"
             }
         },
+        "extraction_protocol": {
+            "mapping": "experiment/extraction_protocol",
+            "mapping_for_search_filter": "individual/biosamples/experiment/extraction_protocol",
+            "title": "Experiment Extraction Protocol",
+            "description": "experiment extraction protocol",
+            "datatype": "string",
+            "config": {
+                "enum": ["NGS"]
+            },
+        },
     },
     "rules": {
         "count_threshold": 5,

diff --git a/chord_metadata_service/discovery/utils.py b/chord_metadata_service/discovery/utils.py
@@ -234,6 +234,19 @@ def get_discovery_field_set_permissions(
 
 
 def get_public_model_scoped_queryset(scope: ValidatedDiscoveryScope, mn: PublicModelName) -> QuerySet:
+    """
+    Discovery models can be scoped to either a project or dataset; for downstream filtering, we need to pre-scope the
+    model queryset to the project/dataset being queried.
+
+    Since downstream filtering may be applied to (possibly deeply)nested fields/models (e.g., biosamples, experiments),
+    we use `.distinct()` instead of `.all()`. Otherwise, there may be multiple instances of the same top-level object
+    (individual especially, which has this deep nesting) for each nested instance, for example, in the case of multiple
+    experiments for a biosample for an individual.
+
+    :param scope: The scope to filter the queryset to.
+    :param mn: The discovery/"public" model name for
+    """
+
     filter_scope: PublicScopeFilterKeys
     if scope.dataset_id:
         filter_scope = "dataset"
@@ -242,9 +255,14 @@ def get_public_model_scoped_queryset(scope: ValidatedDiscoveryScope, mn: PublicM
         filter_scope = "project"
         value = scope.project_id
     else:
-        return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all()
+        return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.distinct()
 
     filter_query = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][filter_scope]["filter"]
     prefetch = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][filter_scope]["prefetch_related"]
 
-    return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.prefetch_related(*prefetch).filter(**{filter_query: value})
+    return (
+        PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects
+        .distinct()
+        .prefetch_related(*prefetch)
+        .filter(**{filter_query: value})
+    )
diff --git a/chord_metadata_service/patients/tests/test_api.py b/chord_metadata_service/patients/tests/test_api.py
@@ -20,6 +20,8 @@
     CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY
 )
 from chord_metadata_service.discovery.types import DiscoveryConfig
+from chord_metadata_service.experiments import models as ex_m
+from chord_metadata_service.experiments.tests import constants as ex_c
 from chord_metadata_service.patients.models import Individual
 from chord_metadata_service.phenopackets import models as ph_m
 from chord_metadata_service.phenopackets.tests import constants as ph_c
@@ -380,13 +382,13 @@ def setUp(self):
             project=self.project_2,
         )
 
-        individuals = [
+        self.individuals = [
             c.generate_valid_individual(date_of_consent_range=(2020, 2023))
             for _ in range(self.num_individuals)
         ]
 
-        individual_objs = [Individual.objects.create(**individual) for individual in individuals]
-        ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(Individual.objects.all()[0]))
+        individual_objs = [Individual.objects.create(**individual) for individual in self.individuals]
+        biosample = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(Individual.objects.all()[0]))
 
         for idx, individual in enumerate(individual_objs, 1):
             self.meta_data = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1)
@@ -396,6 +398,13 @@ def setUp(self):
                 meta_data=self.meta_data,
                 dataset=self.dataset,
             )
+            if idx == 1:
+                self.phenopacket.biosamples.add(biosample)
+                self.phenopacket.save()
+
+        instrument = ex_m.Instrument.objects.create(**ex_c.valid_instrument())
+        ex_m.Experiment.objects.create(**ex_c.valid_experiment(biosample, instrument, self.dataset, 1))
+        ex_m.Experiment.objects.create(**ex_c.valid_experiment(biosample, instrument, self.dataset, 2))
 
         random.seed(self.random_seed)
 
@@ -694,6 +703,15 @@ def test_public_filtering_mapping_for_search_filter(self):
         response_obj = response.json()
         self.assertEqual(1, response_obj["count"])
 
+    @override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_NO_THRESHOLD)
+    def test_public_filtering_two_experiments(self):
+        response = self.dt_authz_counts_get(f"/api/public?sex={self.individuals[0]['sex']}&extraction_protocol=NGS")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        response_obj = response.json()
+        self.assertEqual(response_obj["count"], 1)
+        self.assertEqual(response_obj["biosamples"]["count"], 1)
+        self.assertEqual(response_obj["experiments"]["count"], 2)
+
     @override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST)
     def test_public_overview_sex(self):
         response = self.dt_authz_counts_get('/api/public_search_fields')