Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: wrong search count for some searches across nested properties #551

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions chord_metadata_service/discovery/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ async def individual_experiment_type_stats(
"""
Used for a fixed-response public API and beacon.
returns count and bento_public format list of stats for experiment type
note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
Note: queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys.
"""

# Note: the queryset used to join through phenopackets, but individuals can be created without a phenopacket (which
# occurs sometimes in tests or in the case of a new packet model), which would cause this to return the wrong stats.

return await bento_public_format_count_and_stats_list(
queryset
.values(label=F("phenopackets__biosamples__experiment__experiment_type"))
.annotate(value=Count("phenopackets__biosamples__experiment", distinct=True)),
.values(label=F("biosamples__experiment__experiment_type"))
.annotate(value=Count("biosamples__experiment", distinct=True)),
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved
discovery,
field_permissions,
)
Expand All @@ -43,10 +47,14 @@ async def individual_biosample_tissue_stats(
Used for a fixed-response public API and beacon.
returns count and bento_public format list of stats for biosample sampled_tissue
"""

# Note: the queryset used to join through phenopackets, but individuals can be created without a phenopacket (which
# occurs sometimes in tests or in the case of a new packet model), which would cause this to return the wrong stats.

return await bento_public_format_count_and_stats_list(
queryset
.values(label=F("phenopackets__biosamples__sampled_tissue__label"))
.annotate(value=Count("phenopackets__biosamples", distinct=True)),
.values(label=F("biosamples__sampled_tissue__label"))
.annotate(value=Count("biosamples", distinct=True)),
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved
discovery,
field_permissions,
)
Expand Down
12 changes: 11 additions & 1 deletion chord_metadata_service/discovery/tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"search": [
{
"section_title": "First Section",
"fields": ["sex", "age", "tissues"]
"fields": ["sex", "age", "tissues", "extraction_protocol"]
}
],
"fields": {
Expand Down Expand Up @@ -87,6 +87,16 @@
"units": "mm"
}
},
"extraction_protocol": {
"mapping": "experiment/extraction_protocol",
"mapping_for_search_filter": "individual/biosamples/experiment/extraction_protocol",
"title": "Experiment Extraction Protocol",
"description": "experiment extraction protocol",
"datatype": "string",
"config": {
"enum": ["NGS"]
},
},
},
"rules": {
"count_threshold": 5,
Expand Down
22 changes: 20 additions & 2 deletions chord_metadata_service/discovery/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,19 @@ def get_discovery_field_set_permissions(


def get_public_model_scoped_queryset(scope: ValidatedDiscoveryScope, mn: PublicModelName) -> QuerySet:
"""
Discovery models can be scoped to either a project or dataset; for downstream filtering, we need to pre-scope the
model queryset to the project/dataset being queried.

Since downstream filtering may be applied to (possibly deeply)nested fields/models (e.g., biosamples, experiments),
we use `.distinct()` instead of `.all()`. Otherwise, there may be multiple instances of the same top-level object
(individual especially, which has this deep nesting) for each nested instance, for example, in the case of multiple
experiments for a biosample for an individual.

:param scope: The scope to filter the queryset to.
:param mn: The discovery/"public" model name for
"""

filter_scope: PublicScopeFilterKeys
if scope.dataset_id:
filter_scope = "dataset"
Expand All @@ -242,9 +255,14 @@ def get_public_model_scoped_queryset(scope: ValidatedDiscoveryScope, mn: PublicM
filter_scope = "project"
value = scope.project_id
else:
return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.all()
return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.distinct()
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved

filter_query = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][filter_scope]["filter"]
prefetch = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][filter_scope]["prefetch_related"]

return PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.prefetch_related(*prefetch).filter(**{filter_query: value})
return (
PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects
.distinct()
davidlougheed marked this conversation as resolved.
Show resolved Hide resolved
.prefetch_related(*prefetch)
.filter(**{filter_query: value})
)
24 changes: 21 additions & 3 deletions chord_metadata_service/patients/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
CONFIG_PUBLIC_TEST_SEARCH_SEX_ONLY
)
from chord_metadata_service.discovery.types import DiscoveryConfig
from chord_metadata_service.experiments import models as ex_m
from chord_metadata_service.experiments.tests import constants as ex_c
from chord_metadata_service.patients.models import Individual
from chord_metadata_service.phenopackets import models as ph_m
from chord_metadata_service.phenopackets.tests import constants as ph_c
Expand Down Expand Up @@ -380,13 +382,13 @@ def setUp(self):
project=self.project_2,
)

individuals = [
self.individuals = [
c.generate_valid_individual(date_of_consent_range=(2020, 2023))
for _ in range(self.num_individuals)
]

individual_objs = [Individual.objects.create(**individual) for individual in individuals]
ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(Individual.objects.all()[0]))
individual_objs = [Individual.objects.create(**individual) for individual in self.individuals]
biosample = ph_m.Biosample.objects.create(**ph_c.valid_biosample_1(Individual.objects.all()[0]))

for idx, individual in enumerate(individual_objs, 1):
self.meta_data = ph_m.MetaData.objects.create(**ph_c.VALID_META_DATA_1)
Expand All @@ -396,6 +398,13 @@ def setUp(self):
meta_data=self.meta_data,
dataset=self.dataset,
)
if idx == 1:
self.phenopacket.biosamples.add(biosample)
self.phenopacket.save()

instrument = ex_m.Instrument.objects.create(**ex_c.valid_instrument())
ex_m.Experiment.objects.create(**ex_c.valid_experiment(biosample, instrument, self.dataset, 1))
ex_m.Experiment.objects.create(**ex_c.valid_experiment(biosample, instrument, self.dataset, 2))

random.seed(self.random_seed)

Expand Down Expand Up @@ -694,6 +703,15 @@ def test_public_filtering_mapping_for_search_filter(self):
response_obj = response.json()
self.assertEqual(1, response_obj["count"])

@override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST_NO_THRESHOLD)
def test_public_filtering_two_experiments(self):
response = self.dt_authz_counts_get(f"/api/public?sex={self.individuals[0]['sex']}&extraction_protocol=NGS")
self.assertEqual(response.status_code, status.HTTP_200_OK)
response_obj = response.json()
self.assertEqual(response_obj["count"], 1)
self.assertEqual(response_obj["biosamples"]["count"], 1)
self.assertEqual(response_obj["experiments"]["count"], 2)

@override_settings(CONFIG_PUBLIC=DISCOVERY_CONFIG_TEST)
def test_public_overview_sex(self):
response = self.dt_authz_counts_get('/api/public_search_fields')
Expand Down