From 29f0f01ad4242ff8533c561bb753cc24bab60ed9 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Wed, 9 Aug 2023 19:48:57 -0700 Subject: [PATCH 1/4] Initial expansion of association table search support --- backend/src/monarch_py/cli.py | 1 + .../implementations/solr/solr_query_utils.py | 15 +++++++++++++-- backend/src/monarch_py/service/solr_service.py | 2 +- backend/src/monarch_py/solr_cli.py | 3 ++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/backend/src/monarch_py/cli.py b/backend/src/monarch_py/cli.py index 40ba69e15..a34a7a273 100644 --- a/backend/src/monarch_py/cli.py +++ b/backend/src/monarch_py/cli.py @@ -247,6 +247,7 @@ def association_table( help="The association category to get associations for, ex. biolink:GeneToPhenotypicFeatureAssociation", ), q: str = typer.Option(None, "--query", "-q"), + sort: List[str] = typer.Option(None, "--sort", "-s"), limit: int = typer.Option(5, "--limit", "-l"), offset: int = typer.Option(0, "--offset"), fmt: str = typer.Option( diff --git a/backend/src/monarch_py/implementations/solr/solr_query_utils.py b/backend/src/monarch_py/implementations/solr/solr_query_utils.py index 51bb2a0ff..62e101d43 100644 --- a/backend/src/monarch_py/implementations/solr/solr_query_utils.py +++ b/backend/src/monarch_py/implementations/solr/solr_query_utils.py @@ -56,8 +56,9 @@ def build_association_query( if q: # We don't yet have tokenization strategies for the association index, initially we'll limit searching to # the visible fields in an association table plus their ID equivalents and use a wildcard query for substring matching - query.q = f"*{q}*" - query.query_fields = "subject subject_label predicate object object_label" + query.q = q + query.def_type = "edismax" + query.query_fields = association_search_query_fields() if sort: query.sort = ", ".join(sort) if facet_fields: @@ -214,3 +215,13 @@ def entity_query_fields(): since the field list and boosts are currently the same """ return "id^100 name^10 name_t^5 name_ac symbol^10 symbol_t^5 symbol_ac synonym synonym_t synonym_ac" + +def association_search_query_fields(): + """ + Shared field list for free text search on associations (e.g. for the association table) + """ + + return ("subject subject_label^2 subject_closure subject_closure_label" + " predicate " + " object object_label^2 object_closure object_closure_label" + " publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by ") diff --git a/backend/src/monarch_py/service/solr_service.py b/backend/src/monarch_py/service/solr_service.py index da84d45a5..f6999e6cb 100644 --- a/backend/src/monarch_py/service/solr_service.py +++ b/backend/src/monarch_py/service/solr_service.py @@ -26,7 +26,7 @@ def get(self, id): def query(self, q: SolrQuery) -> SolrQueryResult: url = f"{self.base_url}/{self.core.value}/select?{q.query_string()}" response = requests.get(url) - + logger.debug(f"SolrService.query: {url}") data = json.loads(response.text) if "error" in data: logger.error("Solr error message: " + data["error"]["msg"]) diff --git a/backend/src/monarch_py/solr_cli.py b/backend/src/monarch_py/solr_cli.py index e45807375..d2a25f295 100644 --- a/backend/src/monarch_py/solr_cli.py +++ b/backend/src/monarch_py/solr_cli.py @@ -343,6 +343,7 @@ def association_table( help="The association category to get associations for, ex. biolink:GeneToPhenotypicFeatureAssociation", ), q: str = typer.Option(None, "--query", "-q"), + sort: List[str] = typer.Option(None, "--sort", "-s"), limit: int = typer.Option(5, "--limit", "-l"), offset: int = typer.Option(0, "--offset"), fmt: str = typer.Option( @@ -354,7 +355,7 @@ def association_table( output: str = typer.Option(None, "--output", "-O", help="The path to the output file"), ): solr = get_solr(update=False) - response = solr.get_association_table(entity=entity, category=category, q=q, limit=limit, offset=offset) + response = solr.get_association_table(entity=entity, category=category, sort=sort, q=q, limit=limit, offset=offset) format_output(fmt, response, output) From a7914055b8a9f68145fdbaa4fb4dbfc77fcb9b78 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Wed, 22 Nov 2023 12:15:54 -0800 Subject: [PATCH 2/4] Add text tokenized association fields to association search query --- .../src/monarch_py/implementations/solr/solr_query_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/src/monarch_py/implementations/solr/solr_query_utils.py b/backend/src/monarch_py/implementations/solr/solr_query_utils.py index 62e101d43..969c11ff1 100644 --- a/backend/src/monarch_py/implementations/solr/solr_query_utils.py +++ b/backend/src/monarch_py/implementations/solr/solr_query_utils.py @@ -221,7 +221,7 @@ def association_search_query_fields(): Shared field list for free text search on associations (e.g. for the association table) """ - return ("subject subject_label^2 subject_closure subject_closure_label" - " predicate " - " object object_label^2 object_closure object_closure_label" + return ("subject subject_label^2 subject_label_t subject_closure subject_closure_label subject_closure_label_t" + " predicate predicate_t" + " object object_label^2 object_label_t object_closure object_closure_label object_closure_label_t" " publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by ") From 6ce23b19d189edcbb05bfbd6b4fc53f55b26cf93 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Wed, 22 Nov 2023 13:08:17 -0800 Subject: [PATCH 3/4] Regenerate fixtures, add partial text match association query tests --- .../solr/solr_implementation.py | 3 +++ .../fixtures/association_counts_response.py | 2 +- .../fixtures/association_query_direct.py | 4 +-- .../fixtures/association_query_indirect.py | 4 +-- .../fixtures/association_table_response.py | 2 +- backend/tests/fixtures/histopheno_response.py | 2 +- .../fixtures/phenotype_explorer_compare.py | 12 ++++----- backend/tests/fixtures/search_response.py | 2 +- .../integration/test_solr_association.py | 25 +++++++++++++++++++ .../fixtures/phenotype-explorer-compare.json | 18 ++++++------- 10 files changed, 51 insertions(+), 23 deletions(-) diff --git a/backend/src/monarch_py/implementations/solr/solr_implementation.py b/backend/src/monarch_py/implementations/solr/solr_implementation.py index 0b2cc581d..778121c5c 100644 --- a/backend/src/monarch_py/implementations/solr/solr_implementation.py +++ b/backend/src/monarch_py/implementations/solr/solr_implementation.py @@ -224,6 +224,7 @@ def get_associations( object_closure: str = None, entity: List[str] = None, direct: bool = None, + q: str = None, offset: int = 0, limit: int = 20, ) -> AssociationResults: @@ -237,6 +238,7 @@ def get_associations( subject_closure: Filter to only associations with the specified term ID as an ancestor of the subject. Defaults to None object_closure: Filter to only associations with the specified term ID as an ancestor of the object. Defaults to None entity: Filter to only associations where the specified entities are the subject or the object. Defaults to None. + q: Query string to search within matches. Defaults to None. offset: Result offset, for pagination. Defaults to 0. limit: Limit results to specified number. Defaults to 20. @@ -254,6 +256,7 @@ def get_associations( subject_closure=subject_closure, object_closure=object_closure, direct=direct, + q=q, offset=offset, limit=limit, ) diff --git a/backend/tests/fixtures/association_counts_response.py b/backend/tests/fixtures/association_counts_response.py index a19f9b0e4..b5dd2bad8 100644 --- a/backend/tests/fixtures/association_counts_response.py +++ b/backend/tests/fixtures/association_counts_response.py @@ -5,7 +5,7 @@ def association_counts_response(): return { "responseHeader": { - "QTime": 2, + "QTime": 1, "params": { "facet.query": [ '(category:"biolink:DiseaseToPhenotypicFeatureAssociation") AND (subject:"MONDO:0020121" OR subject_closure:"MONDO:0020121")', diff --git a/backend/tests/fixtures/association_query_direct.py b/backend/tests/fixtures/association_query_direct.py index c41d974d1..b79a4dfd3 100644 --- a/backend/tests/fixtures/association_query_direct.py +++ b/backend/tests/fixtures/association_query_direct.py @@ -4,7 +4,7 @@ @pytest.fixture def association_query_direct(): return { - "q": "*test:q*", + "q": "test:q", "rows": 100, "start": 100, "facet": True, @@ -19,7 +19,7 @@ def association_query_direct(): "object_closure:TEST\\:0000004", 'subject:"TEST\\:0000005" OR object:"TEST\\:0000005"', ], - "query_fields": "subject subject_label predicate object object_label", + "query_fields": "subject subject_label^2 subject_label_t subject_closure subject_closure_label subject_closure_label_t predicate predicate_t object object_label^2 object_label_t object_closure object_closure_label object_closure_label_t publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by ", "def_type": "edismax", "q_op": "AND", "mm": "100%", diff --git a/backend/tests/fixtures/association_query_indirect.py b/backend/tests/fixtures/association_query_indirect.py index bbeab8090..901ca154b 100644 --- a/backend/tests/fixtures/association_query_indirect.py +++ b/backend/tests/fixtures/association_query_indirect.py @@ -4,7 +4,7 @@ @pytest.fixture def association_query_indirect(): return { - "q": "*test:q*", + "q": "test:q", "rows": 100, "start": 100, "facet": True, @@ -19,7 +19,7 @@ def association_query_indirect(): "object_closure:TEST\\:0000004", 'subject:"TEST\\:0000005" OR subject_closure:"TEST\\:0000005" OR object:"TEST\\:0000005" OR object_closure:"TEST\\:0000005"', ], - "query_fields": "subject subject_label predicate object object_label", + "query_fields": "subject subject_label^2 subject_label_t subject_closure subject_closure_label subject_closure_label_t predicate predicate_t object object_label^2 object_label_t object_closure object_closure_label object_closure_label_t publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by ", "def_type": "edismax", "q_op": "AND", "mm": "100%", diff --git a/backend/tests/fixtures/association_table_response.py b/backend/tests/fixtures/association_table_response.py index 820b111af..5e7d23aa6 100644 --- a/backend/tests/fixtures/association_table_response.py +++ b/backend/tests/fixtures/association_table_response.py @@ -5,7 +5,7 @@ def association_table_response(): return { "responseHeader": { - "QTime": 1, + "QTime": 0, "params": { "mm": "100%", "q": "*:*", diff --git a/backend/tests/fixtures/histopheno_response.py b/backend/tests/fixtures/histopheno_response.py index 67fb5632e..44789afcf 100644 --- a/backend/tests/fixtures/histopheno_response.py +++ b/backend/tests/fixtures/histopheno_response.py @@ -5,7 +5,7 @@ def histopheno_response(): return { "responseHeader": { - "QTime": 2, + "QTime": 1, "params": { "facet.query": [ 'object_closure:"HP:0000924"', diff --git a/backend/tests/fixtures/phenotype_explorer_compare.py b/backend/tests/fixtures/phenotype_explorer_compare.py index 48dccccd0..0e0776f24 100644 --- a/backend/tests/fixtures/phenotype_explorer_compare.py +++ b/backend/tests/fixtures/phenotype_explorer_compare.py @@ -5,8 +5,8 @@ def phenotype_explorer_compare(): return { "subject_termset": { - "MP:0002169": {"id": "MP:0002169", "label": "no abnormal phenotype detected (MPO)"}, "MP:0010771": {"id": "MP:0010771", "label": "integument phenotype (MPO)"}, + "MP:0002169": {"id": "MP:0002169", "label": "no abnormal phenotype detected (MPO)"}, }, "object_termset": {"HP:0004325": {"id": "HP:0004325", "label": "Decreased body weight (HPO)"}}, "subject_best_matches": { @@ -69,8 +69,8 @@ def phenotype_explorer_compare(): "HP:0004325": { "match_source": "HP:0004325", "match_source_label": "Decreased body weight (HPO)", - "match_target": "MP:0002169", - "match_target_label": "no abnormal phenotype detected (MPO)", + "match_target": "MP:0010771", + "match_target_label": "integument phenotype (MPO)", "score": 1.4431977534690428, "match_subsumer": None, "match_subsumer_label": None, @@ -78,7 +78,7 @@ def phenotype_explorer_compare(): "subject_id": "HP:0004325", "subject_label": None, "subject_source": None, - "object_id": "MP:0002169", + "object_id": "MP:0010771", "object_label": None, "object_source": None, "ancestor_id": "UPHENO:0001003", @@ -87,10 +87,10 @@ def phenotype_explorer_compare(): "object_information_content": None, "subject_information_content": None, "ancestor_information_content": 1.4431977534690428, - "jaccard_similarity": 0.16216216216216217, + "jaccard_similarity": 0.3333333333333333, "cosine_similarity": None, "dice_similarity": None, - "phenodigm_score": 0.48376861011243283, + "phenodigm_score": 0.6935891563620457, }, } }, diff --git a/backend/tests/fixtures/search_response.py b/backend/tests/fixtures/search_response.py index 07e1cdf67..c096e99ff 100644 --- a/backend/tests/fixtures/search_response.py +++ b/backend/tests/fixtures/search_response.py @@ -5,7 +5,7 @@ def search_response(): return { "responseHeader": { - "QTime": 0, + "QTime": 1, "params": { "mm": "100%", "q": "fanconi", diff --git a/backend/tests/integration/test_solr_association.py b/backend/tests/integration/test_solr_association.py index e9cb04a9a..524fa588b 100644 --- a/backend/tests/integration/test_solr_association.py +++ b/backend/tests/integration/test_solr_association.py @@ -82,3 +82,28 @@ def test_multi_entity_associations(): for c in response[0].associated_categories: if c.counterpart_category == "biolink:Disease": assert c.total > 0 + + +@pytest.mark.parametrize("q", ["eyebrow", "thick", "Thick", "Thick eyebrow", "thick eyebrow", "Thick eyebrow (HPO)"]) +def test_association_search_partial_match(q: str): + si = SolrImplementation() + response = si.get_associations( + q=q, + subject="MONDO:0011518", + category="biolink:DiseaseToPhenotypicFeatureAssociation" + ) + assert response + assert response.total > 0 + assert "HP:0000574" in [item.object for item in response.items] + +@pytest.mark.parametrize("q", ["eyebrow", "thick", "Thick", "Thick eyebrow", "thick eyebrow", "Thick eyebrow (HPO)"]) +def test_association_table_search_partial_match(q: str): + si = SolrImplementation() + response = si.get_association_table( + entity="MONDO:0011518", + category="biolink:DiseaseToPhenotypicFeatureAssociation", + q=q, + ) + assert response + assert response.total > 0 + assert "HP:0000574" in [item.object for item in response.items] \ No newline at end of file diff --git a/frontend/fixtures/phenotype-explorer-compare.json b/frontend/fixtures/phenotype-explorer-compare.json index da14ac58e..a623376e6 100644 --- a/frontend/fixtures/phenotype-explorer-compare.json +++ b/frontend/fixtures/phenotype-explorer-compare.json @@ -1,12 +1,12 @@ { "subject_termset": { - "MP:0002169": { - "id": "MP:0002169", - "label": "no abnormal phenotype detected (MPO)" - }, "MP:0010771": { "id": "MP:0010771", "label": "integument phenotype (MPO)" + }, + "MP:0002169": { + "id": "MP:0002169", + "label": "no abnormal phenotype detected (MPO)" } }, "object_termset": { @@ -75,8 +75,8 @@ "HP:0004325": { "match_source": "HP:0004325", "match_source_label": "Decreased body weight (HPO)", - "match_target": "MP:0002169", - "match_target_label": "no abnormal phenotype detected (MPO)", + "match_target": "MP:0010771", + "match_target_label": "integument phenotype (MPO)", "score": 1.4431977534690428, "match_subsumer": null, "match_subsumer_label": null, @@ -84,7 +84,7 @@ "subject_id": "HP:0004325", "subject_label": null, "subject_source": null, - "object_id": "MP:0002169", + "object_id": "MP:0010771", "object_label": null, "object_source": null, "ancestor_id": "UPHENO:0001003", @@ -93,10 +93,10 @@ "object_information_content": null, "subject_information_content": null, "ancestor_information_content": 1.4431977534690428, - "jaccard_similarity": 0.16216216216216217, + "jaccard_similarity": 0.3333333333333333, "cosine_similarity": null, "dice_similarity": null, - "phenodigm_score": 0.48376861011243283 + "phenodigm_score": 0.6935891563620457 } } }, From 5c2311a6109f9bb02f0f0b547d349d76db9e5682 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Wed, 22 Nov 2023 13:11:34 -0800 Subject: [PATCH 4/4] Formatting --- .../implementations/solr/solr_query_utils.py | 11 +++++++---- backend/tests/integration/test_solr_association.py | 7 +++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/backend/src/monarch_py/implementations/solr/solr_query_utils.py b/backend/src/monarch_py/implementations/solr/solr_query_utils.py index 969c11ff1..df4b9af94 100644 --- a/backend/src/monarch_py/implementations/solr/solr_query_utils.py +++ b/backend/src/monarch_py/implementations/solr/solr_query_utils.py @@ -216,12 +216,15 @@ def entity_query_fields(): """ return "id^100 name^10 name_t^5 name_ac symbol^10 symbol_t^5 symbol_ac synonym synonym_t synonym_ac" + def association_search_query_fields(): """ Shared field list for free text search on associations (e.g. for the association table) """ - return ("subject subject_label^2 subject_label_t subject_closure subject_closure_label subject_closure_label_t" - " predicate predicate_t" - " object object_label^2 object_label_t object_closure object_closure_label object_closure_label_t" - " publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by ") + return ( + "subject subject_label^2 subject_label_t subject_closure subject_closure_label subject_closure_label_t" + " predicate predicate_t" + " object object_label^2 object_label_t object_closure object_closure_label object_closure_label_t" + " publications has_evidence primary_knowledge_source aggregator_knowledge_source provided_by " + ) diff --git a/backend/tests/integration/test_solr_association.py b/backend/tests/integration/test_solr_association.py index 524fa588b..58d23c13b 100644 --- a/backend/tests/integration/test_solr_association.py +++ b/backend/tests/integration/test_solr_association.py @@ -88,14 +88,13 @@ def test_multi_entity_associations(): def test_association_search_partial_match(q: str): si = SolrImplementation() response = si.get_associations( - q=q, - subject="MONDO:0011518", - category="biolink:DiseaseToPhenotypicFeatureAssociation" + q=q, subject="MONDO:0011518", category="biolink:DiseaseToPhenotypicFeatureAssociation" ) assert response assert response.total > 0 assert "HP:0000574" in [item.object for item in response.items] + @pytest.mark.parametrize("q", ["eyebrow", "thick", "Thick", "Thick eyebrow", "thick eyebrow", "Thick eyebrow (HPO)"]) def test_association_table_search_partial_match(q: str): si = SolrImplementation() @@ -106,4 +105,4 @@ def test_association_table_search_partial_match(q: str): ) assert response assert response.total > 0 - assert "HP:0000574" in [item.object for item in response.items] \ No newline at end of file + assert "HP:0000574" in [item.object for item in response.items]