Mock PubMed API and update filtering logic in paper ranking model. (#…

…1291) This pull request addresses two issues related to the paper ranking model and associated testing. **1. Mocking `pubmed_client` to improve test speed and stability** - Mocks the `pubmed_client` in the unit test to avoid API calls, improving test speed and stability. - A new file, `mock_pubmed_data.json`, has been added, containing mock data to simulate PubMed API responses for fetching metadata. **2. Update filtering logic for curated publications** - Modified the logic to exclude PubMed IDs from curated_papers_df, publication_df, and curation_df when fetching new PubMed papers, ensuring no duplicates from already curated or published entries.
biopragmatics · Dec 7, 2024 · a60ff2a · a60ff2a
1 parent a22f2e1
commit a60ff2a
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 11 deletions.
diff --git a/src/bioregistry/analysis/paper_ranking.py b/src/bioregistry/analysis/paper_ranking.py
@@ -397,7 +397,9 @@ def main(bioregistry_file: Path, start_date: str, end_date: str) -> None:
     importances_df.to_csv(importance_path, sep="\t", index=False)
 
     # These have already been curated and will therefore be filtered out
-    curated_pmids = set(curated_papers_df["pubmed"])
+    curated_pmids = set(curated_papers_df["pubmed"]).union(
+        publication_df["pubmed"], curation_df["pubmed"]
+    )
 
     new_pub_df = fetch_pubmed_papers(curated_pmids)
     if not new_pub_df.empty:

diff --git a/tests/mock_pubmed_data.json b/tests/mock_pubmed_data.json
@@ -0,0 +1,22 @@
+{
+    "10592255": {
+        "title": "The ENZYME database in 2000",
+        "abstract": "The ENZYME database is a repository of information related to the nomenclature of enzymes. In recent years it has became an indispensable resource for the development of metabolic databases. The current version contains information on 3705 enzymes. It is available through the ExPASy WWW server (http://www.expasy.ch/enzyme/ )."
+    },
+    "38991851": {
+        "title": "PEPhub: a database, web interface, and API for editing, sharing, and validating biological sample metadata",
+        "abstract": "Background: As biological data increase, we need additional infrastructure to share them and promote interoperability. While major effort has been put into sharing data, relatively less emphasis is placed on sharing metadata. Yet, sharing metadata is also important and in some ways has a wider scope than sharing data themselves. Results: Here, we present PEPhub, an approach to improve sharing and interoperability of biological metadata. PEPhub provides an API, natural-language search, and user-friendly web-based sharing and editing of sample metadata tables. We used PEPhub to process more than 100,000 published biological research projects and index them with fast semantic natural-language search. PEPhub thus provides a fast and user-friendly way to finding existing biological research data or to share new data. Availability: https://pephub.databio.org."
+    },
+    "39005357": {
+        "title": "Alzheimer's Disease Knowledge Graph Enhances Knowledge Discovery and Disease Prediction",
+        "abstract": "Background: Alzheimer's disease (AD), a progressive neurodegenerative disorder, continues to increase in prevalence without any effective treatments to date. In this context, knowledge graphs (KGs) have emerged as a pivotal tool in biomedical research, offering new perspectives on drug repurposing and biomarker discovery by analyzing intricate network structures. Our study seeks to build an AD-specific knowledge graph, highlighting interactions among AD, genes, variants, chemicals, drugs, and other diseases. The goal is to shed light on existing treatments, potential targets, and diagnostic methods for AD, thereby aiding in drug repurposing and the identification of biomarkers. Results: We annotated 800 PubMed abstracts and leveraged GPT-4 for text augmentation to enrich our training data for named entity recognition (NER) and relation classification. A comprehensive data mining model, integrating NER and relationship classification, was trained on the annotated corpus. This model was subsequently applied to extract relation triplets from unannotated abstracts. To enhance entity linking, we utilized a suite of reference biomedical databases and refine the linking accuracy through abbreviation resolution. As a result, we successfully identified 3,199,276 entity mentions and 633,733 triplets, elucidating connections between 5,000 unique entities. These connections were pivotal in constructing a comprehensive Alzheimer's Disease Knowledge Graph (ADKG). We also integrated the ADKG constructed after entity linking with other biomedical databases. The ADKG served as a training ground for Knowledge Graph Embedding models with the high-ranking predicted triplets supported by evidence, underscoring the utility of ADKG in generating testable scientific hypotheses. Further application of ADKG in predictive modeling using the UK Biobank data revealed models based on ADKG outperforming others, as evidenced by higher values in the areas under the receiver operating characteristic (ROC) curves. Conclusion: The ADKG is a valuable resource for generating hypotheses and enhancing predictive models, highlighting its potential to advance AD's disease research and treatment strategies."
+    },
+    "39010878": {
+        "title": "MotifbreakR v2: extended capability and database integration",
+        "abstract": "MotifbreakR is a software tool that scans genetic variants against position weight matrices of transcription factors (TF) to determine the potential for the disruption of TF binding at the site of the variant. It leverages the Bioconductor suite of software packages and annotations to operate across a diverse array of genomes and motif databases. Initially developed to interrogate the effect of single nucleotide variants (common and rare SNVs) on potential TF binding sites, in motifbreakR v2, we have updated the functionality. New features include the ability to query other types of more complex genetic variants, such as short insertions and deletions (indels). This function allows modeling a more extensive array of variants that may have more significant effects on TF binding. Additionally, while TF binding is based partly on sequence preference, predictions of TF binding based on sequence preference alone can indicate many more potential binding events than observed. Adding information from DNA-binding sequencing datasets lends confidence to motif disruption prediction by demonstrating TF binding in cell lines and tissue types. Therefore, motifbreakR implements querying the ReMap2022 database for evidence that a TF matching the disrupted motif binds over the disrupting variant. Finally, in motifbreakR, in addition to the existing interface, we have implemented an R/Shiny graphical user interface to simplify and enhance access to researchers with different skill sets."
+    },
+    "39014503": {
+        "title": "CREdb: A comprehensive database of Cis-Regulatory Elements and their activity in human cells and tissues",
+        "abstract": "Background: Cis-regulatory elements (CREs) play a pivotal role in gene expression regulation, allowing cells to serve diverse functions and respond to external stimuli. Understanding CREs is essential for personalized medicine and disease research, as an increasing number of genetic variants associated with phenotypes and diseases overlap with CREs. However, existing databases often focus on subsets of regulatory elements and present each identified instance of element individually, confounding the effort to obtain a comprehensive view. To address this gap, we have created CREdb, a comprehensive database with over 10 million human regulatory elements across 1,058 cell types and 315 tissues harmonized from different data sources. We curated and aligned the cell types and tissues to standard ontologies for efficient data query. Results: Data from 11 sources were curated and mapped to standard ontological terms. 11,223,434 combined elements are present in the final database, and these were merged into 5,666,240 consensus elements representing the combined ranges of the individual elements informed by their overlap. Each consensus element contains curated metadata including the number of elements supporting it and a hash linking to the source databases. The inferred activity of each consensus element in various cell-type and tissue context is also provided. Examples presented here show the potential utility of CREdb in annotating non-coding genetic variants and informing chromatin accessibility profiling analysis. Conclusions: We developed CREdb, a comprehensive database of CREs, to simplify the analysis of CREs by providing a unified framework for researchers. CREdb compiles consensus ranges for each element by integrating the information from all instances identified across various source databases. This unified database facilitates the functional annotation of non-coding genetic variants and complements chromatin accessibility profiling analysis. CREdb will serve as an important resource in expanding our knowledge of the epigenome and its role in human diseases."
+    }
+}
diff --git a/tests/test_paper_ranking.py b/tests/test_paper_ranking.py
@@ -1,5 +1,6 @@
 """Test for checking the paper ranking model."""
 
+import json
 import datetime
 import unittest
 from pathlib import Path
@@ -18,16 +19,24 @@ def setUp(self):
         root_dir = root_dir = Path(__file__).resolve().parent.parent
         self.bioregistry_file = root_dir / "src" / "bioregistry" / "data" / "bioregistry.json"
         self.output_directory = root_dir / "exports" / "analyses" / "paper_ranking"
+        self.mock_data_path = root_dir / "tests" / "mock_pubmed_data.json"
 
-        # Check if bioregistry file exists
+        # Check if bioregistry and mock data files exists
+        self.assertTrue(self.mock_data_path.exists(), "Mock data file does not exist")
         self.assertTrue(self.bioregistry_file.exists(), "Bioregistry file does not exist")
 
-    @patch("pandas.DataFrame.to_csv")
-    def test_pipeline(self, mock_to_csv):
+    @patch("bioregistry.analysis.paper_ranking.pubmed_client.get_metadata_for_ids")
+    def test_pipeline(self, mock_get_metadata_for_ids):
         """Smoke test to ensure pipeline runs successfully without error."""
         start_date = datetime.date.today().isoformat()
         end_date = datetime.date.today().isoformat()
 
+        # Mock return value for get_metadata_for_ids
+        with open(self.mock_data_path, "r", encoding="utf-8") as file:
+            mock_data = json.load(file)
+
+        mock_get_metadata_for_ids.return_value = mock_data
+
         runner = CliRunner()
 
         result = runner.invoke(
@@ -56,13 +65,6 @@ def test_pipeline(self, mock_to_csv):
         importances_file = self.output_directory.joinpath("importances.tsv")
         self.assertTrue(importances_file.exists(), f"{importances_file} was not created")
 
-        # Check call count of to_csv is 3 for evaluation, importances and prediction file.
-        self.assertEqual(
-            mock_to_csv.call_count,
-            3,
-            f"Expected to_csv call count is 3. It was called {mock_to_csv.call_count} times",
-        )
-
 
 if __name__ == "__main__":
     unittest.main()