Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update curated papers, add new PDB provider (FURNA). #1193

Merged
merged 27 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
01b9685
update curated papers list
Oct 1, 2024
1824a51
update curated papers list with papers identified on Aug 9 batch. Cur…
Oct 7, 2024
c3967ac
Add contributor information, update regex pattern and examples for pe…
Oct 7, 2024
129f370
add example_extras to pephub
Oct 7, 2024
17a3490
ammend 'relevancy_type' and 'relevant' data for two entries in curate…
Oct 7, 2024
4ab43a2
remove PEPhub as a prefix
Oct 8, 2024
65f1d89
update curated_papers with orcid and date
Oct 9, 2024
541b9d0
Merge branch 'main' into update_curated_papers
cthoyt Oct 10, 2024
31a6106
add unit test for validating input in curated_papers file.
Oct 11, 2024
a2f731b
fix style issues
Oct 11, 2024
c74c63a
Refactor
cthoyt Oct 12, 2024
0d7a8bf
Add example full rows
cthoyt Oct 12, 2024
3587530
Centralize code into reusable module
cthoyt Oct 12, 2024
b3e6c0a
Add TODO
cthoyt Oct 12, 2024
7bcf7e9
Add documentation
cthoyt Oct 12, 2024
7e4cc4d
Update curation.rst
cthoyt Oct 12, 2024
a9b5cac
Merge branch 'main' into pr/1193
cthoyt Oct 12, 2024
50ec62b
remove prefix column from TSV file and simplify relevancy check
Oct 15, 2024
7816e3b
Add docstrings for CurationRelevance and re-order TSV file
Oct 17, 2024
462f7f8
Handle empty inputs for notes and pr_added fields
Oct 17, 2024
4948f46
Fix typo
bgyori Oct 17, 2024
e29deef
Merge branch 'main' into update_curated_papers
bgyori Oct 17, 2024
387ce61
Merge branch 'main' into update_curated_papers
bgyori Oct 18, 2024
0169e36
add publication info for FURNA
Oct 19, 2024
98eb707
Update test_curated_papers.py
cthoyt Oct 19, 2024
269204b
Update literature.py
cthoyt Oct 19, 2024
ad69445
Merge branch 'main' into update_curated_papers
cthoyt Oct 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/curation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ There are several curation workflows implemented in :mod:`bioregistry.curation`.
Bulk Import
-----------
.. automodapi:: bioregistry.curation.bulk_import

Semi-automated Literature Curation
----------------------------------
.. automodapi:: bioregistry.curation.literature
1 change: 1 addition & 0 deletions src/bioregistry/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
COLLECTIONS_PATH = DATA_DIRECTORY / "collections.json"
MISMATCH_PATH = DATA_DIRECTORY / "mismatch.json"
CONTEXTS_PATH = DATA_DIRECTORY / "contexts.json"
CURATED_PAPERS_PATH = DATA_DIRECTORY / "curated_papers.tsv"

BIOREGISTRY_MODULE = pystow.module("bioregistry")

Expand Down
1 change: 0 additions & 1 deletion src/bioregistry/curation/curated_papers.csv

This file was deleted.

44 changes: 44 additions & 0 deletions src/bioregistry/curation/literature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Utilities for working with the data produced by the semi-automated curation workflow."""

import enum

import click

__all__ = [
"CurationRelevance",
"COLUMNS",
]

COLUMNS = [
"pmid",
"relevant",
"relevancy_type",
"orcid",
"date_curated",
"notes",
"pr_added", # links back to the PR where curations were done
]


class CurationRelevance(str, enum.Enum):
"""An enumeration for curation relevance."""

#: A resource for new primary identifiers
new_prefix = enum.auto()
new_provider = enum.auto()
bgyori marked this conversation as resolved.
Show resolved Hide resolved
new_publication = enum.auto()
not_identifiers_resource = enum.auto()
no_website = enum.auto()
existing = enum.auto()
unclear = enum.auto()
irrelevant_other = enum.auto()


@click.command()
def main():
"""Import data from the literature curation into the Bioregistry."""
raise NotImplementedError


if __name__ == "__main__":
main()
9 changes: 9 additions & 0 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -85828,6 +85828,15 @@
"prefix": "pdbj",
"uri_format": "http://service.pdbj.org/mine/Detail?PDBID=$1&PAGEID=Summary"
},
"providers": [
{
"code": "furna",
"description": "FURNA (Functions of RNAs) is a database of ligand-RNA interactions and Gene Ontology annotations for RNAs in the Protein Data Bank (PDB).",
"homepage": "https://seq2fun.dcmb.med.umich.edu/furna/",
"name": "furna",
"uri_format": "https://seq2fun.dcmb.med.umich.edu/furna/pdb.cgi?pdbid=$1"
}
],
"publications": [
{
"doi": "10.1002/pro.4211",
Expand Down
21 changes: 21 additions & 0 deletions src/bioregistry/data/curated_papers.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
pmid relevant relevancy_type prefix notes pr_added orcid date_curated
39104285 1 new_provider uniprot Provider for UniProt IDs, issue with curation due to multiple URI formats depending on plant species 1193 0009-0009-5240-7463 2024-09-24
39074139 1 new_provider pdb Resolver for PDB IDs 1193 0009-0009-5240-7463 2024-09-24
bgyori marked this conversation as resolved.
Show resolved Hide resolved
39014503 0 no_website 1193 0009-0009-5240-7463 2024-09-25
39047988 0 irrelevant_other 0009-0009-5240-7463 2024-09-25
39115390 0 irrelevant_other 0009-0009-5240-7463 2024-09-26
39095357 0 irrelevant_other 0009-0009-5240-7463 2024-09-26
39084442 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-27
38991851 1 unclear identifiers for sharing, retrieving, and validating sample metadata. Unclear if this should be curated as a prefix, provider or a separate registry 0009-0009-5240-7463 2024-09-28
38991828 0 irrelevant_other 0009-0009-5240-7463 2024-09-28
39049520 0 not_identifiers_resource 0009-0009-5240-7463 2024-09-30
39104826 1 existing Already present in the bioregistry as a provider for mesh prefix 0009-0009-5240-7463 2024-10-01
39050757 0 irrelevant_other 0009-0009-5240-7463 2024-10-01
39064021 0 irrelevant_other 0009-0009-5240-7463 2024-10-01
39028894 0 not_identifiers_resource 0009-0009-5240-7463 2024-10-04
39044201 0 not_identifiers_resource Potential resource for rare diseases identifiers, but not identifier information 0009-0009-5240-7463 2024-10-04
39088253 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39119155 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39005357 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39044130 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
39010878 0 irrelevant_other 0009-0009-5240-7463 2024-10-05
62 changes: 62 additions & 0 deletions tests/test_curated_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-

"""Test for checking the integrity of the curated_papers TSV file."""

import csv
import unittest
from datetime import datetime

import bioregistry
from bioregistry.constants import CURATED_PAPERS_PATH, ORCID_PATTERN
from bioregistry.curation.literature import COLUMNS, CurationRelevance


class TestTSV(unittest.TestCase):
"""Tests for curated_papers tsv file."""

def setUp(self):
"""Set up the test case."""
self.relevancy_types = {r.name for r in CurationRelevance}

def validate_row(self, row):
"""Validate a single row from the TSV file."""
for field in COLUMNS:
self.assertIn(field, row)

self.assertTrue(row["pmid"].isdigit(), msg="PubMed identifier should be an integer")
self.assertTrue(row["pr_added"].isdigit(), msg="Pull Request should be an integer")

# Validate relevant is 0 or 1
self.assertIn(row["relevant"], ["0", "1"])

if row["relevant"] == "1":
prefix = row["prefix"]
self.assertIsNotNone(prefix, msg="prefix should be set for all relevant entries")
bgyori marked this conversation as resolved.
Show resolved Hide resolved
self.assertNotEqual("", prefix, msg="prefix should not be empty for relevant entries")
self.assertEqual(
bioregistry.normalize_prefix(prefix),
prefix,
msg="prefix should be standardized for relevant entries",
)

# Validate relevancy_type is in relevancy_vocab
self.assertIn(row["relevancy_type"], self.relevancy_types)

self.assertRegex(row["orcid"], ORCID_PATTERN)

self.assertFalse(row["notes"].startswith('"'))
self.assertFalse(row["notes"].endswith('"'))

# Validate date_curated format
try:
datetime.strptime(row["date_curated"], "%Y-%m-%d")
except ValueError:
self.fail("date_curated should follow format YYYY-MM-DD")

def test_tsv_file(self):
"""Tests all rows in TSV file are valid."""
with CURATED_PAPERS_PATH.open() as tsv_file:
reader = csv.DictReader(tsv_file, delimiter="\t")
for row, data in enumerate(reader, start=1):
with self.subTest(row=row, data=data):
self.validate_row(data)
Loading