diff --git a/bigbio/biodatasets/an_em/an_em.py b/bigbio/biodatasets/an_em/an_em.py index f3460349..4d956684 100644 --- a/bigbio/biodatasets/an_em/an_em.py +++ b/bigbio/biodatasets/an_em/an_em.py @@ -29,9 +29,10 @@ import bigbio.utils.parsing as parse from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/anat_em/anat_em.py b/bigbio/biodatasets/anat_em/anat_em.py index c74125c2..c58f6fb1 100644 --- a/bigbio/biodatasets/anat_em/anat_em.py +++ b/bigbio/biodatasets/anat_em/anat_em.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index dee74515..53bc81a9 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "ask_a_patient" +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc5cdr/bc5cdr.py b/bigbio/biodatasets/bc5cdr/bc5cdr.py index 47af693c..45ed49a7 100644 --- a/bigbio/biodatasets/bc5cdr/bc5cdr.py +++ b/bigbio/biodatasets/bc5cdr/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py index 2e9ca9e9..50543a18 100644 --- a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py +++ b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.COVID] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py index 05db39fd..afab0059 100644 --- a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py +++ b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_simlex/bio_simlex.py b/bigbio/biodatasets/bio_simlex/bio_simlex.py index 6b8fc6f8..2a9cecea 100644 --- a/bigbio/biodatasets/bio_simlex/bio_simlex.py +++ b/bigbio/biodatasets/bio_simlex/bio_simlex.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py index 4672c3f5..7fd13d83 100644 --- a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py +++ b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py @@ -51,9 +51,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py index f5668647..685ac4e4 100644 --- a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py +++ b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py @@ -32,9 +32,16 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [ + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py index 8012f380..2aeeb729 100644 --- a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py +++ b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GRANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioinfer/bioinfer.py b/bigbio/biodatasets/bioinfer/bioinfer.py index 8a71bbf5..dd1a7cfd 100644 --- a/bigbio/biodatasets/bioinfer/bioinfer.py +++ b/bigbio/biodatasets/bioinfer/bioinfer.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py index 75117290..282050c6 100644 --- a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py +++ b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.HOW, Tags.WHY] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/biomrc/biomrc.py b/bigbio/biodatasets/biomrc/biomrc.py index a80f0955..43dd1f72 100644 --- a/bigbio/biodatasets/biomrc/biomrc.py +++ b/bigbio/biodatasets/biomrc/biomrc.py @@ -31,9 +31,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC, Tags.CLOZE_TEST] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py index 1f32a25d..4e330313 100644 --- a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py +++ b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py @@ -21,10 +21,15 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import brat_parse_to_bigbio_kb, parse_brat_file +# http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=4605©ownerid=320 +# Task 1. Event detection and characterization +# Task 2. Event argument recognition +# Task 3. Recognition of negations and speculations +_TAGS = [Tags.PPI, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py index 4c2d5991..7a6ea0ab 100644 --- a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py +++ b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_epi" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.EPIGENETICS, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py index 112c03a4..3eab0c71 100644 --- a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py +++ b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py index 1d640ac3..775c56fc 100644 --- a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py +++ b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py @@ -20,13 +20,21 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_id" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.GENE, + Tags.CHEMICAL, + Tags.SPECIES, + Tags.SPECULATION, + Tags.NEGATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py index d6539fbb..92a9c3b2 100644 --- a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py +++ b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_rel" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.PART_OF, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py index f99326ec..6365cd7e 100644 --- a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py +++ b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py @@ -20,12 +20,22 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_cg" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.CANCER, + Tags.TISSUE, + Tags.SPECIES, + Tags.CELL, + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY_CURATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py index 93dfa58f..74a76bde 100644 --- a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py +++ b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py index 277dfcec..bc61c02e 100644 --- a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py +++ b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_gro" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.SPECIES, Tags.CELL, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py index 69fd79f9..cee27dfa 100644 --- a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py +++ b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py @@ -20,12 +20,19 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_pc" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY_CURATION, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py index 026c8337..f399df66 100644 --- a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py +++ b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2019_bb" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biored/biored.py b/bigbio/biodatasets/biored/biored.py index 250ce837..b45bdacd 100644 --- a/bigbio/biodatasets/biored/biored.py +++ b/bigbio/biodatasets/biored/biored.py @@ -26,10 +26,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.VARIANT, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biorelex/biorelex.py b/bigbio/biodatasets/biorelex/biorelex.py index f6dac279..1b1d2a12 100644 --- a/bigbio/biodatasets/biorelex/biorelex.py +++ b/bigbio/biodatasets/biorelex/biorelex.py @@ -35,10 +35,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.VARIANT, Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioscope/bioscope.py b/bigbio/biodatasets/bioscope/bioscope.py index 5af2077a..9e7d2e22 100644 --- a/bigbio/biodatasets/bioscope/bioscope.py +++ b/bigbio/biodatasets/bioscope/bioscope.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biosses/biosses.py b/bigbio/biodatasets/biosses/biosses.py index 059a0306..7f7e72ee 100644 --- a/bigbio/biodatasets/biosses/biosses.py +++ b/bigbio/biodatasets/biosses/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index 3eb3f6da..f9c60446 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -35,9 +35,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [Tags.DISEASE, Tags.ADR, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cantemist/cantemist.py b/bigbio/biodatasets/cantemist/cantemist.py index 6a140d2a..9b4af046 100644 --- a/bigbio/biodatasets/cantemist/cantemist.py +++ b/bigbio/biodatasets/cantemist/cantemist.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CANCER, Tags.DISEASE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cas/cas.py b/bigbio/biodatasets/cas/cas.py index d563be29..6b45d7d2 100644 --- a/bigbio/biodatasets/cas/cas.py +++ b/bigbio/biodatasets/cas/cas.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.POS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/cellfinder/cellfinder.py b/bigbio/biodatasets/cellfinder/cellfinder.py index 935a919c..04b36b52 100644 --- a/bigbio/biodatasets/cellfinder/cellfinder.py +++ b/bigbio/biodatasets/cellfinder/cellfinder.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.CELL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py index c6e96c6f..aeb5f48b 100644 --- a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py +++ b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py @@ -21,10 +21,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import parse_brat_file +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemdner/chemdner.py b/bigbio/biodatasets/chemdner/chemdner.py index 7b0b974f..4e237b6b 100644 --- a/bigbio/biodatasets/chemdner/chemdner.py +++ b/bigbio/biodatasets/chemdner/chemdner.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemprot/chemprot.py b/bigbio/biodatasets/chemprot/chemprot.py index 620a1a44..fc2aa679 100644 --- a/bigbio/biodatasets/chemprot/chemprot.py +++ b/bigbio/biodatasets/chemprot/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index 2328a459..b837bb91 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py index 0713a87f..28169f96 100644 --- a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py +++ b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -59,11 +60,11 @@ _URLS = { _DATASETNAME: [ - "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip"] + "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip" + ] } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, - Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] _SOURCE_VERSION = "1.0.0" @@ -72,8 +73,8 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): """ - The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes - 151 PubMed abstracts with both mention-level and document-level annotations. + The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes + 151 PubMed abstracts with both mention-level and document-level annotations. They are selected because both have a focus on human genes. """ @@ -94,7 +95,7 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): description="citation_gia_test_collection BigBio schema", schema="bigbio_kb", subset_id="citation_gia_test_collection", - ) + ), ] DEFAULT_CONFIG_NAME = "citation_gia_test_collection_source" @@ -126,7 +127,7 @@ def _info(self) -> datasets.DatasetInfo: } ], } - ] + ], } ) @@ -150,16 +151,18 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ - "filepath": os.path.join(data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml"), + "filepath": os.path.join( + data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml" + ), "split": "NLMIAT", }, ), ] def _get_entities(self, annot_d: dict) -> dict: - '''' + """' Converts annotation dict to entity dict. - ''' + """ ent = { "id": str(uuid.uuid4()), "type": annot_d["type"], @@ -175,13 +178,15 @@ def _get_entities(self, annot_d: dict) -> dict: return ent - def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) -> List[int]: - ''' - Extracts child text offsets from parent text for entities. + def _get_offsets_entities( + child, parent_text: str, child_text: str, offset: int + ) -> List[int]: + """ + Extracts child text offsets from parent text for entities. Some offsets that were present in the datset were wrong mainly because of string encodings. - Also a little fraction of parent strings doesn't contain its respective child strings. - Hence few assertion errors in the entitity offsets checking test. - ''' + Also a little fraction of parent strings doesn't contain its respective child strings. + Hence few assertion errors in the entitity offsets checking test. + """ if child_text in parent_text: index = parent_text.index(child_text) start = index + offset @@ -193,10 +198,10 @@ def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) return [start, end] def _process_annot(self, annot: ET.Element, passages: dict) -> dict: - '''' + """' Converts annotation XML Element to Python dict. - ''' - parent_text = " ".join([p['text'] for p in passages.values()]) + """ + parent_text = " ".join([p["text"] for p in passages.values()]) annot_d = dict() a_d = {a.tag: a.text for a in annot} @@ -205,21 +210,21 @@ def _process_annot(self, annot: ET.Element, passages: dict) -> dict: if a.tag == "location": offset = int(a.attrib["offset"]) annot_d["offsets"] = self._get_offsets_entities( - html.escape(parent_text[offset:]), - html.escape(a_d["text"]), offset) + html.escape(parent_text[offset:]), html.escape(a_d["text"]), offset + ) elif a.tag != "infon": annot_d[a.tag] = html.escape(a.text) else: annot_d[a.attrib["key"]] = html.escape(a.text) - + return annot_d def _parse_elem(self, elem: ET.Element) -> dict: - '''' + """' Converts document XML Element to Python dict. - ''' + """ elem_d = dict() passages = dict() annotations = elem.findall(".//annotation") @@ -230,8 +235,21 @@ def _parse_elem(self, elem: ET.Element) -> dict: for child in elem: if child.tag == "passage": - elem_d[child.tag].append({c.tag: html.escape(" ".join(list(filter( - lambda item: item, [t.strip('\n') for t in c.itertext()])))) for c in child}) + elem_d[child.tag].append( + { + c.tag: html.escape( + " ".join( + list( + filter( + lambda item: item, + [t.strip("\n") for t in c.itertext()], + ) + ) + ) + ) + for c in child + } + ) elif child.tag == "id": elem_d[child.tag] = html.escape(child.text) @@ -242,11 +260,10 @@ def _parse_elem(self, elem: ET.Element) -> dict: passages[infon] = passage elem_d["passages"] = passages - elem_d.pop('passage', None) + elem_d.pop("passage", None) for a in annotations: - elem_d["entities"].append( - self._process_annot(a, elem_d["passages"])) + elem_d["entities"].append(self._process_annot(a, elem_d["passages"])) return elem_d @@ -260,31 +277,35 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "passages": [ { "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"]["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], - "entities": [self._get_entities(a) for a in row["entities"]] + "entities": [self._get_entities(a) for a in row["entities"]], } elif self.config.schema == "bigbio_kb": @@ -293,7 +314,7 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "document_id": str(uuid.uuid4()), "passages": [ @@ -301,26 +322,29 @@ def _generate_examples(self, filepath, split): "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"] - ["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], "entities": [self._get_entities(a) for a in row["entities"]], "relations": [], "events": [], - "coreferences": [] + "coreferences": [], } diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index 1cede622..b9c551e0 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 8724cf64..f29c298e 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index 92ca3519..0831f48b 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py index 4d8fb893..7ff25476 100644 --- a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py +++ b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DDI, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py index a9f4a927..8dcc4ac1 100644 --- a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py +++ b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py @@ -27,9 +27,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DOCUMENT_INDEXING, Tags.DISEASE] _LANGUAGES = [Lang.EN, Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/distemist/distemist.py b/bigbio/biodatasets/distemist/distemist.py index b9dfaf5d..1471653f 100644 --- a/bigbio/biodatasets/distemist/distemist.py +++ b/bigbio/biodatasets/distemist/distemist.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ebm_pico/ebm_pico.py b/bigbio/biodatasets/ebm_pico/ebm_pico.py index 5e7078e0..0abb1904 100644 --- a/bigbio/biodatasets/ebm_pico/ebm_pico.py +++ b/bigbio/biodatasets/ebm_pico/ebm_pico.py @@ -26,9 +26,10 @@ import datasets from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PICO, Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -65,7 +66,9 @@ _LICENSE = Licenses.UNKNOWN -_URLS = {_DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz"} +_URLS = { + _DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz" +} _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] @@ -138,7 +141,9 @@ def _partition(alist, indices): for _indices in multiple_indices: high_level_type = LABEL_DECODERS["starting_spans"][annotation_type][1] - fine_grained_type = LABEL_DECODERS["hierarchical_labels"][annotation_type][annotations[_indices[0]]] + fine_grained_type = LABEL_DECODERS["hierarchical_labels"][ + annotation_type + ][annotations[_indices[0]]] annotation_text = " ".join([tokenized[ind] for ind in _indices]) char_start = document_content.find(annotation_text) @@ -221,7 +226,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: data_dir = dl_manager.download_and_extract(urls) documents_folder = Path(data_dir) / "ebm_nlp_2_00" / "documents" - annotations_folder = Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + annotations_folder = ( + Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, @@ -241,7 +248,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - def _generate_examples(self, documents_folder, annotations_folder, split_folder: str) -> Tuple[int, Dict]: + def _generate_examples( + self, documents_folder, annotations_folder, split_folder: str + ) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" annotation_types = ["interventions", "outcomes", "participants"] @@ -264,11 +273,15 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: with open( f"{annotations_folder}/hierarchical_labels/{annotation_type}/{split_folder}/{document}" ) as fp: - annotation_dict[annotation_type] = [int(x) for x in fp.read().splitlines()] + annotation_dict[annotation_type] = [ + int(x) for x in fp.read().splitlines() + ] except OSError: annotation_dict[annotation_type] = [] - ents = _get_entities_pico(annotation_dict, tokenized=tokenized, document_content=document_content) + ents = _get_entities_pico( + annotation_dict, tokenized=tokenized, document_content=document_content + ) if self.config.schema == "source": @@ -279,7 +292,9 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: { "text": ent["annotation_text"], "annotation_type": ent["high_level_annotation_type"], - "fine_grained_annotation_type": ent["fine_grained_annotation_type"], + "fine_grained_annotation_type": ent[ + "fine_grained_annotation_type" + ], "start": ent["char_start"], "end": ent["char_end"], } diff --git a/bigbio/biodatasets/ehr_rel/ehr_rel.py b/bigbio/biodatasets/ehr_rel/ehr_rel.py index 90235ee4..f9b0967e 100644 --- a/bigbio/biodatasets/ehr_rel/ehr_rel.py +++ b/bigbio/biodatasets/ehr_rel/ehr_rel.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index 275aa115..4cff31a1 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/euadr/euadr.py b/bigbio/biodatasets/euadr/euadr.py index 35b27664..6923b5ad 100644 --- a/bigbio/biodatasets/euadr/euadr.py +++ b/bigbio/biodatasets/euadr/euadr.py @@ -4,9 +4,18 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [ + Tags.ADR, + Tags.DRUG, + Tags.GENE, + Tags.DISEASE, + Tags.VARIANT, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index 83fd2ca7..cade748f 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/gad/gad.py b/bigbio/biodatasets/gad/gad.py index 4a9286ce..d12e7b5b 100644 --- a/bigbio/biodatasets/gad/gad.py +++ b/bigbio/biodatasets/gad/gad.py @@ -6,13 +6,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "gad" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genetag/genetag.py b/bigbio/biodatasets/genetag/genetag.py index e53b4918..2faf3558 100644 --- a/bigbio/biodatasets/genetag/genetag.py +++ b/bigbio/biodatasets/genetag/genetag.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py index 0e3f2536..ed874166 100644 --- a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py +++ b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py index f0a73059..f010eb3a 100644 --- a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py +++ b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.PART_OF] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py index 0ae321ce..66b55cf8 100644 --- a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py +++ b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.CELL, Tags.ANATOMY, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py index 1954035f..5b37531a 100644 --- a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py +++ b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/gnormplus/gnormplus.py b/bigbio/biodatasets/gnormplus/gnormplus.py index 7fd0e750..fc1a2367 100644 --- a/bigbio/biodatasets/gnormplus/gnormplus.py +++ b/bigbio/biodatasets/gnormplus/gnormplus.py @@ -23,10 +23,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py index 83d19030..973bf970 100644 --- a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py +++ b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py @@ -18,9 +18,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -65,7 +66,7 @@ _URLs = { "corpus": "https://github.com/sb895/Hallmarks-of-Cancer/archive/refs/heads/master.zip", - "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz" + "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz", } _SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] @@ -73,17 +74,17 @@ _BIGBIO_VERSION = "1.0.0" _CLASS_NAMES = [ - 'evading growth suppressors', - 'tumor promoting inflammation', - 'enabling replicative immortality', - 'cellular energetics', - 'resisting cell death', - 'activating invasion and metastasis', - 'genomic instability and mutation', - 'none', - 'inducing angiogenesis', - 'sustaining proliferative signaling', - 'avoiding immune destruction' + "evading growth suppressors", + "tumor promoting inflammation", + "enabling replicative immortality", + "cellular energetics", + "resisting cell death", + "activating invasion and metastasis", + "genomic instability and mutation", + "none", + "inducing angiogenesis", + "sustaining proliferative signaling", + "avoiding immune destruction", ] @@ -143,21 +144,24 @@ def _split_generators(self, dl_manager): name=datasets.Split.TRAIN, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/train_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/train_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/test_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/test_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/dev_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/dev_pmid.tsv", }, ), ] @@ -183,13 +187,15 @@ def _generate_examples(self, corpuspath: Path, indicespath: Path): sentence, label = example_pair label = label.strip() - + if label == "": label = "none" multi_labels = [m_label.strip() for m_label in label.split("AND")] unique_multi_labels = { - m_label.split("--")[0].lower().lstrip() for m_label in multi_labels if m_label != "NULL" + m_label.split("--")[0].lower().lstrip() + for m_label in multi_labels + if m_label != "NULL" } arrow_file_unique_key = 100 * document_index + example_index diff --git a/bigbio/biodatasets/hprd50/hprd50.py b/bigbio/biodatasets/hprd50/hprd50.py index 91b18470..834bc1c5 100644 --- a/bigbio/biodatasets/hprd50/hprd50.py +++ b/bigbio/biodatasets/hprd50/hprd50.py @@ -38,10 +38,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/iepa/iepa.py b/bigbio/biodatasets/iepa/iepa.py index 5efffd9f..be945fb6 100644 --- a/bigbio/biodatasets/iepa/iepa.py +++ b/bigbio/biodatasets/iepa/iepa.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL, Tags.DRUG, Tags.DDI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index d163c385..9e03eaea 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.CELL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/linnaeus/linnaeus.py b/bigbio/biodatasets/linnaeus/linnaeus.py index 14c1b6ef..80518887 100644 --- a/bigbio/biodatasets/linnaeus/linnaeus.py +++ b/bigbio/biodatasets/linnaeus/linnaeus.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index 34259f12..6dfe9914 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -36,9 +36,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index e014f006..8eb1891c 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -22,9 +22,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mayosrs/mayosrs.py b/bigbio/biodatasets/mayosrs/mayosrs.py index 033a93b8..e0b63b87 100644 --- a/bigbio/biodatasets/mayosrs/mayosrs.py +++ b/bigbio/biodatasets/mayosrs/mayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index e83b70e9..5e000263 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medal/medal.py b/bigbio/biodatasets/medal/medal.py index 2766f97b..0ceeb8cb 100644 --- a/bigbio/biodatasets/medal/medal.py +++ b/bigbio/biodatasets/medal/medal.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -73,10 +74,11 @@ _BIGBIO_VERSION = "1.0.0" + class MedalDataset(datasets.GeneratorBasedBuilder): """The Repository for Medical Dataset for Abbreviation Disambiguation for Natural Language Understanding (MeDAL) is -a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding -pre-training in the medical domain.""" + a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding + pre-training in the medical domain.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) @@ -123,7 +125,9 @@ def _info(self) -> datasets.DatasetInfo: citation=_CITATION, ) - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" urls = _URLS @@ -168,7 +172,7 @@ def _generate_offsets(self, text, location): Returns ------- - dict + dict "word": str, "offsets": tuple (int, int) """ @@ -178,7 +182,7 @@ def _generate_offsets(self, text, location): offset_end = offset_start + len(word) # return word and offsets - return {"word":word, "offsets":(offset_start, offset_end)} + return {"word": word, "offsets": (offset_start, offset_end)} def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" diff --git a/bigbio/biodatasets/meddialog/meddialog.py b/bigbio/biodatasets/meddialog/meddialog.py index 90e77e55..ee647d08 100644 --- a/bigbio/biodatasets/meddialog/meddialog.py +++ b/bigbio/biodatasets/meddialog/meddialog.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "meddialog" +_TAGS = [Tags.DIALOGUE] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddocan/meddocan.py b/bigbio/biodatasets/meddocan/meddocan.py index a2e66d64..d2dc14e9 100644 --- a/bigbio/biodatasets/meddocan/meddocan.py +++ b/bigbio/biodatasets/meddocan/meddocan.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medhop/medhop.py b/bigbio/biodatasets/medhop/medhop.py index 1b6012a7..19649008 100644 --- a/bigbio/biodatasets/medhop/medhop.py +++ b/bigbio/biodatasets/medhop/medhop.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 98632452..48929faa 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DRUG, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py index 3b82f39f..153df024 100644 --- a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py +++ b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py @@ -44,9 +44,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py index 0e85d926..5af9b45b 100644 --- a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py +++ b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.FACTOID, Tags.DISEASE, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py index 9b9fe79e..ad61f531 100644 --- a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py +++ b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index a1e8e2d9..633b86dd 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mednli/mednli.py b/bigbio/biodatasets/mednli/mednli.py index 5e6c8cac..4488852f 100644 --- a/bigbio/biodatasets/mednli/mednli.py +++ b/bigbio/biodatasets/mednli/mednli.py @@ -42,9 +42,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/meqsum/meqsum.py b/bigbio/biodatasets/meqsum/meqsum.py index 684877dd..a2a3d8be 100644 --- a/bigbio/biodatasets/meqsum/meqsum.py +++ b/bigbio/biodatasets/meqsum/meqsum.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/minimayosrs/minimayosrs.py b/bigbio/biodatasets/minimayosrs/minimayosrs.py index 1169fa67..cd2eba50 100644 --- a/bigbio/biodatasets/minimayosrs/minimayosrs.py +++ b/bigbio/biodatasets/minimayosrs/minimayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mirna/mirna.py b/bigbio/biodatasets/mirna/mirna.py index 2b128f21..44babefe 100644 --- a/bigbio/biodatasets/mirna/mirna.py +++ b/bigbio/biodatasets/mirna/mirna.py @@ -1,365 +1,384 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import xml.etree.ElementTree as ET -from typing import Dict, Iterator, List, Tuple - -import datasets - -from bigbio.utils import schemas -from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import xml.etree.ElementTree as ET +from typing import Dict, Iterator, List, Tuple + +import datasets + +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses - + +_TAGS = [Tags.MIRNA, Tags.GENE, Tags.DISEASE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False -_CITATION = """\ -@Article{Bagewadi2014, -author={Bagewadi, Shweta -and Bobi{\'{c}}, Tamara -and Hofmann-Apitius, Martin -and Fluck, Juliane -and Klinger, Roman}, -title={Detecting miRNA Mentions and Relations in Biomedical Literature}, -journal={F1000Research}, -year={2014}, -month={Aug}, -day={28}, -publisher={F1000Research}, -volume={3}, -pages={205-205}, -keywords={MicroRNAs; corpus; prediction algorithms}, -abstract={ - INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional - gene expression regulators, participating in a wide spectrum of regulatory events such as - apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal - physiology, their dysregulation is implicated in a vast array of diseases. Dissection of - miRNA-related associations are valuable for contemplating their mechanism in diseases, - leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. - MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely - available as unstructured text. Manual retrieval of these associations can be labor-intensive - due to steadily growing number of publications. Additionally, most of the published miRNA - entity recognition methods are keyword based, further subjected to manual inspection for - retrieval of relations. Despite the fact that several databases host miRNA-associations - derived from text, lower sensitivity and lack of published details for miRNA entity - recognition and associated relations identification has motivated the need for developing - comprehensive methods that are freely available for the scientific community. Additionally, - the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the - available systems. We propose methods to automatically extract mentions of miRNAs, species, - genes/proteins, disease, and relations from scientific literature. Our generated corpora, - along with dictionaries, and miRNA regular expression are freely available for academic - purposes. To our knowledge, these resources are the most comprehensive developed so far. - RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and - precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an - F1 score of up to 0.76. A comparison of the information extracted by our approach to - the databases miR2Disease and miRSel for the extraction of Alzheimer's disease - related relations shows the capability of our proposed methods in identifying correct - relations with improved sensitivity. The published resources and described methods can - help the researchers for maximal retrieval of miRNA-relations and generation of - miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation - guidelines, developed dictionaries, and supplementary files are available at - http://www.scai.fraunhofer.de/mirna-corpora.html. -}, -note={26535109[pmid]}, -note={PMC4602280[pmcid]}, -issn={2046-1402}, -url={https://pubmed.ncbi.nlm.nih.gov/26535109}, -language={eng} -} -""" - -_DATASETNAME = "mirna" - -_DESCRIPTION = """\ -The corpus consists of 301 Medline citations. The documents were screened for -mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually -annotated. The corpus comprises of two separate files, a train and a test set, coming -from 201 and 100 documents respectively. -""" - -_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" - -_LICENSE = Licenses.CC_BY_NC_3p0 - -_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" - -_URLs = { - "source": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, - "bigbio_kb": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, -} - -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] -_SOURCE_VERSION = "1.0.0" -_BIGBIO_VERSION = "1.0.0" - - -class miRNADataset(datasets.GeneratorBasedBuilder): - """mirna""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - - BUILDER_CONFIGS = [ - BigBioConfig( - name="mirna_source", - version=SOURCE_VERSION, - description="mirna source schema", - schema="source", - subset_id="mirna", - ), - BigBioConfig( - name="mirna_bigbio_kb", - version=BIGBIO_VERSION, - description="mirna BigBio schema", - schema="bigbio_kb", - subset_id="mirna", - ), - ] - - DEFAULT_CONFIG_NAME = "mirna_source" - - def _info(self): - - if self.config.schema == "source": - - features = datasets.Features( - { - "passages": [ - { - "document_id": datasets.Value("string"), - "type": datasets.Value("string"), - "text": datasets.Value("string"), - "offset": datasets.Value("int32"), - "entities": [ - { - "id": datasets.Value("string"), - "offsets": [[datasets.Value("int32")]], - "text": [datasets.Value("string")], - "type": datasets.Value("string"), - "normalized": [ - { - "db_name": datasets.Value("string"), - "db_id": datasets.Value("string"), - } - ], - } - ], - } - ] - } - ) - - elif self.config.schema == "bigbio_kb": - features = schemas.kb_features - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - supervised_keys=None, - homepage=_HOMEPAGE, - license=str(_LICENSE), - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - - my_urls = _URLs[self.config.schema] - - path_xml_train = dl_manager.download(my_urls["train"]) - path_xml_test = dl_manager.download(my_urls["test"]) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_train, - "split": "train", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_test, - "split": "test", - }, - ), - ] - - def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: - - sentences: List[Dict] = [] - entities: List[List[Dict]] = [] - relations: List[List[Dict]] = [] - - text_total_length = 0 - - po_start = 0 - - # Get sentences of the document - for _, s in enumerate(d): - - # annotation used only for document indexing - if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: - continue - - # annotation used only for document indexing - if len(s) <= 0: - continue - - text_total_length += len(s.attrib["text"]) + 1 - - po_end = po_start + len(s.attrib["text"]) - - start = po_start - - dp = { - "text": s.attrib["text"], - "type": "title" if ".s0" in s.attrib["id"] else "abstract", - "offsets": [(po_start, po_end)], - "offset": 0, # original offset - } - - po_start = po_end + 1 - - sentences.append(dp) - - pe = [] # entities - re = [] # relations - - # For each entity - for a in s: - - # If correspond to a entity - if a.tag == "entity": - - length = len(a.attrib["text"]) - - if a.attrib["text"] is None or length <= 0: - continue - - # no in-text annotation: only for document indexing - if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: - continue - - startOffset, endOffset = a.attrib["charOffset"].split("-") - startOffset, endOffset = int(startOffset), int(endOffset) - - pe.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "text": (a.attrib["text"],), - "offsets": [(start + startOffset, start + endOffset + 1)], - "normalized": [{"db_name": "miRNA-corpus", "db_id": a.attrib["id"]}], - } - ) - - # If correspond to relation pair - elif a.tag == "pair": - - re.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "arg1_id": a.attrib["e1"], - "arg2_id": a.attrib["e2"], - "normalized": [], - } - ) - - entities.append(pe) - relations.append(re) - - return sentences, entities, relations - - def _generate_examples( - self, - filepath: str, - split: str, - ) -> Iterator[Tuple[int, Dict]]: - """Yields examples as (key, example) tuples.""" - - reader = ET.fromstring(open(str(filepath), "r").read()) - - if self.config.schema == "source": - - for uid, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - for p, pe, re in zip(sentences, sentences_entities, relations): - - p.pop("offsets") # BioC has only start for passages offsets - - p["document_id"] = doc.attrib["id"] - p["entities"] = pe # BioC has per passage entities - - yield uid, {"passages": sentences} - - elif self.config.schema == "bigbio_kb": - - uid = 0 - - for idx, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - # global id - uid += 1 - - # unpack per-sentence entities - entities = [e for pe in sentences_entities for e in pe] - - for p in sentences: - p.pop("offset") # drop original offset - p["text"] = (p["text"],) # text in sentence is Sequence - p["id"] = uid - uid += 1 - - for e in entities: - e["id"] = uid - uid += 1 - - # unpack per-sentence relations - relations = [r for re in relations for r in re] - - for r in relations: - r["id"] = uid - uid += 1 - - yield idx, { - "id": uid, - "document_id": doc.attrib["id"], - "passages": sentences, - "entities": entities, - "events": [], - "coreferences": [], - "relations": relations, - } +_CITATION = """\ +@Article{Bagewadi2014, +author={Bagewadi, Shweta +and Bobi{\'{c}}, Tamara +and Hofmann-Apitius, Martin +and Fluck, Juliane +and Klinger, Roman}, +title={Detecting miRNA Mentions and Relations in Biomedical Literature}, +journal={F1000Research}, +year={2014}, +month={Aug}, +day={28}, +publisher={F1000Research}, +volume={3}, +pages={205-205}, +keywords={MicroRNAs; corpus; prediction algorithms}, +abstract={ + INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional + gene expression regulators, participating in a wide spectrum of regulatory events such as + apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal + physiology, their dysregulation is implicated in a vast array of diseases. Dissection of + miRNA-related associations are valuable for contemplating their mechanism in diseases, + leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. + MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely + available as unstructured text. Manual retrieval of these associations can be labor-intensive + due to steadily growing number of publications. Additionally, most of the published miRNA + entity recognition methods are keyword based, further subjected to manual inspection for + retrieval of relations. Despite the fact that several databases host miRNA-associations + derived from text, lower sensitivity and lack of published details for miRNA entity + recognition and associated relations identification has motivated the need for developing + comprehensive methods that are freely available for the scientific community. Additionally, + the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the + available systems. We propose methods to automatically extract mentions of miRNAs, species, + genes/proteins, disease, and relations from scientific literature. Our generated corpora, + along with dictionaries, and miRNA regular expression are freely available for academic + purposes. To our knowledge, these resources are the most comprehensive developed so far. + RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and + precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an + F1 score of up to 0.76. A comparison of the information extracted by our approach to + the databases miR2Disease and miRSel for the extraction of Alzheimer's disease + related relations shows the capability of our proposed methods in identifying correct + relations with improved sensitivity. The published resources and described methods can + help the researchers for maximal retrieval of miRNA-relations and generation of + miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation + guidelines, developed dictionaries, and supplementary files are available at + http://www.scai.fraunhofer.de/mirna-corpora.html. +}, +note={26535109[pmid]}, +note={PMC4602280[pmcid]}, +issn={2046-1402}, +url={https://pubmed.ncbi.nlm.nih.gov/26535109}, +language={eng} +} +""" + +_DATASETNAME = "mirna" + +_DESCRIPTION = """\ +The corpus consists of 301 Medline citations. The documents were screened for +mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually +annotated. The corpus comprises of two separate files, a train and a test set, coming +from 201 and 100 documents respectively. +""" + +_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" + +_LICENSE = Licenses.CC_BY_NC_3p0 + +_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" + +_URLs = { + "source": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, + "bigbio_kb": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class miRNADataset(datasets.GeneratorBasedBuilder): + """mirna""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="mirna_source", + version=SOURCE_VERSION, + description="mirna source schema", + schema="source", + subset_id="mirna", + ), + BigBioConfig( + name="mirna_bigbio_kb", + version=BIGBIO_VERSION, + description="mirna BigBio schema", + schema="bigbio_kb", + subset_id="mirna", + ), + ] + + DEFAULT_CONFIG_NAME = "mirna_source" + + def _info(self): + + if self.config.schema == "source": + + features = datasets.Features( + { + "passages": [ + { + "document_id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Value("string"), + "offset": datasets.Value("int32"), + "entities": [ + { + "id": datasets.Value("string"), + "offsets": [[datasets.Value("int32")]], + "text": [datasets.Value("string")], + "type": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } + ] + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + my_urls = _URLs[self.config.schema] + + path_xml_train = dl_manager.download(my_urls["train"]) + path_xml_test = dl_manager.download(my_urls["test"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_train, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_test, + "split": "test", + }, + ), + ] + + def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: + + sentences: List[Dict] = [] + entities: List[List[Dict]] = [] + relations: List[List[Dict]] = [] + + text_total_length = 0 + + po_start = 0 + + # Get sentences of the document + for _, s in enumerate(d): + + # annotation used only for document indexing + if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: + continue + + # annotation used only for document indexing + if len(s) <= 0: + continue + + text_total_length += len(s.attrib["text"]) + 1 + + po_end = po_start + len(s.attrib["text"]) + + start = po_start + + dp = { + "text": s.attrib["text"], + "type": "title" if ".s0" in s.attrib["id"] else "abstract", + "offsets": [(po_start, po_end)], + "offset": 0, # original offset + } + + po_start = po_end + 1 + + sentences.append(dp) + + pe = [] # entities + re = [] # relations + + # For each entity + for a in s: + + # If correspond to a entity + if a.tag == "entity": + + length = len(a.attrib["text"]) + + if a.attrib["text"] is None or length <= 0: + continue + + # no in-text annotation: only for document indexing + if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: + continue + + startOffset, endOffset = a.attrib["charOffset"].split("-") + startOffset, endOffset = int(startOffset), int(endOffset) + + pe.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "text": (a.attrib["text"],), + "offsets": [(start + startOffset, start + endOffset + 1)], + "normalized": [ + {"db_name": "miRNA-corpus", "db_id": a.attrib["id"]} + ], + } + ) + + # If correspond to relation pair + elif a.tag == "pair": + + re.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "arg1_id": a.attrib["e1"], + "arg2_id": a.attrib["e2"], + "normalized": [], + } + ) + + entities.append(pe) + relations.append(re) + + return sentences, entities, relations + + def _generate_examples( + self, + filepath: str, + split: str, + ) -> Iterator[Tuple[int, Dict]]: + """Yields examples as (key, example) tuples.""" + + reader = ET.fromstring(open(str(filepath), "r").read()) + + if self.config.schema == "source": + + for uid, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + for p, pe, re in zip(sentences, sentences_entities, relations): + + p.pop("offsets") # BioC has only start for passages offsets + + p["document_id"] = doc.attrib["id"] + p["entities"] = pe # BioC has per passage entities + + yield uid, {"passages": sentences} + + elif self.config.schema == "bigbio_kb": + + uid = 0 + + for idx, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + # global id + uid += 1 + + # unpack per-sentence entities + entities = [e for pe in sentences_entities for e in pe] + + for p in sentences: + p.pop("offset") # drop original offset + p["text"] = (p["text"],) # text in sentence is Sequence + p["id"] = uid + uid += 1 + + for e in entities: + e["id"] = uid + uid += 1 + + # unpack per-sentence relations + relations = [r for re in relations for r in re] + + for r in relations: + r["id"] = uid + uid += 1 + + yield idx, { + "id": uid, + "document_id": doc.attrib["id"], + "passages": sentences, + "entities": entities, + "events": [], + "coreferences": [], + "relations": relations, + } diff --git a/bigbio/biodatasets/mlee/mlee.py b/bigbio/biodatasets/mlee/mlee.py index d4e3db09..5582f219 100644 --- a/bigbio/biodatasets/mlee/mlee.py +++ b/bigbio/biodatasets/mlee/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mqp/mqp.py b/bigbio/biodatasets/mqp/mqp.py index 6adf36a9..f1d47b1f 100644 --- a/bigbio/biodatasets/mqp/mqp.py +++ b/bigbio/biodatasets/mqp/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/msh_wsd/msh_wsd.py b/bigbio/biodatasets/msh_wsd/msh_wsd.py index 59525ce3..b4765633 100644 --- a/bigbio/biodatasets/msh_wsd/msh_wsd.py +++ b/bigbio/biodatasets/msh_wsd/msh_wsd.py @@ -40,9 +40,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/muchmore/muchmore.py b/bigbio/biodatasets/muchmore/muchmore.py index f744477f..bc5d1335 100644 --- a/bigbio/biodatasets/muchmore/muchmore.py +++ b/bigbio/biodatasets/muchmore/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN, Lang.DE] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/multi_xscience/multi_xscience.py b/bigbio/biodatasets/multi_xscience/multi_xscience.py index ab8c55b6..a5f9fcd3 100644 --- a/bigbio/biodatasets/multi_xscience/multi_xscience.py +++ b/bigbio/biodatasets/multi_xscience/multi_xscience.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mutation_finder/mutation_finder.py b/bigbio/biodatasets/mutation_finder/mutation_finder.py index 277d5db4..e14b715a 100644 --- a/bigbio/biodatasets/mutation_finder/mutation_finder.py +++ b/bigbio/biodatasets/mutation_finder/mutation_finder.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py index f3cac12f..cde53908 100644 --- a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py +++ b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py @@ -65,12 +65,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/14/5/550/720189 +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py index 68840046..9d0b1a99 100644 --- a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py +++ b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py @@ -63,12 +63,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/15/1/14/779738 +_TAGS = [Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py index 0167def0..bb2f37e3 100644 --- a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py +++ b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py @@ -71,12 +71,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2008" # https://academic.oup.com/jamia/article/16/4/561/766997 +_TAGS = [Tags.DIAGNOSIS, Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py index 3d9328a9..742ce095 100644 --- a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py +++ b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py @@ -57,9 +57,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py index 277081cf..3b095e3f 100644 --- a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py +++ b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py @@ -52,9 +52,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.DIAGNOSIS, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 44328533..7ab93a59 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py index 1e3992a1..75f972cb 100644 --- a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py +++ b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py index 524a48fc..fec27a82 100644 --- a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py +++ b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py index 27d0f5ae..0c18374c 100644 --- a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py +++ b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py index ff26a9eb..9862ea22 100644 --- a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py +++ b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py @@ -46,9 +46,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index 260224c6..c80f80db 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [Tags.VARIANT, Tags.GENE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py index 4d85e9ac..c2b1d748 100644 --- a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py +++ b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_gene/nlm_gene.py b/bigbio/biodatasets/nlm_gene/nlm_gene.py index 2d7e1a4b..d084ad47 100644 --- a/bigbio/biodatasets/nlm_gene/nlm_gene.py +++ b/bigbio/biodatasets/nlm_gene/nlm_gene.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py index 01620230..3882db16 100644 --- a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py +++ b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py @@ -53,9 +53,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/nlmchem/nlmchem.py b/bigbio/biodatasets/nlmchem/nlmchem.py index ec83fe2e..10472c24 100644 --- a/bigbio/biodatasets/nlmchem/nlmchem.py +++ b/bigbio/biodatasets/nlmchem/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index 7066df6e..35e93d53 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -63,9 +63,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/osiris/osiris.py b/bigbio/biodatasets/osiris/osiris.py index 3929ca5d..19b0872c 100644 --- a/bigbio/biodatasets/osiris/osiris.py +++ b/bigbio/biodatasets/osiris/osiris.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/paramed/paramed.py b/bigbio/biodatasets/paramed/paramed.py index 6791791e..50966a93 100644 --- a/bigbio/biodatasets/paramed/paramed.py +++ b/bigbio/biodatasets/paramed/paramed.py @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pcr/pcr.py b/bigbio/biodatasets/pcr/pcr.py index e2e10566..8295b177 100644 --- a/bigbio/biodatasets/pcr/pcr.py +++ b/bigbio/biodatasets/pcr/pcr.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pdr/pdr.py b/bigbio/biodatasets/pdr/pdr.py index a41255e6..efa60062 100644 --- a/bigbio/biodatasets/pdr/pdr.py +++ b/bigbio/biodatasets/pdr/pdr.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pharmaconer/pharmaconer.py b/bigbio/biodatasets/pharmaconer/pharmaconer.py index 61a28ab9..ac5aade0 100644 --- a/bigbio/biodatasets/pharmaconer/pharmaconer.py +++ b/bigbio/biodatasets/pharmaconer/pharmaconer.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL, Tags.GENE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 28f8829a..821cd0d0 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [Tags.DISEASE, Tags.COVID] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pico_extraction/pico_extraction.py b/bigbio/biodatasets/pico_extraction/pico_extraction.py index b0509261..7fba82ab 100644 --- a/bigbio/biodatasets/pico_extraction/pico_extraction.py +++ b/bigbio/biodatasets/pico_extraction/pico_extraction.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PICO] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pmc_patients/pmc_patients.py b/bigbio/biodatasets/pmc_patients/pmc_patients.py index b12a79ae..05823f40 100644 --- a/bigbio/biodatasets/pmc_patients/pmc_patients.py +++ b/bigbio/biodatasets/pmc_patients/pmc_patients.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/progene/progene.py b/bigbio/biodatasets/progene/progene.py index 49aec1db..3456fdd2 100644 --- a/bigbio/biodatasets/progene/progene.py +++ b/bigbio/biodatasets/progene/progene.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/psytar/psytar.py b/bigbio/biodatasets/psytar/psytar.py index 61a16aa6..fef90eed 100644 --- a/bigbio/biodatasets/psytar/psytar.py +++ b/bigbio/biodatasets/psytar/psytar.py @@ -51,9 +51,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/pubhealth/pubhealth.py b/bigbio/biodatasets/pubhealth/pubhealth.py index 63c411bf..6d64352e 100644 --- a/bigbio/biodatasets/pubhealth/pubhealth.py +++ b/bigbio/biodatasets/pubhealth/pubhealth.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.utils.logging.get_logger(__name__) +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py index c0e0228f..7203b608 100644 --- a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py +++ b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [Tags.YESNO, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pubtator_central/pubtator_central.py b/bigbio/biodatasets/pubtator_central/pubtator_central.py index 972000e6..8ba512d3 100644 --- a/bigbio/biodatasets/pubtator_central/pubtator_central.py +++ b/bigbio/biodatasets/pubtator_central/pubtator_central.py @@ -48,9 +48,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CELL, Tags.SPECIES, Tags.VARIANT, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 09a8e059..4edc5c45 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -5,10 +5,19 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [ + Tags.CHEMICAL, + Tags.ANATOMY, + Tags.DRUG, + Tags.SPECIES, + Tags.PROCEDURE, + Tags.DISEASE, + Tags.DIAGNOSIS, +] _LANGUAGES = [Lang.FR] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_chemical/scai_chemical.py b/bigbio/biodatasets/scai_chemical/scai_chemical.py index 1abe0fb0..2935b9a0 100644 --- a/bigbio/biodatasets/scai_chemical/scai_chemical.py +++ b/bigbio/biodatasets/scai_chemical/scai_chemical.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_disease/scai_disease.py b/bigbio/biodatasets/scai_disease/scai_disease.py index 711e54b1..d4bdb3f9 100644 --- a/bigbio/biodatasets/scai_disease/scai_disease.py +++ b/bigbio/biodatasets/scai_disease/scai_disease.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scicite/scicite.py b/bigbio/biodatasets/scicite/scicite.py index 0fe74149..0626f7b7 100644 --- a/bigbio/biodatasets/scicite/scicite.py +++ b/bigbio/biodatasets/scicite/scicite.py @@ -37,9 +37,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.INTENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scielo/scielo.py b/bigbio/biodatasets/scielo/scielo.py index 73aea998..44659df7 100644 --- a/bigbio/biodatasets/scielo/scielo.py +++ b/bigbio/biodatasets/scielo/scielo.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ES, Lang.PT] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scifact/scifact.py b/bigbio/biodatasets/scifact/scifact.py index 22065ec2..c537fcfb 100644 --- a/bigbio/biodatasets/scifact/scifact.py +++ b/bigbio/biodatasets/scifact/scifact.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/sciq/sciq.py b/bigbio/biodatasets/sciq/sciq.py index 2f23906d..eee43620 100644 --- a/bigbio/biodatasets/sciq/sciq.py +++ b/bigbio/biodatasets/sciq/sciq.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "sciq" +_TAGS = [Tags.MULTIPLE_CHOICE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scitail/scitail.py b/bigbio/biodatasets/scitail/scitail.py index b945bdaa..c5dcdca5 100644 --- a/bigbio/biodatasets/scitail/scitail.py +++ b/bigbio/biodatasets/scitail/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/seth_corpus/seth_corpus.py b/bigbio/biodatasets/seth_corpus/seth_corpus.py index 70d4c6d4..fbf5c754 100644 --- a/bigbio/biodatasets/seth_corpus/seth_corpus.py +++ b/bigbio/biodatasets/seth_corpus/seth_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py index 1cf5812a..3936b230 100644 --- a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py +++ b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py @@ -64,9 +64,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ADR, Tags.DRUG, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py index 9a1ff076..4ece98c1 100644 --- a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py +++ b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py @@ -38,11 +38,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "swedish_medical_ner" +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.ANATOMY] _LANGUAGES = [Lang.SV] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index 6e7c2484..d55c650a 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -49,10 +49,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense # TODO: Add BibTeX citation +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py index e2d59b74..93c910e8 100644 --- a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py +++ b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py index 8e766d02..a3518bbf 100644 --- a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py +++ b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py index c5b7d93d..197a33fc 100644 --- a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py +++ b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py @@ -22,7 +22,7 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _CITATION = """\ @@ -44,6 +44,7 @@ copyright = {Creative Commons Attribution 4.0 International} } """ +_TAGS = [Tags.VARIANT, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/twadrl/twadrl.py b/bigbio/biodatasets/twadrl/twadrl.py index d7308912..1735b2ec 100644 --- a/bigbio/biodatasets/twadrl/twadrl.py +++ b/bigbio/biodatasets/twadrl/twadrl.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "twadrl" +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/umnsrs/umnsrs.py b/bigbio/biodatasets/umnsrs/umnsrs.py index 6ec1416a..07f603e8 100644 --- a/bigbio/biodatasets/umnsrs/umnsrs.py +++ b/bigbio/biodatasets/umnsrs/umnsrs.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py index be5f625e..2464a95f 100644 --- a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py +++ b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py @@ -32,9 +32,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.VARIANT, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/utils/constants.py b/bigbio/utils/constants.py index 78a57416..8405bae3 100644 --- a/bigbio/utils/constants.py +++ b/bigbio/utils/constants.py @@ -6,9 +6,19 @@ from bigbio.utils import resources from bigbio.utils.license import Licenses -from bigbio.utils.schemas import (entailment_features, kb_features, - pairs_features, qa_features, - text2text_features, text_features) +from bigbio.utils.schemas import ( + entailment_features, + kb_features, + pairs_features, + qa_features, + text2text_features, + text_features, +) + + +_TAGS = json.loads(pkg_resources.read_text(resources, "tags.json")) +Tags = Enum("Tags", _TAGS) + BigBioValues = SimpleNamespace(NULL="") diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json new file mode 100644 index 00000000..dc3f93eb --- /dev/null +++ b/bigbio/utils/resources/tags.json @@ -0,0 +1,49 @@ +{ + "SOCIAL_MEDIA" : "Social media", + "ANATOMY" : "Anatomy", + "ORGAN" : "Organ", + "VARIANT" : "Variant/Mutation", + "TISSUE" : "Tissue", + "CELL" : "Cells and/or cell lines", + "SPECIES" : "Species", + "GENE" : "Gene, proteins, gene products, ...", + "DISEASE" : "Disease", + "DRUG" : "Drug", + "CHEMICAL" : "Chemical", + "COVID" : "Coronavirus disease 2019 (COVID-19)", + "LEXICAL" : "Lexical data (e.g. word, verbs,...)", + "YESNO" : "QA with yes no answer", + "HOW" : "`How` question", + "WHY" : "`Why` question", + "FACTOID" : "QA with factoid answer", + "FACTOID_LIST": "QA with list of factoid answer", + "ABSTRACTIVE" : "Abstractive summary/answer", + "EXTRACTIVE" : "Extractive summary/answer", + "CLOZE_TEST" : "Cloze test", + "GRANT" : "Grants data", + "PPI" : "Protein-protein interaction", + "MRC" : "Machine Reading Comprehension", + "MULTIPLE_CHOICE" : "QA with multiple choice", + "NEGATION" : "Negation", + "SPECULATION" : "Speculation", + "EPIGENETICS" : "Epigenetics", + "PART_OF" : "Part-of relations", + "CANCER" : "Cancer", + "PATHWAY_CURATION" : "Pathway curation", + "DOCUMENT_INDEXING" : "Document indexing", + "ADR" : "Adverse Drug Reaction", + "POS" : "Part of Speech Tagging", + "PICO" : "(P)articipants, (I)nterventions, and (O)utcomes", + "DDI" : "Drug-drug interaction", + "CONCEPT" : "Concept, Multi-word expression (MWE)", + "SENTENCE" : "Sentence", + "PROCEDURE" : "Procedure, treatment", + "DIALOGUE" : "Dialogue", + "ANONYMIZATION" : "Anonymizatio (De-identification)", + "SENTIMENT_ANALYSIS" : "Sentiment analysis", + "MIRNA" : "miRNA", + "ABBREVIATION" : "Abbreviation", + "FACT_CHECKING" : "Fact-checking", + "INTENT" : "Intent", + "DIAGNOSIS" : "DIAGNOSIS" +} diff --git a/examples/bc5cdr.py b/examples/bc5cdr.py index 111d7bee..0c94c3ad 100644 --- a/examples/bc5cdr.py +++ b/examples/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/bioasq_task_b.py b/examples/bioasq_task_b.py index da38146a..8be34588 100644 --- a/examples/bioasq_task_b.py +++ b/examples/bioasq_task_b.py @@ -32,9 +32,16 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [ + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/examples/biosses.py b/examples/biosses.py index 059a0306..91323949 100644 --- a/examples/biosses.py +++ b/examples/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/chemprot.py b/examples/chemprot.py index 1db648c7..3a43c319 100644 --- a/examples/chemprot.py +++ b/examples/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/hallmarks_of_cancer.py b/examples/hallmarks_of_cancer.py index ae8673b4..09f37794 100644 --- a/examples/hallmarks_of_cancer.py +++ b/examples/hallmarks_of_cancer.py @@ -19,9 +19,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mlee.py b/examples/mlee.py index 2f6b09dd..b98bf327 100644 --- a/examples/mlee.py +++ b/examples/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mqp.py b/examples/mqp.py index b42cbd53..c9e122bc 100644 --- a/examples/mqp.py +++ b/examples/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/muchmore.py b/examples/muchmore.py index da6bc743..6ce74b9f 100644 --- a/examples/muchmore.py +++ b/examples/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index 44328533..ca6f47e1 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/examples/nlmchem.py b/examples/nlmchem.py index 945461bf..6b943859 100644 --- a/examples/nlmchem.py +++ b/examples/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/paramed.py b/examples/paramed.py index 6791791e..518d7e62 100644 --- a/examples/paramed.py +++ b/examples/paramed.py @@ -1,7 +1,7 @@ -# coding=utf-8 +# bcoding=utf-8 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # -# Licensed under the Apache License, Version 2.0 (the "License"); +# bicensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/examples/scitail.py b/examples/scitail.py index d7bf14dd..1be23c7c 100644 --- a/examples/scitail.py +++ b/examples/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/scripts/gather_dataset_tasks.py b/scripts/gather_dataset_tasks.py new file mode 100644 index 00000000..7523e8f4 --- /dev/null +++ b/scripts/gather_dataset_tasks.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Generate counts of tasks and fine-grained taks +""" + +from bigbio.dataloader import BigBioConfigHelpers + + +def main(): + """ + Gather counts on tasks and fine-grained tasks + """ + + configs = BigBioConfigHelpers() + + dataset_task = set() + + for conf in configs: + for task in conf.tasks: + dataset_task.add(conf.dataset_name, str(task)) + + print(dataset_task) + + +if __name__ == "__main__": + main()