From 4aa3d84a12d56c2f0eb3ead1aebb0f0bc4f9992b Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:16 +0200 Subject: [PATCH 01/20] add _TAGS --- bigbio/biodatasets/an_em/an_em.py | 3 ++- bigbio/biodatasets/anat_em/anat_em.py | 3 ++- bigbio/biodatasets/ask_a_patient/ask_a_patient.py | 3 ++- bigbio/biodatasets/bc5cdr/bc5cdr.py | 3 ++- bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py | 3 ++- bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py | 3 ++- bigbio/biodatasets/bio_simlex/bio_simlex.py | 3 ++- .../bioasq_2021_mesinesp/bioasq_2021_mesinesp.py | 3 ++- bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py | 3 ++- .../bioasq_task_c_2017/bioasq_task_c_2017.py | 3 ++- bigbio/biodatasets/bioinfer/bioinfer.py | 3 ++- .../biology_how_why_corpus/biology_how_why_corpus.py | 3 ++- bigbio/biodatasets/biomrc/biomrc.py | 3 ++- .../bionlp_shared_task_2009.py | 7 ++++++- .../bionlp_st_2011_epi/bionlp_st_2011_epi.py | 3 ++- .../bionlp_st_2011_ge/bionlp_st_2011_ge.py | 3 ++- .../bionlp_st_2011_id/bionlp_st_2011_id.py | 10 +++++++++- .../bionlp_st_2011_rel/bionlp_st_2011_rel.py | 3 ++- .../bionlp_st_2013_cg/bionlp_st_2013_cg.py | 12 +++++++++++- .../bionlp_st_2013_ge/bionlp_st_2013_ge.py | 3 ++- .../bionlp_st_2013_gro/bionlp_st_2013_gro.py | 3 ++- .../bionlp_st_2013_pc/bionlp_st_2013_pc.py | 3 ++- .../bionlp_st_2019_bb/bionlp_st_2019_bb.py | 3 ++- bigbio/biodatasets/biored/biored.py | 3 ++- bigbio/biodatasets/biorelex/biorelex.py | 3 ++- bigbio/biodatasets/bioscope/bioscope.py | 3 ++- bigbio/biodatasets/biosses/biosses.py | 3 ++- bigbio/biodatasets/cadec/cadec.py | 3 ++- bigbio/biodatasets/cantemist/cantemist.py | 3 ++- bigbio/biodatasets/cas/cas.py | 3 ++- bigbio/biodatasets/cellfinder/cellfinder.py | 3 ++- bigbio/biodatasets/chebi_nactem/chebi_nactem.py | 3 ++- bigbio/biodatasets/chemdner/chemdner.py | 3 ++- bigbio/biodatasets/chemprot/chemprot.py | 3 ++- bigbio/biodatasets/chia/chia.py | 3 ++- .../citation_gia_test_collection.py | 3 ++- bigbio/biodatasets/codiesp/codiesp.py | 3 ++- bigbio/biodatasets/cord_ner/cord_ner.py | 3 ++- bigbio/biodatasets/ctebmsp/ctebmsp.py | 3 ++- bigbio/biodatasets/ddi_corpus/ddi_corpus.py | 3 ++- .../biodatasets/diann_iber_eval/diann_iber_eval.py | 3 ++- bigbio/biodatasets/distemist/distemist.py | 3 ++- bigbio/biodatasets/ebm_pico/ebm_pico.py | 3 ++- bigbio/biodatasets/ehr_rel/ehr_rel.py | 3 ++- bigbio/biodatasets/essai/essai.py | 3 ++- bigbio/biodatasets/euadr/euadr.py | 3 ++- .../evidence_inference/evidence_inference.py | 3 ++- bigbio/biodatasets/gad/gad.py | 3 ++- bigbio/biodatasets/genetag/genetag.py | 3 ++- .../genia_ptm_event_corpus/genia_ptm_event_corpus.py | 3 ++- .../genia_relation_corpus/genia_relation_corpus.py | 3 ++- .../genia_term_corpus/genia_term_corpus.py | 3 ++- bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py | 3 ++- bigbio/biodatasets/gnormplus/gnormplus.py | 3 ++- .../hallmarks_of_cancer/hallmarks_of_cancer.py | 3 ++- bigbio/biodatasets/hprd50/hprd50.py | 3 ++- bigbio/biodatasets/iepa/iepa.py | 3 ++- bigbio/biodatasets/jnlpba/jnlpba.py | 3 ++- bigbio/biodatasets/linnaeus/linnaeus.py | 3 ++- bigbio/biodatasets/lll/lll.py | 1 + bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 3 ++- bigbio/biodatasets/mayosrs/mayosrs.py | 3 ++- bigbio/biodatasets/med_qa/med_qa.py | 3 ++- bigbio/biodatasets/medal/medal.py | 3 ++- bigbio/biodatasets/meddialog/meddialog.py | 3 ++- bigbio/biodatasets/meddocan/meddocan.py | 3 ++- bigbio/biodatasets/medhop/medhop.py | 3 ++- bigbio/biodatasets/medical_data/medical_data.py | 3 ++- bigbio/biodatasets/mediqa_nli/mediqa_nli.py | 3 ++- bigbio/biodatasets/mediqa_qa/mediqa_qa.py | 3 ++- bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py | 3 ++- bigbio/biodatasets/medmentions/medmentions.py | 3 ++- bigbio/biodatasets/mednli/mednli.py | 3 ++- bigbio/biodatasets/meqsum/meqsum.py | 3 ++- bigbio/biodatasets/minimayosrs/minimayosrs.py | 3 ++- bigbio/biodatasets/mirna/mirna.py | 3 ++- bigbio/biodatasets/mlee/mlee.py | 3 ++- bigbio/biodatasets/mqp/mqp.py | 3 ++- bigbio/biodatasets/msh_wsd/msh_wsd.py | 3 ++- bigbio/biodatasets/muchmore/muchmore.py | 3 ++- bigbio/biodatasets/multi_xscience/multi_xscience.py | 3 ++- .../biodatasets/mutation_finder/mutation_finder.py | 3 ++- bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py | 3 ++- .../n2c2_2006_smokers/n2c2_2006_smokers.py | 3 ++- bigbio/biodatasets/n2c2_2008/n2c2_2008.py | 3 ++- bigbio/biodatasets/n2c2_2009/n2c2_2009.py | 3 ++- bigbio/biodatasets/n2c2_2010/n2c2_2010.py | 3 ++- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 3 ++- bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py | 3 ++- .../n2c2_2014_risk_factors/n2c2_2014_risk_factors.py | 3 ++- .../biodatasets/n2c2_2018_track1/n2c2_2018_track1.py | 3 ++- .../biodatasets/n2c2_2018_track2/n2c2_2018_track2.py | 3 ++- bigbio/biodatasets/nagel/nagel.py | 3 ++- bigbio/biodatasets/ncbi_disease/ncbi_disease.py | 3 ++- bigbio/biodatasets/nlm_gene/nlm_gene.py | 3 ++- bigbio/biodatasets/nlm_wsd/nlm_wsd.py | 3 ++- bigbio/biodatasets/nlmchem/nlmchem.py | 3 ++- .../biodatasets/ntcir_13_medweb/ntcir_13_medweb.py | 3 ++- bigbio/biodatasets/osiris/osiris.py | 3 ++- bigbio/biodatasets/paramed/paramed.py | 3 ++- bigbio/biodatasets/pcr/pcr.py | 3 ++- bigbio/biodatasets/pdr/pdr.py | 3 ++- bigbio/biodatasets/pharmaconer/pharmaconer.py | 3 ++- bigbio/biodatasets/pho_ner/pho_ner.py | 3 ++- .../biodatasets/pico_extraction/pico_extraction.py | 3 ++- bigbio/biodatasets/pmc_patients/pmc_patients.py | 3 ++- bigbio/biodatasets/progene/progene.py | 3 ++- bigbio/biodatasets/psytar/psytar.py | 3 ++- bigbio/biodatasets/pubhealth/pubhealth.py | 3 ++- bigbio/biodatasets/pubmed_qa/pubmed_qa.py | 1 + .../biodatasets/pubtator_central/pubtator_central.py | 3 ++- bigbio/biodatasets/quaero/quaero.py | 3 ++- bigbio/biodatasets/scai_chemical/scai_chemical.py | 3 ++- bigbio/biodatasets/scai_disease/scai_disease.py | 3 ++- bigbio/biodatasets/scicite/scicite.py | 3 ++- bigbio/biodatasets/scielo/scielo.py | 3 ++- bigbio/biodatasets/scifact/scifact.py | 3 ++- bigbio/biodatasets/sciq/sciq.py | 3 ++- bigbio/biodatasets/scitail/scitail.py | 3 ++- bigbio/biodatasets/seth_corpus/seth_corpus.py | 3 ++- bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py | 3 ++- .../swedish_medical_ner/swedish_medical_ner.py | 3 ++- bigbio/biodatasets/thomas2011/thomas2011.py | 3 ++- bigbio/biodatasets/tmvar_v1/tmvar_v1.py | 3 ++- bigbio/biodatasets/tmvar_v2/tmvar_v2.py | 3 ++- bigbio/biodatasets/tmvar_v3/tmvar_v3.py | 3 ++- bigbio/biodatasets/twadrl/twadrl.py | 3 ++- bigbio/biodatasets/umnsrs/umnsrs.py | 3 ++- bigbio/biodatasets/verspoor_2013/verspoor_2013.py | 3 ++- 129 files changed, 276 insertions(+), 127 deletions(-) diff --git a/bigbio/biodatasets/an_em/an_em.py b/bigbio/biodatasets/an_em/an_em.py index f3460349..4d956684 100644 --- a/bigbio/biodatasets/an_em/an_em.py +++ b/bigbio/biodatasets/an_em/an_em.py @@ -29,9 +29,10 @@ import bigbio.utils.parsing as parse from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/anat_em/anat_em.py b/bigbio/biodatasets/anat_em/anat_em.py index c74125c2..c58f6fb1 100644 --- a/bigbio/biodatasets/anat_em/anat_em.py +++ b/bigbio/biodatasets/anat_em/anat_em.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index dee74515..0b4eeffe 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "ask_a_patient" +_TAGS = [Tags.SOCIAL_MEDIA] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc5cdr/bc5cdr.py b/bigbio/biodatasets/bc5cdr/bc5cdr.py index 47af693c..5e729b27 100644 --- a/bigbio/biodatasets/bc5cdr/bc5cdr.py +++ b/bigbio/biodatasets/bc5cdr/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py index 2e9ca9e9..50543a18 100644 --- a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py +++ b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.COVID] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py index 05db39fd..afab0059 100644 --- a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py +++ b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_simlex/bio_simlex.py b/bigbio/biodatasets/bio_simlex/bio_simlex.py index 6b8fc6f8..2a9cecea 100644 --- a/bigbio/biodatasets/bio_simlex/bio_simlex.py +++ b/bigbio/biodatasets/bio_simlex/bio_simlex.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py index 4672c3f5..680de353 100644 --- a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py +++ b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py @@ -51,9 +51,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DECS] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py index f5668647..b17ed382 100644 --- a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py +++ b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py index 8012f380..2aeeb729 100644 --- a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py +++ b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GRANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioinfer/bioinfer.py b/bigbio/biodatasets/bioinfer/bioinfer.py index 8a71bbf5..dd1a7cfd 100644 --- a/bigbio/biodatasets/bioinfer/bioinfer.py +++ b/bigbio/biodatasets/bioinfer/bioinfer.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py index 75117290..41e8cca7 100644 --- a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py +++ b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.QA_HOW, Tags.QA_WHY] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/biomrc/biomrc.py b/bigbio/biodatasets/biomrc/biomrc.py index a80f0955..df849298 100644 --- a/bigbio/biodatasets/biomrc/biomrc.py +++ b/bigbio/biodatasets/biomrc/biomrc.py @@ -31,9 +31,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.QA_MULTIPLE_CHOICE, Tags.MRC, Tags.QA_CLOZE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py index 1f32a25d..4e330313 100644 --- a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py +++ b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py @@ -21,10 +21,15 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import brat_parse_to_bigbio_kb, parse_brat_file +# http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=4605©ownerid=320 +# Task 1. Event detection and characterization +# Task 2. Event argument recognition +# Task 3. Recognition of negations and speculations +_TAGS = [Tags.PPI, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py index 4c2d5991..7a6ea0ab 100644 --- a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py +++ b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_epi" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.EPIGENETICS, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py index 112c03a4..3eab0c71 100644 --- a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py +++ b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py index 1d640ac3..c5e0734d 100644 --- a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py +++ b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py @@ -20,13 +20,21 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_id" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.GENE, + Tags.CHEMICAL, + Tags.ORGANISM, + Tags.SPECULATION, + Tags.NEGATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py index d6539fbb..92a9c3b2 100644 --- a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py +++ b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_rel" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.PART_OF, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py index f99326ec..a72d0386 100644 --- a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py +++ b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py @@ -20,12 +20,22 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_cg" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.CANCER, + Tags.TISSUE, + Tags.ORGANISM, + Tags.CELL, + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py index 93dfa58f..74a76bde 100644 --- a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py +++ b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py index 277dfcec..1241b22c 100644 --- a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py +++ b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_gro" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.ORGANISM, Tags.CELL, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py index 69fd79f9..f685ff3e 100644 --- a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py +++ b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py @@ -20,12 +20,13 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_pc" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.PATHWAY, Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py index 026c8337..8d464b85 100644 --- a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py +++ b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2019_bb" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biored/biored.py b/bigbio/biodatasets/biored/biored.py index 250ce837..b45bdacd 100644 --- a/bigbio/biodatasets/biored/biored.py +++ b/bigbio/biodatasets/biored/biored.py @@ -26,10 +26,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.VARIANT, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biorelex/biorelex.py b/bigbio/biodatasets/biorelex/biorelex.py index f6dac279..1b1d2a12 100644 --- a/bigbio/biodatasets/biorelex/biorelex.py +++ b/bigbio/biodatasets/biorelex/biorelex.py @@ -35,10 +35,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.VARIANT, Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioscope/bioscope.py b/bigbio/biodatasets/bioscope/bioscope.py index 5af2077a..9e7d2e22 100644 --- a/bigbio/biodatasets/bioscope/bioscope.py +++ b/bigbio/biodatasets/bioscope/bioscope.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biosses/biosses.py b/bigbio/biodatasets/biosses/biosses.py index 059a0306..a55a313c 100644 --- a/bigbio/biodatasets/biosses/biosses.py +++ b/bigbio/biodatasets/biosses/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index 3eb3f6da..13784fd6 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -35,9 +35,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cantemist/cantemist.py b/bigbio/biodatasets/cantemist/cantemist.py index 6a140d2a..9d0c9d89 100644 --- a/bigbio/biodatasets/cantemist/cantemist.py +++ b/bigbio/biodatasets/cantemist/cantemist.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cas/cas.py b/bigbio/biodatasets/cas/cas.py index d563be29..6c421ca8 100644 --- a/bigbio/biodatasets/cas/cas.py +++ b/bigbio/biodatasets/cas/cas.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/cellfinder/cellfinder.py b/bigbio/biodatasets/cellfinder/cellfinder.py index 935a919c..9987ee5f 100644 --- a/bigbio/biodatasets/cellfinder/cellfinder.py +++ b/bigbio/biodatasets/cellfinder/cellfinder.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py index c6e96c6f..b7edd94f 100644 --- a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py +++ b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py @@ -21,10 +21,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import parse_brat_file +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemdner/chemdner.py b/bigbio/biodatasets/chemdner/chemdner.py index 7b0b974f..c1ec7c88 100644 --- a/bigbio/biodatasets/chemdner/chemdner.py +++ b/bigbio/biodatasets/chemdner/chemdner.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemprot/chemprot.py b/bigbio/biodatasets/chemprot/chemprot.py index 620a1a44..c91d5aa8 100644 --- a/bigbio/biodatasets/chemprot/chemprot.py +++ b/bigbio/biodatasets/chemprot/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index 2328a459..cc1b3d7a 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py index 0713a87f..63efad00 100644 --- a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py +++ b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index 1cede622..aea9c786 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 8724cf64..38f956da 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index 92ca3519..42c23ef2 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py index 4d8fb893..970cdbb6 100644 --- a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py +++ b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py index a9f4a927..9ae95846 100644 --- a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py +++ b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py @@ -27,9 +27,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/distemist/distemist.py b/bigbio/biodatasets/distemist/distemist.py index b9dfaf5d..798c568a 100644 --- a/bigbio/biodatasets/distemist/distemist.py +++ b/bigbio/biodatasets/distemist/distemist.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ebm_pico/ebm_pico.py b/bigbio/biodatasets/ebm_pico/ebm_pico.py index 5e7078e0..f20a3379 100644 --- a/bigbio/biodatasets/ebm_pico/ebm_pico.py +++ b/bigbio/biodatasets/ebm_pico/ebm_pico.py @@ -26,9 +26,10 @@ import datasets from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ehr_rel/ehr_rel.py b/bigbio/biodatasets/ehr_rel/ehr_rel.py index 90235ee4..2ad2f965 100644 --- a/bigbio/biodatasets/ehr_rel/ehr_rel.py +++ b/bigbio/biodatasets/ehr_rel/ehr_rel.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index 275aa115..289055a6 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/euadr/euadr.py b/bigbio/biodatasets/euadr/euadr.py index 35b27664..e68a1feb 100644 --- a/bigbio/biodatasets/euadr/euadr.py +++ b/bigbio/biodatasets/euadr/euadr.py @@ -4,9 +4,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index 83fd2ca7..e21ce4f4 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/gad/gad.py b/bigbio/biodatasets/gad/gad.py index 4a9286ce..d12e7b5b 100644 --- a/bigbio/biodatasets/gad/gad.py +++ b/bigbio/biodatasets/gad/gad.py @@ -6,13 +6,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "gad" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genetag/genetag.py b/bigbio/biodatasets/genetag/genetag.py index e53b4918..bfe13bf5 100644 --- a/bigbio/biodatasets/genetag/genetag.py +++ b/bigbio/biodatasets/genetag/genetag.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py index 0e3f2536..ed874166 100644 --- a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py +++ b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py index f0a73059..81c83368 100644 --- a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py +++ b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py index 0ae321ce..7516e830 100644 --- a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py +++ b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py index 1954035f..5b37531a 100644 --- a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py +++ b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/gnormplus/gnormplus.py b/bigbio/biodatasets/gnormplus/gnormplus.py index 7fd0e750..28d16d36 100644 --- a/bigbio/biodatasets/gnormplus/gnormplus.py +++ b/bigbio/biodatasets/gnormplus/gnormplus.py @@ -23,10 +23,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py index 83d19030..73439fe0 100644 --- a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py +++ b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py @@ -18,9 +18,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hprd50/hprd50.py b/bigbio/biodatasets/hprd50/hprd50.py index 91b18470..63de6001 100644 --- a/bigbio/biodatasets/hprd50/hprd50.py +++ b/bigbio/biodatasets/hprd50/hprd50.py @@ -38,10 +38,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/iepa/iepa.py b/bigbio/biodatasets/iepa/iepa.py index 5efffd9f..15789356 100644 --- a/bigbio/biodatasets/iepa/iepa.py +++ b/bigbio/biodatasets/iepa/iepa.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index d163c385..a10a4298 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/linnaeus/linnaeus.py b/bigbio/biodatasets/linnaeus/linnaeus.py index 14c1b6ef..4a079d1d 100644 --- a/bigbio/biodatasets/linnaeus/linnaeus.py +++ b/bigbio/biodatasets/linnaeus/linnaeus.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index 34259f12..ccc4eca8 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -39,6 +39,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index e014f006..0db20bd5 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -22,9 +22,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mayosrs/mayosrs.py b/bigbio/biodatasets/mayosrs/mayosrs.py index 033a93b8..160a6666 100644 --- a/bigbio/biodatasets/mayosrs/mayosrs.py +++ b/bigbio/biodatasets/mayosrs/mayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index e83b70e9..4cdbc1d9 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medal/medal.py b/bigbio/biodatasets/medal/medal.py index 2766f97b..03df40fc 100644 --- a/bigbio/biodatasets/medal/medal.py +++ b/bigbio/biodatasets/medal/medal.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/meddialog/meddialog.py b/bigbio/biodatasets/meddialog/meddialog.py index 90e77e55..4d0e95b6 100644 --- a/bigbio/biodatasets/meddialog/meddialog.py +++ b/bigbio/biodatasets/meddialog/meddialog.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "meddialog" +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddocan/meddocan.py b/bigbio/biodatasets/meddocan/meddocan.py index a2e66d64..e1fb393d 100644 --- a/bigbio/biodatasets/meddocan/meddocan.py +++ b/bigbio/biodatasets/meddocan/meddocan.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medhop/medhop.py b/bigbio/biodatasets/medhop/medhop.py index 1b6012a7..96c92639 100644 --- a/bigbio/biodatasets/medhop/medhop.py +++ b/bigbio/biodatasets/medhop/medhop.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 98632452..80ddfdef 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py index 3b82f39f..153df024 100644 --- a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py +++ b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py @@ -44,9 +44,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py index 0e85d926..1c26254e 100644 --- a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py +++ b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py index 9b9fe79e..ad61f531 100644 --- a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py +++ b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index a1e8e2d9..a1322f7e 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mednli/mednli.py b/bigbio/biodatasets/mednli/mednli.py index 5e6c8cac..4488852f 100644 --- a/bigbio/biodatasets/mednli/mednli.py +++ b/bigbio/biodatasets/mednli/mednli.py @@ -42,9 +42,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/meqsum/meqsum.py b/bigbio/biodatasets/meqsum/meqsum.py index 684877dd..21fe7f58 100644 --- a/bigbio/biodatasets/meqsum/meqsum.py +++ b/bigbio/biodatasets/meqsum/meqsum.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/minimayosrs/minimayosrs.py b/bigbio/biodatasets/minimayosrs/minimayosrs.py index 1169fa67..f8f095bb 100644 --- a/bigbio/biodatasets/minimayosrs/minimayosrs.py +++ b/bigbio/biodatasets/minimayosrs/minimayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mirna/mirna.py b/bigbio/biodatasets/mirna/mirna.py index 2b128f21..aa7e7279 100644 --- a/bigbio/biodatasets/mirna/mirna.py +++ b/bigbio/biodatasets/mirna/mirna.py @@ -19,9 +19,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mlee/mlee.py b/bigbio/biodatasets/mlee/mlee.py index d4e3db09..47807962 100644 --- a/bigbio/biodatasets/mlee/mlee.py +++ b/bigbio/biodatasets/mlee/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mqp/mqp.py b/bigbio/biodatasets/mqp/mqp.py index 6adf36a9..f1d47b1f 100644 --- a/bigbio/biodatasets/mqp/mqp.py +++ b/bigbio/biodatasets/mqp/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/msh_wsd/msh_wsd.py b/bigbio/biodatasets/msh_wsd/msh_wsd.py index 59525ce3..2195106a 100644 --- a/bigbio/biodatasets/msh_wsd/msh_wsd.py +++ b/bigbio/biodatasets/msh_wsd/msh_wsd.py @@ -40,9 +40,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/muchmore/muchmore.py b/bigbio/biodatasets/muchmore/muchmore.py index f744477f..3ae9d047 100644 --- a/bigbio/biodatasets/muchmore/muchmore.py +++ b/bigbio/biodatasets/muchmore/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.DE] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/multi_xscience/multi_xscience.py b/bigbio/biodatasets/multi_xscience/multi_xscience.py index ab8c55b6..6be1347b 100644 --- a/bigbio/biodatasets/multi_xscience/multi_xscience.py +++ b/bigbio/biodatasets/multi_xscience/multi_xscience.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mutation_finder/mutation_finder.py b/bigbio/biodatasets/mutation_finder/mutation_finder.py index 277d5db4..5dc113a7 100644 --- a/bigbio/biodatasets/mutation_finder/mutation_finder.py +++ b/bigbio/biodatasets/mutation_finder/mutation_finder.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py index f3cac12f..9144f25f 100644 --- a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py +++ b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py @@ -65,12 +65,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/14/5/550/720189 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py index 68840046..6e0fc920 100644 --- a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py +++ b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py @@ -63,12 +63,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/15/1/14/779738 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py index 0167def0..4b3054ac 100644 --- a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py +++ b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py @@ -71,12 +71,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2008" # https://academic.oup.com/jamia/article/16/4/561/766997 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py index 3d9328a9..88f1e60c 100644 --- a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py +++ b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py @@ -57,9 +57,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py index 277081cf..549ac121 100644 --- a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py +++ b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py @@ -52,9 +52,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 44328533..67fc5e68 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py index 1e3992a1..75f972cb 100644 --- a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py +++ b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py index 524a48fc..fec27a82 100644 --- a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py +++ b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py index 27d0f5ae..59411a29 100644 --- a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py +++ b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py index ff26a9eb..13ddc19b 100644 --- a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py +++ b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py @@ -46,9 +46,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index 260224c6..fd8a05f6 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py index 4d85e9ac..1efee20e 100644 --- a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py +++ b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_gene/nlm_gene.py b/bigbio/biodatasets/nlm_gene/nlm_gene.py index 2d7e1a4b..1a6c0e06 100644 --- a/bigbio/biodatasets/nlm_gene/nlm_gene.py +++ b/bigbio/biodatasets/nlm_gene/nlm_gene.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py index 01620230..7437d8df 100644 --- a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py +++ b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py @@ -53,9 +53,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/nlmchem/nlmchem.py b/bigbio/biodatasets/nlmchem/nlmchem.py index ec83fe2e..e816e378 100644 --- a/bigbio/biodatasets/nlmchem/nlmchem.py +++ b/bigbio/biodatasets/nlmchem/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index 7066df6e..ff873473 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -63,9 +63,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/osiris/osiris.py b/bigbio/biodatasets/osiris/osiris.py index 3929ca5d..b8326256 100644 --- a/bigbio/biodatasets/osiris/osiris.py +++ b/bigbio/biodatasets/osiris/osiris.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/paramed/paramed.py b/bigbio/biodatasets/paramed/paramed.py index 6791791e..50966a93 100644 --- a/bigbio/biodatasets/paramed/paramed.py +++ b/bigbio/biodatasets/paramed/paramed.py @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pcr/pcr.py b/bigbio/biodatasets/pcr/pcr.py index e2e10566..28e3987e 100644 --- a/bigbio/biodatasets/pcr/pcr.py +++ b/bigbio/biodatasets/pcr/pcr.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pdr/pdr.py b/bigbio/biodatasets/pdr/pdr.py index a41255e6..1c7bb9f7 100644 --- a/bigbio/biodatasets/pdr/pdr.py +++ b/bigbio/biodatasets/pdr/pdr.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pharmaconer/pharmaconer.py b/bigbio/biodatasets/pharmaconer/pharmaconer.py index 61a28ab9..f20fd87f 100644 --- a/bigbio/biodatasets/pharmaconer/pharmaconer.py +++ b/bigbio/biodatasets/pharmaconer/pharmaconer.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 28f8829a..4ae3852d 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pico_extraction/pico_extraction.py b/bigbio/biodatasets/pico_extraction/pico_extraction.py index b0509261..ab4c36f2 100644 --- a/bigbio/biodatasets/pico_extraction/pico_extraction.py +++ b/bigbio/biodatasets/pico_extraction/pico_extraction.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pmc_patients/pmc_patients.py b/bigbio/biodatasets/pmc_patients/pmc_patients.py index b12a79ae..05823f40 100644 --- a/bigbio/biodatasets/pmc_patients/pmc_patients.py +++ b/bigbio/biodatasets/pmc_patients/pmc_patients.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/progene/progene.py b/bigbio/biodatasets/progene/progene.py index 49aec1db..f1ce6223 100644 --- a/bigbio/biodatasets/progene/progene.py +++ b/bigbio/biodatasets/progene/progene.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/psytar/psytar.py b/bigbio/biodatasets/psytar/psytar.py index 61a16aa6..e0931739 100644 --- a/bigbio/biodatasets/psytar/psytar.py +++ b/bigbio/biodatasets/psytar/psytar.py @@ -51,9 +51,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/pubhealth/pubhealth.py b/bigbio/biodatasets/pubhealth/pubhealth.py index 63c411bf..5320c16e 100644 --- a/bigbio/biodatasets/pubhealth/pubhealth.py +++ b/bigbio/biodatasets/pubhealth/pubhealth.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.utils.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py index c0e0228f..4bdf1506 100644 --- a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py +++ b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py @@ -30,6 +30,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pubtator_central/pubtator_central.py b/bigbio/biodatasets/pubtator_central/pubtator_central.py index 972000e6..50048a96 100644 --- a/bigbio/biodatasets/pubtator_central/pubtator_central.py +++ b/bigbio/biodatasets/pubtator_central/pubtator_central.py @@ -48,9 +48,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 09a8e059..29558a11 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -5,10 +5,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_chemical/scai_chemical.py b/bigbio/biodatasets/scai_chemical/scai_chemical.py index 1abe0fb0..e3c4ef80 100644 --- a/bigbio/biodatasets/scai_chemical/scai_chemical.py +++ b/bigbio/biodatasets/scai_chemical/scai_chemical.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_disease/scai_disease.py b/bigbio/biodatasets/scai_disease/scai_disease.py index 711e54b1..4b7905d9 100644 --- a/bigbio/biodatasets/scai_disease/scai_disease.py +++ b/bigbio/biodatasets/scai_disease/scai_disease.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scicite/scicite.py b/bigbio/biodatasets/scicite/scicite.py index 0fe74149..3a0f3284 100644 --- a/bigbio/biodatasets/scicite/scicite.py +++ b/bigbio/biodatasets/scicite/scicite.py @@ -37,9 +37,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scielo/scielo.py b/bigbio/biodatasets/scielo/scielo.py index 73aea998..44659df7 100644 --- a/bigbio/biodatasets/scielo/scielo.py +++ b/bigbio/biodatasets/scielo/scielo.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ES, Lang.PT] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scifact/scifact.py b/bigbio/biodatasets/scifact/scifact.py index 22065ec2..14a48e21 100644 --- a/bigbio/biodatasets/scifact/scifact.py +++ b/bigbio/biodatasets/scifact/scifact.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/sciq/sciq.py b/bigbio/biodatasets/sciq/sciq.py index 2f23906d..54f44e62 100644 --- a/bigbio/biodatasets/sciq/sciq.py +++ b/bigbio/biodatasets/sciq/sciq.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "sciq" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scitail/scitail.py b/bigbio/biodatasets/scitail/scitail.py index b945bdaa..c5dcdca5 100644 --- a/bigbio/biodatasets/scitail/scitail.py +++ b/bigbio/biodatasets/scitail/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/seth_corpus/seth_corpus.py b/bigbio/biodatasets/seth_corpus/seth_corpus.py index 70d4c6d4..82b84b53 100644 --- a/bigbio/biodatasets/seth_corpus/seth_corpus.py +++ b/bigbio/biodatasets/seth_corpus/seth_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py index 1cf5812a..a6b16123 100644 --- a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py +++ b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py @@ -64,9 +64,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py index 9a1ff076..1ec26aca 100644 --- a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py +++ b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py @@ -38,11 +38,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "swedish_medical_ner" +_TAGS = [] _LANGUAGES = [Lang.SV] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index 6e7c2484..d2747c4a 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -49,10 +49,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense # TODO: Add BibTeX citation +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py index e2d59b74..f6cb22e5 100644 --- a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py +++ b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py index 8e766d02..b522524b 100644 --- a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py +++ b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py index c5b7d93d..1e2bb9dd 100644 --- a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py +++ b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py @@ -22,7 +22,7 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _CITATION = """\ @@ -44,6 +44,7 @@ copyright = {Creative Commons Attribution 4.0 International} } """ +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/twadrl/twadrl.py b/bigbio/biodatasets/twadrl/twadrl.py index d7308912..abfb83a9 100644 --- a/bigbio/biodatasets/twadrl/twadrl.py +++ b/bigbio/biodatasets/twadrl/twadrl.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "twadrl" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/umnsrs/umnsrs.py b/bigbio/biodatasets/umnsrs/umnsrs.py index 6ec1416a..8bb39e55 100644 --- a/bigbio/biodatasets/umnsrs/umnsrs.py +++ b/bigbio/biodatasets/umnsrs/umnsrs.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py index be5f625e..58f61313 100644 --- a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py +++ b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py @@ -32,9 +32,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 4f328b4ff898e06d273b9668e906e195f75f35c1 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:25 +0200 Subject: [PATCH 02/20] add _TAGS --- examples/bc5cdr.py | 3 ++- examples/bioasq_task_b.py | 8 +++++++- examples/biosses.py | 3 ++- examples/chemprot.py | 3 ++- examples/hallmarks_of_cancer.py | 3 ++- examples/mlee.py | 3 ++- examples/mqp.py | 3 ++- examples/muchmore.py | 3 ++- examples/n2c2_2011.py | 3 ++- examples/nlmchem.py | 3 ++- examples/paramed.py | 7 ++++--- examples/scitail.py | 3 ++- 12 files changed, 31 insertions(+), 14 deletions(-) diff --git a/examples/bc5cdr.py b/examples/bc5cdr.py index 111d7bee..ee325c16 100644 --- a/examples/bc5cdr.py +++ b/examples/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/bioasq_task_b.py b/examples/bioasq_task_b.py index da38146a..9026918f 100644 --- a/examples/bioasq_task_b.py +++ b/examples/bioasq_task_b.py @@ -32,9 +32,15 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [ + Tags.QA_YESNO + Tags.QA_FACTOID, + Tags.QA_LIST, + Tags.QA_SUMMARY, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/examples/biosses.py b/examples/biosses.py index 059a0306..80aa75b3 100644 --- a/examples/biosses.py +++ b/examples/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/chemprot.py b/examples/chemprot.py index 1db648c7..c29b362a 100644 --- a/examples/chemprot.py +++ b/examples/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/hallmarks_of_cancer.py b/examples/hallmarks_of_cancer.py index ae8673b4..50600def 100644 --- a/examples/hallmarks_of_cancer.py +++ b/examples/hallmarks_of_cancer.py @@ -19,9 +19,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mlee.py b/examples/mlee.py index 2f6b09dd..e0330d53 100644 --- a/examples/mlee.py +++ b/examples/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mqp.py b/examples/mqp.py index b42cbd53..c9e122bc 100644 --- a/examples/mqp.py +++ b/examples/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/muchmore.py b/examples/muchmore.py index da6bc743..9afb2982 100644 --- a/examples/muchmore.py +++ b/examples/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index 44328533..d1dd79f7 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/examples/nlmchem.py b/examples/nlmchem.py index 945461bf..88523446 100644 --- a/examples/nlmchem.py +++ b/examples/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/paramed.py b/examples/paramed.py index 6791791e..518d7e62 100644 --- a/examples/paramed.py +++ b/examples/paramed.py @@ -1,7 +1,7 @@ -# coding=utf-8 +# bcoding=utf-8 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # -# Licensed under the Apache License, Version 2.0 (the "License"); +# bicensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/examples/scitail.py b/examples/scitail.py index d7bf14dd..1be23c7c 100644 --- a/examples/scitail.py +++ b/examples/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False From 515d9acc3278969adc1a5df4f06d1bf79cfed9be Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:36 +0200 Subject: [PATCH 03/20] create Tags Enum --- bigbio/utils/constants.py | 16 ++++++++++++--- bigbio/utils/resources/tags.json | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 bigbio/utils/resources/tags.json diff --git a/bigbio/utils/constants.py b/bigbio/utils/constants.py index 78a57416..8405bae3 100644 --- a/bigbio/utils/constants.py +++ b/bigbio/utils/constants.py @@ -6,9 +6,19 @@ from bigbio.utils import resources from bigbio.utils.license import Licenses -from bigbio.utils.schemas import (entailment_features, kb_features, - pairs_features, qa_features, - text2text_features, text_features) +from bigbio.utils.schemas import ( + entailment_features, + kb_features, + pairs_features, + qa_features, + text2text_features, + text_features, +) + + +_TAGS = json.loads(pkg_resources.read_text(resources, "tags.json")) +Tags = Enum("Tags", _TAGS) + BigBioValues = SimpleNamespace(NULL="") diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json new file mode 100644 index 00000000..f2daadbb --- /dev/null +++ b/bigbio/utils/resources/tags.json @@ -0,0 +1,34 @@ +{ + "SOCIAL_MEDIA" : "Social media", + "ANATOMY" : "Anatomy", + "ORGANISM" : "Organism", + "VARIANT" : "Variant/Mutation", + "TISSUE" : "Tissue", + "CELL" : "Cells and/or cell lines", + "SPECIES" : "Species", + "GENE" : "Gene, proteins, gene products, ...", + "DISEASE" : "Disease", + "CHEMICAL" : "Chemical", + "UMLS" : "Unified Medical Language System", + "COVID" : "Coronavirus disease 2019 (COVID-19)", + "LEXICAL" : "Lexical data (e.g. word, verbs,...)", + "DECS" : "Descriptores en Ciencias de la Salud", + "QA_YESNO" : "QA with yes no answer", + "QA_FACTOID" : "QA with factoid answer", + "QA_LIST": "QA with list of factoid answer", + "QA_SUMMARY_ANSWER" : "QA with abstractive summary answer", + "QA_HOW" : "`How` question", + "QA_WHY" : "`Why` question", + "GRANT" : "Grants data", + "PPI" : "Protein-protein interaction", + "QA_CLOZE" : "Cloze test", + "MRC" : "Machine Reading Comprehension", + "QA_MULTIPLE_CHOICE" : "QA with multiple choice", + "NEGATION" : "Negation", + "SPECULATION" : "Speculation", + "EPIGENETICS" : "Epigenetics", + "PART_OF" : "Part-of relations", + "CANCER" : "Cancer", + "PATHWAY" : "Pathway", + "MESH" : "Medical Subject Headings (MeSH)" +} From beb1eb6f51cd914f235f84429d9d8f47a10bf515 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 12:20:44 +0200 Subject: [PATCH 04/20] update _TAGS --- .../ask_a_patient/ask_a_patient.py | 2 +- bigbio/biodatasets/bc5cdr/bc5cdr.py | 2 +- .../bioasq_2021_mesinesp.py | 2 +- .../bioasq_task_b/bioasq_task_b.py | 8 +- .../biology_how_why_corpus.py | 2 +- bigbio/biodatasets/biomrc/biomrc.py | 2 +- bigbio/biodatasets/cadec/cadec.py | 2 +- bigbio/biodatasets/cantemist/cantemist.py | 2 +- bigbio/biodatasets/cas/cas.py | 2 +- bigbio/biodatasets/cellfinder/cellfinder.py | 2 +- .../biodatasets/chebi_nactem/chebi_nactem.py | 2 +- bigbio/biodatasets/chemdner/chemdner.py | 2 +- bigbio/biodatasets/chemprot/chemprot.py | 2 +- bigbio/biodatasets/chia/chia.py | 2 +- .../citation_gia_test_collection.py | 137 ++++++++++-------- bigbio/biodatasets/codiesp/codiesp.py | 2 +- bigbio/biodatasets/cord_ner/cord_ner.py | 2 +- bigbio/biodatasets/ctebmsp/ctebmsp.py | 2 +- bigbio/biodatasets/ddi_corpus/ddi_corpus.py | 2 +- .../diann_iber_eval/diann_iber_eval.py | 2 +- bigbio/biodatasets/distemist/distemist.py | 2 +- bigbio/biodatasets/ebm_pico/ebm_pico.py | 30 +++- bigbio/biodatasets/ehr_rel/ehr_rel.py | 2 +- bigbio/biodatasets/essai/essai.py | 2 +- bigbio/biodatasets/euadr/euadr.py | 10 +- .../evidence_inference/evidence_inference.py | 2 +- bigbio/biodatasets/genetag/genetag.py | 2 +- .../genia_relation_corpus.py | 2 +- .../genia_term_corpus/genia_term_corpus.py | 2 +- bigbio/biodatasets/gnormplus/gnormplus.py | 2 +- .../hallmarks_of_cancer.py | 41 +++--- bigbio/biodatasets/hprd50/hprd50.py | 2 +- bigbio/biodatasets/iepa/iepa.py | 2 +- bigbio/biodatasets/jnlpba/jnlpba.py | 2 +- bigbio/biodatasets/linnaeus/linnaeus.py | 2 +- bigbio/biodatasets/lll/lll.py | 2 +- bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 2 +- bigbio/biodatasets/mayosrs/mayosrs.py | 2 +- bigbio/biodatasets/med_qa/med_qa.py | 2 +- bigbio/biodatasets/meddialog/meddialog.py | 2 +- bigbio/biodatasets/meddocan/meddocan.py | 2 +- bigbio/biodatasets/medhop/medhop.py | 2 +- .../biodatasets/medical_data/medical_data.py | 2 +- bigbio/biodatasets/mediqa_qa/mediqa_qa.py | 2 +- bigbio/biodatasets/medmentions/medmentions.py | 2 +- bigbio/utils/resources/tags.json | 34 +++-- 46 files changed, 203 insertions(+), 137 deletions(-) diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index 0b4eeffe..bd89c502 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -26,7 +26,7 @@ _DATASETNAME = "ask_a_patient" -_TAGS = [Tags.SOCIAL_MEDIA] +_TAGS = [Tags.SOCIAL_MEDIA, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc5cdr/bc5cdr.py b/bigbio/biodatasets/bc5cdr/bc5cdr.py index 5e729b27..45ed49a7 100644 --- a/bigbio/biodatasets/bc5cdr/bc5cdr.py +++ b/bigbio/biodatasets/bc5cdr/bc5cdr.py @@ -35,7 +35,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py index 680de353..7fd13d83 100644 --- a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py +++ b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py @@ -54,7 +54,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DECS] +_TAGS = [Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py index b17ed382..685ac4e4 100644 --- a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py +++ b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py @@ -35,7 +35,13 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [ + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py index 41e8cca7..282050c6 100644 --- a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py +++ b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.QA_HOW, Tags.QA_WHY] +_TAGS = [Tags.HOW, Tags.WHY] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/biomrc/biomrc.py b/bigbio/biodatasets/biomrc/biomrc.py index df849298..43dd1f72 100644 --- a/bigbio/biodatasets/biomrc/biomrc.py +++ b/bigbio/biodatasets/biomrc/biomrc.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.QA_MULTIPLE_CHOICE, Tags.MRC, Tags.QA_CLOZE] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC, Tags.CLOZE_TEST] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index 13784fd6..c604c092 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.SOCIAL_MEDIA, Tags.DISEASE, Tags.ADR, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cantemist/cantemist.py b/bigbio/biodatasets/cantemist/cantemist.py index 9d0c9d89..9b4af046 100644 --- a/bigbio/biodatasets/cantemist/cantemist.py +++ b/bigbio/biodatasets/cantemist/cantemist.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CANCER, Tags.DISEASE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cas/cas.py b/bigbio/biodatasets/cas/cas.py index 6c421ca8..6b45d7d2 100644 --- a/bigbio/biodatasets/cas/cas.py +++ b/bigbio/biodatasets/cas/cas.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.POS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/cellfinder/cellfinder.py b/bigbio/biodatasets/cellfinder/cellfinder.py index 9987ee5f..04b36b52 100644 --- a/bigbio/biodatasets/cellfinder/cellfinder.py +++ b/bigbio/biodatasets/cellfinder/cellfinder.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py index b7edd94f..aeb5f48b 100644 --- a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py +++ b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py @@ -25,7 +25,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import parse_brat_file -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemdner/chemdner.py b/bigbio/biodatasets/chemdner/chemdner.py index c1ec7c88..4e237b6b 100644 --- a/bigbio/biodatasets/chemdner/chemdner.py +++ b/bigbio/biodatasets/chemdner/chemdner.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemprot/chemprot.py b/bigbio/biodatasets/chemprot/chemprot.py index c91d5aa8..fc2aa679 100644 --- a/bigbio/biodatasets/chemprot/chemprot.py +++ b/bigbio/biodatasets/chemprot/chemprot.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index cc1b3d7a..da93b98d 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py index 63efad00..28169f96 100644 --- a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py +++ b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py @@ -27,7 +27,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -60,11 +60,11 @@ _URLS = { _DATASETNAME: [ - "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip"] + "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip" + ] } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, - Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] _SOURCE_VERSION = "1.0.0" @@ -73,8 +73,8 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): """ - The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes - 151 PubMed abstracts with both mention-level and document-level annotations. + The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes + 151 PubMed abstracts with both mention-level and document-level annotations. They are selected because both have a focus on human genes. """ @@ -95,7 +95,7 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): description="citation_gia_test_collection BigBio schema", schema="bigbio_kb", subset_id="citation_gia_test_collection", - ) + ), ] DEFAULT_CONFIG_NAME = "citation_gia_test_collection_source" @@ -127,7 +127,7 @@ def _info(self) -> datasets.DatasetInfo: } ], } - ] + ], } ) @@ -151,16 +151,18 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ - "filepath": os.path.join(data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml"), + "filepath": os.path.join( + data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml" + ), "split": "NLMIAT", }, ), ] def _get_entities(self, annot_d: dict) -> dict: - '''' + """' Converts annotation dict to entity dict. - ''' + """ ent = { "id": str(uuid.uuid4()), "type": annot_d["type"], @@ -176,13 +178,15 @@ def _get_entities(self, annot_d: dict) -> dict: return ent - def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) -> List[int]: - ''' - Extracts child text offsets from parent text for entities. + def _get_offsets_entities( + child, parent_text: str, child_text: str, offset: int + ) -> List[int]: + """ + Extracts child text offsets from parent text for entities. Some offsets that were present in the datset were wrong mainly because of string encodings. - Also a little fraction of parent strings doesn't contain its respective child strings. - Hence few assertion errors in the entitity offsets checking test. - ''' + Also a little fraction of parent strings doesn't contain its respective child strings. + Hence few assertion errors in the entitity offsets checking test. + """ if child_text in parent_text: index = parent_text.index(child_text) start = index + offset @@ -194,10 +198,10 @@ def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) return [start, end] def _process_annot(self, annot: ET.Element, passages: dict) -> dict: - '''' + """' Converts annotation XML Element to Python dict. - ''' - parent_text = " ".join([p['text'] for p in passages.values()]) + """ + parent_text = " ".join([p["text"] for p in passages.values()]) annot_d = dict() a_d = {a.tag: a.text for a in annot} @@ -206,21 +210,21 @@ def _process_annot(self, annot: ET.Element, passages: dict) -> dict: if a.tag == "location": offset = int(a.attrib["offset"]) annot_d["offsets"] = self._get_offsets_entities( - html.escape(parent_text[offset:]), - html.escape(a_d["text"]), offset) + html.escape(parent_text[offset:]), html.escape(a_d["text"]), offset + ) elif a.tag != "infon": annot_d[a.tag] = html.escape(a.text) else: annot_d[a.attrib["key"]] = html.escape(a.text) - + return annot_d def _parse_elem(self, elem: ET.Element) -> dict: - '''' + """' Converts document XML Element to Python dict. - ''' + """ elem_d = dict() passages = dict() annotations = elem.findall(".//annotation") @@ -231,8 +235,21 @@ def _parse_elem(self, elem: ET.Element) -> dict: for child in elem: if child.tag == "passage": - elem_d[child.tag].append({c.tag: html.escape(" ".join(list(filter( - lambda item: item, [t.strip('\n') for t in c.itertext()])))) for c in child}) + elem_d[child.tag].append( + { + c.tag: html.escape( + " ".join( + list( + filter( + lambda item: item, + [t.strip("\n") for t in c.itertext()], + ) + ) + ) + ) + for c in child + } + ) elif child.tag == "id": elem_d[child.tag] = html.escape(child.text) @@ -243,11 +260,10 @@ def _parse_elem(self, elem: ET.Element) -> dict: passages[infon] = passage elem_d["passages"] = passages - elem_d.pop('passage', None) + elem_d.pop("passage", None) for a in annotations: - elem_d["entities"].append( - self._process_annot(a, elem_d["passages"])) + elem_d["entities"].append(self._process_annot(a, elem_d["passages"])) return elem_d @@ -261,31 +277,35 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "passages": [ { "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"]["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], - "entities": [self._get_entities(a) for a in row["entities"]] + "entities": [self._get_entities(a) for a in row["entities"]], } elif self.config.schema == "bigbio_kb": @@ -294,7 +314,7 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "document_id": str(uuid.uuid4()), "passages": [ @@ -302,26 +322,29 @@ def _generate_examples(self, filepath, split): "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"] - ["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], "entities": [self._get_entities(a) for a in row["entities"]], "relations": [], "events": [], - "coreferences": [] + "coreferences": [], } diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index aea9c786..65671fcd 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 38f956da..5457155d 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index 42c23ef2..f5a3fc2b 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py index 970cdbb6..7ff25476 100644 --- a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py +++ b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DDI, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py index 9ae95846..8dcc4ac1 100644 --- a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py +++ b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DOCUMENT_INDEXING, Tags.DISEASE] _LANGUAGES = [Lang.EN, Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/distemist/distemist.py b/bigbio/biodatasets/distemist/distemist.py index 798c568a..1471653f 100644 --- a/bigbio/biodatasets/distemist/distemist.py +++ b/bigbio/biodatasets/distemist/distemist.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ebm_pico/ebm_pico.py b/bigbio/biodatasets/ebm_pico/ebm_pico.py index f20a3379..0abb1904 100644 --- a/bigbio/biodatasets/ebm_pico/ebm_pico.py +++ b/bigbio/biodatasets/ebm_pico/ebm_pico.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PICO, Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -66,7 +66,9 @@ _LICENSE = Licenses.UNKNOWN -_URLS = {_DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz"} +_URLS = { + _DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz" +} _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] @@ -139,7 +141,9 @@ def _partition(alist, indices): for _indices in multiple_indices: high_level_type = LABEL_DECODERS["starting_spans"][annotation_type][1] - fine_grained_type = LABEL_DECODERS["hierarchical_labels"][annotation_type][annotations[_indices[0]]] + fine_grained_type = LABEL_DECODERS["hierarchical_labels"][ + annotation_type + ][annotations[_indices[0]]] annotation_text = " ".join([tokenized[ind] for ind in _indices]) char_start = document_content.find(annotation_text) @@ -222,7 +226,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: data_dir = dl_manager.download_and_extract(urls) documents_folder = Path(data_dir) / "ebm_nlp_2_00" / "documents" - annotations_folder = Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + annotations_folder = ( + Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, @@ -242,7 +248,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - def _generate_examples(self, documents_folder, annotations_folder, split_folder: str) -> Tuple[int, Dict]: + def _generate_examples( + self, documents_folder, annotations_folder, split_folder: str + ) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" annotation_types = ["interventions", "outcomes", "participants"] @@ -265,11 +273,15 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: with open( f"{annotations_folder}/hierarchical_labels/{annotation_type}/{split_folder}/{document}" ) as fp: - annotation_dict[annotation_type] = [int(x) for x in fp.read().splitlines()] + annotation_dict[annotation_type] = [ + int(x) for x in fp.read().splitlines() + ] except OSError: annotation_dict[annotation_type] = [] - ents = _get_entities_pico(annotation_dict, tokenized=tokenized, document_content=document_content) + ents = _get_entities_pico( + annotation_dict, tokenized=tokenized, document_content=document_content + ) if self.config.schema == "source": @@ -280,7 +292,9 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: { "text": ent["annotation_text"], "annotation_type": ent["high_level_annotation_type"], - "fine_grained_annotation_type": ent["fine_grained_annotation_type"], + "fine_grained_annotation_type": ent[ + "fine_grained_annotation_type" + ], "start": ent["char_start"], "end": ent["char_end"], } diff --git a/bigbio/biodatasets/ehr_rel/ehr_rel.py b/bigbio/biodatasets/ehr_rel/ehr_rel.py index 2ad2f965..f9b0967e 100644 --- a/bigbio/biodatasets/ehr_rel/ehr_rel.py +++ b/bigbio/biodatasets/ehr_rel/ehr_rel.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index 289055a6..aab44638 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/euadr/euadr.py b/bigbio/biodatasets/euadr/euadr.py index e68a1feb..6923b5ad 100644 --- a/bigbio/biodatasets/euadr/euadr.py +++ b/bigbio/biodatasets/euadr/euadr.py @@ -7,7 +7,15 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [ + Tags.ADR, + Tags.DRUG, + Tags.GENE, + Tags.DISEASE, + Tags.VARIANT, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index e21ce4f4..d17594ca 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genetag/genetag.py b/bigbio/biodatasets/genetag/genetag.py index bfe13bf5..2faf3558 100644 --- a/bigbio/biodatasets/genetag/genetag.py +++ b/bigbio/biodatasets/genetag/genetag.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py index 81c83368..f010eb3a 100644 --- a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py +++ b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.PART_OF] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py index 7516e830..66b55cf8 100644 --- a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py +++ b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL, Tags.ANATOMY, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/gnormplus/gnormplus.py b/bigbio/biodatasets/gnormplus/gnormplus.py index 28d16d36..fc1a2367 100644 --- a/bigbio/biodatasets/gnormplus/gnormplus.py +++ b/bigbio/biodatasets/gnormplus/gnormplus.py @@ -27,7 +27,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py index 73439fe0..973bf970 100644 --- a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py +++ b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py @@ -21,7 +21,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -66,7 +66,7 @@ _URLs = { "corpus": "https://github.com/sb895/Hallmarks-of-Cancer/archive/refs/heads/master.zip", - "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz" + "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz", } _SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] @@ -74,17 +74,17 @@ _BIGBIO_VERSION = "1.0.0" _CLASS_NAMES = [ - 'evading growth suppressors', - 'tumor promoting inflammation', - 'enabling replicative immortality', - 'cellular energetics', - 'resisting cell death', - 'activating invasion and metastasis', - 'genomic instability and mutation', - 'none', - 'inducing angiogenesis', - 'sustaining proliferative signaling', - 'avoiding immune destruction' + "evading growth suppressors", + "tumor promoting inflammation", + "enabling replicative immortality", + "cellular energetics", + "resisting cell death", + "activating invasion and metastasis", + "genomic instability and mutation", + "none", + "inducing angiogenesis", + "sustaining proliferative signaling", + "avoiding immune destruction", ] @@ -144,21 +144,24 @@ def _split_generators(self, dl_manager): name=datasets.Split.TRAIN, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/train_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/train_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/test_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/test_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/dev_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/dev_pmid.tsv", }, ), ] @@ -184,13 +187,15 @@ def _generate_examples(self, corpuspath: Path, indicespath: Path): sentence, label = example_pair label = label.strip() - + if label == "": label = "none" multi_labels = [m_label.strip() for m_label in label.split("AND")] unique_multi_labels = { - m_label.split("--")[0].lower().lstrip() for m_label in multi_labels if m_label != "NULL" + m_label.split("--")[0].lower().lstrip() + for m_label in multi_labels + if m_label != "NULL" } arrow_file_unique_key = 100 * document_index + example_index diff --git a/bigbio/biodatasets/hprd50/hprd50.py b/bigbio/biodatasets/hprd50/hprd50.py index 63de6001..834bc1c5 100644 --- a/bigbio/biodatasets/hprd50/hprd50.py +++ b/bigbio/biodatasets/hprd50/hprd50.py @@ -42,7 +42,7 @@ from bigbio.utils.license import Licenses # TODO: Add BibTeX citation -_TAGS = [] +_TAGS = [Tags.GENE, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/iepa/iepa.py b/bigbio/biodatasets/iepa/iepa.py index 15789356..be945fb6 100644 --- a/bigbio/biodatasets/iepa/iepa.py +++ b/bigbio/biodatasets/iepa/iepa.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DRUG, Tags.DDI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index a10a4298..9e03eaea 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/linnaeus/linnaeus.py b/bigbio/biodatasets/linnaeus/linnaeus.py index 4a079d1d..80518887 100644 --- a/bigbio/biodatasets/linnaeus/linnaeus.py +++ b/bigbio/biodatasets/linnaeus/linnaeus.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index ccc4eca8..560185a5 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -39,7 +39,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index 0db20bd5..cf572db0 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mayosrs/mayosrs.py b/bigbio/biodatasets/mayosrs/mayosrs.py index 160a6666..e0b63b87 100644 --- a/bigbio/biodatasets/mayosrs/mayosrs.py +++ b/bigbio/biodatasets/mayosrs/mayosrs.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index 4cdbc1d9..5e000263 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddialog/meddialog.py b/bigbio/biodatasets/meddialog/meddialog.py index 4d0e95b6..ee647d08 100644 --- a/bigbio/biodatasets/meddialog/meddialog.py +++ b/bigbio/biodatasets/meddialog/meddialog.py @@ -25,7 +25,7 @@ _DATASETNAME = "meddialog" -_TAGS = [] +_TAGS = [Tags.DIALOGUE] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddocan/meddocan.py b/bigbio/biodatasets/meddocan/meddocan.py index e1fb393d..d2dc14e9 100644 --- a/bigbio/biodatasets/meddocan/meddocan.py +++ b/bigbio/biodatasets/meddocan/meddocan.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medhop/medhop.py b/bigbio/biodatasets/medhop/medhop.py index 96c92639..19649008 100644 --- a/bigbio/biodatasets/medhop/medhop.py +++ b/bigbio/biodatasets/medhop/medhop.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 80ddfdef..48929faa 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py index 1c26254e..5af9b45b 100644 --- a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py +++ b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.FACTOID, Tags.DISEASE, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index a1322f7e..9c974663 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index f2daadbb..c4e07a69 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -8,27 +8,37 @@ "SPECIES" : "Species", "GENE" : "Gene, proteins, gene products, ...", "DISEASE" : "Disease", + "DRUG" : "Drug", "CHEMICAL" : "Chemical", - "UMLS" : "Unified Medical Language System", "COVID" : "Coronavirus disease 2019 (COVID-19)", "LEXICAL" : "Lexical data (e.g. word, verbs,...)", - "DECS" : "Descriptores en Ciencias de la Salud", - "QA_YESNO" : "QA with yes no answer", - "QA_FACTOID" : "QA with factoid answer", - "QA_LIST": "QA with list of factoid answer", - "QA_SUMMARY_ANSWER" : "QA with abstractive summary answer", - "QA_HOW" : "`How` question", - "QA_WHY" : "`Why` question", + "YESNO" : "QA with yes no answer", + "HOW" : "`How` question", + "WHY" : "`Why` question", + "FACTOID" : "QA with factoid answer", + "FACTOIND_LIST": "QA with list of factoid answer", + "ABSTRACTIVE" : "Abstractive summary/answer", + "EXTRACTIVE" : "Extractive summary/answer", + "CLOZE_TEST" : "Cloze test", "GRANT" : "Grants data", "PPI" : "Protein-protein interaction", - "QA_CLOZE" : "Cloze test", "MRC" : "Machine Reading Comprehension", - "QA_MULTIPLE_CHOICE" : "QA with multiple choice", + "MULTIPLE_CHOICE" : "QA with multiple choice", "NEGATION" : "Negation", "SPECULATION" : "Speculation", "EPIGENETICS" : "Epigenetics", "PART_OF" : "Part-of relations", "CANCER" : "Cancer", - "PATHWAY" : "Pathway", - "MESH" : "Medical Subject Headings (MeSH)" + "PATHWAY_CURATION" : "Pathway curation", + "DOCUMENT_INDEXING" : "Document indexing", + "ADR" : "Adverse Drug Reaction", + "POS" : "Part of Speech Tagging", + "PICO" : "(P)articipants, (I)nterventions, and (O)utcomes", + "DDI" : "Drug-drug interaction", + "CONCEPT" : "Concept, Multi-word expression (MWE)", + "SENTENCE" : "Sentence", + "PROCEDURE" : "Procedure, treatment", + "DIALOGUE" : "Dialogue", + "ANONYMIZATION" : "Anonymizatio (De-identification)" + "SENTIMENT_ANALYSIS" : "Sentiment analysis" } From 9ebbdf4d2cfd77f2a26c7a4b526751a0541fbb80 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:09:42 +0200 Subject: [PATCH 05/20] new tags --- bigbio/utils/resources/tags.json | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index c4e07a69..2dcc382e 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -2,6 +2,7 @@ "SOCIAL_MEDIA" : "Social media", "ANATOMY" : "Anatomy", "ORGANISM" : "Organism", + "ORGAN" : "Organ", "VARIANT" : "Variant/Mutation", "TISSUE" : "Tissue", "CELL" : "Cells and/or cell lines", @@ -39,6 +40,8 @@ "SENTENCE" : "Sentence", "PROCEDURE" : "Procedure, treatment", "DIALOGUE" : "Dialogue", - "ANONYMIZATION" : "Anonymizatio (De-identification)" - "SENTIMENT_ANALYSIS" : "Sentiment analysis" + "ANONYMIZATION" : "Anonymizatio (De-identification)", + "SENTIMENT_ANALYSIS" : "Sentiment analysis", + "MIRNA" : "miRNA", + "ABBREVIATION" : "Abbreviation" } From 02deb9a37a62ecb926773c5cf0b2ecf0a2cac1c2 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:10:50 +0200 Subject: [PATCH 06/20] add tags --- bigbio/biodatasets/medal/medal.py | 15 +++++++++------ bigbio/biodatasets/meqsum/meqsum.py | 2 +- bigbio/biodatasets/minimayosrs/minimayosrs.py | 2 +- bigbio/biodatasets/mlee/mlee.py | 2 +- bigbio/biodatasets/msh_wsd/msh_wsd.py | 2 +- bigbio/biodatasets/muchmore/muchmore.py | 2 +- .../biodatasets/multi_xscience/multi_xscience.py | 2 +- .../mutation_finder/mutation_finder.py | 2 +- bigbio/biodatasets/nagel/nagel.py | 2 +- bigbio/biodatasets/ncbi_disease/ncbi_disease.py | 2 +- bigbio/biodatasets/nlm_gene/nlm_gene.py | 2 +- bigbio/biodatasets/nlm_wsd/nlm_wsd.py | 2 +- bigbio/biodatasets/nlmchem/nlmchem.py | 2 +- .../ntcir_13_medweb/ntcir_13_medweb.py | 2 +- bigbio/biodatasets/osiris/osiris.py | 2 +- bigbio/biodatasets/pcr/pcr.py | 2 +- bigbio/biodatasets/pdr/pdr.py | 2 +- 17 files changed, 25 insertions(+), 22 deletions(-) diff --git a/bigbio/biodatasets/medal/medal.py b/bigbio/biodatasets/medal/medal.py index 03df40fc..0ceeb8cb 100644 --- a/bigbio/biodatasets/medal/medal.py +++ b/bigbio/biodatasets/medal/medal.py @@ -31,7 +31,7 @@ logger = datasets.logging.get_logger(__name__) -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -74,10 +74,11 @@ _BIGBIO_VERSION = "1.0.0" + class MedalDataset(datasets.GeneratorBasedBuilder): """The Repository for Medical Dataset for Abbreviation Disambiguation for Natural Language Understanding (MeDAL) is -a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding -pre-training in the medical domain.""" + a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding + pre-training in the medical domain.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) @@ -124,7 +125,9 @@ def _info(self) -> datasets.DatasetInfo: citation=_CITATION, ) - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" urls = _URLS @@ -169,7 +172,7 @@ def _generate_offsets(self, text, location): Returns ------- - dict + dict "word": str, "offsets": tuple (int, int) """ @@ -179,7 +182,7 @@ def _generate_offsets(self, text, location): offset_end = offset_start + len(word) # return word and offsets - return {"word":word, "offsets":(offset_start, offset_end)} + return {"word": word, "offsets": (offset_start, offset_end)} def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" diff --git a/bigbio/biodatasets/meqsum/meqsum.py b/bigbio/biodatasets/meqsum/meqsum.py index 21fe7f58..a2a3d8be 100644 --- a/bigbio/biodatasets/meqsum/meqsum.py +++ b/bigbio/biodatasets/meqsum/meqsum.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/minimayosrs/minimayosrs.py b/bigbio/biodatasets/minimayosrs/minimayosrs.py index f8f095bb..cd2eba50 100644 --- a/bigbio/biodatasets/minimayosrs/minimayosrs.py +++ b/bigbio/biodatasets/minimayosrs/minimayosrs.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mlee/mlee.py b/bigbio/biodatasets/mlee/mlee.py index 47807962..5582f219 100644 --- a/bigbio/biodatasets/mlee/mlee.py +++ b/bigbio/biodatasets/mlee/mlee.py @@ -32,7 +32,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/msh_wsd/msh_wsd.py b/bigbio/biodatasets/msh_wsd/msh_wsd.py index 2195106a..b4765633 100644 --- a/bigbio/biodatasets/msh_wsd/msh_wsd.py +++ b/bigbio/biodatasets/msh_wsd/msh_wsd.py @@ -43,7 +43,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/muchmore/muchmore.py b/bigbio/biodatasets/muchmore/muchmore.py index 3ae9d047..bc5d1335 100644 --- a/bigbio/biodatasets/muchmore/muchmore.py +++ b/bigbio/biodatasets/muchmore/muchmore.py @@ -76,7 +76,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN, Lang.DE] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/multi_xscience/multi_xscience.py b/bigbio/biodatasets/multi_xscience/multi_xscience.py index 6be1347b..a5f9fcd3 100644 --- a/bigbio/biodatasets/multi_xscience/multi_xscience.py +++ b/bigbio/biodatasets/multi_xscience/multi_xscience.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mutation_finder/mutation_finder.py b/bigbio/biodatasets/mutation_finder/mutation_finder.py index 5dc113a7..e14b715a 100644 --- a/bigbio/biodatasets/mutation_finder/mutation_finder.py +++ b/bigbio/biodatasets/mutation_finder/mutation_finder.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index fd8a05f6..0f5990ff 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.MUTATION, Tags.GENE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py index 1efee20e..c2b1d748 100644 --- a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py +++ b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_gene/nlm_gene.py b/bigbio/biodatasets/nlm_gene/nlm_gene.py index 1a6c0e06..d084ad47 100644 --- a/bigbio/biodatasets/nlm_gene/nlm_gene.py +++ b/bigbio/biodatasets/nlm_gene/nlm_gene.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py index 7437d8df..3882db16 100644 --- a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py +++ b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py @@ -56,7 +56,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/nlmchem/nlmchem.py b/bigbio/biodatasets/nlmchem/nlmchem.py index e816e378..10472c24 100644 --- a/bigbio/biodatasets/nlmchem/nlmchem.py +++ b/bigbio/biodatasets/nlmchem/nlmchem.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index ff873473..26e972f2 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -66,7 +66,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.SOCIAL_MEDIA, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/osiris/osiris.py b/bigbio/biodatasets/osiris/osiris.py index b8326256..19b0872c 100644 --- a/bigbio/biodatasets/osiris/osiris.py +++ b/bigbio/biodatasets/osiris/osiris.py @@ -27,7 +27,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pcr/pcr.py b/bigbio/biodatasets/pcr/pcr.py index 28e3987e..8295b177 100644 --- a/bigbio/biodatasets/pcr/pcr.py +++ b/bigbio/biodatasets/pcr/pcr.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pdr/pdr.py b/bigbio/biodatasets/pdr/pdr.py index 1c7bb9f7..efa60062 100644 --- a/bigbio/biodatasets/pdr/pdr.py +++ b/bigbio/biodatasets/pdr/pdr.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 5d42aebe66ae44456f1f1ed04d0671164ce0fae5 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:14:42 +0200 Subject: [PATCH 07/20] add tags --- bigbio/biodatasets/mirna/mirna.py | 742 +++++++++++++++--------------- 1 file changed, 380 insertions(+), 362 deletions(-) diff --git a/bigbio/biodatasets/mirna/mirna.py b/bigbio/biodatasets/mirna/mirna.py index aa7e7279..44babefe 100644 --- a/bigbio/biodatasets/mirna/mirna.py +++ b/bigbio/biodatasets/mirna/mirna.py @@ -1,366 +1,384 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import xml.etree.ElementTree as ET -from typing import Dict, Iterator, List, Tuple - -import datasets - -from bigbio.utils import schemas -from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tags, Tasks +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import xml.etree.ElementTree as ET +from typing import Dict, Iterator, List, Tuple + +import datasets + +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses - -_TAGS = [] + +_TAGS = [Tags.MIRNA, Tags.GENE, Tags.DISEASE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False -_CITATION = """\ -@Article{Bagewadi2014, -author={Bagewadi, Shweta -and Bobi{\'{c}}, Tamara -and Hofmann-Apitius, Martin -and Fluck, Juliane -and Klinger, Roman}, -title={Detecting miRNA Mentions and Relations in Biomedical Literature}, -journal={F1000Research}, -year={2014}, -month={Aug}, -day={28}, -publisher={F1000Research}, -volume={3}, -pages={205-205}, -keywords={MicroRNAs; corpus; prediction algorithms}, -abstract={ - INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional - gene expression regulators, participating in a wide spectrum of regulatory events such as - apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal - physiology, their dysregulation is implicated in a vast array of diseases. Dissection of - miRNA-related associations are valuable for contemplating their mechanism in diseases, - leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. - MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely - available as unstructured text. Manual retrieval of these associations can be labor-intensive - due to steadily growing number of publications. Additionally, most of the published miRNA - entity recognition methods are keyword based, further subjected to manual inspection for - retrieval of relations. Despite the fact that several databases host miRNA-associations - derived from text, lower sensitivity and lack of published details for miRNA entity - recognition and associated relations identification has motivated the need for developing - comprehensive methods that are freely available for the scientific community. Additionally, - the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the - available systems. We propose methods to automatically extract mentions of miRNAs, species, - genes/proteins, disease, and relations from scientific literature. Our generated corpora, - along with dictionaries, and miRNA regular expression are freely available for academic - purposes. To our knowledge, these resources are the most comprehensive developed so far. - RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and - precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an - F1 score of up to 0.76. A comparison of the information extracted by our approach to - the databases miR2Disease and miRSel for the extraction of Alzheimer's disease - related relations shows the capability of our proposed methods in identifying correct - relations with improved sensitivity. The published resources and described methods can - help the researchers for maximal retrieval of miRNA-relations and generation of - miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation - guidelines, developed dictionaries, and supplementary files are available at - http://www.scai.fraunhofer.de/mirna-corpora.html. -}, -note={26535109[pmid]}, -note={PMC4602280[pmcid]}, -issn={2046-1402}, -url={https://pubmed.ncbi.nlm.nih.gov/26535109}, -language={eng} -} -""" - -_DATASETNAME = "mirna" - -_DESCRIPTION = """\ -The corpus consists of 301 Medline citations. The documents were screened for -mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually -annotated. The corpus comprises of two separate files, a train and a test set, coming -from 201 and 100 documents respectively. -""" - -_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" - -_LICENSE = Licenses.CC_BY_NC_3p0 - -_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" - -_URLs = { - "source": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, - "bigbio_kb": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, -} - -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] -_SOURCE_VERSION = "1.0.0" -_BIGBIO_VERSION = "1.0.0" - - -class miRNADataset(datasets.GeneratorBasedBuilder): - """mirna""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - - BUILDER_CONFIGS = [ - BigBioConfig( - name="mirna_source", - version=SOURCE_VERSION, - description="mirna source schema", - schema="source", - subset_id="mirna", - ), - BigBioConfig( - name="mirna_bigbio_kb", - version=BIGBIO_VERSION, - description="mirna BigBio schema", - schema="bigbio_kb", - subset_id="mirna", - ), - ] - - DEFAULT_CONFIG_NAME = "mirna_source" - - def _info(self): - - if self.config.schema == "source": - - features = datasets.Features( - { - "passages": [ - { - "document_id": datasets.Value("string"), - "type": datasets.Value("string"), - "text": datasets.Value("string"), - "offset": datasets.Value("int32"), - "entities": [ - { - "id": datasets.Value("string"), - "offsets": [[datasets.Value("int32")]], - "text": [datasets.Value("string")], - "type": datasets.Value("string"), - "normalized": [ - { - "db_name": datasets.Value("string"), - "db_id": datasets.Value("string"), - } - ], - } - ], - } - ] - } - ) - - elif self.config.schema == "bigbio_kb": - features = schemas.kb_features - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - supervised_keys=None, - homepage=_HOMEPAGE, - license=str(_LICENSE), - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - - my_urls = _URLs[self.config.schema] - - path_xml_train = dl_manager.download(my_urls["train"]) - path_xml_test = dl_manager.download(my_urls["test"]) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_train, - "split": "train", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_test, - "split": "test", - }, - ), - ] - - def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: - - sentences: List[Dict] = [] - entities: List[List[Dict]] = [] - relations: List[List[Dict]] = [] - - text_total_length = 0 - - po_start = 0 - - # Get sentences of the document - for _, s in enumerate(d): - - # annotation used only for document indexing - if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: - continue - - # annotation used only for document indexing - if len(s) <= 0: - continue - - text_total_length += len(s.attrib["text"]) + 1 - - po_end = po_start + len(s.attrib["text"]) - - start = po_start - - dp = { - "text": s.attrib["text"], - "type": "title" if ".s0" in s.attrib["id"] else "abstract", - "offsets": [(po_start, po_end)], - "offset": 0, # original offset - } - - po_start = po_end + 1 - - sentences.append(dp) - - pe = [] # entities - re = [] # relations - - # For each entity - for a in s: - - # If correspond to a entity - if a.tag == "entity": - - length = len(a.attrib["text"]) - - if a.attrib["text"] is None or length <= 0: - continue - - # no in-text annotation: only for document indexing - if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: - continue - - startOffset, endOffset = a.attrib["charOffset"].split("-") - startOffset, endOffset = int(startOffset), int(endOffset) - - pe.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "text": (a.attrib["text"],), - "offsets": [(start + startOffset, start + endOffset + 1)], - "normalized": [{"db_name": "miRNA-corpus", "db_id": a.attrib["id"]}], - } - ) - - # If correspond to relation pair - elif a.tag == "pair": - - re.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "arg1_id": a.attrib["e1"], - "arg2_id": a.attrib["e2"], - "normalized": [], - } - ) - - entities.append(pe) - relations.append(re) - - return sentences, entities, relations - - def _generate_examples( - self, - filepath: str, - split: str, - ) -> Iterator[Tuple[int, Dict]]: - """Yields examples as (key, example) tuples.""" - - reader = ET.fromstring(open(str(filepath), "r").read()) - - if self.config.schema == "source": - - for uid, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - for p, pe, re in zip(sentences, sentences_entities, relations): - - p.pop("offsets") # BioC has only start for passages offsets - - p["document_id"] = doc.attrib["id"] - p["entities"] = pe # BioC has per passage entities - - yield uid, {"passages": sentences} - - elif self.config.schema == "bigbio_kb": - - uid = 0 - - for idx, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - # global id - uid += 1 - - # unpack per-sentence entities - entities = [e for pe in sentences_entities for e in pe] - - for p in sentences: - p.pop("offset") # drop original offset - p["text"] = (p["text"],) # text in sentence is Sequence - p["id"] = uid - uid += 1 - - for e in entities: - e["id"] = uid - uid += 1 - - # unpack per-sentence relations - relations = [r for re in relations for r in re] - - for r in relations: - r["id"] = uid - uid += 1 - - yield idx, { - "id": uid, - "document_id": doc.attrib["id"], - "passages": sentences, - "entities": entities, - "events": [], - "coreferences": [], - "relations": relations, - } +_CITATION = """\ +@Article{Bagewadi2014, +author={Bagewadi, Shweta +and Bobi{\'{c}}, Tamara +and Hofmann-Apitius, Martin +and Fluck, Juliane +and Klinger, Roman}, +title={Detecting miRNA Mentions and Relations in Biomedical Literature}, +journal={F1000Research}, +year={2014}, +month={Aug}, +day={28}, +publisher={F1000Research}, +volume={3}, +pages={205-205}, +keywords={MicroRNAs; corpus; prediction algorithms}, +abstract={ + INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional + gene expression regulators, participating in a wide spectrum of regulatory events such as + apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal + physiology, their dysregulation is implicated in a vast array of diseases. Dissection of + miRNA-related associations are valuable for contemplating their mechanism in diseases, + leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. + MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely + available as unstructured text. Manual retrieval of these associations can be labor-intensive + due to steadily growing number of publications. Additionally, most of the published miRNA + entity recognition methods are keyword based, further subjected to manual inspection for + retrieval of relations. Despite the fact that several databases host miRNA-associations + derived from text, lower sensitivity and lack of published details for miRNA entity + recognition and associated relations identification has motivated the need for developing + comprehensive methods that are freely available for the scientific community. Additionally, + the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the + available systems. We propose methods to automatically extract mentions of miRNAs, species, + genes/proteins, disease, and relations from scientific literature. Our generated corpora, + along with dictionaries, and miRNA regular expression are freely available for academic + purposes. To our knowledge, these resources are the most comprehensive developed so far. + RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and + precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an + F1 score of up to 0.76. A comparison of the information extracted by our approach to + the databases miR2Disease and miRSel for the extraction of Alzheimer's disease + related relations shows the capability of our proposed methods in identifying correct + relations with improved sensitivity. The published resources and described methods can + help the researchers for maximal retrieval of miRNA-relations and generation of + miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation + guidelines, developed dictionaries, and supplementary files are available at + http://www.scai.fraunhofer.de/mirna-corpora.html. +}, +note={26535109[pmid]}, +note={PMC4602280[pmcid]}, +issn={2046-1402}, +url={https://pubmed.ncbi.nlm.nih.gov/26535109}, +language={eng} +} +""" + +_DATASETNAME = "mirna" + +_DESCRIPTION = """\ +The corpus consists of 301 Medline citations. The documents were screened for +mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually +annotated. The corpus comprises of two separate files, a train and a test set, coming +from 201 and 100 documents respectively. +""" + +_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" + +_LICENSE = Licenses.CC_BY_NC_3p0 + +_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" + +_URLs = { + "source": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, + "bigbio_kb": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class miRNADataset(datasets.GeneratorBasedBuilder): + """mirna""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="mirna_source", + version=SOURCE_VERSION, + description="mirna source schema", + schema="source", + subset_id="mirna", + ), + BigBioConfig( + name="mirna_bigbio_kb", + version=BIGBIO_VERSION, + description="mirna BigBio schema", + schema="bigbio_kb", + subset_id="mirna", + ), + ] + + DEFAULT_CONFIG_NAME = "mirna_source" + + def _info(self): + + if self.config.schema == "source": + + features = datasets.Features( + { + "passages": [ + { + "document_id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Value("string"), + "offset": datasets.Value("int32"), + "entities": [ + { + "id": datasets.Value("string"), + "offsets": [[datasets.Value("int32")]], + "text": [datasets.Value("string")], + "type": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } + ] + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + my_urls = _URLs[self.config.schema] + + path_xml_train = dl_manager.download(my_urls["train"]) + path_xml_test = dl_manager.download(my_urls["test"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_train, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_test, + "split": "test", + }, + ), + ] + + def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: + + sentences: List[Dict] = [] + entities: List[List[Dict]] = [] + relations: List[List[Dict]] = [] + + text_total_length = 0 + + po_start = 0 + + # Get sentences of the document + for _, s in enumerate(d): + + # annotation used only for document indexing + if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: + continue + + # annotation used only for document indexing + if len(s) <= 0: + continue + + text_total_length += len(s.attrib["text"]) + 1 + + po_end = po_start + len(s.attrib["text"]) + + start = po_start + + dp = { + "text": s.attrib["text"], + "type": "title" if ".s0" in s.attrib["id"] else "abstract", + "offsets": [(po_start, po_end)], + "offset": 0, # original offset + } + + po_start = po_end + 1 + + sentences.append(dp) + + pe = [] # entities + re = [] # relations + + # For each entity + for a in s: + + # If correspond to a entity + if a.tag == "entity": + + length = len(a.attrib["text"]) + + if a.attrib["text"] is None or length <= 0: + continue + + # no in-text annotation: only for document indexing + if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: + continue + + startOffset, endOffset = a.attrib["charOffset"].split("-") + startOffset, endOffset = int(startOffset), int(endOffset) + + pe.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "text": (a.attrib["text"],), + "offsets": [(start + startOffset, start + endOffset + 1)], + "normalized": [ + {"db_name": "miRNA-corpus", "db_id": a.attrib["id"]} + ], + } + ) + + # If correspond to relation pair + elif a.tag == "pair": + + re.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "arg1_id": a.attrib["e1"], + "arg2_id": a.attrib["e2"], + "normalized": [], + } + ) + + entities.append(pe) + relations.append(re) + + return sentences, entities, relations + + def _generate_examples( + self, + filepath: str, + split: str, + ) -> Iterator[Tuple[int, Dict]]: + """Yields examples as (key, example) tuples.""" + + reader = ET.fromstring(open(str(filepath), "r").read()) + + if self.config.schema == "source": + + for uid, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + for p, pe, re in zip(sentences, sentences_entities, relations): + + p.pop("offsets") # BioC has only start for passages offsets + + p["document_id"] = doc.attrib["id"] + p["entities"] = pe # BioC has per passage entities + + yield uid, {"passages": sentences} + + elif self.config.schema == "bigbio_kb": + + uid = 0 + + for idx, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + # global id + uid += 1 + + # unpack per-sentence entities + entities = [e for pe in sentences_entities for e in pe] + + for p in sentences: + p.pop("offset") # drop original offset + p["text"] = (p["text"],) # text in sentence is Sequence + p["id"] = uid + uid += 1 + + for e in entities: + e["id"] = uid + uid += 1 + + # unpack per-sentence relations + relations = [r for re in relations for r in re] + + for r in relations: + r["id"] = uid + uid += 1 + + yield idx, { + "id": uid, + "document_id": doc.attrib["id"], + "passages": sentences, + "entities": entities, + "events": [], + "coreferences": [], + "relations": relations, + } From 52c561d3ee3170d49a7e867880a9883d24797c2a Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:29:25 +0200 Subject: [PATCH 08/20] new tags --- bigbio/utils/resources/tags.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 2dcc382e..51ed9f1b 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -43,5 +43,7 @@ "ANONYMIZATION" : "Anonymizatio (De-identification)", "SENTIMENT_ANALYSIS" : "Sentiment analysis", "MIRNA" : "miRNA", - "ABBREVIATION" : "Abbreviation" + "ABBREVIATION" : "Abbreviation", + "FACT_CHECKING" : "Fact-checking", + "INTENT" : "Intent" } From 14fff7959cf7625abb58fe05f7517da17c5e6c3f Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:29:33 +0200 Subject: [PATCH 09/20] complete adding tags --- bigbio/biodatasets/ask_a_patient/ask_a_patient.py | 2 +- bigbio/biodatasets/pharmaconer/pharmaconer.py | 2 +- bigbio/biodatasets/pho_ner/pho_ner.py | 2 +- bigbio/biodatasets/pico_extraction/pico_extraction.py | 2 +- bigbio/biodatasets/progene/progene.py | 2 +- bigbio/biodatasets/psytar/psytar.py | 2 +- bigbio/biodatasets/pubhealth/pubhealth.py | 2 +- bigbio/biodatasets/pubmed_qa/pubmed_qa.py | 4 ++-- bigbio/biodatasets/pubtator_central/pubtator_central.py | 2 +- bigbio/biodatasets/quaero/quaero.py | 9 ++++++++- bigbio/biodatasets/scai_chemical/scai_chemical.py | 2 +- bigbio/biodatasets/scai_disease/scai_disease.py | 2 +- bigbio/biodatasets/scicite/scicite.py | 2 +- bigbio/biodatasets/scifact/scifact.py | 2 +- bigbio/biodatasets/sciq/sciq.py | 2 +- bigbio/biodatasets/seth_corpus/seth_corpus.py | 2 +- bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py | 2 +- .../swedish_medical_ner/swedish_medical_ner.py | 2 +- bigbio/biodatasets/thomas2011/thomas2011.py | 2 +- bigbio/biodatasets/tmvar_v1/tmvar_v1.py | 2 +- bigbio/biodatasets/tmvar_v2/tmvar_v2.py | 2 +- bigbio/biodatasets/tmvar_v3/tmvar_v3.py | 2 +- bigbio/biodatasets/twadrl/twadrl.py | 2 +- bigbio/biodatasets/umnsrs/umnsrs.py | 2 +- bigbio/biodatasets/verspoor_2013/verspoor_2013.py | 2 +- 25 files changed, 33 insertions(+), 26 deletions(-) diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index bd89c502..53bc81a9 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -26,7 +26,7 @@ _DATASETNAME = "ask_a_patient" -_TAGS = [Tags.SOCIAL_MEDIA, Tags.ADR] +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pharmaconer/pharmaconer.py b/bigbio/biodatasets/pharmaconer/pharmaconer.py index f20fd87f..ac5aade0 100644 --- a/bigbio/biodatasets/pharmaconer/pharmaconer.py +++ b/bigbio/biodatasets/pharmaconer/pharmaconer.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 4ae3852d..32e0e4e0 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tag.DISEASE, Tag.COVID] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pico_extraction/pico_extraction.py b/bigbio/biodatasets/pico_extraction/pico_extraction.py index ab4c36f2..7fba82ab 100644 --- a/bigbio/biodatasets/pico_extraction/pico_extraction.py +++ b/bigbio/biodatasets/pico_extraction/pico_extraction.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PICO] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/progene/progene.py b/bigbio/biodatasets/progene/progene.py index f1ce6223..3456fdd2 100644 --- a/bigbio/biodatasets/progene/progene.py +++ b/bigbio/biodatasets/progene/progene.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/psytar/psytar.py b/bigbio/biodatasets/psytar/psytar.py index e0931739..fef90eed 100644 --- a/bigbio/biodatasets/psytar/psytar.py +++ b/bigbio/biodatasets/psytar/psytar.py @@ -54,7 +54,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/pubhealth/pubhealth.py b/bigbio/biodatasets/pubhealth/pubhealth.py index 5320c16e..6d64352e 100644 --- a/bigbio/biodatasets/pubhealth/pubhealth.py +++ b/bigbio/biodatasets/pubhealth/pubhealth.py @@ -31,7 +31,7 @@ logger = datasets.utils.logging.get_logger(__name__) -_TAGS = [] +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py index 4bdf1506..7203b608 100644 --- a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py +++ b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py @@ -27,10 +27,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.YESNO, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pubtator_central/pubtator_central.py b/bigbio/biodatasets/pubtator_central/pubtator_central.py index 50048a96..8ba512d3 100644 --- a/bigbio/biodatasets/pubtator_central/pubtator_central.py +++ b/bigbio/biodatasets/pubtator_central/pubtator_central.py @@ -51,7 +51,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CELL, Tags.SPECIES, Tags.VARIANT, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 29558a11..2d949b02 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -9,7 +9,14 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [ + Tags.CHEMICAL, + Tags.ANATOMY, + Tags.DRUG, + Tags.SPECIES, + Tags.PROCEDURE, + Tags.DISEASE, +] _LANGUAGES = [Lang.FR] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_chemical/scai_chemical.py b/bigbio/biodatasets/scai_chemical/scai_chemical.py index e3c4ef80..2935b9a0 100644 --- a/bigbio/biodatasets/scai_chemical/scai_chemical.py +++ b/bigbio/biodatasets/scai_chemical/scai_chemical.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_disease/scai_disease.py b/bigbio/biodatasets/scai_disease/scai_disease.py index 4b7905d9..d4bdb3f9 100644 --- a/bigbio/biodatasets/scai_disease/scai_disease.py +++ b/bigbio/biodatasets/scai_disease/scai_disease.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scicite/scicite.py b/bigbio/biodatasets/scicite/scicite.py index 3a0f3284..0626f7b7 100644 --- a/bigbio/biodatasets/scicite/scicite.py +++ b/bigbio/biodatasets/scicite/scicite.py @@ -40,7 +40,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.INTENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scifact/scifact.py b/bigbio/biodatasets/scifact/scifact.py index 14a48e21..c537fcfb 100644 --- a/bigbio/biodatasets/scifact/scifact.py +++ b/bigbio/biodatasets/scifact/scifact.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/sciq/sciq.py b/bigbio/biodatasets/sciq/sciq.py index 54f44e62..eee43620 100644 --- a/bigbio/biodatasets/sciq/sciq.py +++ b/bigbio/biodatasets/sciq/sciq.py @@ -25,7 +25,7 @@ _DATASETNAME = "sciq" -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/seth_corpus/seth_corpus.py b/bigbio/biodatasets/seth_corpus/seth_corpus.py index 82b84b53..fbf5c754 100644 --- a/bigbio/biodatasets/seth_corpus/seth_corpus.py +++ b/bigbio/biodatasets/seth_corpus/seth_corpus.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py index a6b16123..3936b230 100644 --- a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py +++ b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py @@ -67,7 +67,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ADR, Tags.DRUG, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py index 1ec26aca..4ece98c1 100644 --- a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py +++ b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py @@ -43,7 +43,7 @@ _DATASETNAME = "swedish_medical_ner" -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.ANATOMY] _LANGUAGES = [Lang.SV] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index d2747c4a..d55c650a 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -53,7 +53,7 @@ from bigbio.utils.license import CustomLicense # TODO: Add BibTeX citation -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py index f6cb22e5..93c910e8 100644 --- a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py +++ b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py index b522524b..a3518bbf 100644 --- a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py +++ b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py index 1e2bb9dd..197a33fc 100644 --- a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py +++ b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py @@ -44,7 +44,7 @@ copyright = {Creative Commons Attribution 4.0 International} } """ -_TAGS = [] +_TAGS = [Tags.VARIANT, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/twadrl/twadrl.py b/bigbio/biodatasets/twadrl/twadrl.py index abfb83a9..1735b2ec 100644 --- a/bigbio/biodatasets/twadrl/twadrl.py +++ b/bigbio/biodatasets/twadrl/twadrl.py @@ -26,7 +26,7 @@ _DATASETNAME = "twadrl" -_TAGS = [] +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/umnsrs/umnsrs.py b/bigbio/biodatasets/umnsrs/umnsrs.py index 8bb39e55..07f603e8 100644 --- a/bigbio/biodatasets/umnsrs/umnsrs.py +++ b/bigbio/biodatasets/umnsrs/umnsrs.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py index 58f61313..2464a95f 100644 --- a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py +++ b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 71ceed5f1fddd71d5895eb9499983ade88320ab8 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:32:13 +0200 Subject: [PATCH 10/20] ORGANISM is SPECIES, SOCIAL_MEDIA belongs to `source` not `subtask` --- bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py | 2 +- bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py | 2 +- bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py | 2 +- bigbio/biodatasets/cadec/cadec.py | 2 +- bigbio/biodatasets/cord_ner/cord_ner.py | 2 +- bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py index c5e0734d..775c56fc 100644 --- a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py +++ b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py @@ -31,7 +31,7 @@ Tags.DISEASE, Tags.GENE, Tags.CHEMICAL, - Tags.ORGANISM, + Tags.SPECIES, Tags.SPECULATION, Tags.NEGATION, ] diff --git a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py index 1241b22c..bc61c02e 100644 --- a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py +++ b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py @@ -28,7 +28,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.GENE, Tags.ORGANISM, Tags.CELL, Tags.TISSUE] +_TAGS = [Tags.GENE, Tags.SPECIES, Tags.CELL, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py index 8d464b85..f399df66 100644 --- a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py +++ b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py @@ -27,7 +27,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.ORGANISM] +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index c604c092..f9c60446 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.SOCIAL_MEDIA, Tags.DISEASE, Tags.ADR, Tags.DRUG] +_TAGS = [Tags.DISEASE, Tags.ADR, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 5457155d..f29c298e 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.ORGANISM] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index 26e972f2..35e93d53 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -66,7 +66,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.SOCIAL_MEDIA, Tags.SENTIMENT_ANALYSIS] +_TAGS = [Tags.DISEASE, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True From 1b0b89eae1c383cdfb56ab2bfe06fd4f1d5d1f2b Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:32:37 +0200 Subject: [PATCH 11/20] rm ORGANISM --- bigbio/utils/resources/tags.json | 1 - 1 file changed, 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 51ed9f1b..46e62468 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -1,7 +1,6 @@ { "SOCIAL_MEDIA" : "Social media", "ANATOMY" : "Anatomy", - "ORGANISM" : "Organism", "ORGAN" : "Organ", "VARIANT" : "Variant/Mutation", "TISSUE" : "Tissue", From f5cc0525c39b0d76cfe766f2ac959548767891fa Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:17 +0200 Subject: [PATCH 12/20] add DIAGNOSIS tag --- bigbio/biodatasets/chia/chia.py | 2 +- bigbio/biodatasets/codiesp/codiesp.py | 2 +- bigbio/biodatasets/ctebmsp/ctebmsp.py | 2 +- bigbio/biodatasets/essai/essai.py | 2 +- bigbio/biodatasets/evidence_inference/evidence_inference.py | 2 +- bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index da93b98d..b837bb91 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index 65671fcd..b9c551e0 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE] +_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index f5a3fc2b..0831f48b 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index aab44638..4cff31a1 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index d17594ca..cade748f 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.PROCEDURE] +_TAGS = [Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index cf572db0..8eb1891c 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False From f4b528e02486f06647640aba776f657523a325df Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:34 +0200 Subject: [PATCH 13/20] add n2c2 datasets --- bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py | 2 +- bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py | 2 +- bigbio/biodatasets/n2c2_2008/n2c2_2008.py | 2 +- bigbio/biodatasets/n2c2_2009/n2c2_2009.py | 2 +- bigbio/biodatasets/n2c2_2010/n2c2_2010.py | 2 +- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 2 +- bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py | 2 +- bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py index 9144f25f..cde53908 100644 --- a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py +++ b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py @@ -71,7 +71,7 @@ _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/14/5/550/720189 -_TAGS = [] +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py index 6e0fc920..9d0b1a99 100644 --- a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py +++ b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py @@ -69,7 +69,7 @@ _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/15/1/14/779738 -_TAGS = [] +_TAGS = [Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py index 4b3054ac..bb2f37e3 100644 --- a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py +++ b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py @@ -77,7 +77,7 @@ _DATASETNAME = "n2c2_2008" # https://academic.oup.com/jamia/article/16/4/561/766997 -_TAGS = [] +_TAGS = [Tags.DIAGNOSIS, Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py index 88f1e60c..742ce095 100644 --- a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py +++ b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py @@ -60,7 +60,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py index 549ac121..3b095e3f 100644 --- a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py +++ b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py @@ -55,7 +55,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DIAGNOSIS, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 67fc5e68..478fba48 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.TREATMENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py index 59411a29..0c18374c 100644 --- a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py +++ b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py index 13ddc19b..9862ea22 100644 --- a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py +++ b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py @@ -49,7 +49,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True From fbfdc7063ce71db88be2e321f1b8366692456ff8 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:44 +0200 Subject: [PATCH 14/20] add diagnosis tag --- bigbio/biodatasets/quaero/quaero.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 2d949b02..4edc5c45 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -16,6 +16,7 @@ Tags.SPECIES, Tags.PROCEDURE, Tags.DISEASE, + Tags.DIAGNOSIS, ] _LANGUAGES = [Lang.FR] _PUBMED = True From 7f96f08e3164ea6129f496da89226cf6c8004e8d Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:50 +0200 Subject: [PATCH 15/20] update tags --- bigbio/utils/resources/tags.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 46e62468..f58f2779 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -44,5 +44,6 @@ "MIRNA" : "miRNA", "ABBREVIATION" : "Abbreviation", "FACT_CHECKING" : "Fact-checking", - "INTENT" : "Intent" + "INTENT" : "Intent", + "DIAGNOSIS" : "DIAGNOSIS" } From 4109ebfcb05d3e96e119ff33b3e0e7f4c37fd1e6 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:58:29 +0200 Subject: [PATCH 16/20] format --- bigbio/biodatasets/biosses/biosses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigbio/biodatasets/biosses/biosses.py b/bigbio/biodatasets/biosses/biosses.py index a55a313c..7f7e72ee 100644 --- a/bigbio/biodatasets/biosses/biosses.py +++ b/bigbio/biodatasets/biosses/biosses.py @@ -33,7 +33,7 @@ _DATASETNAME = "biosses" -_TAGS = [] +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False From a7728abf2df44dd8e18c50e9ee40fe3426665cc0 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:58:34 +0200 Subject: [PATCH 17/20] add tags to examples --- examples/bc5cdr.py | 2 +- examples/bioasq_task_b.py | 9 +++++---- examples/biosses.py | 2 +- examples/chemprot.py | 2 +- examples/hallmarks_of_cancer.py | 2 +- examples/mlee.py | 2 +- examples/muchmore.py | 2 +- examples/n2c2_2011.py | 2 +- examples/nlmchem.py | 2 +- 9 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/bc5cdr.py b/examples/bc5cdr.py index ee325c16..0c94c3ad 100644 --- a/examples/bc5cdr.py +++ b/examples/bc5cdr.py @@ -35,7 +35,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/bioasq_task_b.py b/examples/bioasq_task_b.py index 9026918f..8be34588 100644 --- a/examples/bioasq_task_b.py +++ b/examples/bioasq_task_b.py @@ -36,10 +36,11 @@ from bigbio.utils.license import Licenses _TAGS = [ - Tags.QA_YESNO - Tags.QA_FACTOID, - Tags.QA_LIST, - Tags.QA_SUMMARY, + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, ] _LANGUAGES = [Lang.EN] _PUBMED = True diff --git a/examples/biosses.py b/examples/biosses.py index 80aa75b3..91323949 100644 --- a/examples/biosses.py +++ b/examples/biosses.py @@ -33,7 +33,7 @@ _DATASETNAME = "biosses" -_TAGS = [] +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/chemprot.py b/examples/chemprot.py index c29b362a..3a43c319 100644 --- a/examples/chemprot.py +++ b/examples/chemprot.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/hallmarks_of_cancer.py b/examples/hallmarks_of_cancer.py index 50600def..09f37794 100644 --- a/examples/hallmarks_of_cancer.py +++ b/examples/hallmarks_of_cancer.py @@ -22,7 +22,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mlee.py b/examples/mlee.py index e0330d53..b98bf327 100644 --- a/examples/mlee.py +++ b/examples/mlee.py @@ -32,7 +32,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/muchmore.py b/examples/muchmore.py index 9afb2982..6ce74b9f 100644 --- a/examples/muchmore.py +++ b/examples/muchmore.py @@ -76,7 +76,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index d1dd79f7..2495432f 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.TREATMENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/examples/nlmchem.py b/examples/nlmchem.py index 88523446..6b943859 100644 --- a/examples/nlmchem.py +++ b/examples/nlmchem.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From dda930f9b0cdd73e4bf70f93b129a604a52db316 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:05:37 +0200 Subject: [PATCH 18/20] fix missing/errors --- bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py | 4 ++-- bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py | 8 +++++++- bigbio/biodatasets/lll/lll.py | 2 +- bigbio/biodatasets/medmentions/medmentions.py | 2 +- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 2 +- bigbio/biodatasets/nagel/nagel.py | 2 +- bigbio/biodatasets/pho_ner/pho_ner.py | 2 +- bigbio/utils/resources/tags.json | 2 +- 8 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py index a72d0386..6365cd7e 100644 --- a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py +++ b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py @@ -30,11 +30,11 @@ Tags.DISEASE, Tags.CANCER, Tags.TISSUE, - Tags.ORGANISM, + Tags.SPECIES, Tags.CELL, Tags.GENE, Tags.CHEMICAL, - Tags.PATHWAY, + Tags.PATHWAY_CURATION, ] _LANGUAGES = [Lang.EN] _PUBMED = True diff --git a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py index f685ff3e..cee27dfa 100644 --- a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py +++ b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py @@ -26,7 +26,13 @@ _DATASETNAME = "bionlp_st_2013_pc" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.PATHWAY, Tags.NEGATION, Tags.SPECULATION] +_TAGS = [ + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY_CURATION, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index 560185a5..6dfe9914 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -36,7 +36,7 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses _TAGS = [Tags.GENE] diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index 9c974663..633b86dd 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.ORGANISM] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 478fba48..7ab93a59 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [Tags.DISEASE, Tags.TREATMENT] +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index 0f5990ff..c80f80db 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.MUTATION, Tags.GENE, Tags.SPECIES] +_TAGS = [Tags.VARIANT, Tags.GENE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 32e0e4e0..821cd0d0 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tag.DISEASE, Tag.COVID] +_TAGS = [Tags.DISEASE, Tags.COVID] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index f58f2779..dc3f93eb 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -16,7 +16,7 @@ "HOW" : "`How` question", "WHY" : "`Why` question", "FACTOID" : "QA with factoid answer", - "FACTOIND_LIST": "QA with list of factoid answer", + "FACTOID_LIST": "QA with list of factoid answer", "ABSTRACTIVE" : "Abstractive summary/answer", "EXTRACTIVE" : "Extractive summary/answer", "CLOZE_TEST" : "Cloze test", From 3493d0f5a19cff5f4174a39a6590b09eb896ae58 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:05:49 +0200 Subject: [PATCH 19/20] treatment is procedure --- examples/n2c2_2011.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index 2495432f..ca6f47e1 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [Tags.DISEASE, Tags.TREATMENT] +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True From 44bce0ced4836f5ce6b848aba02f04c9e48ae465 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:06:06 +0200 Subject: [PATCH 20/20] add script to gather (fine-grained) tasks counts --- scripts/gather_dataset_tasks.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 scripts/gather_dataset_tasks.py diff --git a/scripts/gather_dataset_tasks.py b/scripts/gather_dataset_tasks.py new file mode 100644 index 00000000..7523e8f4 --- /dev/null +++ b/scripts/gather_dataset_tasks.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Generate counts of tasks and fine-grained taks +""" + +from bigbio.dataloader import BigBioConfigHelpers + + +def main(): + """ + Gather counts on tasks and fine-grained tasks + """ + + configs = BigBioConfigHelpers() + + dataset_task = set() + + for conf in configs: + for task in conf.tasks: + dataset_task.add(conf.dataset_name, str(task)) + + print(dataset_task) + + +if __name__ == "__main__": + main()