From b6f473099f28ebc3b50440494e2363d7200dba9f Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 17 Aug 2022 09:59:05 -0400 Subject: [PATCH] More parsers (#248) * Release/2.8.0 (#198) * Bumping version * support for extracting dug elements from graph (#197) * support for extracting dug elements from graph * adding flag for enabling dug element extraction from graph * adding new config for node_to dug element parsing * adding more parameters to crawler to able configuration to element extraction logic * add tests * add tests for crawler Co-authored-by: Yaphetkg * Update _version.py * Update _version.py updating version for final push to master * Update factory.py Adding more comments Co-authored-by: Carl Schreep Co-authored-by: Yaphetkg * Release/v2.9.0 (#201) * Bumping version * support for extracting dug elements from graph (#197) * support for extracting dug elements from graph * adding flag for enabling dug element extraction from graph * adding new config for node_to dug element parsing * adding more parameters to crawler to able configuration to element extraction logic * add tests * add tests for crawler Co-authored-by: Yaphetkg * Display es scores (#199) * Include ES scores in variable results * Round ES score to 6 * Update _version.py (#200) * Update _version.py Co-authored-by: Carl Schreep Co-authored-by: Yaphetkg Co-authored-by: Ginnie Hench * consolidate dbgap format parser in single file , adds crdc and kfdrc parsers * adding tests * bump version * parser when versions of studies are > 9 * test for version * fix long text issues, and encoding errors * nltk initialization * change nltk approach for sliding window Co-authored-by: Carl Schreep Co-authored-by: Yaphetkg Co-authored-by: Ginnie Hench --- src/dug/_version.py | 2 +- src/dug/core/annotate.py | 35 ++++++++++++++++++++-- src/dug/core/parsers/__init__.py | 8 +++-- src/dug/core/parsers/anvil_dbgap_parser.py | 6 ---- src/dug/core/parsers/dbgap_parser.py | 18 +++++++++-- tests/integration/test_parsers.py | 25 +++++++++++++++- tests/unit/test_annotate.py | 26 ++++++++++++++++ 7 files changed, 105 insertions(+), 15 deletions(-) delete mode 100644 src/dug/core/parsers/anvil_dbgap_parser.py diff --git a/src/dug/_version.py b/src/dug/_version.py index 451b49b6..7eec20a5 100644 --- a/src/dug/_version.py +++ b/src/dug/_version.py @@ -1 +1 @@ -__version__ = "2.9.3dev" +__version__ = "2.9.4dev" diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 0adef1f1..91976f75 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -3,12 +3,12 @@ import os import urllib.parse from typing import TypeVar, Generic, Union, List, Tuple, Optional - import requests from requests import Session import dug.core.tranql as tql + logger = logging.getLogger('dug') logging.getLogger("requests").setLevel(logging.WARNING) @@ -271,9 +271,40 @@ class Annotator(ApiClient[str, List[Identifier]]): def __init__(self, url: str): self.url = url + def sliding_window(self, text, max_characters=2000, padding_words=5): + """ + For long texts sliding window works as the following + "aaaa bbb ccc ddd eeee" + with a sliding max chars 8 and padding 1 + first yeild would be "aaaa bbb" + next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" + allowing context to be preserved with the scope of padding + For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. + """ + words = text.split(' ') + total_words = len(words) + window_end = False + current_index = 0 + while not window_end: + current_string = "" + for index, word in enumerate(words[current_index: ]): + if len(current_string) + len(word) + 1 >= max_characters: + yield current_string + " " + current_index += index - padding_words + break + appendee = word if index == 0 else " " + word + current_string += appendee + + if current_index + index == len(words) - 1: + window_end = True + yield current_string + def annotate(self, text, http_session): logger.debug(f"Annotating: {text}") - return self(text, http_session) + identifiers = [] + for chunk_text in self.sliding_window(text): + identifiers += self(chunk_text, http_session) + return identifiers def make_request(self, value: Input, http_session: Session): value = urllib.parse.quote(value) diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index 867257e9..70331119 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -4,12 +4,12 @@ import pluggy from ._base import DugElement, DugConcept, Indexable, Parser, FileParser -from .dbgap_parser import DbGaPParser +from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser from .nida_parser import NIDAParser from .scicrunch_parser import SciCrunchParser from .topmed_tag_parser import TOPMedTagParser from .topmed_csv_parser import TOPMedCSVParser -from .anvil_dbgap_parser import AnvilDbGaPParser + logger = logging.getLogger('dug') @@ -23,7 +23,9 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["topmedtag"] = TOPMedTagParser() parser_dict["topmedcsv"] = TOPMedCSVParser() parser_dict["scicrunch"] = SciCrunchParser() - parser_dict["anvil"] = AnvilDbGaPParser() + parser_dict["anvil"] = AnvilDbGaPParser() + parser_dict["crdc"] = CRDCDbGaPParser() + parser_dict["kfdrc"] = KFDRCDbGaPParser() class ParserNotFoundException(Exception): diff --git a/src/dug/core/parsers/anvil_dbgap_parser.py b/src/dug/core/parsers/anvil_dbgap_parser.py deleted file mode 100644 index 72d3d2e4..00000000 --- a/src/dug/core/parsers/anvil_dbgap_parser.py +++ /dev/null @@ -1,6 +0,0 @@ -from .dbgap_parser import DbGaPParser - - -class AnvilDbGaPParser(DbGaPParser): - def _get_element_type(self): - return "AnVIL" diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index 2dd553eb..8173c5e7 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -15,7 +15,7 @@ class DbGaPParser(FileParser): @staticmethod def parse_study_name_from_filename(filename: str): # Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from - dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]\.pht[0-9]+\.v[0-9]\.(.+)\.data_dict.*') + dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*') match = re.match(dbgap_file_pattern, filename) if match is not None: return match.group(1) @@ -26,7 +26,7 @@ def _get_element_type(self): def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) - tree = ET.parse(input_file) + tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() study_id = root.attrib['study_id'] participant_set = root.get('participant_set','0') @@ -58,3 +58,17 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: # You don't actually create any concepts return elements + + +class AnvilDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "AnVIL" + + +class CRDCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "Cancer Data Commons" + +class KFDRCDbGaPParser(DbGaPParser): + def _get_element_type(self): + return "Kids First" \ No newline at end of file diff --git a/tests/integration/test_parsers.py b/tests/integration/test_parsers.py index 157e8735..5f647b90 100644 --- a/tests/integration/test_parsers.py +++ b/tests/integration/test_parsers.py @@ -1,4 +1,5 @@ -from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser +from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser,\ + CRDCDbGaPParser, KFDRCDbGaPParser from tests.integration.conftest import TEST_DATA_DIR def test_dbgap_parse_study_name_from_filename(): @@ -6,6 +7,10 @@ def test_dbgap_parse_study_name_from_filename(): filename = "whatever/phs000166.v2.pht000700.v1.CAMP_CData.data_dict_2009_09_03.xml" studyname = parser.parse_study_name_from_filename(filename) assert studyname == "CAMP_CData" + # test if version numbers are > 9 + filename = "whatever/phs000166.v23.pht000700.v13.CAMP_CData.data_dict_2009_09_03.xml" + studyname = parser.parse_study_name_from_filename(filename) + assert studyname == "CAMP_CData" def test_nida_parse_study_name_from_filename(): parser = NIDAParser() @@ -70,3 +75,21 @@ def test_anvil_parser(): assert len(elements) == 3 for element in elements: assert element.type == "AnVIL" + + +def test_crdc_parser(): + parser = CRDCDbGaPParser() + parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml") + elements = parser(parse_file) + assert len(elements) == 3 + for element in elements: + assert element.type == "Cancer Data Commons" + + +def test_kfdrc_parser(): + parser = KFDRCDbGaPParser() + parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml") + elements = parser(parse_file) + assert len(elements) == 3 + for element in elements: + assert element.type == "Kids First" \ No newline at end of file diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py index 053c80d4..deb479ab 100644 --- a/tests/unit/test_annotate.py +++ b/tests/unit/test_annotate.py @@ -234,3 +234,29 @@ def test_ontology_helper(ontology_api): assert name == 'primary circulatory organ' assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].' assert ontology_type == 'anatomical entity' + + +def test_yield_partial_text(): + annotator = Annotator('foo') + # text contains 800 characters + 9 new lines + text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeelĀ® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]""" + chunks = "" + is_the_beginning = True + max_chars = 2000 + padding_words = 3 + counter = 0 + print(len(text)) + # divvy up into chunks, sum of each chunk should equal the original text. + for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words): + assert len(chunk) <= max_chars + counter += 1 + if is_the_beginning: + chunks += chunk + else: + # remove redundand padded words from final result + chunks += " ".join(chunk.split(" ")[padding_words:]) + is_the_beginning = False + + print(counter) + # since spaces are trimmed by tokenizer , we can execuled all spaces and do char + assert chunks == text \ No newline at end of file