From b6f473099f28ebc3b50440494e2363d7200dba9f Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 17 Aug 2022 09:59:05 -0400
Subject: [PATCH] More parsers (#248)

* Release/2.8.0 (#198)

* Bumping version

* support for extracting dug elements from graph (#197)

* support for extracting dug elements from graph

* adding flag for enabling dug element extraction from graph

* adding new config for node_to dug element parsing

* adding more parameters to crawler to able configuration to element extraction logic

* add tests

* add tests for crawler

Co-authored-by: Yaphetkg <yaphetkg@renci.org>

* Update _version.py

* Update _version.py

updating version for final push to master

* Update factory.py

Adding more comments

Co-authored-by: Carl Schreep <schreepc@renci.org>
Co-authored-by: Yaphetkg <yaphetkg@renci.org>

* Release/v2.9.0 (#201)

* Bumping version

* support for extracting dug elements from graph (#197)

* support for extracting dug elements from graph

* adding flag for enabling dug element extraction from graph

* adding new config for node_to dug element parsing

* adding more parameters to crawler to able configuration to element extraction logic

* add tests

* add tests for crawler

Co-authored-by: Yaphetkg <yaphetkg@renci.org>

* Display es scores (#199)

* Include ES scores in variable results

* Round ES score to 6

* Update _version.py (#200)

* Update _version.py

Co-authored-by: Carl Schreep <schreepc@renci.org>
Co-authored-by: Yaphetkg <yaphetkg@renci.org>
Co-authored-by: Ginnie Hench <vhench@rti.org>

* consolidate dbgap format parser in single file , adds crdc and kfdrc parsers

* adding tests

* bump version

* parser when versions of studies are > 9

* test for version

* fix long text issues, and encoding errors

* nltk initialization

* change nltk approach for sliding window

Co-authored-by: Carl Schreep <schreepc@renci.org>
Co-authored-by: Yaphetkg <yaphetkg@renci.org>
Co-authored-by: Ginnie Hench <vhench@rti.org>
---
 src/dug/_version.py                        |  2 +-
 src/dug/core/annotate.py                   | 35 ++++++++++++++++++++--
 src/dug/core/parsers/__init__.py           |  8 +++--
 src/dug/core/parsers/anvil_dbgap_parser.py |  6 ----
 src/dug/core/parsers/dbgap_parser.py       | 18 +++++++++--
 tests/integration/test_parsers.py          | 25 +++++++++++++++-
 tests/unit/test_annotate.py                | 26 ++++++++++++++++
 7 files changed, 105 insertions(+), 15 deletions(-)
 delete mode 100644 src/dug/core/parsers/anvil_dbgap_parser.py

diff --git a/src/dug/_version.py b/src/dug/_version.py
index 451b49b6..7eec20a5 100644
--- a/src/dug/_version.py
+++ b/src/dug/_version.py
@@ -1 +1 @@
-__version__ = "2.9.3dev"
+__version__ = "2.9.4dev"
diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py
index 0adef1f1..91976f75 100644
--- a/src/dug/core/annotate.py
+++ b/src/dug/core/annotate.py
@@ -3,12 +3,12 @@
 import os
 import urllib.parse
 from typing import TypeVar, Generic, Union, List, Tuple, Optional
-
 import requests
 from requests import Session
 
 import dug.core.tranql as tql
 
+
 logger = logging.getLogger('dug')
 
 logging.getLogger("requests").setLevel(logging.WARNING)
@@ -271,9 +271,40 @@ class Annotator(ApiClient[str, List[Identifier]]):
     def __init__(self, url: str):
         self.url = url
 
+    def sliding_window(self, text, max_characters=2000, padding_words=5):
+        """
+        For long texts sliding window works as the following
+        "aaaa bbb ccc ddd eeee"
+        with a sliding max chars 8 and padding 1
+        first yeild would be "aaaa bbb"
+        next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee"
+        allowing context to be preserved with the scope of padding
+        For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded.
+        """
+        words = text.split(' ')
+        total_words = len(words)
+        window_end = False
+        current_index = 0
+        while not window_end:
+            current_string = ""
+            for index, word in enumerate(words[current_index: ]):
+                if len(current_string) + len(word) + 1 >= max_characters:
+                    yield current_string + " "
+                    current_index += index - padding_words
+                    break
+                appendee = word if index == 0 else " " + word
+                current_string += appendee
+
+            if current_index + index == len(words) - 1:
+                window_end = True
+                yield current_string
+
     def annotate(self, text, http_session):
         logger.debug(f"Annotating: {text}")
-        return self(text, http_session)
+        identifiers = []
+        for chunk_text in self.sliding_window(text):
+            identifiers += self(chunk_text, http_session)
+        return identifiers
 
     def make_request(self, value: Input, http_session: Session):
         value = urllib.parse.quote(value)
diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py
index 867257e9..70331119 100644
--- a/src/dug/core/parsers/__init__.py
+++ b/src/dug/core/parsers/__init__.py
@@ -4,12 +4,12 @@
 import pluggy
 
 from ._base import DugElement, DugConcept, Indexable, Parser, FileParser
-from .dbgap_parser import DbGaPParser
+from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser
 from .nida_parser import NIDAParser
 from .scicrunch_parser import SciCrunchParser
 from .topmed_tag_parser import TOPMedTagParser
 from .topmed_csv_parser import TOPMedCSVParser
-from .anvil_dbgap_parser import AnvilDbGaPParser
+
 
 logger = logging.getLogger('dug')
 
@@ -23,7 +23,9 @@ def define_parsers(parser_dict: Dict[str, Parser]):
     parser_dict["topmedtag"] = TOPMedTagParser()
     parser_dict["topmedcsv"] = TOPMedCSVParser()
     parser_dict["scicrunch"] = SciCrunchParser()
-    parser_dict["anvil"] = AnvilDbGaPParser() 
+    parser_dict["anvil"] = AnvilDbGaPParser()
+    parser_dict["crdc"] = CRDCDbGaPParser()
+    parser_dict["kfdrc"] = KFDRCDbGaPParser()
 
 
 class ParserNotFoundException(Exception):
diff --git a/src/dug/core/parsers/anvil_dbgap_parser.py b/src/dug/core/parsers/anvil_dbgap_parser.py
deleted file mode 100644
index 72d3d2e4..00000000
--- a/src/dug/core/parsers/anvil_dbgap_parser.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .dbgap_parser import DbGaPParser
-
-
-class AnvilDbGaPParser(DbGaPParser):
-    def _get_element_type(self):
-        return "AnVIL"
diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py
index 2dd553eb..8173c5e7 100644
--- a/src/dug/core/parsers/dbgap_parser.py
+++ b/src/dug/core/parsers/dbgap_parser.py
@@ -15,7 +15,7 @@ class DbGaPParser(FileParser):
     @staticmethod
     def parse_study_name_from_filename(filename: str):
         # Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from
-        dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]\.pht[0-9]+\.v[0-9]\.(.+)\.data_dict.*')
+        dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*')
         match = re.match(dbgap_file_pattern, filename)
         if match is not None:
             return match.group(1)
@@ -26,7 +26,7 @@ def _get_element_type(self):
 
     def __call__(self, input_file: InputFile) -> List[Indexable]:
         logger.debug(input_file)
-        tree = ET.parse(input_file)
+        tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5'))
         root = tree.getroot()
         study_id = root.attrib['study_id']
         participant_set = root.get('participant_set','0')
@@ -58,3 +58,17 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
 
         # You don't actually create any concepts
         return elements
+
+
+class AnvilDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "AnVIL"
+
+
+class CRDCDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "Cancer Data Commons"
+
+class KFDRCDbGaPParser(DbGaPParser):
+    def _get_element_type(self):
+        return "Kids First"
\ No newline at end of file
diff --git a/tests/integration/test_parsers.py b/tests/integration/test_parsers.py
index 157e8735..5f647b90 100644
--- a/tests/integration/test_parsers.py
+++ b/tests/integration/test_parsers.py
@@ -1,4 +1,5 @@
-from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser
+from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser,\
+    CRDCDbGaPParser, KFDRCDbGaPParser
 from tests.integration.conftest import TEST_DATA_DIR
 
 def test_dbgap_parse_study_name_from_filename():
@@ -6,6 +7,10 @@ def test_dbgap_parse_study_name_from_filename():
     filename = "whatever/phs000166.v2.pht000700.v1.CAMP_CData.data_dict_2009_09_03.xml"
     studyname = parser.parse_study_name_from_filename(filename)
     assert studyname == "CAMP_CData"
+    # test if version numbers are > 9
+    filename = "whatever/phs000166.v23.pht000700.v13.CAMP_CData.data_dict_2009_09_03.xml"
+    studyname = parser.parse_study_name_from_filename(filename)
+    assert studyname == "CAMP_CData"
 
 def test_nida_parse_study_name_from_filename():
     parser = NIDAParser()
@@ -70,3 +75,21 @@ def test_anvil_parser():
     assert len(elements) == 3
     for element in elements:
         assert element.type == "AnVIL"
+
+
+def test_crdc_parser():
+    parser = CRDCDbGaPParser()
+    parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml")
+    elements = parser(parse_file)
+    assert len(elements) == 3
+    for element in elements:
+        assert element.type == "Cancer Data Commons"
+
+
+def test_kfdrc_parser():
+    parser = KFDRCDbGaPParser()
+    parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml")
+    elements = parser(parse_file)
+    assert len(elements) == 3
+    for element in elements:
+        assert element.type == "Kids First"
\ No newline at end of file
diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py
index 053c80d4..deb479ab 100644
--- a/tests/unit/test_annotate.py
+++ b/tests/unit/test_annotate.py
@@ -234,3 +234,29 @@ def test_ontology_helper(ontology_api):
     assert name == 'primary circulatory organ'
     assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].'
     assert ontology_type == 'anatomical entity'
+
+
+def test_yield_partial_text():
+    annotator = Annotator('foo')
+    # text contains 800 characters + 9 new lines
+    text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeel® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]"""
+    chunks = ""
+    is_the_beginning = True
+    max_chars = 2000
+    padding_words = 3
+    counter = 0
+    print(len(text))
+    # divvy up into chunks,  sum of each chunk should equal the original text.
+    for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words):
+        assert len(chunk) <= max_chars
+        counter += 1
+        if is_the_beginning:
+            chunks += chunk
+        else:
+            # remove redundand padded words from final result
+            chunks += " ".join(chunk.split(" ")[padding_words:])
+        is_the_beginning = False
+
+    print(counter)
+    # since spaces are trimmed by tokenizer , we can execuled all spaces and do char
+    assert chunks == text
\ No newline at end of file