jakelever · creisle · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 9, 2023
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ snakejob.*
 
 listings/
 dist
+debug
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
     long_description = fh.read()
 
 
-DEV_REQS = ['black', 'flake8', 'isort', 'mypy']
+DEV_REQS = ['black', 'flake8', 'isort', 'mypy', 'requests-cache']
 TEST_REQS = ['biopython', 'snakemake', 'ftputil', 'requests', 'pytest', 'pytest-cov', 'hypothesis']
 
 setup(
@@ -17,7 +17,7 @@
     description='Convert between NCBI pubmed/PMC and BIOC formats',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    install_requires=['bioc>=2.0', 'typing_extensions'],
+    install_requires=['bioc>=2.0', 'typing_extensions', 'unidecode'],
     extras_require={'dev': DEV_REQS + TEST_REQS, 'test': TEST_REQS},
     python_requires='>=3.6',
     author='Jake Lever',

diff --git a/src/bioconverters/constants.py b/src/bioconverters/constants.py
@@ -0,0 +1,50 @@
+GREEK_ALPHABET = {
+    '\u0391': 'Alpha',
+    '\u0392': 'Beta',
+    '\u0393': 'Gamma',
+    '\u0394': 'Delta',
+    '\u0395': 'Epsilon',
+    '\u0396': 'Zeta',
+    '\u0397': 'Eta',
+    '\u0398': 'Theta',
+    '\u0399': 'Iota',
+    '\u039A': 'Kappa',
+    '\u039B': 'Lambda',
+    '\u039C': 'Mu',
+    '\u039D': 'Nu',
+    '\u039E': 'Xi',
+    '\u039F': 'Omicron',
+    '\u03A0': 'Pi',
+    '\u03A1': 'Rho',
+    '\u03A3': 'Sigma',
+    '\u03A4': 'Tau',
+    '\u03A5': 'Upsilon',
+    '\u03A6': 'Phi',
+    '\u03A7': 'Chi',
+    '\u03A8': 'Psi',
+    '\u03A9': 'Omega',
+    '\u03B1': 'alpha',
+    '\u03B2': 'beta',
+    '\u03B3': 'gamma',
+    '\u03B4': 'delta',
+    '\u03B5': 'epsilon',
+    '\u03B6': 'zeta',
+    '\u03B7': 'eta',
+    '\u03B8': 'theta',
+    '\u03B9': 'iota',
+    '\u03BA': 'kappa',
+    '\u03BB': 'lambda',
+    '\u03BC': 'mu',
+    '\u03BD': 'nu',
+    '\u03BE': 'xi',
+    '\u03BF': 'omicron',
+    '\u03C0': 'pi',
+    '\u03C1': 'rho',
+    '\u03C3': 'sigma',
+    '\u03C4': 'tau',
+    '\u03C5': 'upsilon',
+    '\u03C6': 'phi',
+    '\u03C7': 'chi',
+    '\u03C8': 'psi',
+    '\u03C9': 'omega',
+}
diff --git a/src/bioconverters/pmcxml.py b/src/bioconverters/pmcxml.py
@@ -33,6 +33,7 @@
     "authors’ contributions",
     "background",
     "case report",
+    "case presentation",
     "competing interests",
     "conclusion",
     "conclusions",
@@ -41,6 +42,7 @@
     "consent",
     "data analysis",
     "data collection",
+    "disclosure statement",
     "discussion",
     "ethics statement",
     "funding",
@@ -90,7 +92,7 @@ class PmcArticle(TypedDict):
     journal: str
     journalISO: str
     textSources: TextSource
-    annotations: Dict[str, str] = {}
+    annotations: Dict[str, str]
 
 
 def extract_article_content(
@@ -372,6 +374,7 @@ def pmcxml2bioc(
     trim_sentences: bool = False,
     all_xml_path_infon: bool = False,
     mark_citations: bool = False,
+    sectioning_delimiter: str = "//",
 ) -> Iterator[Iterable[bioc.BioCDocument]]:
     """
     Convert a PMC XML file into its Bioc equivalent
@@ -419,9 +422,16 @@ def pmcxml2bioc(
                     subsection_check = text_source.lower().strip("01234567890. ")
                     if subsection_check in allowed_subsections:
                         subsection = subsection_check
+                    elif chunk.section:
+                        subsection = re.sub(
+                            r"^\s*\d+(\.\d+)*\s*\.\s*", "", chunk.section.lower()
+                        )
 
                     passage.infons["section"] = group_name
                     passage.infons["subsection"] = subsection
+                    passage.infons["sectioning"] = sectioning_delimiter.join(
+                        chunk.sections
+                    )
 
                     if chunk.xml_path:
                         if all_xml_path_infon or set(chunk.xml_path.split('/')) & {