Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/parsing cases #17

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ snakejob.*

listings/
dist
debug
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
long_description = fh.read()


DEV_REQS = ['black', 'flake8', 'isort', 'mypy']
DEV_REQS = ['black', 'flake8', 'isort', 'mypy', 'requests-cache']
TEST_REQS = ['biopython', 'snakemake', 'ftputil', 'requests', 'pytest', 'pytest-cov', 'hypothesis']

setup(
Expand All @@ -17,7 +17,7 @@
description='Convert between NCBI pubmed/PMC and BIOC formats',
long_description=long_description,
long_description_content_type='text/markdown',
install_requires=['bioc>=2.0', 'typing_extensions'],
install_requires=['bioc>=2.0', 'typing_extensions', 'unidecode'],
extras_require={'dev': DEV_REQS + TEST_REQS, 'test': TEST_REQS},
python_requires='>=3.6',
author='Jake Lever',
Expand Down
50 changes: 50 additions & 0 deletions src/bioconverters/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
GREEK_ALPHABET = {
'\u0391': 'Alpha',
'\u0392': 'Beta',
'\u0393': 'Gamma',
'\u0394': 'Delta',
'\u0395': 'Epsilon',
'\u0396': 'Zeta',
'\u0397': 'Eta',
'\u0398': 'Theta',
'\u0399': 'Iota',
'\u039A': 'Kappa',
'\u039B': 'Lambda',
'\u039C': 'Mu',
'\u039D': 'Nu',
'\u039E': 'Xi',
'\u039F': 'Omicron',
'\u03A0': 'Pi',
'\u03A1': 'Rho',
'\u03A3': 'Sigma',
'\u03A4': 'Tau',
'\u03A5': 'Upsilon',
'\u03A6': 'Phi',
'\u03A7': 'Chi',
'\u03A8': 'Psi',
'\u03A9': 'Omega',
'\u03B1': 'alpha',
'\u03B2': 'beta',
'\u03B3': 'gamma',
'\u03B4': 'delta',
'\u03B5': 'epsilon',
'\u03B6': 'zeta',
'\u03B7': 'eta',
'\u03B8': 'theta',
'\u03B9': 'iota',
'\u03BA': 'kappa',
'\u03BB': 'lambda',
'\u03BC': 'mu',
'\u03BD': 'nu',
'\u03BE': 'xi',
'\u03BF': 'omicron',
'\u03C0': 'pi',
'\u03C1': 'rho',
'\u03C3': 'sigma',
'\u03C4': 'tau',
'\u03C5': 'upsilon',
'\u03C6': 'phi',
'\u03C7': 'chi',
'\u03C8': 'psi',
'\u03C9': 'omega',
}
12 changes: 11 additions & 1 deletion src/bioconverters/pmcxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"authors’ contributions",
"background",
"case report",
"case presentation",
"competing interests",
"conclusion",
"conclusions",
Expand All @@ -41,6 +42,7 @@
"consent",
"data analysis",
"data collection",
"disclosure statement",
"discussion",
"ethics statement",
"funding",
Expand Down Expand Up @@ -90,7 +92,7 @@ class PmcArticle(TypedDict):
journal: str
journalISO: str
textSources: TextSource
annotations: Dict[str, str] = {}
annotations: Dict[str, str]


def extract_article_content(
Expand Down Expand Up @@ -372,6 +374,7 @@ def pmcxml2bioc(
trim_sentences: bool = False,
all_xml_path_infon: bool = False,
mark_citations: bool = False,
sectioning_delimiter: str = "//",
) -> Iterator[Iterable[bioc.BioCDocument]]:
"""
Convert a PMC XML file into its Bioc equivalent
Expand Down Expand Up @@ -419,9 +422,16 @@ def pmcxml2bioc(
subsection_check = text_source.lower().strip("01234567890. ")
if subsection_check in allowed_subsections:
subsection = subsection_check
elif chunk.section:
subsection = re.sub(
r"^\s*\d+(\.\d+)*\s*\.\s*", "", chunk.section.lower()
)

passage.infons["section"] = group_name
passage.infons["subsection"] = subsection
passage.infons["sectioning"] = sectioning_delimiter.join(
chunk.sections
)

if chunk.xml_path:
if all_xml_path_infon or set(chunk.xml_path.split('/')) & {
Expand Down
Loading
Loading