Merge pull request #6 from rmnldwg/release-0.0.4

Release 0.0.4
rmnldwg · Oct 11, 2024 · 65bb8b5 · 65bb8b5
2 parents 1223fd9 + a5c261d
commit 65bb8b5
Show file tree

Hide file tree

Showing 9 changed files with 649 additions and 129 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,41 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.0.4] - 2024-10-11
+
+### 🚀 Features
+
+- [**breaking**] Make several helper functions private (e.g., `_max_likelihood()`)
+- *(utils)* Add more shortname columns, like `surgery` for `("patient", "#", "neck_dissection")`
+- *(load)* Allow search for datasets at different locations on disk
+- *(query)* Add `C` object for easier `Q` creation
+- *(query)* Add `in_` to `C` object
+- *(validate)* Add `transform_to_lyprox` function
+
+### 🐛 Bug Fixes
+
+- *(load)* Resolve circular import of `_repo`
+
+### 📚 Documentation
+
+- Add intersphinx mapping to pandera
+- Expand module docstrings
+- Update `README.md` with library examples
+
+### 🧪 Testing
+
+- Fix failure due to changing order of items in set
+
+### Change
+
+- *(validate)* Add args to renamed validation
+- Import useful stuff as top-level
+- Make `main()` funcs private
+
+### Remove
+
+- *(load)* [**breaking**] `load_dataset()` not needed, one can just use `next(load_datasets())`
+
 ## [0.0.3] - 2024-10-01
 
 ### 🚀 Features

diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ We are in the process of collecting more data that we might publish soon. If you
 
 Every folder that corresponds to a dataset also contains a `CITATION.cff` file which may be used to cite the respective dataset. To cite the entire repository with all datasets inside, use the `CITATION.cff` at the root of the repository (or just click the *Cite this repository* button on the right).
 
-## Requirements
+## Library
 
 Besides the data, this repository provides a Python library for loading, manipulating, and validating the available datasets.
 
@@ -93,17 +93,83 @@ pip install -U pip
 pip install .
 ```
 
-You may have noticed that there are also `requirements.*` files here. These are independent of this library and instead related to reproducing the output of the Python files in the `scripts/` folder. To reproduce these, run the following commands:
+You may have noticed that there are also `requirements.*` files here. These are independent of this library and instead related to reproducing the output of the Python files in the `scripts/` folder. You may ignore this.
+
+### Usage of Python Utilities
+
+The first and most common use case would probably listing and loading the published datasets:
+
+```python
+import lydata
+
+for dataset_spec in lydata.available_datasets(
+    year=2023,              # show all datasets added in 2023
+    skip_disk=True,         # do not search on disk, but rather on GitHub
+    ref="61a17e",           # may be some specific hash/tag/branch
+):
+    print(dataset_spec.name)
+
+# output:
+# 2023-clb-multisite
+# 2023-isb-multisite
+
+merged_data = lydata.join_datasets(
+    subsite="oropharynx",   # merge data that include oropharyngeal tumor patients
+    skip_disk=True,         # again, search GitHub, not on disk (which is the default)
+)
+print(merged_data.head())
+
+# output:
+#     patient                                          ... pathology
+#           #                                          ...      ipsi
+#          id                 institution     sex age  ...        VI VIII  IX   X
+# 0      P011          Centre Léon Bérard    male  67  ...       NaN  NaN NaN NaN
+# 1      P012          Centre Léon Bérard  female  62  ...       NaN  NaN NaN NaN
+# ..      ...                         ...     ...  ..  ...       ...  ...  ..  ..
+# 548     286  University Hospital Zurich    male  67  ...       NaN  NaN NaN NaN
+# 549     287  University Hospital Zurich    male  76  ...       NaN  NaN NaN NaN
+#
+# [550 rows x 242 columns]
+```
 
-```bash
-git clone https://github.com/rmnldwg/lydata
-cd lydata
-python -m venv .venv
-source .venv/bin/activate
-pip install -U pip
-pip install -r requirements.txt
+And since the three-level header of the tables is a little unwieldy at times, we also provide some shortcodes via a custom pandas accessor. As soon as `lydata` is imported it can be used like this:
+
+```python
+print(merged_data.ly.age)
+
+# output:
+# 0      67
+# 1      62
+#        ..
+# 548    67
+# 549    76
+# Name: (patient, #, age), Length: 550, dtype: int64
 ```
 
+And we have implemented `Q` and `C` objects inspired by Django that allow easier querying of the tables:
+
+```python
+from lydata import C
+
+# select patients younger than 50 that are not HPV positive (includes NaNs)
+query_result = merged_data.ly.query((C("age") < 50) & ~(C("hpv") == True))
+print(query_result)
+
+# output:
+#     patient                                          ... pathology
+#           #                                          ...      ipsi
+#          id                 institution     sex age  ...        VI VIII  IX   X
+# 11     P030          Centre Léon Bérard    male  49  ...       NaN  NaN NaN NaN
+# 12     P031          Centre Léon Bérard    male  46  ...       NaN  NaN NaN NaN
+# ..      ...                         ...     ...  ..  ...       ...  ... ... ...
+# 545     283  University Hospital Zurich    male  49  ...       NaN  NaN NaN NaN
+# 547     285  University Hospital Zurich    male  44  ...       NaN  NaN NaN NaN
+#
+# [20 rows x 242 columns]
+```
+
+For more details and further examples or use-cases, have a look at the [official documentation](https://lydata.readthedocs.org/)
+
 ## See also
 
 ### LyProX Interface

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -7,43 +7,45 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'lyDATA'
-copyright = '2024, Roman Ludwig'
-author = 'Roman Ludwig'
-gh_username = 'rmnldwg'
+project = "lyDATA"
+copyright = "2024, Roman Ludwig"
+author = "Roman Ludwig"
+gh_username = "rmnldwg"
 version = lydata.__version__
 release = lydata.__version__
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = [
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
     "myst_parser",
 ]
 
-source_suffix = ['.rst', '.md']
-templates_path = ['_templates']
+source_suffix = [".rst", ".md"]
+templates_path = ["_templates"]
 exclude_patterns = []
 
 # document classes and their constructors
-autoclass_content = 'class'
+autoclass_content = "class"
 
 # sort members by source
-autodoc_member_order = 'bysource'
+autodoc_member_order = "bysource"
 
 # show type hints
-autodoc_typehints = 'signature'
+autodoc_typehints = "signature"
 
 # create links to other projects
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3.10', None),
-    'lymph': ('https://lymph-model.readthedocs.io/latest/', None),
-    'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
-    'numpy': ('https://numpy.org/doc/stable/', None),
+    "python": ("https://docs.python.org/3.10", None),
+    "lymph": ("https://lymph-model.readthedocs.io/latest/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "pandera": ("https://pandera.readthedocs.io/en/stable/", None),
+    "pydantic": ("https://docs.pydantic.dev/latest/", None),
 }
 
 # -- MyST configuration ------------------------------------------------------
@@ -57,7 +59,7 @@
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'sphinx_book_theme'
+html_theme = "sphinx_book_theme"
 html_theme_options = {
     "repository_url": f"https://github.com/{gh_username}/{project}",
     "repository_branch": "main",
@@ -66,7 +68,7 @@
     "home_page_in_toc": True,
 }
 
-html_static_path = ['_static']
+html_static_path = ["_static"]
 html_css_files = [
     "css/custom.css",
 ]
diff --git a/lydata/__init__.py b/lydata/__init__.py
@@ -3,13 +3,29 @@
 import logging
 
 import lydata._version as _version
+from lydata.accessor import C, Q
+from lydata.loader import (
+    available_datasets,
+    join_datasets,
+    load_datasets,
+)
+from lydata.validator import validate_datasets
 
 __author__ = "Roman Ludwig"
 __email__ = "roman.ludwig@usz.ch"
 __uri__ = "https://github.com/rmnldwg/lydata"
-_repo = __uri__.replace("https://github.com/", "")
 __version__ = _version.__version__
 
+__all__ = [
+    "accessor",
+    "Q",
+    "C",
+    "available_datasets",
+    "join_datasets",
+    "load_datasets",
+    "validate_datasets",
+]
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 logger.setLevel(logging.WARNING)