Merge pull request #7 from rmnldwg/release-0.1.0

Release 0.1.0
rmnldwg · Oct 28, 2024 · 3bbb3c4 · 3bbb3c4
2 parents 65bb8b5 + 6bf14ae
commit 3bbb3c4
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,26 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.1.0] - 2024-10-28
+
+### 🚀 Features
+
+- *(utils)* Add often needed `enhance` function to complete sub-/superlevel involvement and infer maximum likelihood status.
+
+### 🐛 Bug Fixes
+
+- Avoid `KeyError` in `infer_superlevels`
+
+### ⚙️ Miscellaneous Tasks
+
+- Add link to release 0.0.4
+
+### Change
+
+- `infer_su(b|per)levels` skips inferring involvement of sub-/super LNLs that are already present
+- *(load)* Rename `skip_disk` to `use_github`
+- *(query)* Rename `in_` to `isin` for `C` object
+
 ## [0.0.4] - 2024-10-11
 
 ### 🚀 Features
@@ -119,6 +139,8 @@ Initial implementation of the lyDATA library.
 <!-- generated by git-cliff -->
 <!-- markdownlint-disable-file MD024 -->
 
+[0.1.0]: https://github.com/rmnldwg/lydata/compare/0.0.4..0.1.0
+[0.0.4]: https://github.com/rmnldwg/lydata/compare/0.0.3..0.0.4
 [0.0.3]: https://github.com/rmnldwg/lydata/compare/0.0.2..0.0.3
 [0.0.2]: https://github.com/rmnldwg/lydata/compare/0.0.1..0.0.2
 [0.0.1]: https://github.com/rmnldwg/lydata/compare/63b2d867888aa8f583c498ff3fc3f94cdb48765c..0.0.1

diff --git a/lydata/__init__.py b/lydata/__init__.py
@@ -9,6 +9,7 @@
     join_datasets,
     load_datasets,
 )
+from lydata.utils import enhance
 from lydata.validator import validate_datasets
 
 __author__ = "Roman Ludwig"
@@ -24,6 +25,7 @@
     "join_datasets",
     "load_datasets",
     "validate_datasets",
+    "enhance",
 ]
 
 logger = logging.getLogger(__name__)

diff --git a/lydata/accessor.py b/lydata/accessor.py
@@ -46,6 +46,8 @@
 )
 from lydata.validator import construct_schema
 
+warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
+
 
 def _get_all_true(df: pd.DataFrame) -> pd.Series:
     """Return a mask with all entries set to ``True``."""
@@ -293,10 +295,10 @@ def __ne__(self, value: Any) -> Q:
         """
         return Q(self.column, "!=", value)
 
-    def in_(self, value: list[Any]) -> Q:
+    def isin(self, value: list[Any]) -> Q:
         """Create a query object for checking if the column values are in a list.
 
-        >>> C('foo').in_([1, 2, 3])
+        >>> C('foo').isin([1, 2, 3])
         Q('foo', 'in', [1, 2, 3])
         """
         return Q(self.column, "in", value)
@@ -756,8 +758,9 @@ def infer_sublevels(
                 "V": ["a", "b"],
             }
 
-        The resulting DataFrame will only contain the newly inferred sublevel columns.
-        Thus, one can simply :py:meth:`~pandas.DataFrame.update` the original DataFrame
+        The resulting DataFrame will only contain the newly inferred sublevel columns
+        and only for those sublevels that were not already present in the DataFrame.
+        Thus, one can simply :py:meth:`~pandas.DataFrame.join` the original DataFrame
         with the result.
 
         >>> df = pd.DataFrame({
@@ -795,6 +798,9 @@ def infer_sublevels(
 
             for subid in subids:
                 sublevel = superlevel + subid
+                if sublevel in self._obj[modality, side]:
+                    continue
+
                 result.loc[is_healthy, (modality, side, sublevel)] = False
                 result.loc[~is_healthy, (modality, side, sublevel)] = None
 
@@ -815,8 +821,9 @@ def infer_superlevels(
         The superlevel's status is computed for the specified ``modalities``. If and
         what sublevels a superlevel has, is specified in ``subdivisions``.
 
-        The resulting DataFrame will only contain the newly inferred superlevel columns.
-        This way, it is straightforward to :py:meth:`~pandas.DataFrame.update` the
+        The resulting DataFrame will only contain the newly inferred superlevel columns
+        and only for those superlevels that were not already present in the DataFrame.
+        This way, it is straightforward to :py:meth:`~pandas.DataFrame.join` it with the
         original DataFrame.
 
         >>> df = pd.DataFrame({
@@ -857,6 +864,9 @@ def infer_superlevels(
             except KeyError:
                 continue
 
+            if superlevel in self._obj[modality, side]:
+                continue
+
             result.loc[are_all_healthy, (modality, side, superlevel)] = False
             result.loc[is_any_involved, (modality, side, superlevel)] = True
             result.loc[is_unknown, (modality, side, superlevel)] = None

diff --git a/lydata/loader.py b/lydata/loader.py
@@ -128,12 +128,12 @@ def get_description(self) -> str:
 
     def load(
         self,
-        skip_disk: bool = False,
+        use_github: bool = False,
         **load_kwargs,
     ) -> pd.DataFrame:
         """Load the ``data.csv`` file from disk or from GitHub.
 
-        One can also choose to ``skip_disk``. Any keyword arguments are passed to
+        One can also choose to ``use_github``. Any keyword arguments are passed to
         :py:func:`pandas.read_csv`.
 
         The method will store the output of :py:meth:`~pydantic.BaseModel.model_dump`
@@ -144,15 +144,15 @@ def load(
         >>> df_from_disk = conf.load()
         >>> df_from_disk.shape
         (263, 82)
-        >>> df_from_github = conf.load(skip_disk=True)
+        >>> df_from_github = conf.load(use_github=True)
         >>> np.all(df_from_disk.fillna(0) == df_from_github.fillna(0))
         np.True_
         """
         kwargs = {"header": [0, 1, 2]}
         kwargs.update(load_kwargs)
 
         try:
-            if skip_disk:
+            if use_github:
                 logger.info(f"Skipping loading from {self.path}.")
                 raise SkipDiskError
             df = pd.read_csv(self.path, **kwargs)
@@ -273,7 +273,7 @@ def available_datasets(
     institution: str = "*",
     subsite: str = "*",
     search_paths: list[Path] | None = None,
-    skip_disk: bool = False,
+    use_github: bool = False,
     repo: str = _repo,
     ref: str = "main",
 ) -> Generator[LyDatasetConfig, None, None]:
@@ -288,17 +288,17 @@ def available_datasets(
     the parent directory of the directory containing this file. If the library is
     installed, this will be the ``site-packages`` directory.
 
-    With ``skip_disk`` set to ``True``, the function will not look for datasets on disk,
-    but will instead look for them on GitHub. The ``repo`` and ``ref`` arguments can be
-    used to specify the repository and the branch/tag/commit to look in.
+    With ``use_github`` set to ``True``, the function will not look for datasets on
+    disk, but will instead look for them on GitHub. The ``repo`` and ``ref`` arguments
+    can be used to specify the repository and the branch/tag/commit to look in.
 
     >>> avail_gen = available_datasets()
     >>> sorted([ds.name for ds in avail_gen])   # doctest: +NORMALIZE_WHITESPACE
     ['2021-clb-oropharynx',
      '2021-usz-oropharynx',
      '2023-clb-multisite',
      '2023-isb-multisite']
-    >>> avail_gen = available_datasets(skip_disk=True)
+    >>> avail_gen = available_datasets(use_github=True)
     >>> sorted([ds.name for ds in avail_gen])   # doctest: +NORMALIZE_WHITESPACE
     ['2021-clb-oropharynx',
      '2021-usz-oropharynx',
@@ -307,15 +307,15 @@ def available_datasets(
     >>> avail_gen = available_datasets(
     ...     institution="hvh",
     ...     ref="6ac98d",
-    ...     skip_disk=True,
+    ...     use_github=True,
     ... )
     >>> sorted([ds.get_url("") for ds in avail_gen])   # doctest: +NORMALIZE_WHITESPACE
     ['https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2024-hvh-oropharynx/']
     """
-    if not skip_disk:
+    if not use_github:
         if repo != _repo or ref != "main":
             warnings.warn(
-                "Parameters `repo` and `ref` are ignored, unless `skip_disk` "
+                "Parameters `repo` and `ref` are ignored, unless `use_github` "
                 "is set to `True`."
             )
         yield from _available_datasets_on_disk(
@@ -339,7 +339,7 @@ def load_datasets(
     institution: str = "*",
     subsite: str = "*",
     search_paths: list[Path] | None = None,
-    skip_disk: bool = False,
+    use_github: bool = False,
     repo: str = _repo,
     ref: str = "main",
     **kwargs,
@@ -355,20 +355,20 @@ def load_datasets(
         institution=institution,
         subsite=subsite,
         search_paths=search_paths,
-        skip_disk=skip_disk,
+        use_github=use_github,
         repo=repo,
         ref=ref,
     )
     for dset_conf in dset_confs:
-        yield dset_conf.load(skip_disk=skip_disk, **kwargs)
+        yield dset_conf.load(use_github=use_github, **kwargs)
 
 
 def join_datasets(
     year: int | str = "*",
     institution: str = "*",
     subsite: str = "*",
     search_paths: list[Path] | None = None,
-    skip_disk: bool = False,
+    use_github: bool = False,
     repo: str = _repo,
     ref: str = "main",
     **kwargs,
@@ -381,15 +381,15 @@ def join_datasets(
 
     >>> join_datasets(year="2023").shape
     (705, 219)
-    >>> join_datasets(year="2023", skip_disk=True).shape
+    >>> join_datasets(year="2023", use_github=True).shape
     (705, 219)
     """
     gen = load_datasets(
         year=year,
         institution=institution,
         subsite=subsite,
         search_paths=search_paths,
-        skip_disk=skip_disk,
+        use_github=use_github,
         repo=repo,
         ref=ref,
         **kwargs,

diff --git a/lydata/utils.py b/lydata/utils.py
@@ -121,6 +121,49 @@ def get_default_modalities() -> dict[str, ModalityConfig]:
     }
 
 
+def enhance(
+    dataset: pd.DataFrame,
+    infer_sublevels_kwargs: dict[str, Any] | None = None,
+    infer_superlevels_kwargs: dict[str, Any] | None = None,
+    combine_kwargs: dict[str, Any] | None = None,
+) -> pd.DataFrame:
+    """Enhance the dataset by inferring additional columns from the data.
+
+    This performs the following steps in order:
+
+    1. Infer the superlevel involvement for each diagnostic modality using the
+        :py:meth:`~lydata.accessor.LyDataAccessor.infer_superlevels` method.
+    2. Infer the sublevel involvement for each diagnostic modality using the
+        :py:meth:`~lydata.accessor.LyDataAccessor.infer_sublevels` method. This skips
+        all LNLs that were computed in the previous step.
+    3. Compute the maximum likelihood estimate of the true state of the patient using
+        the :py:meth:`~lydata.accessor.LyDataAccessor.combine`.
+
+    .. important::
+
+        Performing these operations in any other order may lead to the loss of some
+        information or even to conflicting LNL involvement information.
+
+    The result contains all LNLs of interest in the head and neck region, as well as
+    the best estimate of the true state of the patient under the top-level key
+    ``max_llh``.
+    """
+    infer_sublevels_kwargs = infer_sublevels_kwargs or {}
+    infer_superlevels_kwargs = infer_superlevels_kwargs or {}
+    combine_kwargs = combine_kwargs or {}
+
+    result = dataset.copy()
+
+    result = result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs))
+    result = result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))
+
+    max_llh = pd.concat(
+        {"max_llh": result.ly.combine(**combine_kwargs)},
+        axis="columns",
+    )
+    return result.join(max_llh)
+
+
 def _main() -> None:
     """Run the main function."""
     ...

diff --git a/lydata/validator.py b/lydata/validator.py
@@ -118,7 +118,7 @@ def validate_datasets(
     year: int | str = "*",
     institution: str = "*",
     subsite: str = "*",
-    skip_disk: bool = False,
+    use_github: bool = False,
     repo: str = "rmnldwg/lydata",
     ref: str = "main",
     **kwargs,
@@ -140,7 +140,7 @@ def validate_datasets(
         year=year,
         institution=institution,
         subsite=subsite,
-        skip_disk=skip_disk,
+        use_github=use_github,
         repo=repo,
         ref=ref,
     ):