Skip to content

Commit

Permalink
Merge pull request #8 from rmnldwg/release-0.1.1
Browse files Browse the repository at this point in the history
Release 0.1.1
  • Loading branch information
rmnldwg authored Oct 31, 2024
2 parents 3bbb3c4 + 0398c3e commit 3d34db5
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 26 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

All notable changes to this project will be documented in this file.

## [0.1.1] - 2024-10-31

### 🚀 Features

- *(load)* add `get_repo()` method that fetches remote repository information for a `LyDatasetConfig
- *(load)* make authentication more flexible
- *(utils)* put sub-/superlevel inference in its own utility function

## [0.1.0] - 2024-10-28

### 🚀 Features
Expand Down Expand Up @@ -139,6 +147,7 @@ Initial implementation of the lyDATA library.
<!-- generated by git-cliff -->
<!-- markdownlint-disable-file MD024 -->

[0.1.1]: https://github.com/rmnldwg/lydata/compare/0.1.0..0.1.1
[0.1.0]: https://github.com/rmnldwg/lydata/compare/0.0.4..0.1.0
[0.0.4]: https://github.com/rmnldwg/lydata/compare/0.0.3..0.0.4
[0.0.3]: https://github.com/rmnldwg/lydata/compare/0.0.2..0.0.3
Expand Down
77 changes: 59 additions & 18 deletions lydata/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@
import mistletoe
import numpy as np # noqa: F401
import pandas as pd
from github import Auth, Github
from github import Auth, Github, Repository
from mistletoe.block_token import Heading
from mistletoe.markdown_renderer import MarkdownRenderer
from mistletoe.token import Token
from pydantic import BaseModel, Field, constr

logger = logging.getLogger(__name__)
_repo = "rmnldwg/lydata"
_default_repo_name = "rmnldwg/lydata"
low_min1_str = constr(to_lower=True, min_length=1)


Expand All @@ -59,7 +59,10 @@ class LyDatasetConfig(BaseModel):
description="Institution's short code. E.g., University Hospital Zurich: `usz`."
)
subsite: low_min1_str = Field(description="Subsite(s) this dataset covers.")
repo: low_min1_str = Field(default=_repo, description="GitHub `repository/owner`.")
repo_name: low_min1_str = Field(
default=_default_repo_name,
description="GitHub `repository/owner`.",
)
ref: low_min1_str = Field(
default="main",
description="Branch/tag/commit of the repo.",
Expand Down Expand Up @@ -99,17 +102,52 @@ def get_url(self, file: str) -> str:
"""
return (
"https://raw.githubusercontent.com/"
f"{self.repo}/{self.ref}/"
f"{self.year}-{self.institution}-{self.subsite}/"
f"{self.repo_name}/{self.ref}/{self.name}/"
) + file

def get_description(self) -> str:
def get_repo(
self,
token: str | None = None,
user: str | None = None,
password: str | None = None,
) -> Repository:
"""Get the GitHub repository object.
With the arguments ``token`` or ``user`` and ``password``, one can authenticate
with GitHub. If no authentication is provided, the function will try to use the
environment variables ``GITHUB_TOKEN`` or ``GITHUB_USER`` and
``GITHUB_PASSWORD``.
>>> conf = LyDatasetConfig(
... year=2021,
... institution="clb",
... subsite="oropharynx",
... repo="rmnldwg/lydata",
... )
>>> conf.get_repo().full_name == conf.repo_name
True
>>> conf.get_repo().visibility
'public'
"""
auth = _get_github_auth(token=token, user=user, password=password)
gh = Github(auth=auth)
return gh.get_repo(self.repo_name)

def get_description(
self,
token: str | None = None,
user: str | None = None,
password: str | None = None,
) -> str:
"""Get the description of the dataset.
First, try to load it from the ``README.md`` file that should sit right next to
the ``data.csv`` file. If that fails, try to look for the ``README.md`` file in
the GitHub repository.
In the latter case, see :py:func:`.get_repo` for how to authenticate with
GitHub, if necessary.
>>> conf = LyDatasetConfig(year=2021, institution="clb", subsite="oropharynx")
>>> print(conf.get_description()) # doctest: +ELLIPSIS
# 2021 CLB Oropharynx
Expand All @@ -121,8 +159,7 @@ def get_description(self) -> str:
return format_description(readme, short=True)

logger.info(f"Readme not found at {readme_path}. Searching on GitHub...")
gh = Github(auth=_get_github_auth())
repo = gh.get_repo(self.repo)
repo = self.get_repo(token=token, user=user, password=password)
readme = repo.get_contents(f"{self.name}/README.md").decoded_content.decode()
return format_description(readme, short=True)

Expand Down Expand Up @@ -222,10 +259,14 @@ def _available_datasets_on_disk(
)


def _get_github_auth() -> Auth:
token = os.getenv("GITHUB_TOKEN")
user = os.getenv("GITHUB_USER")
password = os.getenv("GITHUB_PASSWORD")
def _get_github_auth(
token: str | None = None,
user: str | None = None,
password: str | None = None,
) -> Auth:
token = token or os.getenv("GITHUB_TOKEN")
user = user or os.getenv("GITHUB_USER")
password = password or os.getenv("GITHUB_PASSWORD")

if token:
logger.debug("Using GITHUB_TOKEN for authentication.")
Expand All @@ -242,7 +283,7 @@ def _available_datasets_on_github(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
repo: str = _repo,
repo: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
gh = Github(auth=_get_github_auth())
Expand All @@ -263,7 +304,7 @@ def _available_datasets_on_github(
year=year,
institution=institution,
subsite=subsite,
repo=repo.full_name,
repo_name=repo.full_name,
ref=ref,
)

Expand All @@ -274,7 +315,7 @@ def available_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _repo,
repo: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
"""Generate :py:class:`.LyDatasetConfig` instances of available datasets.
Expand Down Expand Up @@ -313,7 +354,7 @@ def available_datasets(
['https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2024-hvh-oropharynx/']
"""
if not use_github:
if repo != _repo or ref != "main":
if repo != _default_repo_name or ref != "main":
warnings.warn(
"Parameters `repo` and `ref` are ignored, unless `use_github` "
"is set to `True`."
Expand All @@ -340,7 +381,7 @@ def load_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _repo,
repo: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> Generator[pd.DataFrame, None, None]:
Expand Down Expand Up @@ -369,7 +410,7 @@ def join_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _repo,
repo: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> pd.DataFrame:
Expand Down
34 changes: 26 additions & 8 deletions lydata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,27 @@ def get_default_modalities() -> dict[str, ModalityConfig]:
}


def infer_all_levels(
dataset: pd.DataFrame,
infer_sublevels_kwargs: dict[str, Any] | None = None,
infer_superlevels_kwargs: dict[str, Any] | None = None,
) -> pd.DataFrame:
"""Infer all levels of involvement for each diagnostic modality.
This function first infers sublevel (e.g. 'IIa", and 'IIb') involvement for each
modality using :py:meth:`~lydata.accessor.LyDataAccessor.infer_sublevels`. Then,
it infers superlevel (e.g. 'II') involvement for each modality using
:py:meth:`~lydata.accessor.LyDataAccessor.infer_superlevels`.
"""
infer_sublevels_kwargs = infer_sublevels_kwargs or {}
infer_superlevels_kwargs = infer_superlevels_kwargs or {}

result = dataset.copy()

result = result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs))
return result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))


def enhance(
dataset: pd.DataFrame,
infer_sublevels_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -148,15 +169,12 @@ def enhance(
the best estimate of the true state of the patient under the top-level key
``max_llh``.
"""
infer_sublevels_kwargs = infer_sublevels_kwargs or {}
infer_superlevels_kwargs = infer_superlevels_kwargs or {}
result = infer_all_levels(
dataset,
infer_sublevels_kwargs=infer_sublevels_kwargs,
infer_superlevels_kwargs=infer_superlevels_kwargs,
)
combine_kwargs = combine_kwargs or {}

result = dataset.copy()

result = result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs))
result = result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))

max_llh = pd.concat(
{"max_llh": result.ly.combine(**combine_kwargs)},
axis="columns",
Expand Down

0 comments on commit 3d34db5

Please sign in to comment.