Skip to content

Commit

Permalink
Merge pull request #11 from rmnldwg/release-0.2.0
Browse files Browse the repository at this point in the history
Release 0.2.0
  • Loading branch information
rmnldwg authored Nov 14, 2024
2 parents 93f8a6b + d51065d commit 336835b
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 82 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

All notable changes to this project will be documented in this file.

## [0.2.0] - 2024-11-14

### Change

- [**breaking**] Add, rename, delete several methods:
- `LyDatasetConfig` is now just `LyDataset`
- the `path` property is now `path_on_disk`
- the `get_url()` method has been removed
- the `get_description()` method has been removed
- added `get_content_file()` method to fetch and store remove content
- `load()` was renamed to `get_dataframe()`
- the `repo` argument was changed to `repo_name`

## [0.1.2] - 2024-10-31

### 🐛 Bug Fixes
Expand Down Expand Up @@ -153,6 +166,7 @@ Initial implementation of the lyDATA library.
<!-- generated by git-cliff -->
<!-- markdownlint-disable-file MD024 -->

[0.2.0]: https://github.com/rmnldwg/lydata/compare/0.1.2..0.2.0
[0.1.2]: https://github.com/rmnldwg/lydata/compare/0.1.1..0.1.2
[0.1.1]: https://github.com/rmnldwg/lydata/compare/0.1.0..0.1.1
[0.1.0]: https://github.com/rmnldwg/lydata/compare/0.0.4..0.1.0
Expand Down
144 changes: 68 additions & 76 deletions lydata/loader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
"""Provides functions to easily load lyDATA CSV tables as :py:class:`pandas.DataFrame`.
The loading itself is implemented in the :py:class:`.LyDatasetConfig` class, which
The loading itself is implemented in the :py:class:`.LyDataset` class, which
is a :py:class:`pydantic.BaseModel` subclass. It validates the unique specification
that identifies a dataset and then allows loading it from the disk (if present) or
from GitHub.
The :py:func:`available_datasets` function can be used to create a generator of such
:py:class:`.LyDatasetConfig` instances, corresponding to all available datasets that
:py:class:`.LyDataset` instances, corresponding to all available datasets that
are either found on disk or on GitHub.
Consequently, the :py:func:`load_datasets` function can be used to load all datasets
matching the given specs/pattern. It takes the same arguments as the function
:py:func:`available_datasets` but returns a generator of :py:class:`pandas.DataFrame`
instead of :py:class:`.LyDatasetConfig`.
instead of :py:class:`.LyDataset`.
Lastly, with the :py:func:`join_datasets` function, one can load and concatenate all
datasets matching the given specs/pattern into a single :py:class:`pandas.DataFrame`.
Expand All @@ -33,10 +33,11 @@
import numpy as np # noqa: F401
import pandas as pd
from github import Auth, Github, Repository
from github.ContentFile import ContentFile
from mistletoe.block_token import Heading
from mistletoe.markdown_renderer import MarkdownRenderer
from mistletoe.token import Token
from pydantic import BaseModel, Field, constr
from pydantic import BaseModel, Field, PrivateAttr, constr

logger = logging.getLogger(__name__)
_default_repo_name = "rmnldwg/lydata"
Expand All @@ -47,7 +48,7 @@ class SkipDiskError(Exception):
"""Raised when the user wants to skip loading from disk."""


class LyDatasetConfig(BaseModel):
class LyDataset(BaseModel):
"""Specification of a dataset."""

year: int = Field(
Expand All @@ -58,7 +59,9 @@ class LyDatasetConfig(BaseModel):
institution: low_min1_str = Field(
description="Institution's short code. E.g., University Hospital Zurich: `usz`."
)
subsite: low_min1_str = Field(description="Subsite(s) this dataset covers.")
subsite: low_min1_str = Field(
description="Tumor subsite(s) patients in this dataset were diagnosed with.",
)
repo_name: low_min1_str = Field(
default=_default_repo_name,
description="GitHub `repository/owner`.",
Expand All @@ -67,44 +70,29 @@ class LyDatasetConfig(BaseModel):
default="main",
description="Branch/tag/commit of the repo.",
)
_content_file: ContentFile | None = PrivateAttr(default=None)

@property
def name(self) -> str:
"""Get the name of the dataset.
>>> conf = LyDatasetConfig(year=2023, institution="clb", subsite="multisite")
>>> conf = LyDataset(year=2023, institution="clb", subsite="multisite")
>>> conf.name
'2023-clb-multisite'
"""
return f"{self.year}-{self.institution}-{self.subsite}"

@property
def path(self) -> Path:
def path_on_disk(self) -> Path:
"""Get the path to the dataset.
>>> conf = LyDatasetConfig(year="2021", institution="usz", subsite="oropharynx")
>>> conf.path.exists()
>>> conf = LyDataset(year="2021", institution="usz", subsite="oropharynx")
>>> conf.path_on_disk.exists()
True
"""
install_loc = Path(__file__).parent.parent
return install_loc / self.name / "data.csv"

def get_url(self, file: str) -> str:
"""Get the URL to the dataset's directory, CSV file, or README file.
>>> LyDatasetConfig(
... year=2021,
... institution="clb",
... subsite="oropharynx",
... ref="6ac98d",
... ).get_url("data.csv")
'https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2021-clb-oropharynx/data.csv'
"""
return (
"https://raw.githubusercontent.com/"
f"{self.repo_name}/{self.ref}/{self.name}/"
) + file

def get_repo(
self,
token: str | None = None,
Expand All @@ -118,11 +106,10 @@ def get_repo(
environment variables ``GITHUB_TOKEN`` or ``GITHUB_USER`` and
``GITHUB_PASSWORD``.
>>> conf = LyDatasetConfig(
>>> conf = LyDataset(
... year=2021,
... institution="clb",
... subsite="oropharynx",
... repo="rmnldwg/lydata",
... )
>>> conf.get_repo().full_name == conf.repo_name
True
Expand All @@ -133,39 +120,41 @@ def get_repo(
gh = Github(auth=auth)
return gh.get_repo(self.repo_name)

def get_description(
def get_content_file(
self,
token: str | None = None,
user: str | None = None,
password: str | None = None,
) -> str:
"""Get the description of the dataset.
) -> ContentFile:
"""Get the GitHub content file of the data CSV.
First, try to load it from the ``README.md`` file that should sit right next to
the ``data.csv`` file. If that fails, try to look for the ``README.md`` file in
the GitHub repository.
This method always tries to fetch the most recent version of the file.
In the latter case, see :py:func:`.get_repo` for how to authenticate with
GitHub, if necessary.
>>> conf = LyDatasetConfig(year=2021, institution="clb", subsite="oropharynx")
>>> print(conf.get_description()) # doctest: +ELLIPSIS
# 2021 CLB Oropharynx
...
>>> conf = LyDataset(
... year=2023,
... institution="usz",
... subsite="hypopharynx-larynx",
... repo_name="rmnldwg/lydata.private",
... ref="2023-usz-hypopharynx-larynx",
... )
>>> conf.get_content_file()
ContentFile(path="2023-usz-hypopharynx-larynx/data.csv")
"""
readme_path = self.path.with_name("README.md")
if readme_path.exists():
with open(readme_path, encoding="utf-8") as readme:
return format_description(readme, short=True)
if self._content_file is not None:
if self._content_file.update():
logger.info(f"Content file of {self.name} was updated.")
return self._content_file

logger.info(f"Readme not found at {readme_path}. Searching on GitHub...")
repo = self.get_repo(token=token, user=user, password=password)
readme = repo.get_contents(f"{self.name}/README.md").decoded_content.decode()
return format_description(readme, short=True)
self._content_file = repo.get_contents(f"{self.name}/data.csv", ref=self.ref)
return self._content_file

def load(
def get_dataframe(
self,
use_github: bool = False,
token: str | None = None,
user: str | None = None,
password: str | None = None,
**load_kwargs,
) -> pd.DataFrame:
"""Load the ``data.csv`` file from disk or from GitHub.
Expand All @@ -177,11 +166,11 @@ def load(
in the :py:attr:`~pandas.DataFrame.attrs` attribute of the returned
:py:class:`~pandas.DataFrame`.
>>> conf = LyDatasetConfig(year=2021, institution="clb", subsite="oropharynx")
>>> df_from_disk = conf.load()
>>> conf = LyDataset(year=2021, institution="clb", subsite="oropharynx")
>>> df_from_disk = conf.get_dataframe()
>>> df_from_disk.shape
(263, 82)
>>> df_from_github = conf.load(use_github=True)
>>> df_from_github = conf.get_dataframe(use_github=True)
>>> np.all(df_from_disk.fillna(0) == df_from_github.fillna(0))
np.True_
"""
Expand All @@ -190,15 +179,20 @@ def load(

try:
if use_github:
logger.info(f"Skipping loading from {self.path}.")
logger.info(f"Skipping loading from {self.path_on_disk}.")
raise SkipDiskError
df = pd.read_csv(self.path, **kwargs)
df = pd.read_csv(self.path_on_disk, **kwargs)

except (FileNotFoundError, pd.errors.ParserError, SkipDiskError) as err:
if isinstance(err, FileNotFoundError | pd.errors.ParserError):
logger.info(f"Could not load from {self.path}. Trying GitHub...")
logger.info(
f"Could not load from {self.path_on_disk}. Trying GitHub..."
)

df = pd.read_csv(self.get_url("data.csv"), **kwargs)
download_url = self.get_content_file(
token=token, user=user, password=password
).download_url
df = pd.read_csv(download_url, **kwargs)

df.attrs.update(self.model_dump())
return df
Expand Down Expand Up @@ -244,15 +238,15 @@ def _available_datasets_on_disk(
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
) -> Generator[LyDatasetConfig, None, None]:
) -> Generator[LyDataset, None, None]:
pattern = f"{str(year)}-{institution}-{subsite}"
search_paths = search_paths or [Path(__file__).parent.parent]

for search_path in search_paths:
for match in search_path.glob(pattern):
if match.is_dir() and (match / "data.csv").exists():
year, institution, subsite = match.name.split("-")
yield LyDatasetConfig(
yield LyDataset(
year=year,
institution=institution,
subsite=subsite,
Expand Down Expand Up @@ -283,12 +277,12 @@ def _available_datasets_on_github(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
) -> Generator[LyDataset, None, None]:
gh = Github(auth=_get_github_auth())

repo = gh.get_repo(repo)
repo = gh.get_repo(repo_name)
contents = repo.get_contents(path="", ref=ref)

matches = []
Expand All @@ -300,7 +294,7 @@ def _available_datasets_on_github(

for match in matches:
year, institution, subsite = match.name.split("-", maxsplit=2)
yield LyDatasetConfig(
yield LyDataset(
year=year,
institution=institution,
subsite=subsite,
Expand All @@ -315,10 +309,10 @@ def available_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
"""Generate :py:class:`.LyDatasetConfig` instances of available datasets.
) -> Generator[LyDataset, None, None]:
"""Generate :py:class:`.LyDataset` instances of available datasets.
The arguments ``year``, ``institution``, and ``subsite`` represent glob patterns
and all datasets matching these patterns can be iterated over using the returned
Expand All @@ -340,7 +334,7 @@ def available_datasets(
'2023-clb-multisite',
'2023-isb-multisite']
>>> avail_gen = available_datasets(
... repo="rmnldwg/lydata.private",
... repo_name="rmnldwg/lydata.private",
... ref="2024-umcg-hypopharynx-larynx",
... use_github=True,
... )
Expand All @@ -355,11 +349,9 @@ def available_datasets(
... ref="6ac98d",
... use_github=True,
... )
>>> sorted([ds.get_url("") for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2024-hvh-oropharynx/']
"""
if not use_github:
if repo != _default_repo_name or ref != "main":
if repo_name != _default_repo_name or ref != "main":
warnings.warn(
"Parameters `repo` and `ref` are ignored, unless `use_github` "
"is set to `True`."
Expand All @@ -375,7 +367,7 @@ def available_datasets(
year=year,
institution=institution,
subsite=subsite,
repo=repo,
repo_name=repo_name,
ref=ref,
)

Expand All @@ -386,13 +378,13 @@ def load_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> Generator[pd.DataFrame, None, None]:
"""Load matching datasets from the disk.
It loads every dataset from the :py:class:`.LyDatasetConfig` instances generated by
It loads every dataset from the :py:class:`.LyDataset` instances generated by
the :py:func:`available_datasets` function, which also receives all arguments of
this function.
"""
Expand All @@ -402,11 +394,11 @@ def load_datasets(
subsite=subsite,
search_paths=search_paths,
use_github=use_github,
repo=repo,
repo_name=repo_name,
ref=ref,
)
for dset_conf in dset_confs:
yield dset_conf.load(use_github=use_github, **kwargs)
yield dset_conf.get_dataframe(use_github=use_github, **kwargs)


def join_datasets(
Expand All @@ -415,7 +407,7 @@ def join_datasets(
subsite: str = "*",
search_paths: list[Path] | None = None,
use_github: bool = False,
repo: str = _default_repo_name,
repo_name: str = _default_repo_name,
ref: str = "main",
**kwargs,
) -> pd.DataFrame:
Expand All @@ -436,7 +428,7 @@ def join_datasets(
subsite=subsite,
search_paths=search_paths,
use_github=use_github,
repo=repo,
repo_name=repo_name,
ref=ref,
**kwargs,
)
Expand Down
Loading

0 comments on commit 336835b

Please sign in to comment.