From 6d07306d76380dfb9590c23b63661d74811b56af Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 08:29:46 +0100 Subject: [PATCH 01/20] test(uni): check if loading empty data works Related: #79 --- tests/binary_unilateral_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index 57db93e..750342b 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -210,6 +210,12 @@ def setUp(self): self.model.set_params(**self.create_random_params()) self.load_patient_data(filename="2021-usz-oropharynx.csv") + def test_load_empty_dataframe(self): + """Make sure the patient data is loaded correctly.""" + self.model.load_patient_data(self.raw_data.iloc[:0]) + self.assertEqual(len(self.model.patient_data), 0) + self.assertEqual(self.model.likelihood(), 0.) + def test_load_patient_data(self): """Make sure the patient data is loaded correctly.""" self.assertEqual(len(self.model.patient_data), len(self.raw_data)) From df5f8037208c5e67f639211b70a9e758c6932627 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 08:31:30 +0100 Subject: [PATCH 02/20] fix(uni): catch error when `apply` to empty data For some reason, using `apply` on an empty `DataFrame` has an entirely different return type than when it is not empty. This caused the issue #79 and has now been fixed. Fixes: #79 --- lymph/models/unilateral.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 9c2c76c..c176721 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -583,14 +583,21 @@ def load_patient_data( column = patient_data[modality, side, name] patient_data["_model", modality, name] = column - patient_data[T_STAGE_COL] = patient_data.apply(lambda_mapping, axis=1) + if len(patient_data) == 0: + patient_data[T_STAGE_COL] = None + else: + patient_data[T_STAGE_COL] = patient_data.apply(lambda_mapping, axis=1) + self._patient_data = patient_data + self._cache_version += 1 for t_stage in self.get_t_stages("distributions"): if t_stage not in patient_data[T_STAGE_COL].values: - warnings.warn(f"No data for T-stage {t_stage} found.") + warnings.warn( + message=f"No data for T-stage {t_stage} found.", + category=types.MissingTStageWarning, + ) - self._cache_version += 1 @property From 8fe81e2583faf7cec83198710d9efd27fb73b2c1 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 08:31:51 +0100 Subject: [PATCH 03/20] change: add custom warnings about missing T-stage --- lymph/models/midline.py | 2 +- lymph/types.py | 4 ++++ tests/fixtures.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 1efc9b9..30fab85 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -517,7 +517,7 @@ def load_patient_data( if self.marginalize_unknown and is_unknown.sum() > 0: self.unknown.load_patient_data(patient_data[is_unknown], mapping) - else: + elif is_unknown.sum() > 0: warnings.warn( f"Discarding {is_unknown.sum()} patients where midline extension " "is unknown." diff --git a/lymph/types.py b/lymph/types.py index b7f0f54..64feebd 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -9,6 +9,10 @@ from pandas._libs.missing import NAType +class MissingTStageWarning(UserWarning): + """Warning that is raised when a defined T-stage is missing from the data.""" + + class HasSetParams(Protocol): """Protocol for classes that have a ``set_params`` method.""" def set_params(self, *args: float, **kwargs: float) -> tuple[float]: diff --git a/tests/fixtures.py b/tests/fixtures.py index 3770e98..ef068ad 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -15,7 +15,7 @@ from lymph import diagnose_times from lymph.modalities import Clinical, Modality, Pathological from lymph.models import Unilateral -from lymph.types import PatternType +from lymph.types import MissingTStageWarning, PatternType MODALITIES = { "CT": Clinical(spec=0.81, sens=0.86), @@ -30,6 +30,7 @@ class IgnoreWarningsTestCase(unittest.TestCase): def setUp(self) -> None: """Ignore warnings.""" warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning) + warnings.simplefilter("ignore", category=MissingTStageWarning) super().setUp() From 5747148430ff64e4abca92956f00918e48995840 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:17:33 +0100 Subject: [PATCH 04/20] test(uni): make sure likelihood is deterministic --- tests/binary_unilateral_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index 750342b..2c67714 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -319,6 +319,12 @@ def test_likelihood_invalid_params_isinf(self): ) self.assertEqual(likelihood, -np.inf) + def test_compute_likelihood_twice(self): + """Make sure the likelihood is the same when computed twice.""" + likelihood = self.model.likelihood(log=True, mode="HMM") + likelihood_again = self.model.likelihood(log=True, mode="HMM") + self.assertEqual(likelihood, likelihood_again) + class RiskTestCase( fixtures.BinaryUnilateralModelMixin, From 2f215026893a2cb1816a5ef67b7968af2555e006 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:22:09 +0100 Subject: [PATCH 05/20] change: hash magically changes... --- lymph/matrix.py | 32 ++----------- lymph/models/midline.py | 5 +- lymph/models/unilateral.py | 86 ++++++++++------------------------ tests/binary_bilateral_test.py | 2 + 4 files changed, 34 insertions(+), 91 deletions(-) diff --git a/lymph/matrix.py b/lymph/matrix.py index b034e39..1d5d466 100644 --- a/lymph/matrix.py +++ b/lymph/matrix.py @@ -188,19 +188,16 @@ def generate_data_encoding( patient_data: pd.DataFrame, modalities: dict[str, Modality], lnls: list[str], -) -> pd.DataFrame: +) -> np.ndarray: """Generate the data matrix for a specific T-stage from patient data. - The :py:attr:`~lymph.models.Unilateral.patient_data` needs to contain the column + The :py:attr:`.models.Unilateral.patient_data` needs to contain the column ``"_model"``, which is constructed when loading the data into the model. From this, - a data matrix is constructed for the given ``t_stage``. If ``"_BN"`` is selected, - as T-stage, the data matrix for all patients is returned. This is mainly used for - the computation of the Bayesian network likelihood. + a data matrix is constructed for all present diagnostic modalities. The returned matrix has the shape :math:`2^{N \\cdot \\mathcal{O}} \\times M`, where :math:`N` is the number of lymph node levels, :math:`\\mathcal{O}` is the - number of diagnostic modalities and :math:`M` is the number of patients with the - given ``t_stage`` (or just all patients). + number of diagnostic modalities and :math:`M` is the number of patients. """ result = np.ones( shape=(2 ** (len(lnls) * len(modalities)), len(patient_data)), @@ -221,26 +218,7 @@ def generate_data_encoding( result[:,i] = patient_encoding - mi = pd.MultiIndex.from_product([ - ["_model"], ["_encoding"], range(result.shape[0]), - ]) - return pd.DataFrame(result.T, columns=mi) - - -def generate_diagnose_probs( - observation_matrix: np.ndarray, - data_matrix: np.ndarray, -) -> pd.DataFrame: - """Generate the diagnose matrix for a specific T-stage. - - The diagnose matrix is the product of the observation matrix and the data matrix - for the given ``t_stage``. - """ - result = observation_matrix @ data_matrix.T - mi = pd.MultiIndex.from_product([ - ["_model"], ["_diagnose_prob"], range(result.shape[0]), - ]) - return pd.DataFrame(result.T, columns=mi) + return result.T @lru_cache diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 30fab85..1679afe 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -353,8 +353,9 @@ def get_params( This includes the spread parameters from the call to :py:meth:`get_spread_params` and the distribution parameters from the call to :py:meth:`get_distribution_params`. """ - params = self.get_spread_params(as_flat=as_flat) + params = {} params["mixing"] = self.mixing_param + params.update(self.get_spread_params(as_flat=as_flat)) params["midext_prob"] = self.midext_prob params.update(self.get_distribution_params(as_flat=as_flat)) @@ -473,9 +474,9 @@ def set_params( Combines the calls to :py:meth:`.set_spread_params` and :py:meth:`.set_distribution_params`. """ - args = self.set_spread_params(*args, **kwargs) first, args = popfirst(args) self.midext_prob = kwargs.get("midext_prob", first) or self.midext_prob + args = self.set_spread_params(*args, **kwargs) return self.set_distribution_params(*args, **kwargs) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index c176721..2f7eea6 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -455,39 +455,25 @@ def data_matrix(self, t_stage: str | None = None) -> np.ndarray: if self._patient_data is None: raise AttributeError("No patient data loaded yet.") - # Compute entire data matrix and store it in the patient data DataFrame if it - # is not in the cache - _hash = hash((None, self.modalities_hash(), self._cache_version)) - if _hash not in self._data_matrix_cache: - self.del_data_matrix() - data_encoding = matrix.generate_data_encoding( + # Compute entire data matrix if it is not in the cache + full_hash = hash((None, self.modalities_hash(), self._cache_version)) + if full_hash not in self._data_matrix_cache: + self._data_matrix_cache[full_hash] = matrix.generate_data_encoding( patient_data=self._patient_data, modalities=self.get_all_modalities(), lnls=list(self.graph.lnls.keys()), ) - self._patient_data = pd.concat([self._patient_data, data_encoding], axis=1) - self._data_matrix_cache[_hash] = self._patient_data[ENCODING_COL].to_numpy() - # Return a cache hit - _hash = hash((t_stage, self.modalities_hash(), self._cache_version)) - if _hash in self._data_matrix_cache: - return self._data_matrix_cache[_hash] - - # Extract a subset of the data matrix for a given T-stage from the entire - # data matrix and store it in the cache - has_t_stage = self.patient_data[T_STAGE_COL] == t_stage - has_t_stage = slice(None) if t_stage is None else has_t_stage - result = self.patient_data.loc[has_t_stage, ENCODING_COL].to_numpy() - self._data_matrix_cache[_hash] = result - return result - - def del_data_matrix(self) -> None: - """Delete the data matrix.""" - if ( - self._patient_data is not None - and ENCODING_COL in self._patient_data.columns - ): - self._patient_data.drop(columns=ENCODING_COL, inplace=True) + # Extract a subset of the data matrix for a given T-stage. If `t_stage` is + # `None`, this will be skipped and the entire data matrix will be returned. + t_hash = hash((t_stage, self.modalities_hash(), self._cache_version)) + if t_hash not in self._data_matrix_cache: + has_t_stage = self.patient_data[T_STAGE_COL] == t_stage + full_data_matrix = self._data_matrix_cache[full_hash] + t_data_matrix = full_data_matrix[has_t_stage] + self._data_matrix_cache[t_hash] = t_data_matrix + + return self._data_matrix_cache[t_hash] def diagnose_matrix(self, t_stage: str | None = None) -> np.ndarray: @@ -495,42 +481,18 @@ def diagnose_matrix(self, t_stage: str | None = None) -> np.ndarray: For every patient this matrix stores the probability to observe this patient's diagnosis, given one of the possible hidden states of the model. It is computed - by multiplying the :py:attr:`~data_matrix` with the - :py:attr:`~observation_matrix`. + by multiplying the :py:meth:`.data_matrix` with the + :py:meth:`.observation_matrix`. """ - # Compute the entire diagnose matrix and store it in the patient data DataFrame - # if it is not in the cache. Note that this requires the data matrix to be - # computed as well. - _hash = hash((None, self.modalities_hash(), self._cache_version)) + # Compute the entire diagnose matrix if it is not in the cache. Note that this + # requires the data matrix to be computed as well. + _hash = hash((t_stage, self.modalities_hash(), self._cache_version)) if _hash not in self._diagnose_matrix_cache: - self.del_diagnose_matrix() - diagnose_probs = matrix.generate_diagnose_probs( - self.observation_matrix(), self.data_matrix(), + self._diagnose_matrix_cache[_hash] = ( + self.observation_matrix() @ self.data_matrix(t_stage).T ) - self._patient_data = pd.concat([self._patient_data, diagnose_probs], axis=1) - diagnose_matrix = self._patient_data[DIAG_PROB_COL].to_numpy() - self._diagnose_matrix_cache[_hash] = diagnose_matrix - # Return a cache hit - _hash = hash((t_stage, self.modalities_hash(), self._cache_version)) - if _hash in self._diagnose_matrix_cache: - return self._diagnose_matrix_cache[_hash] - - # Extract a subset of the diagnose matrix for a given T-stage from the entire - # diagnose matrix and store it in the cache - has_t_stage = self.patient_data[T_STAGE_COL] == t_stage - has_t_stage = slice(None) if t_stage is None else has_t_stage - result = self.patient_data.loc[has_t_stage, DIAG_PROB_COL].to_numpy() - self._diagnose_matrix_cache[_hash] = result - return result - - def del_diagnose_matrix(self) -> None: - """Delete the diagnose matrix.""" - if ( - self._patient_data is not None - and DIAG_PROB_COL in self._patient_data.columns - ): - self._patient_data.drop(columns=DIAG_PROB_COL, inplace=True) + return self._diagnose_matrix_cache[_hash].T def load_patient_data( @@ -564,8 +526,6 @@ def load_patient_data( .drop(columns="_model", errors="ignore") .reset_index(drop=True) ) - mapping = dict_to_func(mapping) if isinstance(mapping, dict) else mapping - lambda_mapping = lambda row: mapping(row["tumor", "1", "t_stage"]) for modality in self.get_all_modalities().keys(): if modality not in patient_data.columns.levels[0]: @@ -586,6 +546,8 @@ def load_patient_data( if len(patient_data) == 0: patient_data[T_STAGE_COL] = None else: + mapping = dict_to_func(mapping) if isinstance(mapping, dict) else mapping + lambda_mapping = lambda row: mapping(row["tumor", "1", "t_stage"]) patient_data[T_STAGE_COL] = patient_data.apply(lambda_mapping, axis=1) self._patient_data = patient_data diff --git a/tests/binary_bilateral_test.py b/tests/binary_bilateral_test.py index 9de4674..db4b54c 100644 --- a/tests/binary_bilateral_test.py +++ b/tests/binary_bilateral_test.py @@ -287,8 +287,10 @@ def setUp(self): def test_compute_likelihood_twice(self): """Test that the likelihood is computed correctly.""" + tmp = hash(1) first_llh = self.model.likelihood(log=True) second_llh = self.model.likelihood(log=True) + self.assertEqual(tmp, hash(1)) self.assertEqual(first_llh, second_llh) From ed5843e813c8687867d0f379f8350673eca743fb Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 7 Mar 2024 22:27:54 +0100 Subject: [PATCH 06/20] fix(bi): data reload loads wrong side Now the data does not get reloaded anymore, which was actually unnecessary in the first place. --- lymph/matrix.py | 2 ++ lymph/modalities.py | 28 ---------------------------- lymph/models/unilateral.py | 19 ++++++------------- tests/binary_bilateral_test.py | 2 -- tests/binary_unilateral_test.py | 14 +++++++++++--- 5 files changed, 19 insertions(+), 46 deletions(-) diff --git a/lymph/matrix.py b/lymph/matrix.py index 1d5d466..1e35fdf 100644 --- a/lymph/matrix.py +++ b/lymph/matrix.py @@ -4,6 +4,7 @@ # pylint: disable=too-few-public-methods from __future__ import annotations +import warnings from functools import lru_cache from typing import Iterable @@ -208,6 +209,7 @@ def generate_data_encoding( patient_encoding = np.ones(shape=1, dtype=bool) for modality_name in modalities.keys(): if modality_name not in patient_row: + warnings.warn(f"Modality {modality_name} not in data. Skipping.") continue diagnose_encoding = compute_encoding( lnls=lnls, diff --git a/lymph/modalities.py b/lymph/modalities.py index 5876f74..5b91e57 100644 --- a/lymph/modalities.py +++ b/lymph/modalities.py @@ -10,7 +10,6 @@ import warnings from abc import ABC, abstractmethod -from contextlib import contextmanager from typing import Literal, TypeVar import numpy as np @@ -182,7 +181,6 @@ def __init__( modality_children = {} # ignore any provided children self._modality_children = modality_children - self._stored_modalities_hash = self.modalities_hash() @property @@ -220,32 +218,6 @@ def modalities_hash(self: MC) -> int: return hash_res - def have_modalities_changed(self: MC) -> bool: - """Return whether the modalities have changed since the last check.""" - return self._stored_modalities_hash != self.modalities_hash() - - def acknowledge_modalities_change(self: MC) -> None: - """Acknowledge that the modalities have changed.""" - self._stored_modalities_hash = self.modalities_hash() - - - @contextmanager - def modality_context(self: MC): - """Context that yields the check if the modalities have changed. - - The context binds the result of calling :py:meth:`.have_modalities_changed` to - the ``as`` clause. Then, inside the ``with`` block, one can check if the - modalities have changed since the last check. Upon exiting the ``with`` - context, the stored hash of the modalities is updated and - :py:meth:`.have_modalities_changed` will return ``False`` until the - modalities are changed again. - """ - try: - yield self.have_modalities_changed() - finally: - self.acknowledge_modalities_change() - - def get_modality(self: MC, name: str) -> Modality: """Return the modality with the given ``name``.""" return self.get_all_modalities()[name] diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 2f7eea6..a098be0 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -527,10 +527,8 @@ def load_patient_data( .reset_index(drop=True) ) - for modality in self.get_all_modalities().keys(): - if modality not in patient_data.columns.levels[0]: - raise ValueError(f"{modality} data not found.") - + data_modalities = set(patient_data.columns.levels[0]) - {"patient", "tumor"} + for modality in data_modalities: if side not in patient_data[modality]: raise ValueError(f"{side}lateral involvement data not found.") @@ -547,8 +545,10 @@ def load_patient_data( patient_data[T_STAGE_COL] = None else: mapping = dict_to_func(mapping) if isinstance(mapping, dict) else mapping - lambda_mapping = lambda row: mapping(row["tumor", "1", "t_stage"]) - patient_data[T_STAGE_COL] = patient_data.apply(lambda_mapping, axis=1) + patient_data[T_STAGE_COL] = patient_data.apply( + lambda row: mapping(row["tumor", "1", "t_stage"]), + axis=1, + ) self._patient_data = patient_data self._cache_version += 1 @@ -583,13 +583,6 @@ def patient_data(self) -> pd.DataFrame: if self._patient_data is None: raise AttributeError("No patient data loaded yet.") - with self.modality_context() as has_changed: - # we need to reload the patient data when the modalities have changed, - # since it stores only those diagnoses under the ``"_model"`` header that - # are relevant for the current modalities (which can change). - if has_changed: - self.load_patient_data(self._patient_data) - # if not present, this will recompute the full data and diagnose matrices _ = self.diagnose_matrix() diff --git a/tests/binary_bilateral_test.py b/tests/binary_bilateral_test.py index db4b54c..9de4674 100644 --- a/tests/binary_bilateral_test.py +++ b/tests/binary_bilateral_test.py @@ -287,10 +287,8 @@ def setUp(self): def test_compute_likelihood_twice(self): """Test that the likelihood is computed correctly.""" - tmp = hash(1) first_llh = self.model.likelihood(log=True) second_llh = self.model.likelihood(log=True) - self.assertEqual(tmp, hash(1)) self.assertEqual(first_llh, second_llh) diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index 2c67714..4ef1e0c 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -282,10 +282,18 @@ def test_diagnose_matrices(self): )) def test_modality_replacement(self) -> None: - """Check if the patient data gets updated when the modalities change.""" + """Check if the data & diagnose matrices get updated when modalities change.""" + data_matrix = self.model.data_matrix() + diagnose_matrix = self.model.diagnose_matrix() self.model.replace_all_modalities({"PET": Clinical(spec=0.8, sens=0.8)}) - self.assertTrue("PET" in self.model.patient_data["_model"].columns) - self.assertFalse("CT" in self.model.patient_data["_model"].columns) + self.assertNotEqual( + hash(data_matrix.tobytes()), + hash(self.model.data_matrix().tobytes()), + ) + self.assertNotEqual( + hash(diagnose_matrix.tobytes()), + hash(self.model.diagnose_matrix().tobytes()), + ) class LikelihoodTestCase( From fb69f0b80c2f54b497e28308cc922ca0d7eec4bd Mon Sep 17 00:00:00 2001 From: rmnldwg <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 8 Mar 2024 08:35:30 +0100 Subject: [PATCH 07/20] change(uni)!: shorten two (unused) method names --- lymph/models/unilateral.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index a098be0..0c6daa6 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -11,7 +11,7 @@ from lymph import diagnose_times, graph, matrix, modalities, types # pylint: disable=unused-import -from lymph.helper import ( # nopycln: import +from lymph.utils import ( # nopycln: import add_or_mult, dict_to_func, draw_diagnoses, @@ -23,9 +23,9 @@ warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) -ENCODING_COL = ("_model", "_encoding") -DIAG_PROB_COL = ("_model", "_diagnose_prob") -T_STAGE_COL = ("_model", "#", "t_stage") + +MAP_T_COL = ("_model", "#", "t_stage") +RAW_T_COL = ("tumor", "1", "t_stage") class Unilateral( @@ -158,7 +158,7 @@ def get_t_stages( if which in ("valid", "data"): try: - data_t_stages = self.patient_data[T_STAGE_COL].unique() + data_t_stages = self.patient_data[MAP_T_COL].unique() except AttributeError: data_t_stages = [] @@ -290,7 +290,7 @@ def set_params(self, *args: float, **kwargs: float) -> tuple[float]: return self.set_distribution_params(*args, **kwargs) - def comp_transition_prob( + def transition_prob( self, newstate: list[int], assign: bool = False @@ -313,7 +313,7 @@ def comp_transition_prob( return trans_prob - def comp_diagnose_prob( + def diagnose_prob( self, diagnoses: pd.Series | dict[str, dict[str, bool]] ) -> float: @@ -468,7 +468,7 @@ def data_matrix(self, t_stage: str | None = None) -> np.ndarray: # `None`, this will be skipped and the entire data matrix will be returned. t_hash = hash((t_stage, self.modalities_hash(), self._cache_version)) if t_hash not in self._data_matrix_cache: - has_t_stage = self.patient_data[T_STAGE_COL] == t_stage + has_t_stage = self.patient_data[MAP_T_COL] == t_stage full_data_matrix = self._data_matrix_cache[full_hash] t_data_matrix = full_data_matrix[has_t_stage] self._data_matrix_cache[t_hash] = t_data_matrix @@ -542,11 +542,11 @@ def load_patient_data( patient_data["_model", modality, name] = column if len(patient_data) == 0: - patient_data[T_STAGE_COL] = None + patient_data[MAP_T_COL] = None else: mapping = dict_to_func(mapping) if isinstance(mapping, dict) else mapping - patient_data[T_STAGE_COL] = patient_data.apply( - lambda row: mapping(row["tumor", "1", "t_stage"]), + patient_data[MAP_T_COL] = patient_data.apply( + lambda row: mapping(row[RAW_T_COL]), axis=1, ) @@ -554,7 +554,7 @@ def load_patient_data( self._cache_version += 1 for t_stage in self.get_t_stages("distributions"): - if t_stage not in patient_data[T_STAGE_COL].values: + if t_stage not in patient_data[MAP_T_COL].values: warnings.warn( message=f"No data for T-stage {t_stage} found.", category=types.MissingTStageWarning, @@ -959,6 +959,6 @@ def draw_patients( multi_cols = pd.MultiIndex.from_product([modality_names, ["ipsi"], lnl_names]) dataset = pd.DataFrame(drawn_obs, columns=multi_cols) - dataset[("tumor", "1", "t_stage")] = drawn_t_stages + dataset[(RAW_T_COL)] = drawn_t_stages return dataset From 889595f19f9203ab31ba3971b70ff584dab9c2d9 Mon Sep 17 00:00:00 2001 From: rmnldwg <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 8 Mar 2024 08:35:52 +0100 Subject: [PATCH 08/20] change!: `helpers` are now `utils` --- lymph/__init__.py | 2 +- lymph/diagnose_times.py | 2 +- lymph/graph.py | 2 +- lymph/matrix.py | 2 +- lymph/models/bilateral.py | 2 +- lymph/models/midline.py | 2 +- lymph/{helper.py => utils.py} | 0 tests/binary_bilateral_test.py | 2 +- tests/doc_test.py | 4 ++-- tests/trinary_unilateral_test.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) rename lymph/{helper.py => utils.py} (100%) diff --git a/lymph/__init__.py b/lymph/__init__.py index ed0a6e1..73df9a3 100644 --- a/lymph/__init__.py +++ b/lymph/__init__.py @@ -17,7 +17,7 @@ # nopycln: file from lymph import diagnose_times, graph, matrix, models -from lymph.helper import clinical, pathological +from lymph.utils import clinical, pathological __all__ = [ "diagnose_times", "matrix", diff --git a/lymph/diagnose_times.py b/lymph/diagnose_times.py index 0c5671b..6480f56 100644 --- a/lymph/diagnose_times.py +++ b/lymph/diagnose_times.py @@ -21,7 +21,7 @@ import numpy as np -from lymph.helper import flatten, popfirst, unflatten_and_split +from lymph.utils import flatten, popfirst, unflatten_and_split logger = logging.getLogger(__name__) diff --git a/lymph/graph.py b/lymph/graph.py index f23cb43..342b9d6 100644 --- a/lymph/graph.py +++ b/lymph/graph.py @@ -18,7 +18,7 @@ import numpy as np -from lymph.helper import ( +from lymph.utils import ( check_unique_names, comp_transition_tensor, flatten, diff --git a/lymph/matrix.py b/lymph/matrix.py index 1e35fdf..cccbef9 100644 --- a/lymph/matrix.py +++ b/lymph/matrix.py @@ -12,7 +12,7 @@ import pandas as pd from lymph import graph -from lymph.helper import get_state_idx_matrix, row_wise_kron, tile_and_repeat +from lymph.utils import get_state_idx_matrix, row_wise_kron, tile_and_repeat from lymph.modalities import Modality diff --git a/lymph/models/bilateral.py b/lymph/models/bilateral.py index 630f761..c8d4ca6 100644 --- a/lymph/models/bilateral.py +++ b/lymph/models/bilateral.py @@ -8,7 +8,7 @@ import pandas as pd from lymph import diagnose_times, matrix, modalities, models, types -from lymph.helper import ( +from lymph.utils import ( add_or_mult, early_late_mapping, flatten, diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 1679afe..095bdf1 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -8,7 +8,7 @@ import pandas as pd from lymph import diagnose_times, matrix, modalities, models, types -from lymph.helper import ( +from lymph.utils import ( add_or_mult, draw_diagnoses, early_late_mapping, diff --git a/lymph/helper.py b/lymph/utils.py similarity index 100% rename from lymph/helper.py rename to lymph/utils.py diff --git a/tests/binary_bilateral_test.py b/tests/binary_bilateral_test.py index 9de4674..9afc18d 100644 --- a/tests/binary_bilateral_test.py +++ b/tests/binary_bilateral_test.py @@ -5,7 +5,7 @@ import numpy as np from lymph import models -from lymph.helper import flatten +from lymph.utils import flatten from . import fixtures diff --git a/tests/doc_test.py b/tests/doc_test.py index 7265cf3..140dd43 100644 --- a/tests/doc_test.py +++ b/tests/doc_test.py @@ -4,7 +4,7 @@ import doctest import unittest -from lymph import diagnose_times, graph, helper, matrix, modalities +from lymph import diagnose_times, graph, matrix, modalities, utils from lymph.models import bilateral, unilateral @@ -12,7 +12,7 @@ def load_tests(loader, tests: unittest.TestSuite, ignore): """Load doctests from the lymph package.""" tests.addTests(doctest.DocTestSuite(diagnose_times)) tests.addTests(doctest.DocTestSuite(graph)) - tests.addTests(doctest.DocTestSuite(helper)) + tests.addTests(doctest.DocTestSuite(utils)) tests.addTests(doctest.DocTestSuite(matrix)) tests.addTests(doctest.DocTestSuite(modalities)) diff --git a/tests/trinary_unilateral_test.py b/tests/trinary_unilateral_test.py index d57617a..d022019 100644 --- a/tests/trinary_unilateral_test.py +++ b/tests/trinary_unilateral_test.py @@ -4,7 +4,7 @@ import pandas as pd from lymph.graph import LymphNodeLevel -from lymph.helper import set_params_for +from lymph.utils import set_params_for from . import fixtures From d7d57acb904643cd52b1aef7ce1dc42ea151abc8 Mon Sep 17 00:00:00 2001 From: rmnldwg <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:32:40 +0100 Subject: [PATCH 09/20] fix(uni): return correctly in `get_spread_params` --- docs/source/api.rst | 2 +- docs/source/helper.rst | 12 ------------ docs/source/utils.rst | 12 ++++++++++++ lymph/models/unilateral.py | 18 +++++++++++++----- lymph/types.py | 4 ++++ 5 files changed, 30 insertions(+), 18 deletions(-) delete mode 100644 docs/source/helper.rst create mode 100644 docs/source/utils.rst diff --git a/docs/source/api.rst b/docs/source/api.rst index 0b49f89..418711c 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -9,7 +9,7 @@ Detailed API models components types - helper + utils Index & search diff --git a/docs/source/helper.rst b/docs/source/helper.rst deleted file mode 100644 index 687b661..0000000 --- a/docs/source/helper.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. module: helper - -.. _helper: - - -Helper Functions -================ - -.. automodule:: lymph.helper - :members: - :special-members: __init__ - :show-inheritance: diff --git a/docs/source/utils.rst b/docs/source/utils.rst new file mode 100644 index 0000000..ae6237c --- /dev/null +++ b/docs/source/utils.rst @@ -0,0 +1,12 @@ +.. module: utils + +.. _utils: + + +Utility Functions +================= + +.. automodule:: lymph.utils + :members: + :special-members: __init__ + :show-inheritance: diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 0c6daa6..489cfc5 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -202,10 +202,13 @@ def get_spread_params( as_flat: bool = True, ) -> Iterable[float] | dict[str, float]: """Get the parameters of the spread edges.""" - return { - **self.get_tumor_spread_params(as_dict, as_flat), - **self.get_lnl_spread_params(as_dict, as_flat), - } + params = self.get_tumor_spread_params(as_flat=as_flat) + params.update(self.get_lnl_spread_params(as_flat=as_flat)) + + if as_flat or not as_dict: + params = flatten(params) + + return params if as_dict else params.values() def get_params( @@ -530,7 +533,12 @@ def load_patient_data( data_modalities = set(patient_data.columns.levels[0]) - {"patient", "tumor"} for modality in data_modalities: if side not in patient_data[modality]: - raise ValueError(f"{side}lateral involvement data not found.") + warnings.warn( + f"{side}lateral involvement data not found. Skipping " + f"modality {modality}.", + category=types.InvalidDataModalityWarning, + ) + continue for name in self.graph.lnls.keys(): modality_side_data = patient_data[modality, side] diff --git a/lymph/types.py b/lymph/types.py index 64feebd..8ebd740 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -9,6 +9,10 @@ from pandas._libs.missing import NAType +class InvalidDataModalityWarning(UserWarning): + """Warning that is raised when a header in the data is not a valid modality.""" + + class MissingTStageWarning(UserWarning): """Warning that is raised when a defined T-stage is missing from the data.""" From 3dc1fd038e65de53119af9867c3f2eabc84fe743 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:48:06 +0100 Subject: [PATCH 10/20] feat(type): make warnings more granular --- lymph/models/unilateral.py | 20 +++++++++++++------- lymph/types.py | 14 +++++++++++--- tests/fixtures.py | 4 ++-- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 489cfc5..9df3406 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -536,18 +536,24 @@ def load_patient_data( warnings.warn( f"{side}lateral involvement data not found. Skipping " f"modality {modality}.", - category=types.InvalidDataModalityWarning, + category=types.MissingLateralisationWarning, ) continue - for name in self.graph.lnls.keys(): + for lnl in self.graph.lnls.keys(): modality_side_data = patient_data[modality, side] - if name not in modality_side_data: - raise ValueError(f"Involvement data for LNL {name} not found.") - - column = patient_data[modality, side, name] - patient_data["_model", modality, name] = column + if lnl not in modality_side_data: + warnings.warn( + f"Modality {modality} does not contain involvement data for " + f"LNL {lnl}. Assuming unknown.", + category=types.MissingLNLWarning, + ) + column = None + else: + column = patient_data[modality, side, lnl] + + patient_data["_model", modality, lnl] = column if len(patient_data) == 0: patient_data[MAP_T_COL] = None diff --git a/lymph/types.py b/lymph/types.py index 8ebd740..c9a576b 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -9,11 +9,19 @@ from pandas._libs.missing import NAType -class InvalidDataModalityWarning(UserWarning): - """Warning that is raised when a header in the data is not a valid modality.""" +class DataWarning(UserWarning): + """Parent class of all warnings related to potential data issues.""" -class MissingTStageWarning(UserWarning): +class MissingLNLWarning(DataWarning): + """Raised when a lymph node level is missing from the data.""" + + +class MissingLateralisationWarning(DataWarning): + """Raised when a lateralisation is missing from the data.""" + + +class MissingTStageWarning(DataWarning): """Warning that is raised when a defined T-stage is missing from the data.""" diff --git a/tests/fixtures.py b/tests/fixtures.py index ef068ad..4b0903c 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -15,7 +15,7 @@ from lymph import diagnose_times from lymph.modalities import Clinical, Modality, Pathological from lymph.models import Unilateral -from lymph.types import MissingTStageWarning, PatternType +from lymph.types import DataWarning, PatternType MODALITIES = { "CT": Clinical(spec=0.81, sens=0.86), @@ -30,7 +30,7 @@ class IgnoreWarningsTestCase(unittest.TestCase): def setUp(self) -> None: """Ignore warnings.""" warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning) - warnings.simplefilter("ignore", category=MissingTStageWarning) + warnings.simplefilter("ignore", category=DataWarning) super().setUp() From 5423214b40ff826f105209b95163b05539d9717b Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:49:34 +0100 Subject: [PATCH 11/20] fix(mid): consume & return params in same order --- lymph/models/midline.py | 5 ++--- tests/binary_midline_test.py | 11 +++++++++++ tests/binary_unilateral_test.py | 7 ++++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 095bdf1..ee16ec0 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -8,6 +8,7 @@ import pandas as pd from lymph import diagnose_times, matrix, modalities, models, types +from lymph.types import DiagnoseType, PatternType from lymph.utils import ( add_or_mult, draw_diagnoses, @@ -16,7 +17,6 @@ popfirst, unflatten_and_split, ) -from lymph.types import DiagnoseType, PatternType warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) logger = logging.getLogger(__name__) @@ -354,9 +354,8 @@ def get_params( and the distribution parameters from the call to :py:meth:`get_distribution_params`. """ params = {} - params["mixing"] = self.mixing_param - params.update(self.get_spread_params(as_flat=as_flat)) params["midext_prob"] = self.midext_prob + params.update(self.get_spread_params(as_flat=as_flat)) params.update(self.get_distribution_params(as_flat=as_flat)) if as_flat or not as_dict: diff --git a/tests/binary_midline_test.py b/tests/binary_midline_test.py index bc07f96..98caa80 100644 --- a/tests/binary_midline_test.py +++ b/tests/binary_midline_test.py @@ -71,6 +71,17 @@ def test_set_spread_params(self) -> None: ) + def test_get_set_params_order(self) -> None: + """Check if the order of getter and setter is the same.""" + num_dims = self.model.get_num_dims() + params_to_set = np.linspace(0., 1., num_dims + 1) + unused_param = self.model.set_params(*params_to_set) + returned_params = list(self.model.get_params(as_dict=False)) + + self.assertEqual(unused_param, params_to_set[-1]) + self.assertEqual(params_to_set[:-1].tolist(), returned_params) + + class MidlineLikelihoodTestCase(fixtures.IgnoreWarningsTestCase): """Check that the likelihood function works correctly.""" diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index 4ef1e0c..20f8228 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -1,7 +1,10 @@ """Test the binary unilateral system.""" +import warnings + import numpy as np +from lymph import types from lymph.graph import LymphNodeLevel, Tumor from lymph.modalities import Clinical @@ -205,6 +208,7 @@ class PatientDataTestCase( def setUp(self): """Load patient data.""" super().setUp() + warnings.simplefilter("ignore", category=types.InvalidDataWarning) self.model.replace_all_modalities(fixtures.MODALITIES) self.init_diag_time_dists(early="frozen", late="parametric", foo="frozen") self.model.set_params(**self.create_random_params()) @@ -219,9 +223,6 @@ def test_load_empty_dataframe(self): def test_load_patient_data(self): """Make sure the patient data is loaded correctly.""" self.assertEqual(len(self.model.patient_data), len(self.raw_data)) - self.assertRaises( - ValueError, self.model.load_patient_data, self.raw_data, side="foo" - ) def test_t_stages(self): """Make sure all T-stages are present.""" From 1e4fa5dc2ac4c71026f1e03155d65c0b6f58fe2b Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:35:54 +0100 Subject: [PATCH 12/20] change(type): add type definition for graph dict --- lymph/models/bilateral.py | 2 +- lymph/models/midline.py | 2 +- lymph/models/unilateral.py | 2 +- lymph/types.py | 19 ++++++++++++++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lymph/models/bilateral.py b/lymph/models/bilateral.py index c8d4ca6..7496f59 100644 --- a/lymph/models/bilateral.py +++ b/lymph/models/bilateral.py @@ -39,7 +39,7 @@ class Bilateral( """ def __init__( self, - graph_dict: dict[tuple[str], list[str]], + graph_dict: types.GraphDictType, is_symmetric: dict[str, bool] | None = None, uni_kwargs: dict[str, Any] | None = None, ipsi_kwargs: dict[str, Any] | None = None, diff --git a/lymph/models/midline.py b/lymph/models/midline.py index ee16ec0..4d01fc3 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -56,7 +56,7 @@ class Midline( """ def __init__( self, - graph_dict: dict[tuple[str], list[str]], + graph_dict: types.GraphDictType, is_symmetric: dict[str, bool] | None = None, use_mixing: bool = True, use_central: bool = True, diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 9df3406..033d215 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -43,7 +43,7 @@ class Unilateral( """ def __init__( self, - graph_dict: dict[tuple[str], list[str]], + graph_dict: types.GraphDictType, tumor_state: int | None = None, allowed_states: list[int] | None = None, max_time: int = 10, diff --git a/lymph/types.py b/lymph/types.py index c9a576b..a73897b 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -41,13 +41,30 @@ def get_params( ... -PatternType = dict[str, bool | NAType | None] +GraphDictType = dict[tuple[str, str], list[str]] +"""Type alias for a graph dictionary. + +A dictionary of this form specifies the structure of the underlying graph. Example: + +>>> graph_dict = { +... ("tumor", "T"): ["I", "II", "III"], +... ("lnl", "I"): ["II"], +... ("lnl", "II"): ["III"], +... ("lnl", "III"): [], +... } +""" + + +PatternType = dict[str, bool | str | NAType | None] """Type alias for an involvement pattern. An involvement pattern is a dictionary with keys for the lymph node levels and values for the involvement of the respective lymph nodes. The values are either True, False, or None, which means that the involvement is unknown. +TODO: Document the new possibilities to specify trinary involvment. +See :py:func:`.matrix.compute_encoding` + >>> pattern = {"I": True, "II": False, "III": None} """ From a03e3fcc5291ff8680be679dedc4eb0390ba9a59 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:46:04 +0100 Subject: [PATCH 13/20] change(diag): use partials to save parametric dist --- lymph/diagnose_times.py | 41 ++++++++++++++++++++------------- lymph/types.py | 12 +++++----- tests/binary_unilateral_test.py | 2 +- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/lymph/diagnose_times.py b/lymph/diagnose_times.py index 6480f56..c47ee9f 100644 --- a/lymph/diagnose_times.py +++ b/lymph/diagnose_times.py @@ -17,6 +17,7 @@ import logging import warnings from abc import ABC +from functools import partial from typing import Any, Iterable, TypeVar import numpy as np @@ -36,6 +37,7 @@ def __init__( self, distribution: Iterable[float] | callable, max_time: int | None = None, + **kwargs, ) -> None: """Initialize a distribution over diagnose times. @@ -59,23 +61,29 @@ def __init__( ``max_time`` + 1 don't match, in case it is accidentally provided. """ if callable(distribution): - self._init_from_callable(distribution, max_time) + self._init_from_callable(distribution, max_time, **kwargs) elif isinstance(distribution, Distribution): self._init_from_instance(distribution) else: self._init_from_frozen(distribution, max_time) - def _init_from_callable(self, distribution: callable, max_time: int | None = None): + def _init_from_callable( + self, + distribution: callable, + max_time: int | None = None, + **kwargs, + ): """Initialize the distribution from a callable distribution.""" if max_time is None: raise ValueError("max_time must be provided if a function is passed") if max_time < 0: raise ValueError("max_time must be a positive integer") + func_kwargs = self.extract_kwargs(distribution) + func_kwargs.update(kwargs) self.max_time = max_time - self._kwargs = self.extract_kwargs(distribution) - self._func = distribution + self._func = partial(distribution, **func_kwargs) self._frozen = self.pmf @@ -85,8 +93,7 @@ def _init_from_instance(self, instance: Distribution): self._init_from_frozen(instance.pmf, instance.max_time) else: self.max_time = instance.max_time - self._kwargs = instance._kwargs.copy() - self._func = instance._func + self._func = partial(instance._func, **instance._func.keywords) self._frozen = self.pmf @@ -102,7 +109,6 @@ def _init_from_frozen(self, distribution: Iterable[float], max_time: int | None ) self.max_time = max_time - self._kwargs = {} self._func = None self._frozen = self.normalize(distribution) @@ -140,9 +146,12 @@ def __eq__(self, other) -> bool: if not isinstance(other, Distribution): return False + if not self.is_updateable and not other.is_updateable: + return np.all(self.pmf == other.pmf) + return ( self.is_updateable == other.is_updateable - and self._kwargs == other._kwargs + and self._func.keywords == other._func.keywords and np.all(self.pmf == other.pmf) ) @@ -156,8 +165,8 @@ def __hash__(self) -> int: :py:meth:`.is_updateable` returns ``True`` -- the stored keyword arguments of the parametric distribution. """ - kwarg_tpl = tuple(self._kwargs.items()) - return hash((self.is_updateable, kwarg_tpl, self.pmf.tobytes())) + args_and_kwargs_tpl = self._func.args + tuple(self._func.keywords.items()) + return hash((self.is_updateable, args_and_kwargs_tpl, self.pmf.tobytes())) @property @@ -186,7 +195,7 @@ def normalize(distribution: np.ndarray) -> np.ndarray: def pmf(self) -> np.ndarray: """Return the probability mass function of the distribution if it is frozen.""" if not hasattr(self, "_frozen") or self._frozen is None: - self._frozen = self.normalize(self._func(self.support, **self._kwargs)) + self._frozen = self.normalize(self._func(self.support)) return self._frozen @@ -213,7 +222,7 @@ def get_params( warnings.warn("Distribution is not updateable, returning empty dict") return {} if as_dict else None - return self._kwargs if as_dict else self._kwargs.values() + return self._func.keywords if as_dict else self._func.keywords.values() def set_params(self, *args: float, **kwargs: float) -> tuple[float]: @@ -231,18 +240,18 @@ def set_params(self, *args: float, **kwargs: float) -> tuple[float]: warnings.warn("Distribution is not updateable, ignoring parameters") return args - old_kwargs = self._kwargs.copy() + old_kwargs = self._func.keywords.copy() - for name, value in self._kwargs.items(): + for name, value in self._func.keywords.items(): first, args = popfirst(args) - self._kwargs[name] = first or kwargs.get(name, value) + self._func.keywords[name] = first or kwargs.get(name, value) if hasattr(self, "_frozen"): del self._frozen try: _ = self.pmf except ValueError as val_err: - self._kwargs = old_kwargs + self._func.keywords.update(old_kwargs) raise ValueError("Invalid params provided to distribution") from val_err return args diff --git a/lymph/types.py b/lymph/types.py index a73897b..6bd1a53 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -78,7 +78,7 @@ def get_params( """ -M = TypeVar("M", bound="Model") +ModelT = TypeVar("ModelT", bound="Model") class Model(ABC): """Abstract base class for models. @@ -88,7 +88,7 @@ class Model(ABC): """ @abstractmethod def get_params( - self: M, + self: ModelT, as_dict: bool = True, as_flat: bool = True, ) -> Iterable[float] | dict[str, float]: @@ -102,7 +102,7 @@ def get_params( """ raise NotImplementedError - def get_num_dims(self: M, mode: Literal["HMM", "BN"] = "HMM") -> int: + def get_num_dims(self: ModelT, mode: Literal["HMM", "BN"] = "HMM") -> int: """Return the number of dimensions of the parameter space. A hidden Markov model (``mode="HMM"``) typically has more parameters than a @@ -117,7 +117,7 @@ def get_num_dims(self: M, mode: Literal["HMM", "BN"] = "HMM") -> int: return num @abstractmethod - def set_params(self: M, *args: float, **kwargs: float) -> tuple[float]: + def set_params(self: ModelT, *args: float, **kwargs: float) -> tuple[float]: """Set the parameters of the model. The parameters may be passed as positional or keyword arguments. The positional @@ -128,7 +128,7 @@ def set_params(self: M, *args: float, **kwargs: float) -> tuple[float]: @abstractmethod def load_patient_data( - self: M, + self: ModelT, patient_data: pd.DataFrame, ) -> None: """Load patient data in `LyProX`_ format into the model. @@ -139,7 +139,7 @@ def load_patient_data( @abstractmethod def likelihood( - self: M, + self: ModelT, given_params: Iterable[float] | dict[str, float] | None = None, log: bool = True, ) -> float: diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index 20f8228..f7d2326 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -208,7 +208,7 @@ class PatientDataTestCase( def setUp(self): """Load patient data.""" super().setUp() - warnings.simplefilter("ignore", category=types.InvalidDataWarning) + warnings.simplefilter("ignore", category=types.DataWarning) self.model.replace_all_modalities(fixtures.MODALITIES) self.init_diag_time_dists(early="frozen", late="parametric", foo="frozen") self.model.set_params(**self.create_random_params()) From 3c03a0fffd4c3f927c07a6000a59b27610e36a6d Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 15 Mar 2024 14:09:21 +0100 Subject: [PATCH 14/20] fix(uni): allow `mapping=None` when loading data --- lymph/models/unilateral.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 033d215..f8ac429 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -502,7 +502,7 @@ def load_patient_data( self, patient_data: pd.DataFrame, side: str = "ipsi", - mapping: callable | dict[int, Any] = early_late_mapping, + mapping: callable | dict[int, Any] | None = None, ) -> None: """Load patient data in `LyProX`_ format into the model. @@ -522,6 +522,9 @@ def load_patient_data( .. _LyProX: https://lyprox.org/ """ + if mapping is None: + mapping = early_late_mapping + # pylint: disable=unnecessary-lambda-assignment patient_data = ( patient_data From 08bef6379df639f98978dfe2e63199019892a2a3 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:34:48 +0100 Subject: [PATCH 15/20] change(type): use only one warning type --- lymph/models/unilateral.py | 6 +++--- lymph/types.py | 14 +------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index f8ac429..0b98c68 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -539,7 +539,7 @@ def load_patient_data( warnings.warn( f"{side}lateral involvement data not found. Skipping " f"modality {modality}.", - category=types.MissingLateralisationWarning, + category=types.DataWarning, ) continue @@ -550,7 +550,7 @@ def load_patient_data( warnings.warn( f"Modality {modality} does not contain involvement data for " f"LNL {lnl}. Assuming unknown.", - category=types.MissingLNLWarning, + category=types.DataWarning, ) column = None else: @@ -574,7 +574,7 @@ def load_patient_data( if t_stage not in patient_data[MAP_T_COL].values: warnings.warn( message=f"No data for T-stage {t_stage} found.", - category=types.MissingTStageWarning, + category=types.DataWarning, ) diff --git a/lymph/types.py b/lymph/types.py index 6bd1a53..6e8b3f8 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -10,19 +10,7 @@ class DataWarning(UserWarning): - """Parent class of all warnings related to potential data issues.""" - - -class MissingLNLWarning(DataWarning): - """Raised when a lymph node level is missing from the data.""" - - -class MissingLateralisationWarning(DataWarning): - """Raised when a lateralisation is missing from the data.""" - - -class MissingTStageWarning(DataWarning): - """Warning that is raised when a defined T-stage is missing from the data.""" + """Warnings related to potential data issues.""" class HasSetParams(Protocol): From ba42a76c3a44e1b85eb62ee953dab559f2453f50 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:00:52 +0100 Subject: [PATCH 16/20] chore: update changelog --- CHANGELOG.md | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 643778f..160d36c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,43 @@ All notable changes to this project will be documented in this file. + + +## [1.0.0] - 2024-03-18 + +### Bug Fixes + +- (**uni**) Catch error when `apply` to empty data. Fixes [#79].\ + For some reason, using `apply` on an empty `DataFrame` has an entirely + different return type than when it is not empty. This caused the issue + [#79] and has now been fixed. +- (**bi**) Data reload loads wrong side.\ + Now the data does not get reloaded anymore, which was actually + unnecessary in the first place. +- (**uni**) Return correctly in `get_spread_params`. +- (**mid**) Consume & return params in same order. +- (**uni**) Allow `mapping=None` when loading data. + +### Testing + +- (**uni**) Check if loading empty data works. Related [#79]. +- (**uni**) Make sure likelihood is deterministic. + +### Change + +- ⚠ **BREAKING** (**uni**) Shorten two (unused) method names. +- ⚠ **BREAKING** `helpers` are now `utils`. +- (**type**) Add type definition for graph dict. +- (**diag**) Use [partials] to save parametric dist. + +[partials]: https://docs.python.org/3.10/library/functools.html#functools.partial + +### Merge + +- Branch 'main' into 'dev'. +- Branch '79-loading-an-empty-dataframe-raises-error' into 'dev'. + + ## [1.0.0.rc2] - 2024-03-06 @@ -567,7 +604,8 @@ Almost the entire API has changed. I'd therefore recommend to have a look at the - add pre-commit hook to check commit msg -[Unreleased]: https://github.com/rmnldwg/lymph/compare/1.0.0.rc2...HEAD +[Unreleased]: https://github.com/rmnldwg/lymph/compare/1.0.0...HEAD +[1.0.0]: https://github.com/rmnldwg/lymph/compare/1.0.0.rc2...1.0.0 [1.0.0.rc2]: https://github.com/rmnldwg/lymph/compare/1.0.0.rc1...1.0.0.rc2 [1.0.0.rc1]: https://github.com/rmnldwg/lymph/compare/1.0.0.a6...1.0.0.rc1 [1.0.0.a6]: https://github.com/rmnldwg/lymph/compare/1.0.0.a5...1.0.0.a6 @@ -582,6 +620,7 @@ Almost the entire API has changed. I'd therefore recommend to have a look at the [0.4.1]: https://github.com/rmnldwg/lymph/compare/0.4.0...0.4.1 [0.4.0]: https://github.com/rmnldwg/lymph/compare/0.3.10...0.4.0 +[#79]: https://github.com/rmnldwg/lymph/issues/79 [#77]: https://github.com/rmnldwg/lymph/issues/77 [#74]: https://github.com/rmnldwg/lymph/issues/74 [#72]: https://github.com/rmnldwg/lymph/issues/72 From ffb3b9c457e01ff71953321501d391762d7cf6eb Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:01:17 +0100 Subject: [PATCH 17/20] docs: allow nb execution to run longer --- docs/source/conf.py | 1 + docs/source/quickstart_unilateral_old.ipynb | 373 -------------------- 2 files changed, 1 insertion(+), 373 deletions(-) delete mode 100644 docs/source/quickstart_unilateral_old.ipynb diff --git a/docs/source/conf.py b/docs/source/conf.py index c32e97f..02af062 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,6 +52,7 @@ # MyST settings myst_enable_extensions = ["colon_fence", "dollarmath"] nb_execution_mode = "auto" +nb_execution_timeout = 120 # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/source/quickstart_unilateral_old.ipynb b/docs/source/quickstart_unilateral_old.ipynb deleted file mode 100644 index 9748219..0000000 --- a/docs/source/quickstart_unilateral_old.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Getting started\n", - "\n", - "A lot of people get diagnosed with squamous cell carcinoma in the head & neck region ([HNSCC](https://en.wikipedia.org/wiki/Head_and_neck_cancer)), which frequently metastasizes via the lymphatic system. We set out to develop a methodology to predict the risk of a new patient having metastases in so-called lymph node levels (LNLs), based on their personal diagnose (e.g. findings from a CT scan) and information of previously diagnosed and treated patients. And that's exactly what this code enables you to do as well.\n", - "\n", - "As mentioned, this package is meant to be a relatively simple-to-use frontend. The math is done under the hood and one does not need to worry about it a lot. But let's have a quick look at what we're doing here.\n", - "\n", - "We will assume that you have already read the section on how to install the module and followed its instructions.\n", - "\n", - "## Importing\n", - "\n", - "First, let's import the package." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import lymph" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Graph\n", - "\n", - "The model is based on the assumption that one can represent the lymphatic system as a directed graph. The arcs in that graph represent the direction of the lymphatic flow and therefore also the direction of metastatic spread. Hence, the first thing to do is to define a graph that represents the drainage pathways of the lymphatic system aptly.\n", - "\n", - "Here, this is done via a dictionary:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "graph = {\n", - " ('tumor', 'primary') : ['I', 'II', 'III', 'IV'], \n", - " ('lnl' , 'I') : ['II'], \n", - " ('lnl' , 'II') : ['III'], \n", - " ('lnl' , 'III'): ['IV'], \n", - " ('lnl' , 'IV') : []\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Every key in this dictionary is a tuple of the form `({type}, {name})` and represents either a tumor - in which case the `{type}` must be `'tumor'` - or a lymph node level (`{type}` must be `'lnl'`). The value of each of those nodes is then a list of names for nodes it connects to. So, for example the primary tumor `('tumor', 'primary')` in the `graph` above has directed arcs to `a` and `b`, while the LNL `c` does not have any outgoing connections.\n", - "\n", - "We can simply create an instance of `System` using only this graph and let it report itself:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model = lymph.Unilateral(graph=graph)\n", - "print(unilateral_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The percentages between two nodes represents the probability rate that metastatic spread occurs along it. In the case of the tumor spreading to LNL `a` we call this probability *base probability rate* $\\tilde{b}_a$. For the spread between lymph node levels, we call it *transition probability rate*, e.g. $\\tilde{t}_{ab}$. The difference to the base probability rate is that it only plays a role if the parent LNL is already ivolved with metastases, while the tumor always spreads, of course.\n", - "\n", - "We can change these spread probability rates by setting the attribute `spread_probs` of the `System`. It can be set with an array of these spread sprobability rates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.spread_probs = [0.05, 0.3, 0.2, 0.15, 0.1, 0.25, 0.2]\n", - "print(unilateral_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reversely, we can also read them out:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spread_probabilities = unilateral_model.spread_probs\n", - "print(spread_probabilities)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Diagnostic Modalities\n", - "\n", - "To ultimately compute the likelihoods of observations, we need to fix the sensitivities and specificities of the obtained diagnoses. And since we might have multiple diagnostic modalities available, we need to tell the system which of them comes with which specificity and sensitivity. We do this by creating a dictionary of specificity/sensitivity pairs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mri_and_pet_spsn = {\"MRI\": [0.63, 0.81], \n", - " \"PET\": [0.86, 0.79]}\n", - "# ^ ^\n", - "# specificty sensitivity" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can pass this to the system by setting the `modalities` attribute, which expects a dictionary containing the diagnostic modalities and as a key to it the numbers for specificity & sensitivity." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.modalities = mri_and_pet_spsn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data / Observations\n", - "\n", - "To compute the likelihood of a set of probability (rates) given a patient cohort we need such a patient cohort, of course. We can provide it to the system in the form of a `pandas` `DataFrame`. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from lyscripts.data.clean import lyprox_to_lymph\n", - "\n", - "dataset_url = \"https://raw.githubusercontent.com/rmnldwg/lydata/main/2021-usz-oropharynx/data.csv\"\n", - "example_cols = [\n", - " (\"info\", \"t_stage\"),\n", - " (\"PET\", \"I\"),\n", - " (\"PET\", \"II\"),\n", - " (\"PET\", \"III\"),\n", - " (\"PET\", \"IV\"),\n", - " (\"MRI\", \"I\"),\n", - " (\"MRI\", \"II\"),\n", - " (\"MRI\", \"III\"),\n", - " (\"MRI\", \"IV\"),\n", - "]\n", - "\n", - "dataset = lyprox_to_lymph(pd.read_csv(dataset_url, header=[0,1,2]))\n", - "dataset[example_cols]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that this data has two header-rows, defining not only the individual column's content, but also to which over-arching category they belong. The \"Info\" category plays a special role here along with its sub-category \"T-stage\". It will later tell the system which time prior to use according to a dictionary of these distributions.\n", - "\n", - "The \"pathology\" section denotes that this dataset contains observations from a pathologic diagnostic modality (neck dissections in this case). How this is termed is irrelevant, as we will be telling the system what to look for. Import is, however, that - if we had multiple diagnostic modalities - they all contain a column for each lymph node level in the system we have set up. Obvioulsy, this dataset here does not match the system set up earlier, so let's fix that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.modalities = {\"PET\": [0.86, 0.79]}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To feed the dataset into the system, we assign the dataset to the attribute `patient_data`. What the system then does here is creating a diagnose matrix for every T-stage in the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.patient_data = dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, we get a warning that we have no distributions defined for marginalizing over diagnose times. This is the next step.\n", - "\n", - "## Distribution over diagnose times\n", - "\n", - "The last ingredient to set up (at least when using the hidden Markov model) would now be the time prior. Since this dataset contains only early T-stage patients the exact shape does not matter too much, as long as it is \"reasonable\". If we also had late T-stage patients in the cohort, we would need to think about how the two time priors relate to each other.\n", - "\n", - "For now we are going to use binomial distributions for this. Their shape makes intuitive sense: Since the time prior $p_T(t)$ is a distribution over the probability of diagnosing a patient after $t$ time steps, given his T-stage $T$ we would expect that a very early detection of the cancer is similarly unlikely as a very late one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import scipy as sp\n", - "import scipy.stats\n", - "import matplotlib.pyplot as plt\n", - "\n", - "max_t = 10\n", - "time_steps = np.arange(max_t+1)\n", - "p = 0.4\n", - "\n", - "early_prior = sp.stats.binom.pmf(time_steps, max_t, p)\n", - "\n", - "plt.plot(time_steps, early_prior, \"o-\");\n", - "plt.xlabel(\"time step $t$\");\n", - "plt.ylabel(\"probability $p$\");" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.diag_time_dists[\"early\"] = early_prior" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from scipy.special import factorial\n", - "\n", - "def binom_pmf(k: np.ndarray, n: int, p: float):\n", - " \"\"\"Binomial PMF\"\"\"\n", - " if p > 1. or p < 0.:\n", - " raise ValueError(\"Binomial prob must be btw. 0 and 1\")\n", - " q = (1. - p)\n", - " binom_coeff = factorial(n) / (factorial(k) * factorial(n - k))\n", - " return binom_coeff * p**k * q**(n - k)\n", - "\n", - "def parametric_binom_pmf(n: int):\n", - " \"\"\"Return a parametric binomial PMF\"\"\"\n", - " def inner(t, p):\n", - " \"\"\"Parametric binomial PMF\"\"\"\n", - " return binom_pmf(t, n, p)\n", - " return inner" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unilateral_model.diag_time_dists[\"late\"] = parametric_binom_pmf(max_t)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Likelihood\n", - "\n", - "With everything set up like this, we can compute the likelihood of seeing the above dataset given a set of base and transition probability (rates)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_probabilities = np.array([0.02, 0.24, 0.03, 0.2, 0.23, 0.18, 0.18, 0.5])\n", - "\n", - "llh = unilateral_model.likelihood(given_params=test_probabilities, log=True)\n", - "\n", - "print(f\"log-likelihood is {llh:.2f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From here it is up to the user what to do with this quantity. Most *likely* though, one would want to perform MCMC sampling with this.\n", - "\n", - "## Summary\n", - "\n", - "To set up a model for lymphatic metastatic spread, you need to do the following things:\n", - "\n", - "1. Define a graph that connects the lymph node levels via a dictionary\n", - "2. Provide the specificity & sensitivity of the diagnostic modalities to the `modalities` attribute\n", - "3. Assign your correctly formatted pandas `DataFrame` to the attribute `patient_data` of the model\n", - "4. For each T-stage in the data, define a distribution over possible diagnose times\n", - "\n", - "Then, you can use the `log_likelihood` method to compute the log-likelihood of the stored data given an array of parameters - the spread probabilities." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.10 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": true - }, - "vscode": { - "interpreter": { - "hash": "1b6eded5f386e55fd051b894079e4370359bf13f51a44183870a4399bfd4d593" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From ec63db902f5278108f4fd7546d67c6b88cc0cca2 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:05:45 +0100 Subject: [PATCH 18/20] remove(uni): delete `print_info()` method for now --- lymph/models/unilateral.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 0b98c68..2d86647 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -106,13 +106,13 @@ def __init__( @classmethod - def binary(cls, graph_dict: dict[tuple[str], set[str]], **kwargs) -> Unilateral: + def binary(cls, graph_dict: types.GraphDictType, **kwargs) -> Unilateral: """Create an instance of the :py:class:`~Unilateral` class with binary LNLs.""" return cls(graph_dict, allowed_states=[0, 1], **kwargs) @classmethod - def trinary(cls, graph_dict: dict[tuple[str], set[str]], **kwargs) -> Unilateral: + def trinary(cls, graph_dict: types.GraphDictType, **kwargs) -> Unilateral: """Create an instance of the :py:class:`~Unilateral` class with trinary LNLs.""" return cls(graph_dict, allowed_states=[0, 1, 2], **kwargs) @@ -122,19 +122,6 @@ def __str__(self) -> str: return f"Unilateral with {len(self.graph.tumors)} tumors and {len(self.graph.lnls)} LNLs" - def print_info(self): - """Print detailed information about the instance.""" - num_tumors = len(self.graph.tumors) - num_lnls = len(self.graph.lnls) - string = ( - f"Unilateral lymphatic system with {num_tumors} tumor(s) " - f"and {num_lnls} LNL(s).\n" - + " ".join([f"{e} {e.spread_prob}%" for e in self.graph.tumor_edges]) + "\n" + " ".join([f"{e} {e.spread_prob}%" for e in self.graph.lnl_edges]) - + f"\n the growth probability is: {self.graph.growth_edges[0].spread_prob}" + f" the micro mod is {self.graph.lnl_edges[0].micro_mod}" - ) - print(string) - - @property def is_trinary(self) -> bool: """Return whether the model is trinary.""" From a379316222b08006929aef5aa56cdcf6dc048509 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:14:25 +0100 Subject: [PATCH 19/20] refactor(type): use type alias for params --- lymph/diagnose_times.py | 5 +++-- lymph/graph.py | 5 ++--- lymph/models/bilateral.py | 14 +++++++------- lymph/models/midline.py | 10 +++++----- lymph/models/unilateral.py | 14 +++++++------- lymph/types.py | 11 ++++++++--- lymph/utils.py | 14 +++++++------- 7 files changed, 39 insertions(+), 34 deletions(-) diff --git a/lymph/diagnose_times.py b/lymph/diagnose_times.py index c47ee9f..168a540 100644 --- a/lymph/diagnose_times.py +++ b/lymph/diagnose_times.py @@ -22,6 +22,7 @@ import numpy as np +from lymph import types from lymph.utils import flatten, popfirst, unflatten_and_split logger = logging.getLogger(__name__) @@ -209,7 +210,7 @@ def get_params( self, as_dict: bool = True, **_kwargs, - ) -> float | Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """If updateable, return the dist's ``param`` value or all params in a dict. See Also: @@ -473,7 +474,7 @@ def get_distribution_params( self: DC, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of all distributions.""" params = {} diff --git a/lymph/graph.py b/lymph/graph.py index 342b9d6..a4188bc 100644 --- a/lymph/graph.py +++ b/lymph/graph.py @@ -14,7 +14,6 @@ import base64 import warnings from itertools import product -from typing import Iterable import numpy as np @@ -403,7 +402,7 @@ def get_params( self, as_dict: bool = True, **_kwargs, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the value of the parameter ``param`` or all params in a dict. See Also: @@ -791,7 +790,7 @@ def get_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of the edges in the graph. If ``as_dict`` is ``False``, return an iterable of all parameter values. If diff --git a/lymph/models/bilateral.py b/lymph/models/bilateral.py index 7496f59..b4988eb 100644 --- a/lymph/models/bilateral.py +++ b/lymph/models/bilateral.py @@ -166,7 +166,7 @@ def get_tumor_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of the model's spread from tumor to LNLs. If the attribute dictionary :py:attr:`.is_symmetric` stores the key-value pair @@ -198,7 +198,7 @@ def get_lnl_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of the model's spread from LNLs to tumor. Similarily to the :py:meth:`.get_tumor_spread_params` method, this returns only @@ -230,7 +230,7 @@ def get_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of the model's spread edges. Depending on the symmetries (i.e. the ``is_symmetric`` attribute), this returns @@ -285,7 +285,7 @@ def get_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return the parameters of the model. It returns the combination of the call to the :py:meth:`.Unilateral.get_params` @@ -522,7 +522,7 @@ def _hmm_likelihood(self, log: bool = True, t_stage: str | None = None) -> float def likelihood( self, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, log: bool = True, mode: Literal["HMM", "BN"] = "HMM", for_t_stage: str | None = None, @@ -566,7 +566,7 @@ def likelihood( def comp_posterior_joint_state_dist( self, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, given_diagnoses: dict[str, types.DiagnoseType] | None = None, t_stage: str | int = "early", mode: Literal["HMM", "BN"] = "HMM", @@ -620,7 +620,7 @@ def comp_posterior_joint_state_dist( def risk( self, involvement: types.PatternType | None = None, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, given_diagnoses: dict[str, types.DiagnoseType] | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 4d01fc3..6daf805 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -347,7 +347,7 @@ def get_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Return all the parameters of the model. This includes the spread parameters from the call to :py:meth:`get_spread_params` @@ -366,7 +366,7 @@ def get_params( def set_tumor_spread_params( self, *args: float, **kwargs: float, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Set the spread parameters of the midline model. In analogy to the :py:meth:`get_tumor_spread_params` method, this method sets @@ -467,7 +467,7 @@ def set_spread_params(self, *args: float, **kwargs: float) -> Iterable[float]: def set_params( self, *args: float, **kwargs: float, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Set all parameters of the model. Combines the calls to :py:meth:`.set_spread_params` and @@ -630,7 +630,7 @@ def _hmm_likelihood(self, log: bool = True, for_t_stage: str | None = None) -> f def likelihood( self, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, log: bool = True, mode: Literal["HMM", "BN"] = "HMM", for_t_stage: str | None = None, @@ -673,7 +673,7 @@ def likelihood( def risk( self, involvement: PatternType | None = None, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, given_diagnoses: dict[str, DiagnoseType] | None = None, t_stage: str = "early", midline_extension: bool = False, diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index 2d86647..b17e4c2 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -165,7 +165,7 @@ def get_tumor_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Get the parameters of the tumor spread edges.""" return get_params_from(self.graph.tumor_edges, as_dict, as_flat) @@ -174,7 +174,7 @@ def get_lnl_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Get the parameters of the LNL spread edges. In the trinary case, this includes the growth parameters as well as the @@ -187,7 +187,7 @@ def get_spread_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Get the parameters of the spread edges.""" params = self.get_tumor_spread_params(as_flat=as_flat) params.update(self.get_lnl_spread_params(as_flat=as_flat)) @@ -202,7 +202,7 @@ def get_params( self, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> types.ParamsType: """Get the parameters of the model. If ``as_dict`` is ``True``, the parameters are returned as a dictionary. If @@ -711,7 +711,7 @@ def _hmm_likelihood(self, log: bool = True, t_stage: str | None = None) -> float def likelihood( self, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, log: bool = True, mode: Literal["HMM", "BN"] = "HMM", for_t_stage: str | None = None, @@ -767,7 +767,7 @@ def compute_encoding( def posterior_state_dist( self, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, given_diagnoses: types.DiagnoseType | None = None, t_stage: str | int = "early", mode: Literal["HMM", "BN"] = "HMM", @@ -825,7 +825,7 @@ def posterior_state_dist( def risk( self, involvement: types.PatternType | None = None, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: types.ParamsType | None = None, given_diagnoses: dict[str, types.PatternType] | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", diff --git a/lymph/types.py b/lymph/types.py index 6e8b3f8..e07392a 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -42,6 +42,11 @@ def get_params( ... } """ +ParamsType = Iterable[float] | dict[str, float] +"""Type alias for how parameters are passed around. + +This is e.g. the type that the :py:meth:`Model.get_params` method returns. +""" PatternType = dict[str, bool | str | NAType | None] """Type alias for an involvement pattern. @@ -79,7 +84,7 @@ def get_params( self: ModelT, as_dict: bool = True, as_flat: bool = True, - ) -> Iterable[float] | dict[str, float]: + ) -> ParamsType: """Return the parameters of the model. The parameters are returned as a dictionary if ``as_dict`` is True, and as @@ -128,7 +133,7 @@ def load_patient_data( @abstractmethod def likelihood( self: ModelT, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: ParamsType | None = None, log: bool = True, ) -> float: """Return the likelihood of the model given the parameters. @@ -143,7 +148,7 @@ def likelihood( def risk( self, involvement: PatternType | None = None, - given_params: Iterable[float] | dict[str, float] | None = None, + given_params: ParamsType | None = None, given_diagnoses: dict[str, PatternType] | None = None, ) -> float | np.ndarray: """Return the risk of ``involvement``, given the parameters and diagnoses.""" diff --git a/lymph/utils.py b/lymph/utils.py index 29ae9f5..a0ebcf6 100644 --- a/lymph/utils.py +++ b/lymph/utils.py @@ -3,11 +3,11 @@ """ import logging from functools import cached_property, lru_cache, wraps -from typing import Any, Iterable, Sequence +from typing import Any, Sequence import numpy as np -from lymph.types import HasGetParams, HasSetParams +from lymph import types logger = logging.getLogger(__name__) @@ -316,10 +316,10 @@ def unflatten_and_split( def get_params_from( - objects: dict[str, HasGetParams], + objects: dict[str, types.HasGetParams], as_dict: bool = True, as_flat: bool = True, -) -> Iterable[float] | dict[str, float]: +) -> types.ParamsType: """Get the parameters from each ``get_params()`` method of the ``objects``.""" params = {} for key, obj in objects.items(): @@ -332,7 +332,7 @@ def get_params_from( def set_params_for( - objects: dict[str, HasSetParams], + objects: dict[str, types.HasSetParams], *args: float, **kwargs: float, ) -> tuple[float]: @@ -348,8 +348,8 @@ def set_params_for( def synchronize_params( - get_from: dict[str, HasGetParams], - set_to: dict[str, HasSetParams], + get_from: dict[str, types.HasGetParams], + set_to: dict[str, types.HasSetParams], ) -> None: """Get the parameters from one object and set them to another.""" for key, obj in set_to.items(): From 1b86d01bae3ee33c0b0503d768a3ed05c8ed2870 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:26:12 +0100 Subject: [PATCH 20/20] docs: fix typos in docstrings --- lymph/models/bilateral.py | 10 +++++----- lymph/models/midline.py | 3 ++- lymph/models/unilateral.py | 8 ++++---- tests/binary_bilateral_test.py | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/lymph/models/bilateral.py b/lymph/models/bilateral.py index b4988eb..b642689 100644 --- a/lymph/models/bilateral.py +++ b/lymph/models/bilateral.py @@ -243,7 +243,7 @@ def get_spread_params( symmetric, the leading ``ipsi_`` or ``contra_`` is omitted, since it's valid for both sides. - This is consistent with how the :py:meth:`~lymph.models.Bilteral.set_params` + This is consistent with how the :py:meth:`.set_params` method expects the keyword arguments in case of the symmetry configurations. >>> model = Bilateral(graph_dict={ @@ -564,7 +564,7 @@ def likelihood( raise ValueError("Invalid mode. Must be either 'HMM' or 'BN'.") - def comp_posterior_joint_state_dist( + def posterior_joint_state_dist( self, given_params: types.ParamsType | None = None, given_diagnoses: dict[str, types.DiagnoseType] | None = None, @@ -573,7 +573,7 @@ def comp_posterior_joint_state_dist( ) -> np.ndarray: """Compute joint post. dist. over ipsi & contra states, ``given_diagnoses``. - The ``given_diagnoses`` is a dictionary storing a :py:class:`types.DiagnoseType` + The ``given_diagnoses`` is a dictionary storing a :py:obj:`.types.DiagnoseType` for the ``"ipsi"`` and ``"contra"`` side of the neck. Essentially, this is the risk for any possible combination of ipsi- and @@ -630,7 +630,7 @@ def risk( The parameters can be set via the ``given_params`` and ``given_params``, both of which are passed to the :py:meth:`.set_params` method. The ``given_diagnoses`` must be a dictionary mapping the side of the neck to a - :py:class:`.types.DiagnoseType`. + :py:obj:`.types.DiagnoseType`. Note: The computation is much faster if no parameters are given, since then the @@ -645,7 +645,7 @@ def risk( only marginalizes over the states that match the involvement pattern. """ # TODO: test this method - posterior_state_probs = self.comp_posterior_joint_state_dist( + posterior_state_probs = self.posterior_joint_state_dist( given_params=given_params, given_diagnoses=given_diagnoses, t_stage=t_stage, diff --git a/lymph/models/midline.py b/lymph/models/midline.py index 6daf805..e942487 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -351,7 +351,8 @@ def get_params( """Return all the parameters of the model. This includes the spread parameters from the call to :py:meth:`get_spread_params` - and the distribution parameters from the call to :py:meth:`get_distribution_params`. + and the distribution parameters from the call to + :py:meth:`~.diagnose_times.Composite.get_distribution_params`. """ params = {} params["midext_prob"] = self.midext_prob diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index b17e4c2..e0ea4d6 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -638,7 +638,7 @@ def state_dist( which is essentially a marginalization of the evolution over the possible states as computed by :py:meth:`.state_dist_evo` with the distribution over diagnose times for the given T-stage from the dictionary returned by - :py:meth:`.get_all_dsitributions`. + :py:meth:`.get_all_distributions`. Or, when ``mode`` is set to ``"BN"``, compute the distribution over states for the Bayesian network. In that case, the ``t_stage`` parameter is ignored. @@ -669,12 +669,12 @@ def obs_dist( Returns an array of probabilities for each possible complete observation. This entails multiplying the distribution over states as returned by the - :py:meth:`.state_dist` method with the :py:attr:`.observation_matrix`. + :py:meth:`.state_dist` method with the :py:meth:`.observation_matrix`. Note that since the :py:attr:`.observation_matrix` can become very large, this method is not very efficient for inference. Instead, we compute the - :py:attr:`.diagnose_matrices` from the :py:attr:`.observation_matrix` and - the :py:attr:`.data_matrices` and use these to compute the likelihood. + :py:meth:`.diagnose_matrix` from the :py:meth:`.observation_matrix` and + the :py:meth:`.data_matrix` and use these to compute the likelihood. """ state_dist = self.state_dist(t_stage=t_stage, mode=mode) return state_dist @ self.observation_matrix() diff --git a/tests/binary_bilateral_test.py b/tests/binary_bilateral_test.py index 9afc18d..c90239e 100644 --- a/tests/binary_bilateral_test.py +++ b/tests/binary_bilateral_test.py @@ -318,7 +318,7 @@ def test_posterior_state_dist(self): random_parameters = self.create_random_params() random_diagnoses = self.create_random_diagnoses() - posterior = self.model.comp_posterior_joint_state_dist( + posterior = self.model.posterior_joint_state_dist( given_params=random_parameters, given_diagnoses=random_diagnoses, )