seems to work!

FalseNegativeLab · Oct 11, 2024 · 3fde74e · 3fde74e
1 parent 7d760dc
commit 3fde74e
Show file tree

Hide file tree

Showing 27 changed files with 4,843 additions and 3,562 deletions.
diff --git a/mlscorecheck/auc/_acc_aggregated.py b/mlscorecheck/auc/_acc_aggregated.py
@@ -11,14 +11,15 @@
 
 from ._utils import prepare_intervals, translate_folding
 
-from ._acc_single import acc_min, acc_max
+from ._acc_single import acc_min, acc_max, acc_onmax
 from ._auc_aggregated import R, check_cvxopt
 
 __all__ = [
     "acc_min_aggregated",
     "acc_max_aggregated",
     "acc_rmin_aggregated",
     "acc_rmax_aggregated",
+    "acc_onmax_aggregated",
     "acc_from_aggregated",
     "acc_lower_from_aggregated",
     "acc_upper_from_aggregated",
@@ -441,6 +442,60 @@ def acc_rmax_aggregated(
     return acc_rmax_solve(ps, ns, auc, return_solutions)
 
 
+def acc_onmax_aggregated(
+    auc: float, ps: np.array, ns: np.array, return_solutions: bool = False
+):
+    """
+    The one-node curves based maximum accuracy
+
+    Args:
+        auc (float): the average accuracy
+        ps (np.array): the number of positive samples
+        ns (np.array): the number of negative samples
+        return_solutions (bool): whether to return the solutions to the
+                                underlying optimization problem
+
+    Returns:
+        float | (float, np.array, np.array, np.array, np.array, np.array): the
+        mean accuracy, or the mean accuracy, the auc parameters, the vectors of
+        ps, ns, and the lower bounds and upper bounds
+
+    Raises:
+        ValueError: when auc < 0.5 or no optimal solution is found
+    """
+
+    if auc < 0.5:
+        raise ValueError("auc too small (acc_onmax_aggregated)")
+
+    ps = np.array(ps)
+    ns = np.array(ns)
+
+    k = len(ps)
+
+    mins = np.array([min(p, n) for p, n in zip(ps, ns)])
+
+    weights = mins / (ps + ns)
+
+    lower_bounds = np.repeat(0.5, k)
+    upper_bounds = np.repeat(1.0, k)
+
+    sorting = np.argsort(weights)[::-1]
+
+    ps = ps[sorting]
+    ns = ns[sorting]
+
+    aucs = R(auc, k, lower_bounds, upper_bounds)
+
+    accs = np.array([acc_onmax(auc, p, n) for auc, p, n in zip(aucs, ps, ns)])
+
+    results = float(np.mean(accs))
+
+    if return_solutions:
+        results = results, (aucs, ps, ns, lower_bounds, upper_bounds)
+
+    return results
+
+
 def acc_lower_from_aggregated(
     *,
     scores: dict,

diff --git a/mlscorecheck/auc/_acc_single.py b/mlscorecheck/auc/_acc_single.py
@@ -17,8 +17,13 @@
     "acc_min",
     "acc_rmin",
     "acc_max",
+    "acc_max_grad",
     "acc_rmax",
+    "acc_rmax_grad",
+    "acc_onmax",
+    "acc_onmax_grad",
     "macc_min",
+    "macc_min_grad",
 ]
 
 
@@ -74,6 +79,21 @@ def acc_max(auc, p, n):
     return (auc * min(p, n) + max(p, n)) / (p + n)
 
 
+def acc_max_grad(auc, p, n):
+    """
+    The gradient of maximum accuracy given an AUC
+
+    Args:
+        auc (float): upper bound on AUC
+        p (int): the number of positive test samples
+        n (int): the number of negative test samples
+
+    Returns:
+        float: the accuracy
+    """
+    return min(p, n) / (p + n)
+
+
 def acc_rmax(auc, p, n):
     """
     The maximum accuracy on a regulated minimum curve given an AUC
@@ -94,6 +114,58 @@ def acc_rmax(auc, p, n):
     return (max(p, n) + min(p, n) * np.sqrt(2 * (auc - 0.5))) / (p + n)
 
 
+def acc_rmax_grad(auc, p, n):
+    """
+    The gradient of regulated maximum accuracy given an AUC
+
+    Args:
+        auc (float): upper bound on AUC
+        p (int): the number of positive test samples
+        n (int): the number of negative test samples
+
+    Returns:
+        float: the accuracy
+    """
+    return np.sqrt(2) * min(p, n) / 2 / (np.sqrt(auc - 0.5) * (p + n))
+
+
+def acc_onmax(auc, p, n):
+    """
+    The maximum accuracy on a one node curve given an AUC
+
+    Args:
+        auc (float): upper bound on AUC
+        p (int): the number of positive test samples
+        n (int): the number of negative test samples
+
+    Returns:
+        float: the accuracy
+
+    Raises:
+        ValueError: when auc < 0.5
+    """
+
+    if auc < 0.5:
+        raise ValueError("auc too small for acc_onmax")
+
+    return (2 * auc * min(p, n) + max(p, n) - min(p, n)) / (p + n)
+
+
+def acc_onmax_grad(auc, p, n):
+    """
+    The gradient of one node maximum accuracy given an AUC
+
+    Args:
+        auc (float): upper bound on AUC
+        p (int): the number of positive test samples
+        n (int): the number of negative test samples
+
+    Returns:
+        float: the accuracy
+    """
+    return 2 * min(p, n) / (p + n)
+
+
 def macc_min(auc, p, n):
     """
     The minimum of the maximum accuracy
@@ -112,6 +184,24 @@ def macc_min(auc, p, n):
     return max(p, n) / (p + n)
 
 
+def macc_min_grad(auc, p, n):
+    """
+    The gradient of the minimum maximum accuracy
+
+    Args:
+        fpr (float): upper bound on false positive rate
+        tpr (float): lower bound on true positive rate
+
+    Returns:
+        float: the gradient magnitude
+    """
+
+    if auc >= 1 - min(p, n) / (2 * max(p, n)):
+        return n * p / ((n + p) * np.sqrt(-2 * auc * n * p + 2 * n * p))
+
+    return 0.0
+
+
 def acc_lower_from(*, scores: dict, eps: float, p: int, n: int, lower: str = "min"):
     """
     This function applies the lower bound estimation schemes to estimate
@@ -157,7 +247,7 @@ def acc_upper_from(*, scores: dict, eps: float, p: int, n: int, upper: str = "ma
         eps (float): the numerical uncertainty
         p (int): the number of positive samples
         n (int): the number of negative samples
-        upper (str): 'max'/'rmax' - the type of upper bound
+        upper (str): 'max'/'rmax'/'onmax' - the type of upper bound
 
     Returns:
         float: the upper bound for the accuracy
@@ -176,6 +266,8 @@ def acc_upper_from(*, scores: dict, eps: float, p: int, n: int, upper: str = "ma
         upper0 = acc_max(intervals["auc"][1], p, n)
     elif upper == "rmax":
         upper0 = acc_rmax(intervals["auc"][1], p, n)
+    elif upper == "onmax":
+        upper0 = acc_onmax(intervals["auc"][1], p, n)
     else:
         raise ValueError(f"unsupported upper bound {upper}")
 
@@ -193,7 +285,7 @@ def acc_from(
         eps (float): the numerical uncertainty
         p (int): the number of positive samples
         n (int): the number of negative samples
-        lower (str): 'min'/'rmin'
+        lower (str): 'min'/'rmin'/'onmax'
         upper (str): 'max'/'rmax' - the type of upper bound
 
     Returns:
@@ -253,7 +345,7 @@ def max_acc_upper_from(*, scores: dict, eps: float, p: int, n: int, upper: str =
         eps (float): the numerical uncertainty
         p (int): the number of positive samples
         n (int): the number of negative samples
-        upper (str): 'max'/'rmax' - the type of upper bound
+        upper (str): 'max'/'rmax'/'onmax' - the type of upper bound
 
     Returns:
         float: the upper bound for the maximum accuracy
@@ -272,6 +364,8 @@ def max_acc_upper_from(*, scores: dict, eps: float, p: int, n: int, upper: str =
         upper0 = acc_max(intervals["auc"][1], p, n)
     elif upper == "rmax":
         upper0 = acc_rmax(intervals["auc"][1], p, n)
+    elif upper == "onmax":
+        upper0 = acc_onmax(intervals["auc"][1], p, n)
     else:
         raise ValueError(f"unsupported upper bound {upper}")
 
@@ -291,7 +385,7 @@ def max_acc_from(
         p (int): the number of positive samples
         n (int): the number of negative samples
         lower (str): 'min'
-        upper (str): 'max'/'rmax' - the type of upper bound
+        upper (str): 'max'/'rmax'/'onmax' - the type of upper bound
 
     Returns:
         tuple(float, float): the interval for the maximum accuracy

diff --git a/mlscorecheck/auc/_auc_aggregated.py b/mlscorecheck/auc/_auc_aggregated.py
@@ -22,6 +22,7 @@
 __all__ = [
     "auc_min_aggregated",
     "auc_max_aggregated",
+    "auc_onmin_aggregated",
     "auc_rmin_aggregated",
     "auc_maxa_evaluate",
     "auc_maxa_solve",
@@ -371,6 +372,36 @@ def auc_max_aggregated(
     return results
 
 
+def auc_onmin_aggregated(
+    fpr: float, tpr: float, k: int, return_solutions: bool = False
+) -> float:
+    """
+    The average area under the onmin curves at the average fpr, tpr
+
+    Args:
+        fpr (list): lower bound on average false positive rate
+        tpr (list): upper bound on average true positive rate
+        return_solutions (bool): whether to return the solutions for the
+        underlying curves
+
+    Returns:
+        float | (float, np.array, np.array, np.array, np.array): the area or the area, the
+        solutions and the bounds
+    """
+
+    results = float((1 - fpr + tpr) / 2.0)
+
+    if return_solutions:
+        results = results, (
+            np.repeat(fpr, k),
+            np.repeat(tpr, k),
+            np.repeat(0.0, k),
+            np.repeat(1.0, k),
+        )
+
+    return results
+
+
 def auc_rmin_aggregated(
     fpr: float, tpr: float, k: int, return_solutions: bool = False
 ) -> float:
@@ -745,7 +776,7 @@ def check_applicability_lower_aggregated(intervals: dict, lower: str, ps: int, n
         ValueError: when the methods are not applicable with the
                     specified scores
     """
-    if lower in ["min", "rmin"]:
+    if lower in ["min", "rmin", "onmin"]:
         if "fpr" not in intervals or "tpr" not in intervals:
             raise ValueError("fpr, tpr or their complements must be specified")
     if lower in ["amin", "armin"]:
@@ -805,7 +836,7 @@ def auc_lower_from_aggregated(
                         ps and ns, contains the keys 'p', 'n', 'n_repeats',
                         'n_folds', 'folding' (currently 'stratified_sklearn'
                         supported for 'folding')
-        lower (str): ('min'/'rmin'/'amin'/'armin') - the type of
+        lower (str): ('min'/'rmin'/'amin'/'armin'/'onmin') - the type of
                         estimation for the lower bound
 
     Returns:
@@ -833,6 +864,8 @@ def auc_lower_from_aggregated(
 
     if lower == "min":
         lower0 = auc_min_aggregated(intervals["fpr"][1], intervals["tpr"][0], k)
+    elif lower == "onmin":
+        lower0 = auc_onmin_aggregated(intervals["fpr"][1], intervals["tpr"][0], k)
     elif lower == "rmin":
         lower0 = auc_rmin_aggregated(intervals["fpr"][0], intervals["tpr"][1], k)
     elif lower == "amin":
@@ -931,7 +964,7 @@ def auc_from_aggregated(
                         ps and ns, contains the keys 'p', 'n', 'n_repeats',
                         'n_folds', 'folding' (currently 'stratified_sklearn'
                         supported for 'folding')
-        lower (str): ('min'/'rmin'/'amin'/'armin') - the type of
+        lower (str): ('min'/'rmin'/'amin'/'armin'/'onmin') - the type of
                         estimation for the lower bound
         upper (str): ('max'/'maxa'/'amax') - the type of estimation for
                         the upper bound