New reports improvements. (#1414)

evidentlyai · Jan 13, 2025 · a0b17ef · a0b17ef
1 parent d7c11a1
commit a0b17ef
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 25 deletions.
diff --git a/src/evidently/calculations/classification_performance.py b/src/evidently/calculations/classification_performance.py
@@ -101,9 +101,12 @@ def get_prediction_data(
     target = data_columns.utility_columns.target
 
     if isinstance(prediction, list) and len(prediction) > 2:
+        pred_data = data[prediction].idxmax(axis=1)
+        if is_integer_dtype(data[target]):
+            pred_data = pred_data.apply(lambda x: int(x) if x is not None else None)
         # list of columns with prediction probas, should be same as target labels
         return PredictionData(
-            predictions=data[prediction].idxmax(axis=1),
+            predictions=pred_data,
             prediction_probas=data[prediction],
             labels=prediction,
         )
@@ -154,25 +157,29 @@ def get_prediction_data(
         and is_float_dtype(data[prediction])
     ):
         pos_label = _check_pos_labels(pos_label, labels)
-        if prediction not in labels:
-            raise ValueError(
-                "No prediction for the target labels were found. "
-                "Consider to rename columns with the prediction to match target labels."
-            )
 
+        neg_label = None
+        if prediction in labels and pos_label != prediction:
+            neg_label = prediction
         # get negative label for binary classification
-        labels = pd.Series(labels)
-        neg_label = labels[labels != pos_label].iloc[0]
-        if pos_label == prediction:
-            pos_preds = data[prediction]
 
-        else:
+        if neg_label is None:
+            for label in labels:
+                if label != pos_label:
+                    neg_label = label
+        if neg_label is None:
+            raise ValueError("Failed to determine negative label")
+        if prediction in labels and neg_label == prediction:
+            neg_preds = data[prediction]
             pos_preds = data[prediction].apply(lambda x: 1.0 - x)
+        else:
+            pos_preds = data[prediction]
+            neg_preds = data[prediction].apply(lambda x: 1.0 - x)
 
         prediction_probas = pd.DataFrame.from_dict(
             {
                 pos_label: pos_preds,
-                neg_label: pos_preds.apply(lambda x: 1.0 - x),
+                neg_label: neg_preds,
             }
         )
         predictions = threshold_probability_labels(prediction_probas, pos_label, neg_label, threshold)

diff --git a/src/evidently/future/metric_types.py b/src/evidently/future/metric_types.py
@@ -70,6 +70,30 @@ def widget(self, value: List[BaseWidgetInfo]):
     def tests(self) -> Dict["BoundTest", "MetricTestResult"]:
         return self._tests or {}
 
+    def to_dict(self):
+        config = self._metric.metric.dict()  # type: ignore[attr-defined]
+        config_items = []
+        type = None
+        for field, value in config.items():
+            if field == "type":
+                type = value.split(":")[-1]
+                continue
+            elif value is None:
+                continue
+            elif isinstance(value, list):
+                if len(value) > 0:
+                    config_items.append(f"{field}={','.join(str(x) for x in value)}")
+                continue
+            elif isinstance(value, dict):
+                continue
+            else:
+                config_items.append(f"{field}={str(value)}")
+        return {
+            "id": self._metric.id,
+            "metric_id": f"{type}({','.join(config_items)})",
+            "value": self.dict(),
+        }
+
     @abc.abstractmethod
     def dict(self) -> object:
         raise NotImplementedError()
@@ -228,12 +252,16 @@ def get_default_render(title: str, result: TResult) -> List[BaseWidgetInfo]:
             counter(
                 title=title,
                 size=WidgetSize.FULL,
-                counters=[CounterData(label="", value=str(result.value))],
+                counters=[CounterData(label="", value=f"{result.value:0.3f}")],
             ),
         ]
     if isinstance(result, ByLabelValue):
         return [
-            table_data(title=title, column_names=["Label", "Value"], data=[(k, v) for k, v in result.values.items()])
+            table_data(
+                title=title,
+                column_names=["Label", "Value"],
+                data=[(k, f"{v:0.3f}") for k, v in result.values.items()],
+            )
         ]
     if isinstance(result, CountValue):
         return [

diff --git a/src/evidently/future/metrics/classification.py b/src/evidently/future/metrics/classification.py
@@ -13,6 +13,7 @@
 from evidently.future.metric_types import SingleValueMetric
 from evidently.future.metrics._legacy import LegacyMetricCalculation
 from evidently.future.report import Context
+from evidently.metric_results import Label
 from evidently.metrics import ClassificationDummyMetric
 from evidently.metrics import ClassificationQualityByClass as _ClassificationQualityByClass
 from evidently.metrics.classification_performance.classification_dummy_metric import ClassificationDummyMetricResults
@@ -70,6 +71,15 @@ def get_tests(self, value: ByLabelValue) -> Generator[MetricTestResult, None, No
             for test in tests:
                 yield test.to_test()(self, label_value)
 
+    def _relabel(self, context: "Context", label: Label):
+        classification = context.data_definition.get_classification("default")
+        if classification is None:
+            return label
+        labels = classification.labels
+        if labels is not None:
+            return labels[label]
+        return label
+
 
 class F1ByLabel(ClassificationQualityByLabel):
     pass
@@ -82,7 +92,7 @@ def calculate_value(
         legacy_result: ClassificationQualityByClassResult,
         render: List[BaseWidgetInfo],
     ) -> ByLabelValue:
-        return ByLabelValue({k: v.f1 for k, v in legacy_result.current.metrics.items()})
+        return ByLabelValue({self._relabel(context, k): v.f1 for k, v in legacy_result.current.metrics.items()})
 
     def display_name(self) -> str:
         return "F1 by Label metric"
@@ -100,7 +110,7 @@ def calculate_value(
         render: List[BaseWidgetInfo],
     ) -> ByLabelValue:
         return ByLabelValue(
-            {k: v.precision for k, v in legacy_result.current.metrics.items()},
+            {self._relabel(context, k): v.precision for k, v in legacy_result.current.metrics.items()},
         )
 
     def display_name(self) -> str:
@@ -119,7 +129,7 @@ def calculate_value(
         render: List[BaseWidgetInfo],
     ) -> ByLabelValue:
         return ByLabelValue(
-            {k: v.recall for k, v in legacy_result.current.metrics.items()},
+            {self._relabel(context, k): v.recall for k, v in legacy_result.current.metrics.items()},
         )
 
     def display_name(self) -> str:
@@ -138,7 +148,7 @@ def calculate_value(
         render: List[BaseWidgetInfo],
     ) -> ByLabelValue:
         value = ByLabelValue(
-            {k: v.roc_auc for k, v in legacy_result.current.metrics.items()},
+            {self._relabel(context, k): v.roc_auc for k, v in legacy_result.current.metrics.items()},
         )
         value.widget = render
         value.widget[0].params["counters"][0]["label"] = self.display_name()

diff --git a/src/evidently/future/presets/classification.py b/src/evidently/future/presets/classification.py
@@ -98,7 +98,9 @@ def metrics(self) -> List[Metric]:
 
     def calculate(self, metric_results: Dict[MetricId, MetricResult]) -> PresetResult:
         metric = RocAucByLabel(probas_threshold=self._probas_threshold, k=self._k)
-        return PresetResult(metric_results[metric.to_calculation().id].widget)
+        widget = metric_results[metric.to_calculation().id].widget[:]
+        widget[0].params["counters"][0]["label"] = "Classification Quality by Label"
+        return PresetResult(widget)
 
 
 class ClassificationDummyQuality(MetricContainer):

diff --git a/src/evidently/future/report.py b/src/evidently/future/report.py
@@ -102,11 +102,19 @@ def get_legacy_metric(self, metric: LegacyMetric[T]) -> Tuple[T, List[BaseWidget
         classification = self._input_data[0]._data_definition.get_classification("default")
         reference = self._input_data[1].as_dataframe() if self._input_data[1] is not None else None
         current = self._input_data[0].as_dataframe()
+        prediction: Optional[Union[str, List[str]]]
+        if classification is not None:
+            if isinstance(classification.prediction_probas, list):
+                prediction = classification.prediction_probas
+            elif classification.prediction_probas not in current.columns:
+                prediction = classification.prediction_labels
+            else:
+                prediction = classification.prediction_probas
+        else:
+            prediction = None
         mapping = ColumnMapping(
             target=classification.target if classification is not None else None,
-            prediction=(classification.prediction_probas or classification.prediction_labels)
-            if classification is not None
-            else None,
+            prediction=prediction,
             pos_label=classification.pos_label if isinstance(classification, BinaryClassification) else None,
             target_names=classification.labels if classification is not None else None,
         )
@@ -204,10 +212,10 @@ def _repr_html_(self):
 
     def dict(self) -> dict:
         return {
-            "metrics": {
-                metric: self.context.get_metric_result(metric).dict()  # type: ignore[attr-defined]
+            "metrics": [
+                self.context.get_metric_result(metric).to_dict()  # type: ignore[attr-defined]
                 for metric in self.context._metrics_graph.keys()
-            },
+            ],
             "tests": {
                 test.get_fingerprint(): test_result.dict()
                 for metric in self.context._metrics_graph.keys()

diff --git a/tests/calculations/test_classification_performance.py b/tests/calculations/test_classification_performance.py
@@ -5,7 +5,10 @@
 
 from evidently.calculations.classification_performance import calculate_confusion_by_classes
 from evidently.calculations.classification_performance import calculate_metrics
+from evidently.calculations.classification_performance import get_prediction_data
 from evidently.metric_results import ConfusionMatrix
+from evidently.metric_results import DatasetColumns
+from evidently.metric_results import DatasetUtilityColumns
 from evidently.metric_results import PredictionData
 from evidently.pipeline.column_mapping import ColumnMapping
 
@@ -58,3 +61,41 @@ def test_calculate_metrics():
     assert actual_result.rate_plots_data.fpr == [pytest.approx(v) for v in [0.0, 0.0, 0.4, 0.4, 0.6, 0.6, 1.0]]
     assert actual_result.rate_plots_data.fnr == [pytest.approx(v) for v in [1.0, 0.8, 0.8, 0.2, 0.2, 0.0, 0.0]]
     assert actual_result.rate_plots_data.tnr == [pytest.approx(v) for v in [1.0, 1.0, 0.6, 0.6, 0.4, 0.4, 0.0]]
+
+
+@pytest.mark.parametrize(
+    "dataframe,target,prediction,target_names,pos_label,expected",
+    (
+        (
+            pd.DataFrame(data={"col": ["a", "b", "b", "a", "b"], "prob": [0.1, 0.1, 0.1, 0.8, 0.2]}),
+            "col",
+            "prob",
+            ["a", "b"],
+            "a",
+            {"a": [0.1, 0.1, 0.1, 0.8, 0.2], "b": [0.9, 0.9, 0.9, 0.2, 0.8]},
+        ),
+        (
+            pd.DataFrame(data={"col": ["a", "b", "b", "a", "b"], "prob": [0.1, 0.1, 0.1, 0.8, 0.2]}),
+            "col",
+            "prob",
+            ["a", "b"],
+            "a",
+            {"a": [0.1, 0.1, 0.1, 0.8, 0.2], "b": [0.9, 0.9, 0.9, 0.2, 0.8]},
+        ),
+    ),
+)
+def test_get_prediction_data(dataframe, target, prediction, target_names, pos_label, expected):
+    data = get_prediction_data(
+        dataframe,
+        DatasetColumns(
+            utility_columns=DatasetUtilityColumns(target=target, prediction=prediction),
+            target_names=target_names,
+            num_feature_names=[],
+            cat_feature_names=[],
+            text_feature_names=[],
+            datetime_feature_names=[],
+        ),
+        pos_label=pos_label,
+    )
+    for label in target_names:
+        assert np.allclose(data.prediction_probas[label], expected[label], atol=1e-6)