diff --git a/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb b/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb index 496bc42785..f0abc6298c 100644 --- a/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb +++ b/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb @@ -45,7 +45,7 @@ "outputs": [], "source": [ "report = Report(metrics=[\n", - " ROUGESummaryMetric(column_name=\"summary\", rouge_n=1)\n", + " ROUGESummaryMetric(column_name=\"summary\", rouge_n=2)\n", "])" ] }, @@ -75,11 +75,27 @@ "source": [ "report.as_dict()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report.as_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "evidently", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -97,5 +113,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/evidently/metrics/_registry.py b/src/evidently/metrics/_registry.py index 1ed0ce8345..26f6e58a8a 100644 --- a/src/evidently/metrics/_registry.py +++ b/src/evidently/metrics/_registry.py @@ -138,6 +138,13 @@ "evidently.metrics.data_integrity.dataset_summary_metric.DatasetSummaryMetric", "evidently:metric:DatasetSummaryMetric", ) + +register_type_alias( + Metric, + "evidently.metrics.data_integrity.rouge_summary_metric.ROUGESummaryMetric", + "evidently:metric:ROUGESummaryMetric", +) + register_type_alias( Metric, "evidently.metrics.data_quality.column_category_metric.ColumnCategoryMetric", @@ -570,6 +577,11 @@ "evidently.metrics.data_integrity.dataset_summary_metric.DatasetSummaryMetricResult", "evidently:metric_result:DatasetSummaryMetricResult", ) +register_type_alias( + MetricResult, + "evidently.metrics.data_integrity.rouge_summary_metric.ROUGESummaryMetricResult", + "evidently:metric_result:ROUGESummaryMetricResult", +) register_type_alias( MetricResult, "evidently.metrics.data_quality.column_category_metric.CategoryStat", diff --git a/src/evidently/metrics/data_integrity/rouge_summary_metric.py b/src/evidently/metrics/data_integrity/rouge_summary_metric.py index 959f29bbae..c9c53aeb2b 100644 --- a/src/evidently/metrics/data_integrity/rouge_summary_metric.py +++ b/src/evidently/metrics/data_integrity/rouge_summary_metric.py @@ -1,31 +1,37 @@ from typing import List -from typing import Union import evaluate import pandas as pd -from evidently.base_metric import ColumnName from evidently.base_metric import InputData from evidently.base_metric import Metric from evidently.base_metric import MetricResult from evidently.core import IncludeTags from evidently.model.widget import BaseWidgetInfo +from evidently.options.base import AnyOptions from evidently.renderers.base_renderer import MetricRenderer from evidently.renderers.base_renderer import default_renderer from evidently.renderers.html_widgets import header_text from evidently.renderers.html_widgets import table_data +from evidently.renderers.html_widgets import text_widget class ROUGESummaryMetricResult(MetricResult): class Config: type_alias = "evidently:metric_result:ROUGESummaryMetricResult" field_tags = { + "current": {IncludeTags.Current}, + "reference": {IncludeTags.Reference}, "rouge_type": {IncludeTags.Parameter}, - "value": {IncludeTags.Parameter}, + "per_row_scores": {IncludeTags.Parameter}, + "summary_score": {IncludeTags.Parameter}, } + current: list + reference: list rouge_type: str - score: dict + per_row_scores: list + summary_score: float class ROUGESummaryMetric(Metric[ROUGESummaryMetricResult]): @@ -36,38 +42,43 @@ class Config: column_name: str rouge_n: int - def __init__(self, column_name: Union[str, ColumnName], rouge_n: int): + def __init__(self, column_name: str, rouge_n: int, options: AnyOptions = None): self.column_name = column_name self.rouge_n = rouge_n - super().__init__() + super().__init__(options=options) - def _calculate_summary_rouge(self, current_data: pd.Series, reference_data: pd.Series): + def _calculate_summary_rouge(self, current: pd.Series, reference: pd.Series): rouge_evaluator = evaluate.load("rouge") - predictions = current_data.astype(str).tolist() - references = reference_data.astype(str).tolist() + current = current.astype(str).tolist() + reference = reference.astype(str).tolist() rouge_scores = rouge_evaluator.compute( - rouge_types=[f"rouge{self.rouge_n}"], predictions=predictions, references=references, use_aggregator=False + rouge_types=[f"rouge{self.rouge_n}"], predictions=current, references=reference, use_aggregator=False ) per_row_rouge_scores = rouge_scores[f"rouge{self.rouge_n}"] summary_rouge_score = sum(per_row_rouge_scores) / len(per_row_rouge_scores) - return per_row_rouge_scores, summary_rouge_score + return per_row_rouge_scores, summary_rouge_score, current, reference - def calculate(self, data: InputData) -> MetricResult: + def calculate(self, data: InputData) -> ROUGESummaryMetricResult: + if data.current_data is None or data.reference_data is None: + raise ValueError("The current data or the reference data is None.") if len(data.current_data[self.column_name]) == 0 or len(data.reference_data[self.column_name]) == 0: raise ValueError("The current data or the reference data is empty.") - per_row_rouge_scores, summary_rouge_score = self._calculate_summary_rouge( + per_row_rouge_scores, summary_rouge_score, current, reference = self._calculate_summary_rouge( data.current_data[self.column_name], data.reference_data[self.column_name] ) result = ROUGESummaryMetricResult( rouge_type=f"ROUGE-{self.rouge_n}", - score={"per_row_scores": per_row_rouge_scores, "summary_score": summary_rouge_score}, + per_row_scores=per_row_rouge_scores, + summary_score=summary_rouge_score, + current=current, + reference=reference, ) return result @@ -75,11 +86,18 @@ def calculate(self, data: InputData) -> MetricResult: @default_renderer(wrap_type=ROUGESummaryMetric) class ROUGESummaryMetricRenderer(MetricRenderer): @staticmethod - def _get_table(metric, n: int = 2) -> BaseWidgetInfo: - column_names = ["Metric", "Value"] - rows = ([metric.rouge_type, metric.score],) + def _get_table(metric) -> BaseWidgetInfo: + column_names = ["Metric", "current", "reference", "score"] + rows = [] + for i in range(len(metric.current)): + rows.append([metric.rouge_type, metric.current[i], metric.reference[i], metric.per_row_scores[i]]) + # rows.append(["metric.rouge_type", 1, "metric.current[i]", "metric.reference[i]", 2.4]) return table_data(title="", column_names=column_names, data=rows) - def render_html(self, obj: ROUGESummaryMetricResult) -> List[BaseWidgetInfo]: + def render_html(self, obj: ROUGESummaryMetric) -> List[BaseWidgetInfo]: metric = obj.get_result() - return [header_text(label="ROUGE Metric"), self._get_table(metric)] + return [ + header_text(label="ROUGE Metric"), + self._get_table(metric), + text_widget(text=f"{metric.summary_score}", title="Overall ROUGE score"), + ] diff --git a/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py b/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py index 339550b88c..814bf39ec2 100644 --- a/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py +++ b/tests/metrics/data_interity/test_dataset_rouge_summary_metric.py @@ -4,37 +4,7 @@ import pytest from evidently.metrics.data_integrity.rouge_summary_metric import ROUGESummaryMetric -from evidently.metrics.data_integrity.rouge_summary_metric import ROUGESummaryMetricResult from evidently.report.report import Report -from tests.conftest import smart_assert_equal - - -@pytest.mark.parametrize( - "current_df, reference_df, metric, expected_result", - ( - ( - pd.DataFrame( - { - "summary": ["hello there", "general kenobi"], - } - ), - pd.DataFrame({"summary": ["hello there", "no de"]}), - ROUGESummaryMetric(column_name="summary", rouge_n=1), - ROUGESummaryMetricResult(rouge_type="ROUGE-1", score={"per_row_scores": [1.0, 0.0], "summary_score": 0.5}), - ), - ), -) -def test_rouge_summary_metric_success( - current_df: pd.DataFrame, - reference_df: pd.DataFrame, - metric, - expected_result: ROUGESummaryMetricResult, -) -> None: - report = Report(metrics=[metric]) - - report.run(current_data=current_df, reference_data=reference_df) - - smart_assert_equal(metric.get_result(), expected_result) @pytest.mark.parametrize( @@ -48,7 +18,13 @@ def test_rouge_summary_metric_success( ), pd.DataFrame({"summary": ["hello there", "no de"]}), ROUGESummaryMetric(column_name="summary", rouge_n=1), - {"rouge_type": "ROUGE-1", "score": {"per_row_scores": [1.0, 0.0], "summary_score": 0.5}}, + { + "current": ["hello there", "general kenobi"], + "reference": ["hello there", "no de"], + "rouge_type": "ROUGE-1", + "per_row_scores": [1.0, 0.0], + "summary_score": 0.5, + }, ), ), ) diff --git a/tests/multitest/metrics/data_integrity.py b/tests/multitest/metrics/data_integrity.py index d52ae6526a..7973928f44 100644 --- a/tests/multitest/metrics/data_integrity.py +++ b/tests/multitest/metrics/data_integrity.py @@ -16,6 +16,7 @@ from evidently.metrics.data_integrity.column_summary_metric import NumericCharacteristics from evidently.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetric from evidently.metrics.data_integrity.dataset_summary_metric import DatasetSummaryMetric +from evidently.metrics.data_integrity.rouge_summary_metric import ROUGESummaryMetric from tests.multitest.conftest import AssertExpectedResult from tests.multitest.conftest import Error from tests.multitest.conftest import NoopOutcome @@ -206,6 +207,27 @@ def dataset_summary_metric(): ) +@metric +def rouge_summary_metric(): + return TestMetric( + name="rouge_summary_metric", + metric=ROUGESummaryMetric(column_name="summary", rouge_n=1), + fingerprint="bfc616f760b973d2cbfbf0540c7b2c71", + outcomes=NoopOutcome(), + datasets=[ + TestDataset( + "rouge_summary_metric_data", + current=pd.DataFrame( + { + "summary": ["hello there", "general kenobi"], + } + ), + reference=pd.DataFrame({"summary": ["hello there", "no de"]}), + ), + ], + ) + + @metric def column_reg_exp_metric(): return TestMetric(