Skip to content

Commit

Permalink
feat: Add BERTScore feature descriptor and feature (#1335)
Browse files Browse the repository at this point in the history
* feat: Add BERTScore feature descriptor and feature

* refactor: Refactor BERTScoreFeature class and add TF-IDF weighting option

* update token_counts type

* Refactor BERTScoreFeature class and convert IDF scores to numpy arrays

* Refactor BERTScoreFeature class and import dependencies locally

* added documentation

---------

Co-authored-by: Emeli Dral <emeli@evidentlyai.com>
  • Loading branch information
projectultra and emeli-dral authored Oct 29, 2024
1 parent 89baac3 commit 6bb10cb
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 1 deletion.
3 changes: 2 additions & 1 deletion docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,8 @@ Use pre-trained machine learning models for evaluation.
| **Sentiment()** <ul><li>Analyzes the sentiment of the text using a word-based model. </li><li> Return a score on a scale: -1 (negative) to 1 positive). </li></ul>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **HuggingFaceModel()** <br><br> Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|
| **HuggingFaceToxicityModel()** <ul><li> Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label="hate"` (default)</li><li> `display_name`</li></ul> |

| **BERTScore()** <ul><li> Calculates similarity between two text columns based on token embeddings from a pre-trained BERT model.</li> <li>Returns [BERTScore](https://arxiv.org/pdf/1904.09675) (F1 Score) based on cosine similarity between token embeddings.</li></ul> | **Required:** <ul><li> `with_column` </li></ul> **Optional:** <ul><li> `model`:Name of the pre-trained BERT model to use (default: `"bert-base-uncased"`).</li><li>`tfidf_weighted`: Boolean indicating if embeddings should be weighted with inverse document frequency (IDF) scores (default: `False`).</li><li>`display_name`</li></ul> |

# Data Drift

**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.
Expand Down
13 changes: 13 additions & 0 deletions src/evidently/descriptors/BERTScore_descriptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from evidently.features.BERTScore_feature import BERTScoreFeature
from evidently.features.generated_features import FeatureDescriptor
from evidently.features.generated_features import GeneratedFeatures


class BERTScore(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:BERTScore"

with_column: str

def feature(self, column_name: str) -> GeneratedFeatures:
return BERTScoreFeature(columns=[column_name, self.with_column], display_name=self.display_name)
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import _registry
from .BERTScore_descriptor import BERTScore
from .contains_link_descriptor import ContainsLink
from .custom_descriptor import CustomColumnEval
from .custom_descriptor import CustomPairColumnEval
Expand Down Expand Up @@ -38,6 +39,7 @@
from .words_descriptor import WordNoMatch

__all__ = [
"BERTScore",
"CustomColumnEval",
"CustomPairColumnEval",
"HuggingFaceModel",
Expand Down
5 changes: 5 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@
"evidently.descriptors.semantic_similarity.SemanticSimilarity",
"evidently:descriptor:SemanticSimilarity",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.BERTScore_descriptor.BERTScore",
"evidently:descriptor:BERTScore",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.sentence_count_descriptor.SentenceCount",
Expand Down
120 changes: 120 additions & 0 deletions src/evidently/features/BERTScore_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from collections import defaultdict
from typing import ClassVar
from typing import Dict
from typing import List

import numpy as np
import pandas as pd

from evidently.base_metric import ColumnName
from evidently.core import ColumnType
from evidently.features.generated_features import GeneratedFeature
from evidently.utils.data_preprocessing import DataDefinition


class BERTScoreFeature(GeneratedFeature):
class Config:
type_alias = "evidently:feature:BERTScoreFeature"

__feature_type__: ClassVar = ColumnType.Numerical
columns: List[str]
model: str = "bert-base-uncased" # Pretrained BERT model
tfidf_weighted: bool = False # Whether to weight embeddings with IDF

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
# Load BERT model and tokenizer
from transformers import BertModel
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(self.model)
model = BertModel.from_pretrained(self.model)

# Tokenize sentences
tokens_first = tokenizer(
data[self.columns[0]].fillna("").tolist(), return_tensors="pt", padding=True, truncation=True
)
tokens_second = tokenizer(
data[self.columns[1]].fillna("").tolist(), return_tensors="pt", padding=True, truncation=True
)

# Get embeddings
embeddings_first = model(**tokens_first).last_hidden_state.detach().numpy()
embeddings_second = model(**tokens_second).last_hidden_state.detach().numpy()
# Obtain IDF scores
idf_scores = self.compute_idf_scores(data[self.columns[0]], data[self.columns[1]], tokenizer)

scores = []
for i, (emb1, emb2) in enumerate(zip(embeddings_first, embeddings_second)):
recall, precision = self.calculate_scores(emb1, emb2, idf_scores, i)
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
scores.append(f1_score)

# Return as a DataFrame
return pd.DataFrame(
{
self._feature_name(): pd.Series(scores, index=data.index),
}
)

def compute_idf_scores(self, col1: pd.Series, col2: pd.Series, tokenizer) -> tuple:
# Combine reference sentences
reference_sentences = pd.concat([col1, col2]).dropna().tolist()
M = len(reference_sentences)

# Compute IDF for each unique token
token_counts: Dict[str, int] = defaultdict(int)
for sentence in reference_sentences:
tokens = [tokenizer.cls_token] + tokenizer.tokenize(sentence) + [tokenizer.sep_token]
unique_tokens = set(tokens)
for token in unique_tokens:
token_counts[token] += 1

idf_scores = {token: -np.log(count / M) for token, count in token_counts.items()}

# Convert IDF scores to numpy arrays
def convert_to_idf_arrays(sentences):
idf_arrays = []
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)

# Add special tokens
tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
# Compute IDF scores for each token including plus one smoothing
idf_array = np.array([idf_scores.get(token, 0) + 1 for token in tokens])
idf_arrays.append(idf_array)
# Pad sequences to the same length
max_len = max(len(arr) for arr in idf_arrays)
idf_arrays = np.array([np.pad(arr, (0, max_len - len(arr)), "constant") for arr in idf_arrays])
return idf_arrays

idf_arrays1 = convert_to_idf_arrays(col1.fillna("").tolist())
idf_arrays2 = convert_to_idf_arrays(col2.fillna("").tolist())
return idf_arrays1, idf_arrays2

def max_similarity(self, embeddings1, embeddings2):
# Compute max cosine similarity for each token in embeddings1 with respect to embeddings2
similarity_matrix = np.dot(embeddings1, embeddings2.T) / (
np.linalg.norm(embeddings1, axis=1, keepdims=True) * np.linalg.norm(embeddings2, axis=1, keepdims=True).T
)
return similarity_matrix.max(axis=1)

def calculate_scores(self, emb1, emb2, idf_scores, index):
if self.tfidf_weighted:
weighted_scores = np.multiply(self.max_similarity(emb1, emb2), idf_scores[0][index])
recall = weighted_scores.sum() / idf_scores[0][index].sum()

weighted_scores = np.multiply(self.max_similarity(emb2, emb1), idf_scores[1][index])
precision = weighted_scores.sum() / idf_scores[1][index].sum()
else:
recall = self.max_similarity(emb1, emb2).mean()
precision = self.max_similarity(emb2, emb1).mean()
return recall, precision

def _feature_name(self):
return "|".join(self.columns)

def _as_column(self) -> "ColumnName":
return self._create_column(
self._feature_name(),
default_display_name=f"BERTScore for {' '.join(self.columns)}.",
)
5 changes: 5 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
"evidently.features.semantic_similarity_feature.SemanticSimilarityFeature",
"evidently:feature:SemanticSimilarityFeature",
)
register_type_alias(
GeneratedFeatures,
"evidently.features.BERTScore_feature.BERTScoreFeature",
"evidently:feature:BERTScoreFeature",
)
register_type_alias(
GeneratedFeatures, "evidently.features.sentence_count_feature.SentenceCount", "evidently:feature:SentenceCount"
)
Expand Down
40 changes: 40 additions & 0 deletions tests/features/test_bertscore_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd
import pytest

from evidently.features.BERTScore_feature import BERTScoreFeature
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition

test_data = [
("The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a lazy dog"),
("Hello world", "Hi universe"),
("Machine learning is fascinating", "Artificial intelligence is intriguing"),
("I love apples", "I adore oranges"),
("Python is a great programming language", "Python is an excellent coding language"),
]


@pytest.mark.parametrize(
(
"column_1",
"column_2",
"expected",
), # expected values obtained from the BERTScore library https://github.com/Tiiiger/bert_score
[
("The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a lazy dog", 0.8917),
("Hello world", "Hi universe", 0.7707),
("Machine learning is fascinating", "Artificial intelligence is intriguing", 0.8238),
("I love apples", "I adore oranges", 0.7017),
("Python is a great programming language", "Python is an excellent coding language", 0.8689),
],
)
def test_bert_score_feature(column_1: str, column_2: str, expected: float):
feature_generator = BERTScoreFeature(columns=["column_1", "column_2"], tfidf_weighted=False)
data = pd.DataFrame(dict(column_1=[column_1], column_2=[column_2]))

result = feature_generator.generate_feature(
data=data,
data_definition=create_data_definition(None, data, ColumnMapping()),
)
column_expected = feature_generator._feature_name()
assert result[column_expected].iloc[0] == pytest.approx(expected, rel=0.1)

0 comments on commit 6bb10cb

Please sign in to comment.