diff --git a/docs/clustering.md b/docs/clustering.md index a7c2a55..58df63f 100644 --- a/docs/clustering.md +++ b/docs/clustering.md @@ -188,6 +188,11 @@ top2vec = ClusteringTopicModel( Theoretically the model descriptions above should result in the same behaviour as the other two packages, but there might be minor changes in implementation. We do not intend to keep up with changes in Top2Vec's and BERTopic's internal implementation details indefinitely. +### _(Optional)_ 5. Dynamic Modeling + +Clustering models are also capable of dynamic topic modeling. This happens by fitting a clustering model over the entire corpus, as we expect that there is only one semantic model generating the documents. +To gain temporal representations for topics, the corpus is divided into equal, or arbitrarily chosen time slices, and then term importances are estimated using Soft-c-TF-IDF, c-TF-IDF, or distances from cluster centroid for each of the time slices separately. When distance from cluster centroids is used to estimate topic importances in dynamic modeling, cluster centroids are computed based on documents and terms present within a given time slice. + ## Considerations ### Strengths diff --git a/docs/dynamic.md b/docs/dynamic.md index 9e90628..3120d95 100644 --- a/docs/dynamic.md +++ b/docs/dynamic.md @@ -28,7 +28,7 @@ Dynamic topic models in Turftopic have a unified interface. To fit a dynamic topic model you will need a corpus, that has been annotated with timestamps. The timestamps need to be Python `datetime` objects, but pandas `Timestamp` object are also supported. -Models that have dynamic modeling capabilities have a `fit_transform_dynamic()` method, that fits the model on the corpus over time. +Models that have dynamic modeling capabilities (currently, `GMM` and `ClusteringTopicModel`) have a `fit_transform_dynamic()` method, that fits the model on the corpus over time. ```python from datetime import datetime diff --git a/tests/test_integration.py b/tests/test_integration.py index ee31f30..e3d92c4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,3 +1,4 @@ +from datetime import datetime import tempfile from pathlib import Path @@ -15,6 +16,21 @@ SemanticSignalSeparation, ) + +def generate_dates( + n_dates: int, +) -> list[datetime]: + """Generate random dates to test dynamic models""" + dates = [] + for n in range(n_dates): + d = np.random.randint(low=1, high=29) + m = np.random.randint(low=1, high=13) + y = np.random.randint(low=2000, high=2020) + date = datetime(year=y, month=m, day=d) + dates.append(date) + return dates + + newsgroups = fetch_20newsgroups( subset="all", categories=[ @@ -25,12 +41,13 @@ texts = newsgroups.data trf = SentenceTransformer("all-MiniLM-L6-v2") embeddings = np.asarray(trf.encode(texts)) +timestamps = generate_dates(n_dates=len(texts)) models = [ GMM(5, encoder=trf), SemanticSignalSeparation(5, encoder=trf), - KeyNMF(5, encoder=trf, keyword_scope='document'), - KeyNMF(5, encoder=trf, keyword_scope='corpus'), + KeyNMF(5, encoder=trf, keyword_scope="document"), + KeyNMF(5, encoder=trf, keyword_scope="corpus"), ClusteringTopicModel( n_reduce_to=5, feature_importance="c-tf-idf", @@ -46,6 +63,22 @@ AutoEncodingTopicModel(5, combined=True), ] +dynamic_models = [ + GMM(5, encoder=trf), + ClusteringTopicModel( + n_reduce_to=5, + feature_importance="centroid", + encoder=trf, + reduction_method="smallest", + ), + ClusteringTopicModel( + n_reduce_to=5, + feature_importance="soft-c-tf-idf", + encoder=trf, + reduction_method="smallest" + ), +] + @pytest.mark.parametrize("model", models) def test_fit_export_table(model): @@ -56,3 +89,18 @@ def test_fit_export_table(model): with out_path.open("w") as out_file: out_file.write(table) df = pd.read_csv(out_path) + + +@pytest.mark.parametrize("model", dynamic_models) +def test_fit_dynamic(model): + doc_topic_matrix = model.fit_transform_dynamic( + texts, + embeddings=embeddings, + timestamps=timestamps, + ) + table = model.export_topics(format="csv") + with tempfile.TemporaryDirectory() as tmpdirname: + out_path = Path(tmpdirname).joinpath("topics.csv") + with out_path.open("w") as out_file: + out_file.write(table) + df = pd.read_csv(out_path) diff --git a/turftopic/base.py b/turftopic/base.py index e0cd883..25b1ec7 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str: class ContextualModel(ABC, TransformerMixin, BaseEstimator): """Base class for contextual topic models in Turftopic.""" - def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]: + def get_topics( + self, top_k: int = 10 + ) -> List[Tuple[Any, List[Tuple[str, float]]]]: """Returns high-level topic representations in form of the top K words in each topic. @@ -135,8 +137,12 @@ def _highest_ranking_docs( except AttributeError: pass kth = min(top_k, document_topic_matrix.shape[0] - 1) - highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth] - highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])] + highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[ + :kth + ] + highest = highest[ + np.argsort(-document_topic_matrix[highest, topic_id]) + ] scores = document_topic_matrix[highest, topic_id] columns = [] columns.append("Document") @@ -171,7 +177,9 @@ def print_highest_ranking_documents( topic_id, raw_documents, document_topic_matrix, top_k ) table = Table(show_lines=True) - table.add_column("Document", justify="left", style="magenta", max_width=100) + table.add_column( + "Document", justify="left", style="magenta", max_width=100 + ) table.add_column("Score", style="blue", justify="right") for row in rows: table.add_row(*row) @@ -223,7 +231,9 @@ def _topic_distribution( ) -> list[list[str]]: if topic_dist is None: if text is None: - raise ValueError("You should either pass a text or a distribution.") + raise ValueError( + "You should either pass a text or a distribution." + ) try: topic_dist = self.transform([text]) except AttributeError: @@ -248,7 +258,9 @@ def _topic_distribution( rows.append([topic_names[ind], f"{score:.2f}"]) return [columns, *rows] - def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10): + def print_topic_distribution( + self, text=None, topic_dist=None, top_k: int = 10 + ): """Pretty prints topic distribution in a document. Parameters @@ -330,7 +342,9 @@ def fit_transform( """ pass - def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None): + def fit( + self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None + ): """Fits model on the given corpus. Parameters @@ -396,9 +410,13 @@ def prepare_topic_data( if embeddings is None: embeddings = self.encode_documents(corpus) try: - document_topic_matrix = self.transform(corpus, embeddings=embeddings) + document_topic_matrix = self.transform( + corpus, embeddings=embeddings + ) except (AttributeError, NotFittedError): - document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings) + document_topic_matrix = self.fit_transform( + corpus, embeddings=embeddings + ) dtm = self.vectorizer.transform(corpus) # type: ignore res: TopicData = { "corpus": corpus, diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py index d363580..8595157 100644 --- a/turftopic/dynamic.py +++ b/turftopic/dynamic.py @@ -199,7 +199,9 @@ def print_topics_over_time( show_scores: bool, default False Indicates whether to show importance scores for each word. """ - columns, *rows = self._topics_over_time(top_k, show_scores, date_format) + columns, *rows = self._topics_over_time( + top_k, show_scores, date_format + ) table = Table(show_lines=True) for column in columns: table.add_column(column) diff --git a/turftopic/encoders/__init__.py b/turftopic/encoders/__init__.py index 34492d6..6797d16 100644 --- a/turftopic/encoders/__init__.py +++ b/turftopic/encoders/__init__.py @@ -9,5 +9,5 @@ "OpenAIEmbeddings", "VoyageEmbeddings", "ExternalEncoder", - "E5Encoder" + "E5Encoder", ] diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py new file mode 100644 index 0000000..4a527f4 --- /dev/null +++ b/turftopic/encoders/utils.py @@ -0,0 +1,12 @@ +import itertools +from typing import Iterable, List + + +def batched(iterable, n: int) -> Iterable[List[str]]: + "Batch data into tuples of length n. The last batch may be shorter." + # batched('ABCDEFG', 3) --> ABC DEF G + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := list(itertools.islice(it, n)): + yield batch diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index dca7f95..092c359 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import Literal, Optional, Union import numpy as np @@ -11,6 +12,7 @@ from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder +from turftopic.dynamic import DynamicTopicModel, bin_timestamps from turftopic.feature_importance import ( cluster_centroid_distance, ctf_idf, @@ -33,6 +35,10 @@ ClusteringTopicModel(n_reduce_to=10) """ +feature_message = """ +feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid' +""" + def smallest_hierarchical_join( topic_vectors: np.ndarray, @@ -47,7 +53,9 @@ def smallest_hierarchical_join( classes = list(classes_) while len(classes) > n_to: smallest = np.argmin(topic_sizes) - dist = cosine_distances(np.atleast_2d(topic_vectors[smallest]), topic_vectors) + dist = cosine_distances( + np.atleast_2d(topic_vectors[smallest]), topic_vectors + ) closest = np.argsort(dist[0])[1] merge_inst.append((classes[smallest], classes[closest])) classes.pop(smallest) @@ -62,20 +70,26 @@ def smallest_hierarchical_join( def calculate_topic_vectors( - cluster_labels: np.ndarray, embeddings: np.ndarray + cluster_labels: np.ndarray, + embeddings: np.ndarray, + time_index: Optional[np.ndarray] = None, ) -> np.ndarray: """Calculates topic centroids.""" centroids = [] unique_labels = np.unique(cluster_labels) unique_labels = np.sort(unique_labels) for label in unique_labels: - centroid = np.mean(embeddings[cluster_labels == label], axis=0) + label_index = cluster_labels == label + if time_index is not None: + label_index = label_index * time_index + label_embeddings = embeddings[label_index] + centroid = np.mean(label_embeddings, axis=0) centroids.append(centroid) centroids = np.stack(centroids) return centroids -class ClusteringTopicModel(ContextualModel, ClusterMixin): +class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): """Topic models, which assume topics to be clusters of documents in semantic space. Models also include a dimensionality reduction step to aid clustering. @@ -127,7 +141,9 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): def __init__( self, - encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[ + Encoder, str + ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, dimensionality_reduction: Optional[TransformerMixin] = None, clustering: Optional[ClusterMixin] = None, @@ -135,9 +151,13 @@ def __init__( "c-tf-idf", "soft-c-tf-idf", "centroid" ] = "soft-c-tf-idf", n_reduce_to: Optional[int] = None, - reduction_method: Literal["agglomerative", "smallest"] = "agglomerative", + reduction_method: Literal[ + "agglomerative", "smallest" + ] = "agglomerative", ): self.encoder = encoder + if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]: + raise ValueError(feature_message) if isinstance(encoder, int): raise TypeError(integer_message) if isinstance(encoder, str): @@ -153,7 +173,9 @@ def __init__( else: self.clustering = clustering if dimensionality_reduction is None: - self.dimensionality_reduction = TSNE(n_components=2, metric="cosine") + self.dimensionality_reduction = TSNE( + n_components=2, metric="cosine" + ) else: self.dimensionality_reduction = dimensionality_reduction self.feature_importance = feature_importance @@ -209,7 +231,9 @@ def _estimate_parameters( self.vocab_embeddings = self.encoder_.encode( self.vectorizer.get_feature_names_out() ) # type: ignore - document_topic_matrix = label_binarize(self.labels_, classes=self.classes_) + document_topic_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) if self.feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore elif self.feature_importance == "centroid": @@ -250,7 +274,9 @@ def fit_predict( self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") status.update("Reducing Dimensionality") - reduced_embeddings = self.dimensionality_reduction.fit_transform(embeddings) + reduced_embeddings = self.dimensionality_reduction.fit_transform( + embeddings + ) console.log("Dimensionality reduction done.") status.update("Clustering documents") self.labels_ = self.clustering.fit_predict(reduced_embeddings) @@ -263,7 +289,9 @@ def fit_predict( console.log("Parameter estimation done.") if self.n_reduce_to is not None: n_topics = self.classes_.shape[0] - status.update(f"Reducing topics from {n_topics} to {self.n_reduce_to}") + status.update( + f"Reducing topics from {n_topics} to {self.n_reduce_to}" + ) if self.reduction_method == "agglomerative": self.labels_ = self._merge_agglomerative(self.n_reduce_to) else: @@ -285,3 +313,63 @@ def fit_transform( ): labels = self.fit_predict(raw_documents, y, embeddings) return label_binarize(labels, classes=self.classes_) + + def fit_transform_dynamic( + self, + raw_documents, + timestamps: list[datetime], + embeddings: Optional[np.ndarray] = None, + bins: Union[int, list[datetime]] = 10, + ): + time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) + temporal_components = [] + temporal_importances = [] + if embeddings is None: + embeddings = self.encoder_.encode(raw_documents) + for i_timebin in np.arange(len(self.time_bin_edges) - 1): + if hasattr(self, 'components_'): + doc_topic_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) + else: + doc_topic_matrix = self.fit_transform( + raw_documents, embeddings=embeddings + ) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum( + axis=0 + ) + topic_importances = topic_importances / topic_importances.sum() + t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] + t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] + if "c-tf-idf" in self.feature_importance: + if self.feature_importance == "soft-c-tf-idf": + components = soft_ctf_idf( + t_doc_topic_matrix, t_doc_term_matrix + ) + elif self.feature_importance == "c-tf-idf": + components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) + elif self.feature_importance == "centroid": + time_index = time_labels == i_timebin + t_topic_vectors = calculate_topic_vectors( + self.labels_, + embeddings, + time_index, + ) + topic_mask = np.isnan(t_topic_vectors).all( + axis=1, keepdims=True + ) + t_topic_vectors[:] = 0 + components = cluster_centroid_distance( + t_topic_vectors, + self.vocab_embeddings, + metric="cosine", + ) + components *= topic_mask + mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64) + mask_terms[mask_terms == 0] = np.nan + components *= mask_terms + temporal_components.append(components) + temporal_importances.append(topic_importances) + self.temporal_components_ = np.stack(temporal_components) + self.temporal_importance_ = np.stack(temporal_importances) + return doc_topic_matrix diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index 9764440..854fa34 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -64,7 +64,9 @@ class GMM(ContextualModel, DynamicTopicModel): def __init__( self, n_components: int, - encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[ + Encoder, str + ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, dimensionality_reduction: Optional[TransformerMixin] = None, weight_prior: Literal["dirichlet", "dirichlet_process", None] = None, @@ -99,7 +101,6 @@ def __init__( self.gmm_ = make_pipeline(dimensionality_reduction, mixture) else: self.gmm_ = mixture - self.components_ = None def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -118,7 +119,9 @@ def fit_transform( console.log("Mixture model fitted.") status.update("Estimating term importances.") document_topic_matrix = self.gmm_.predict_proba(embeddings) - self.components_ = soft_ctf_idf(document_topic_matrix, document_term_matrix) + self.components_ = soft_ctf_idf( + document_topic_matrix, document_term_matrix + ) console.log("Model fitting done.") return document_topic_matrix @@ -159,15 +162,21 @@ def fit_transform_dynamic( bins: Union[int, list[datetime]] = 10, ): time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) - if self.components_ is not None: - doc_topic_matrix = self.transform(raw_documents, embeddings=embeddings) + if hasattr(self, 'components_'): + doc_topic_matrix = self.transform( + raw_documents, embeddings=embeddings + ) else: - doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) + doc_topic_matrix = self.fit_transform( + raw_documents, embeddings=embeddings + ) document_term_matrix = self.vectorizer.transform(raw_documents) temporal_components = [] temporal_importances = [] for i_timebin in np.arange(len(self.time_bin_edges) - 1): - topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum( + axis=0 + ) # Normalizing topic_importances = topic_importances / topic_importances.sum() components = soft_ctf_idf( diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py index 513c2e9..7d74834 100644 --- a/turftopic/models/keynmf.py +++ b/turftopic/models/keynmf.py @@ -89,9 +89,9 @@ def __init__( ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, top_n: int = 25, - keyword_scope: str = 'document', + keyword_scope: str = "document", ): - if keyword_scope not in ['document', 'corpus']: + if keyword_scope not in ["document", "corpus"]: raise ValueError("keyword_scope must be 'document' or 'corpus'") self.n_components = n_components self.top_n = top_n @@ -123,7 +123,7 @@ def extract_keywords( for i in range(total): terms = document_term_matrix[i, :].todense() embedding = embeddings[i].reshape(1, -1) - if self.keyword_scope == 'document': + if self.keyword_scope == "document": mask = terms > 0 else: tot_freq = document_term_matrix.sum(axis=0)