From 40c54a9d2f05cacbbc3640b39bbe89f9d9ccb5d5 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 15 Mar 2024 13:44:05 +0100 Subject: [PATCH 1/9] add fit_transform_dynamic to clustering models --- turftopic/models/cluster.py | 49 +++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index dca7f95..5cecf41 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import Literal, Optional, Union import numpy as np @@ -11,6 +12,7 @@ from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder +from turftopic.dynamic import DynamicTopicModel, bin_timestamps from turftopic.feature_importance import ( cluster_centroid_distance, ctf_idf, @@ -285,3 +287,50 @@ def fit_transform( ): labels = self.fit_predict(raw_documents, y, embeddings) return label_binarize(labels, classes=self.classes_) + + def fit_transform_dynamic( + self, + raw_documents, + timestamps: list[datetime], + embeddings: Optional[np.ndarray] = None, + bins: Union[int, list[datetime]] = 10, + ): + time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) + temporal_components = [] + temporal_importances = [] + if embeddings is None: + embeddings = self.encoder_.encode(raw_documents) + for i_timebin in np.arange(len(self.time_bin_edges) - 1): + if self.labels_ is not None: + doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_) + else: + doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) + topic_importances = topic_importances / topic_importances.sum() + if "c-tf-idf" in self.feature_importance: + t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] + t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] + if self.feature_importance == 'soft-c-tf-idf': + components = soft_ctf_idf( + t_doc_topic_matrix, + t_doc_term_matrix + ) + elif self.feature_importance == 'c-tf-idf': + components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) + elif self.feature_importance == 'centroids': + t_labels = self.labels_[time_labels == i_timebin] + t_embeddings = embeddings[time_labels == i_timebin] # type: ignore + t_topic_vectors = calculate_topic_vectors(t_labels, t_embeddings) + t_vocab_embeddings = self.encoder_.encode( + self.vectorizer.get_feature_names_out() + ) + components = cluster_centroid_distance( + t_topic_vectors, + t_vocab_embeddings, + metric="cosine", + ) + temporal_components.append(components) + temporal_importances.append(topic_importances) + self.temporal_components_ = np.stack(temporal_components) + self.temporal_importance_ = np.stack(temporal_importances) + return doc_topic_matrix From d6147c8025fc2281ebb0a179a1a356f45476992e Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 19 Mar 2024 16:59:06 +0100 Subject: [PATCH 2/9] inherit from dynamictopicmodel --- turftopic/models/cluster.py | 2 +- turftopic/models/decomp.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 5cecf41..e760703 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -77,7 +77,7 @@ def calculate_topic_vectors( return centroids -class ClusteringTopicModel(ContextualModel, ClusterMixin): +class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): """Topic models, which assume topics to be clusters of documents in semantic space. Models also include a dimensionality reduction step to aid clustering. diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index c727490..8b2e628 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -47,6 +47,7 @@ def __init__( ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, objective: Literal["orthogonality", "independence"] = "independence", + subset: str = None ): self.n_components = n_components self.encoder = encoder @@ -63,6 +64,7 @@ def __init__( self.decomposition = FastICA(n_components) else: self.decomposition = PCA(n_components) + self.subset = subset def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -88,6 +90,12 @@ def fit_transform( console.log("Model fitting done.") return doc_topic + def abs_components(self, subset): + if subset == 'abs': + self.components_ = np.abs(self.components_) + if subset == 'neg': + self.components_ = -self.components_ + def transform( self, raw_documents, embeddings: Optional[np.ndarray] = None ) -> np.ndarray: From ca5f9032c81af2104c5f18d4b2c59e70532a8c88 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 19 Mar 2024 17:02:45 +0100 Subject: [PATCH 3/9] add valuerror if feature_importance is not a valid value --- turftopic/models/cluster.py | 8 ++++++++ turftopic/models/decomp.py | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index e760703..c20c5f2 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -35,6 +35,10 @@ ClusteringTopicModel(n_reduce_to=10) """ +feature_message = """ +feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroids' +""" + def smallest_hierarchical_join( topic_vectors: np.ndarray, @@ -140,6 +144,10 @@ def __init__( reduction_method: Literal["agglomerative", "smallest"] = "agglomerative", ): self.encoder = encoder + if feature_importance not in ["c-tf-idf", + "soft-c-tf-idf", + "centroid"]: + raise ValueError(feature_message) if isinstance(encoder, int): raise TypeError(integer_message) if isinstance(encoder, str): diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index 8b2e628..c727490 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -47,7 +47,6 @@ def __init__( ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, objective: Literal["orthogonality", "independence"] = "independence", - subset: str = None ): self.n_components = n_components self.encoder = encoder @@ -64,7 +63,6 @@ def __init__( self.decomposition = FastICA(n_components) else: self.decomposition = PCA(n_components) - self.subset = subset def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -90,12 +88,6 @@ def fit_transform( console.log("Model fitting done.") return doc_topic - def abs_components(self, subset): - if subset == 'abs': - self.components_ = np.abs(self.components_) - if subset == 'neg': - self.components_ = -self.components_ - def transform( self, raw_documents, embeddings: Optional[np.ndarray] = None ) -> np.ndarray: From 389f3e9586cd1529c367eb238c286332a944c339 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 19 Mar 2024 17:35:35 +0100 Subject: [PATCH 4/9] mask temporal_components according to whether a word is present in the time slice --- turftopic/models/cluster.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index c20c5f2..2fd7ba6 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -313,11 +313,12 @@ def fit_transform_dynamic( doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_) else: doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) topic_importances = topic_importances / topic_importances.sum() + t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] + t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] if "c-tf-idf" in self.feature_importance: - t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] - t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] if self.feature_importance == 'soft-c-tf-idf': components = soft_ctf_idf( t_doc_topic_matrix, @@ -327,16 +328,15 @@ def fit_transform_dynamic( components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) elif self.feature_importance == 'centroids': t_labels = self.labels_[time_labels == i_timebin] - t_embeddings = embeddings[time_labels == i_timebin] # type: ignore + t_embeddings = embeddings[time_labels == i_timebin] t_topic_vectors = calculate_topic_vectors(t_labels, t_embeddings) - t_vocab_embeddings = self.encoder_.encode( - self.vectorizer.get_feature_names_out() - ) components = cluster_centroid_distance( t_topic_vectors, - t_vocab_embeddings, + self.vocab_embeddings, metric="cosine", ) + mask = t_doc_term_matrix.sum(axis=0) + components = components * mask temporal_components.append(components) temporal_importances.append(topic_importances) self.temporal_components_ = np.stack(temporal_components) From a9a015e39e6b6675f678fba96754c904ea6a3d02 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 19 Mar 2024 19:23:08 +0100 Subject: [PATCH 5/9] NA centroids and terms when no documents are available for a topic within a time bin --- turftopic/models/cluster.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 2fd7ba6..f442edf 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -35,8 +35,8 @@ ClusteringTopicModel(n_reduce_to=10) """ -feature_message = """ -feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroids' +feature_message = """ +feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid' """ @@ -68,14 +68,19 @@ def smallest_hierarchical_join( def calculate_topic_vectors( - cluster_labels: np.ndarray, embeddings: np.ndarray + cluster_labels: np.ndarray, embeddings: np.ndarray, + time_index: Optional[np.ndarray] = None, ) -> np.ndarray: """Calculates topic centroids.""" centroids = [] unique_labels = np.unique(cluster_labels) unique_labels = np.sort(unique_labels) for label in unique_labels: - centroid = np.mean(embeddings[cluster_labels == label], axis=0) + label_index = cluster_labels == label + if time_index is not None: + label_index = label_index * time_index + label_embeddings = embeddings[label_index] + centroid = np.mean(label_embeddings, axis=0) centroids.append(centroid) centroids = np.stack(centroids) return centroids @@ -169,6 +174,7 @@ def __init__( self.feature_importance = feature_importance self.n_reduce_to = n_reduce_to self.reduction_method = reduction_method + self.components_ = None def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: n_topics = self.components_.shape[0] @@ -309,11 +315,10 @@ def fit_transform_dynamic( if embeddings is None: embeddings = self.encoder_.encode(raw_documents) for i_timebin in np.arange(len(self.time_bin_edges) - 1): - if self.labels_ is not None: + if self.components_ is not None: doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_) else: doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) - topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) topic_importances = topic_importances / topic_importances.sum() t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] @@ -326,17 +331,24 @@ def fit_transform_dynamic( ) elif self.feature_importance == 'c-tf-idf': components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) - elif self.feature_importance == 'centroids': - t_labels = self.labels_[time_labels == i_timebin] - t_embeddings = embeddings[time_labels == i_timebin] - t_topic_vectors = calculate_topic_vectors(t_labels, t_embeddings) + elif self.feature_importance == 'centroid': + time_index = time_labels == i_timebin + t_topic_vectors = calculate_topic_vectors( + self.labels_, embeddings, time_index, + ) + topic_mask = np.isnan(t_topic_vectors).all( + axis=1, keepdims=True + ) + t_topic_vectors[:] = 0 components = cluster_centroid_distance( t_topic_vectors, self.vocab_embeddings, metric="cosine", ) - mask = t_doc_term_matrix.sum(axis=0) - components = components * mask + components *= topic_mask + mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64) + mask_terms[mask_terms == 0] = np.nan + components *= mask_terms temporal_components.append(components) temporal_importances.append(topic_importances) self.temporal_components_ = np.stack(temporal_components) From e79d836c5aa793acbea09a9c9c0da1a716c15f55 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 19 Mar 2024 19:23:26 +0100 Subject: [PATCH 6/9] add tests for dynamic models --- tests/test_integration.py | 48 ++++++++++++++++++++++++++++++++++++- turftopic/encoders/utils.py | 12 ++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 turftopic/encoders/utils.py diff --git a/tests/test_integration.py b/tests/test_integration.py index ee31f30..da92554 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,3 +1,4 @@ +from datetime import datetime import tempfile from pathlib import Path @@ -12,9 +13,24 @@ AutoEncodingTopicModel, ClusteringTopicModel, KeyNMF, - SemanticSignalSeparation, + SemanticSignalSeparation ) + +def generate_dates( + n_dates: int, +) -> list[datetime]: + """ Generate random dates to test dynamic models """ + dates = [] + for n in range(n_dates): + d = np.random.randint(low=1, high=29) + m = np.random.randint(low=1, high=13) + y = np.random.randint(low=2000, high=2020) + date = datetime(year=y, month=m, day=d) + dates.append(date) + return dates + + newsgroups = fetch_20newsgroups( subset="all", categories=[ @@ -25,6 +41,7 @@ texts = newsgroups.data trf = SentenceTransformer("all-MiniLM-L6-v2") embeddings = np.asarray(trf.encode(texts)) +timestamps = generate_dates(n_dates=len(texts)) models = [ GMM(5, encoder=trf), @@ -46,6 +63,22 @@ AutoEncodingTopicModel(5, combined=True), ] +dynamic_models = [ + GMM(5, encoder=trf), + ClusteringTopicModel( + n_reduce_to=5, + feature_importance="centroid", + encoder=trf, + reduction_method="smallest" + ), + ClusteringTopicModel( + n_reduce_to=5, + feature_importance="soft-c-tf-idf", + encoder=trf, + reduction_method="smallest" + ) +] + @pytest.mark.parametrize("model", models) def test_fit_export_table(model): @@ -56,3 +89,16 @@ def test_fit_export_table(model): with out_path.open("w") as out_file: out_file.write(table) df = pd.read_csv(out_path) + + +@pytest.mark.parametrize("model", dynamic_models) +def test_fit_dynamic(model): + doc_topic_matrix = model.fit_transform_dynamic( + texts, embeddings=embeddings, timestamps=timestamps, + ) + table = model.export_topics(format="csv") + with tempfile.TemporaryDirectory() as tmpdirname: + out_path = Path(tmpdirname).joinpath("topics.csv") + with out_path.open("w") as out_file: + out_file.write(table) + df = pd.read_csv(out_path) diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py new file mode 100644 index 0000000..4a527f4 --- /dev/null +++ b/turftopic/encoders/utils.py @@ -0,0 +1,12 @@ +import itertools +from typing import Iterable, List + + +def batched(iterable, n: int) -> Iterable[List[str]]: + "Batch data into tuples of length n. The last batch may be shorter." + # batched('ABCDEFG', 3) --> ABC DEF G + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := list(itertools.islice(it, n)): + yield batch From 22129ff65dac33f236a344d7d80361d1296b1175 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 20 Mar 2024 09:54:25 +0100 Subject: [PATCH 7/9] add reference to dynamic modeling in docs --- docs/clustering.md | 5 +++++ docs/dynamic.md | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/clustering.md b/docs/clustering.md index a7c2a55..58df63f 100644 --- a/docs/clustering.md +++ b/docs/clustering.md @@ -188,6 +188,11 @@ top2vec = ClusteringTopicModel( Theoretically the model descriptions above should result in the same behaviour as the other two packages, but there might be minor changes in implementation. We do not intend to keep up with changes in Top2Vec's and BERTopic's internal implementation details indefinitely. +### _(Optional)_ 5. Dynamic Modeling + +Clustering models are also capable of dynamic topic modeling. This happens by fitting a clustering model over the entire corpus, as we expect that there is only one semantic model generating the documents. +To gain temporal representations for topics, the corpus is divided into equal, or arbitrarily chosen time slices, and then term importances are estimated using Soft-c-TF-IDF, c-TF-IDF, or distances from cluster centroid for each of the time slices separately. When distance from cluster centroids is used to estimate topic importances in dynamic modeling, cluster centroids are computed based on documents and terms present within a given time slice. + ## Considerations ### Strengths diff --git a/docs/dynamic.md b/docs/dynamic.md index 9e90628..3120d95 100644 --- a/docs/dynamic.md +++ b/docs/dynamic.md @@ -28,7 +28,7 @@ Dynamic topic models in Turftopic have a unified interface. To fit a dynamic topic model you will need a corpus, that has been annotated with timestamps. The timestamps need to be Python `datetime` objects, but pandas `Timestamp` object are also supported. -Models that have dynamic modeling capabilities have a `fit_transform_dynamic()` method, that fits the model on the corpus over time. +Models that have dynamic modeling capabilities (currently, `GMM` and `ClusteringTopicModel`) have a `fit_transform_dynamic()` method, that fits the model on the corpus over time. ```python from datetime import datetime From 4fcab17381980547c801d9bfea23923333be882d Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 20 Mar 2024 16:10:13 +0100 Subject: [PATCH 8/9] linting --- tests/test_integration.py | 34 ++++++++++--------- turftopic/base.py | 36 +++++++++++++++----- turftopic/dynamic.py | 4 ++- turftopic/encoders/__init__.py | 2 +- turftopic/models/cluster.py | 60 ++++++++++++++++++++++------------ turftopic/models/gmm.py | 20 +++++++++--- turftopic/models/keynmf.py | 6 ++-- 7 files changed, 107 insertions(+), 55 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index da92554..e3d92c4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,22 +13,22 @@ AutoEncodingTopicModel, ClusteringTopicModel, KeyNMF, - SemanticSignalSeparation + SemanticSignalSeparation, ) def generate_dates( - n_dates: int, + n_dates: int, ) -> list[datetime]: - """ Generate random dates to test dynamic models """ - dates = [] - for n in range(n_dates): - d = np.random.randint(low=1, high=29) - m = np.random.randint(low=1, high=13) - y = np.random.randint(low=2000, high=2020) - date = datetime(year=y, month=m, day=d) - dates.append(date) - return dates + """Generate random dates to test dynamic models""" + dates = [] + for n in range(n_dates): + d = np.random.randint(low=1, high=29) + m = np.random.randint(low=1, high=13) + y = np.random.randint(low=2000, high=2020) + date = datetime(year=y, month=m, day=d) + dates.append(date) + return dates newsgroups = fetch_20newsgroups( @@ -46,8 +46,8 @@ def generate_dates( models = [ GMM(5, encoder=trf), SemanticSignalSeparation(5, encoder=trf), - KeyNMF(5, encoder=trf, keyword_scope='document'), - KeyNMF(5, encoder=trf, keyword_scope='corpus'), + KeyNMF(5, encoder=trf, keyword_scope="document"), + KeyNMF(5, encoder=trf, keyword_scope="corpus"), ClusteringTopicModel( n_reduce_to=5, feature_importance="c-tf-idf", @@ -69,14 +69,14 @@ def generate_dates( n_reduce_to=5, feature_importance="centroid", encoder=trf, - reduction_method="smallest" + reduction_method="smallest", ), ClusteringTopicModel( n_reduce_to=5, feature_importance="soft-c-tf-idf", encoder=trf, reduction_method="smallest" - ) + ), ] @@ -94,7 +94,9 @@ def test_fit_export_table(model): @pytest.mark.parametrize("model", dynamic_models) def test_fit_dynamic(model): doc_topic_matrix = model.fit_transform_dynamic( - texts, embeddings=embeddings, timestamps=timestamps, + texts, + embeddings=embeddings, + timestamps=timestamps, ) table = model.export_topics(format="csv") with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/turftopic/base.py b/turftopic/base.py index e0cd883..25b1ec7 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str: class ContextualModel(ABC, TransformerMixin, BaseEstimator): """Base class for contextual topic models in Turftopic.""" - def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]: + def get_topics( + self, top_k: int = 10 + ) -> List[Tuple[Any, List[Tuple[str, float]]]]: """Returns high-level topic representations in form of the top K words in each topic. @@ -135,8 +137,12 @@ def _highest_ranking_docs( except AttributeError: pass kth = min(top_k, document_topic_matrix.shape[0] - 1) - highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth] - highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])] + highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[ + :kth + ] + highest = highest[ + np.argsort(-document_topic_matrix[highest, topic_id]) + ] scores = document_topic_matrix[highest, topic_id] columns = [] columns.append("Document") @@ -171,7 +177,9 @@ def print_highest_ranking_documents( topic_id, raw_documents, document_topic_matrix, top_k ) table = Table(show_lines=True) - table.add_column("Document", justify="left", style="magenta", max_width=100) + table.add_column( + "Document", justify="left", style="magenta", max_width=100 + ) table.add_column("Score", style="blue", justify="right") for row in rows: table.add_row(*row) @@ -223,7 +231,9 @@ def _topic_distribution( ) -> list[list[str]]: if topic_dist is None: if text is None: - raise ValueError("You should either pass a text or a distribution.") + raise ValueError( + "You should either pass a text or a distribution." + ) try: topic_dist = self.transform([text]) except AttributeError: @@ -248,7 +258,9 @@ def _topic_distribution( rows.append([topic_names[ind], f"{score:.2f}"]) return [columns, *rows] - def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10): + def print_topic_distribution( + self, text=None, topic_dist=None, top_k: int = 10 + ): """Pretty prints topic distribution in a document. Parameters @@ -330,7 +342,9 @@ def fit_transform( """ pass - def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None): + def fit( + self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None + ): """Fits model on the given corpus. Parameters @@ -396,9 +410,13 @@ def prepare_topic_data( if embeddings is None: embeddings = self.encode_documents(corpus) try: - document_topic_matrix = self.transform(corpus, embeddings=embeddings) + document_topic_matrix = self.transform( + corpus, embeddings=embeddings + ) except (AttributeError, NotFittedError): - document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings) + document_topic_matrix = self.fit_transform( + corpus, embeddings=embeddings + ) dtm = self.vectorizer.transform(corpus) # type: ignore res: TopicData = { "corpus": corpus, diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py index d363580..8595157 100644 --- a/turftopic/dynamic.py +++ b/turftopic/dynamic.py @@ -199,7 +199,9 @@ def print_topics_over_time( show_scores: bool, default False Indicates whether to show importance scores for each word. """ - columns, *rows = self._topics_over_time(top_k, show_scores, date_format) + columns, *rows = self._topics_over_time( + top_k, show_scores, date_format + ) table = Table(show_lines=True) for column in columns: table.add_column(column) diff --git a/turftopic/encoders/__init__.py b/turftopic/encoders/__init__.py index 34492d6..6797d16 100644 --- a/turftopic/encoders/__init__.py +++ b/turftopic/encoders/__init__.py @@ -9,5 +9,5 @@ "OpenAIEmbeddings", "VoyageEmbeddings", "ExternalEncoder", - "E5Encoder" + "E5Encoder", ] diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index f442edf..4e1ab22 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -53,7 +53,9 @@ def smallest_hierarchical_join( classes = list(classes_) while len(classes) > n_to: smallest = np.argmin(topic_sizes) - dist = cosine_distances(np.atleast_2d(topic_vectors[smallest]), topic_vectors) + dist = cosine_distances( + np.atleast_2d(topic_vectors[smallest]), topic_vectors + ) closest = np.argsort(dist[0])[1] merge_inst.append((classes[smallest], classes[closest])) classes.pop(smallest) @@ -68,7 +70,8 @@ def smallest_hierarchical_join( def calculate_topic_vectors( - cluster_labels: np.ndarray, embeddings: np.ndarray, + cluster_labels: np.ndarray, + embeddings: np.ndarray, time_index: Optional[np.ndarray] = None, ) -> np.ndarray: """Calculates topic centroids.""" @@ -138,7 +141,9 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): def __init__( self, - encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[ + Encoder, str + ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, dimensionality_reduction: Optional[TransformerMixin] = None, clustering: Optional[ClusterMixin] = None, @@ -146,12 +151,12 @@ def __init__( "c-tf-idf", "soft-c-tf-idf", "centroid" ] = "soft-c-tf-idf", n_reduce_to: Optional[int] = None, - reduction_method: Literal["agglomerative", "smallest"] = "agglomerative", + reduction_method: Literal[ + "agglomerative", "smallest" + ] = "agglomerative", ): self.encoder = encoder - if feature_importance not in ["c-tf-idf", - "soft-c-tf-idf", - "centroid"]: + if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]: raise ValueError(feature_message) if isinstance(encoder, int): raise TypeError(integer_message) @@ -168,7 +173,9 @@ def __init__( else: self.clustering = clustering if dimensionality_reduction is None: - self.dimensionality_reduction = TSNE(n_components=2, metric="cosine") + self.dimensionality_reduction = TSNE( + n_components=2, metric="cosine" + ) else: self.dimensionality_reduction = dimensionality_reduction self.feature_importance = feature_importance @@ -225,7 +232,9 @@ def _estimate_parameters( self.vocab_embeddings = self.encoder_.encode( self.vectorizer.get_feature_names_out() ) # type: ignore - document_topic_matrix = label_binarize(self.labels_, classes=self.classes_) + document_topic_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) if self.feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore elif self.feature_importance == "centroid": @@ -266,7 +275,9 @@ def fit_predict( self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") status.update("Reducing Dimensionality") - reduced_embeddings = self.dimensionality_reduction.fit_transform(embeddings) + reduced_embeddings = self.dimensionality_reduction.fit_transform( + embeddings + ) console.log("Dimensionality reduction done.") status.update("Clustering documents") self.labels_ = self.clustering.fit_predict(reduced_embeddings) @@ -279,7 +290,9 @@ def fit_predict( console.log("Parameter estimation done.") if self.n_reduce_to is not None: n_topics = self.classes_.shape[0] - status.update(f"Reducing topics from {n_topics} to {self.n_reduce_to}") + status.update( + f"Reducing topics from {n_topics} to {self.n_reduce_to}" + ) if self.reduction_method == "agglomerative": self.labels_ = self._merge_agglomerative(self.n_reduce_to) else: @@ -316,25 +329,32 @@ def fit_transform_dynamic( embeddings = self.encoder_.encode(raw_documents) for i_timebin in np.arange(len(self.time_bin_edges) - 1): if self.components_ is not None: - doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_) + doc_topic_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) else: - doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) - topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) + doc_topic_matrix = self.fit_transform( + raw_documents, embeddings=embeddings + ) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum( + axis=0 + ) topic_importances = topic_importances / topic_importances.sum() t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] if "c-tf-idf" in self.feature_importance: - if self.feature_importance == 'soft-c-tf-idf': + if self.feature_importance == "soft-c-tf-idf": components = soft_ctf_idf( - t_doc_topic_matrix, - t_doc_term_matrix + t_doc_topic_matrix, t_doc_term_matrix ) - elif self.feature_importance == 'c-tf-idf': + elif self.feature_importance == "c-tf-idf": components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) - elif self.feature_importance == 'centroid': + elif self.feature_importance == "centroid": time_index = time_labels == i_timebin t_topic_vectors = calculate_topic_vectors( - self.labels_, embeddings, time_index, + self.labels_, + embeddings, + time_index, ) topic_mask = np.isnan(t_topic_vectors).all( axis=1, keepdims=True diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index 9764440..ef34403 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -64,7 +64,9 @@ class GMM(ContextualModel, DynamicTopicModel): def __init__( self, n_components: int, - encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[ + Encoder, str + ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, dimensionality_reduction: Optional[TransformerMixin] = None, weight_prior: Literal["dirichlet", "dirichlet_process", None] = None, @@ -118,7 +120,9 @@ def fit_transform( console.log("Mixture model fitted.") status.update("Estimating term importances.") document_topic_matrix = self.gmm_.predict_proba(embeddings) - self.components_ = soft_ctf_idf(document_topic_matrix, document_term_matrix) + self.components_ = soft_ctf_idf( + document_topic_matrix, document_term_matrix + ) console.log("Model fitting done.") return document_topic_matrix @@ -160,14 +164,20 @@ def fit_transform_dynamic( ): time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) if self.components_ is not None: - doc_topic_matrix = self.transform(raw_documents, embeddings=embeddings) + doc_topic_matrix = self.transform( + raw_documents, embeddings=embeddings + ) else: - doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings) + doc_topic_matrix = self.fit_transform( + raw_documents, embeddings=embeddings + ) document_term_matrix = self.vectorizer.transform(raw_documents) temporal_components = [] temporal_importances = [] for i_timebin in np.arange(len(self.time_bin_edges) - 1): - topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0) + topic_importances = doc_topic_matrix[time_labels == i_timebin].sum( + axis=0 + ) # Normalizing topic_importances = topic_importances / topic_importances.sum() components = soft_ctf_idf( diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py index 513c2e9..7d74834 100644 --- a/turftopic/models/keynmf.py +++ b/turftopic/models/keynmf.py @@ -89,9 +89,9 @@ def __init__( ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, top_n: int = 25, - keyword_scope: str = 'document', + keyword_scope: str = "document", ): - if keyword_scope not in ['document', 'corpus']: + if keyword_scope not in ["document", "corpus"]: raise ValueError("keyword_scope must be 'document' or 'corpus'") self.n_components = n_components self.top_n = top_n @@ -123,7 +123,7 @@ def extract_keywords( for i in range(total): terms = document_term_matrix[i, :].todense() embedding = embeddings[i].reshape(1, -1) - if self.keyword_scope == 'document': + if self.keyword_scope == "document": mask = terms > 0 else: tot_freq = document_term_matrix.sum(axis=0) From 21944ddb20aa4bed006bd11eaaa87f6179a4df0a Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 20 Mar 2024 16:12:08 +0100 Subject: [PATCH 9/9] use hasattr to check if model has been fitted --- turftopic/models/cluster.py | 3 +-- turftopic/models/gmm.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 4e1ab22..092c359 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -181,7 +181,6 @@ def __init__( self.feature_importance = feature_importance self.n_reduce_to = n_reduce_to self.reduction_method = reduction_method - self.components_ = None def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: n_topics = self.components_.shape[0] @@ -328,7 +327,7 @@ def fit_transform_dynamic( if embeddings is None: embeddings = self.encoder_.encode(raw_documents) for i_timebin in np.arange(len(self.time_bin_edges) - 1): - if self.components_ is not None: + if hasattr(self, 'components_'): doc_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index ef34403..854fa34 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -101,7 +101,6 @@ def __init__( self.gmm_ = make_pipeline(dimensionality_reduction, mixture) else: self.gmm_ = mixture - self.components_ = None def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -163,7 +162,7 @@ def fit_transform_dynamic( bins: Union[int, list[datetime]] = 10, ): time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) - if self.components_ is not None: + if hasattr(self, 'components_'): doc_topic_matrix = self.transform( raw_documents, embeddings=embeddings )