From 6d5ac3356ccd5989bca717e12b722a9b6f761dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 4 Apr 2024 11:09:18 +0200 Subject: [PATCH 1/5] Removed orthogonality objective, added max_iter and random state to s3 --- turftopic/models/decomp.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index d29d119..ad718c5 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -20,7 +20,7 @@ class SemanticSignalSeparation(ContextualModel): corpus: list[str] = ["some text", "more text", ...] - model = SemanticSignalSeparation(10, objective="independence").fit(corpus) + model = SemanticSignalSeparation(10).fit(corpus) model.print_topics() ``` @@ -33,10 +33,10 @@ class SemanticSignalSeparation(ContextualModel): vectorizer: CountVectorizer, default None Vectorizer used for term extraction. Can be used to prune or filter the vocabulary. - objective: 'orthogonality' or 'independence', default 'independence' - Indicates what the components should be optimized for. - When 'orthogonality', PCA is used to discover components, - when 'independence', ICA is used to discover components. + max_iter: int, default 200 + Maximum number of iterations for ICA. + random_state: int, default None + Random state to use so that results are exactly reproducible. """ def __init__( @@ -46,7 +46,8 @@ def __init__( Encoder, str ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, - objective: Literal["orthogonality", "independence"] = "independence", + max_iter: int = 200, + random_state: Optional[int] = None, ): self.n_components = n_components self.encoder = encoder @@ -58,11 +59,11 @@ def __init__( self.vectorizer = default_vectorizer() else: self.vectorizer = vectorizer - self.objective = objective - if objective == "independence": - self.decomposition = FastICA(n_components) - else: - self.decomposition = PCA(n_components) + self.max_iter = max_iter + self.random_state = random_state + self.decomposition = FastICA( + n_components, max_iter=max_iter, random_state=random_state + ) def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None From 450184b2403c076a6e4c6e7f009c83999e0fbb13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 4 Apr 2024 11:23:13 +0200 Subject: [PATCH 2/5] Added random_state argument to all models so results are exactly reproducible. --- turftopic/models/cluster.py | 22 ++++++++++++++++++---- turftopic/models/ctm.py | 8 +++++++- turftopic/models/gmm.py | 11 +++++++++-- turftopic/models/keynmf.py | 14 +++++++++++--- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 092c359..b0c4c5c 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -137,6 +137,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): The specified reduction method will be used to merge them. By default, topics are not merged. reduction_method: 'agglomerative', 'smallest' + Method used to reduce the number of topics post-hoc. + When 'agglomerative', BERTopic's topic reduction method is used, + where topic vectors are hierarchically clustered. + When 'smallest', the smallest topic gets merged into the closest + non-outlier cluster until the desired number + is achieved similarly to Top2Vec. + random_state: int, default None + Random state to use so that results are exactly reproducible. """ def __init__( @@ -154,8 +162,10 @@ def __init__( reduction_method: Literal[ "agglomerative", "smallest" ] = "agglomerative", + random_state: Optional[int] = None, ): self.encoder = encoder + self.random_state = random_state if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]: raise ValueError(feature_message) if isinstance(encoder, int): @@ -174,7 +184,7 @@ def __init__( self.clustering = clustering if dimensionality_reduction is None: self.dimensionality_reduction = TSNE( - n_components=2, metric="cosine" + n_components=2, metric="cosine", random_state=random_state ) else: self.dimensionality_reduction = dimensionality_reduction @@ -196,7 +206,9 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: ) old_labels = [label for label in self.classes_ if label != -1] new_labels = AgglomerativeClustering( - n_clusters=n_reduce_to, metric="cosine", linkage="average" + n_clusters=n_reduce_to, + metric="cosine", + linkage="average", ).fit_predict(interesting_topic_vectors) res = {} if -1 in self.classes_: @@ -235,7 +247,9 @@ def _estimate_parameters( self.labels_, classes=self.classes_ ) if self.feature_importance == "soft-c-tf-idf": - self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore + self.components_ = soft_ctf_idf( + document_topic_matrix, doc_term_matrix + ) # type: ignore elif self.feature_importance == "centroid": self.components_ = cluster_centroid_distance( self.topic_vectors_, @@ -327,7 +341,7 @@ def fit_transform_dynamic( if embeddings is None: embeddings = self.encoder_.encode(raw_documents) for i_timebin in np.arange(len(self.time_bin_edges) - 1): - if hasattr(self, 'components_'): + if hasattr(self, "components_"): doc_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) diff --git a/turftopic/models/ctm.py b/turftopic/models/ctm.py index ba02e85..3a62ed7 100644 --- a/turftopic/models/ctm.py +++ b/turftopic/models/ctm.py @@ -1,4 +1,6 @@ import math +import random +import sys from typing import Optional, Union import numpy as np @@ -129,6 +131,8 @@ class AutoEncodingTopicModel(ContextualModel): Learning rate for the optimizer. n_epochs: int, default 50 Number of epochs to run during training. + random_state: int, default None + Random state to use so that results are exactly reproducible. """ def __init__( @@ -144,8 +148,10 @@ def __init__( batch_size: int = 42, learning_rate: float = 1e-2, n_epochs: int = 50, + random_state: Optional[int] = None, ): self.n_components = n_components + self.random_state = random_state self.encoder = encoder if isinstance(encoder, str): self.encoder_ = SentenceTransformer(encoder) @@ -205,7 +211,7 @@ def fit( status.update("Extracting terms.") document_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") - seed = 0 + seed = self.random_state or random.randint(0, sys.maxint - 1) torch.manual_seed(seed) pyro.set_rng_seed(seed) device = torch.device( diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index 854fa34..5448c82 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -54,6 +54,8 @@ class GMM(ContextualModel, DynamicTopicModel): result in Gaussian components. For even larger datasets you can use IncrementalPCA to reduce memory load. + random_state: int, default None + Random state to use so that results are exactly reproducible. Attributes ---------- @@ -71,11 +73,13 @@ def __init__( dimensionality_reduction: Optional[TransformerMixin] = None, weight_prior: Literal["dirichlet", "dirichlet_process", None] = None, gamma: Optional[float] = None, + random_state: Optional[int] = None, ): self.n_components = n_components self.encoder = encoder self.weight_prior = weight_prior self.gamma = gamma + self.random_state = random_state if isinstance(encoder, str): self.encoder_ = SentenceTransformer(encoder) else: @@ -94,9 +98,12 @@ def __init__( else "dirichlet_process" ), weight_concentration_prior=gamma, + random_state=self.random_state, ) else: - mixture = GaussianMixture(n_components) + mixture = GaussianMixture( + n_components, random_state=self.random_state + ) if dimensionality_reduction is not None: self.gmm_ = make_pipeline(dimensionality_reduction, mixture) else: @@ -162,7 +169,7 @@ def fit_transform_dynamic( bins: Union[int, list[datetime]] = 10, ): time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins) - if hasattr(self, 'components_'): + if hasattr(self, "components_"): doc_topic_matrix = self.transform( raw_documents, embeddings=embeddings ) diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py index 7d74834..e970468 100644 --- a/turftopic/models/keynmf.py +++ b/turftopic/models/keynmf.py @@ -79,6 +79,8 @@ class KeyNMF(ContextualModel): is performed on the whole vocabulary ('corpus') or only using words that are included in the document ('document'). Setting this to 'corpus' allows for multilingual topics. + random_state: int, default None + Random state to use so that results are exactly reproducible. """ def __init__( @@ -90,7 +92,9 @@ def __init__( vectorizer: Optional[CountVectorizer] = None, top_n: int = 25, keyword_scope: str = "document", + random_state: Optional[int] = None, ): + self.random_state = random_state if keyword_scope not in ["document", "corpus"]: raise ValueError("keyword_scope must be 'document' or 'corpus'") self.n_components = n_components @@ -105,7 +109,7 @@ def __init__( else: self.vectorizer = vectorizer self.dict_vectorizer_ = DictVectorizer() - self.nmf_ = NMF(n_components) + self.nmf_ = NMF(n_components, random_state=self.random_state) self.keyword_scope = keyword_scope def extract_keywords( @@ -172,7 +176,9 @@ def minibatch_train( console=None, ): self.dict_vectorizer_.fit(keywords) - self.nmf_ = MiniBatchNMF(self.n_components) + self.nmf_ = MiniBatchNMF( + self.n_components, random_state=self.random_state + ) epoch_costs = [] for i_epoch in range(max_epochs): epoch_cost = 0 @@ -220,7 +226,9 @@ def big_fit( console.log("Keywords extracted.") keywords = KeywordIterator(keyword_file) status.update("Fitting NMF.") - self.minibatch_train(keywords, max_epochs, batch_size, console=console) # type: ignore + self.minibatch_train( + keywords, max_epochs, batch_size, console=console + ) # type: ignore console.log("NMF fitted.") return self From 4a8487ed60ed52581c1ac5cd9471d0a6697bfcdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 4 Apr 2024 11:32:04 +0200 Subject: [PATCH 3/5] Added option to use any decomposition method in S3 --- turftopic/models/decomp.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index ad718c5..e274f1e 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -3,7 +3,8 @@ import numpy as np from rich.console import Console from sentence_transformers import SentenceTransformer -from sklearn.decomposition import PCA, FastICA +from sklearn.base import TransformerMixin +from sklearn.decomposition import FastICA from sklearn.feature_extraction.text import CountVectorizer from turftopic.base import ContextualModel, Encoder @@ -33,6 +34,11 @@ class SemanticSignalSeparation(ContextualModel): vectorizer: CountVectorizer, default None Vectorizer used for term extraction. Can be used to prune or filter the vocabulary. + decomposition: TransformerMixin, default None + Custom decomposition method to use. + Can be an instance of FastICA or PCA, or basically any dimensionality + reduction method. Has to have `fit_transform` and `fit` methods. + If not specified, FastICA is used. max_iter: int, default 200 Maximum number of iterations for ICA. random_state: int, default None @@ -46,6 +52,7 @@ def __init__( Encoder, str ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, + decomposition: Optional[TransformerMixin] = None, max_iter: int = 200, random_state: Optional[int] = None, ): @@ -61,9 +68,12 @@ def __init__( self.vectorizer = vectorizer self.max_iter = max_iter self.random_state = random_state - self.decomposition = FastICA( - n_components, max_iter=max_iter, random_state=random_state - ) + if decomposition is None: + self.decomposition = FastICA( + n_components, max_iter=max_iter, random_state=random_state + ) + else: + self.decomposition = decomposition def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None From 7b334cfbfc39f3b1860814f4170041736d6ce51b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 4 Apr 2024 11:37:22 +0200 Subject: [PATCH 4/5] Removed sys.maxint from ctm's (int is unbounded in Python 3 stupid me) --- turftopic/models/ctm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/turftopic/models/ctm.py b/turftopic/models/ctm.py index 3a62ed7..f3d6665 100644 --- a/turftopic/models/ctm.py +++ b/turftopic/models/ctm.py @@ -1,6 +1,5 @@ import math import random -import sys from typing import Optional, Union import numpy as np @@ -211,7 +210,7 @@ def fit( status.update("Extracting terms.") document_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") - seed = self.random_state or random.randint(0, sys.maxint - 1) + seed = self.random_state or random.randint(0, 10_000) torch.manual_seed(seed) pyro.set_rng_seed(seed) device = torch.device( From ca4a95a94a45f0bc967771d1e6e3adfbe4d4cd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 4 Apr 2024 11:41:44 +0200 Subject: [PATCH 5/5] Added default n_components to S3, so that if you pass a decomposition method you won't have to specify it --- turftopic/models/decomp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index e274f1e..234e36a 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -27,7 +27,7 @@ class SemanticSignalSeparation(ContextualModel): Parameters ---------- - n_components: int + n_components: int, default 10 Number of topics. encoder: str or SentenceTransformer Model to encode documents/terms, all-MiniLM-L6-v2 is the default. @@ -47,7 +47,7 @@ class SemanticSignalSeparation(ContextualModel): def __init__( self, - n_components: int, + n_components: int = 10, encoder: Union[ Encoder, str ] = "sentence-transformers/all-MiniLM-L6-v2",