Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random States for models and max_iter for S3 #38

Merged
merged 5 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions turftopic/models/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
The specified reduction method will be used to merge them.
By default, topics are not merged.
reduction_method: 'agglomerative', 'smallest'
Method used to reduce the number of topics post-hoc.
When 'agglomerative', BERTopic's topic reduction method is used,
where topic vectors are hierarchically clustered.
When 'smallest', the smallest topic gets merged into the closest
non-outlier cluster until the desired number
is achieved similarly to Top2Vec.
random_state: int, default None
Random state to use so that results are exactly reproducible.
"""

def __init__(
Expand All @@ -154,8 +162,10 @@ def __init__(
reduction_method: Literal[
"agglomerative", "smallest"
] = "agglomerative",
random_state: Optional[int] = None,
):
self.encoder = encoder
self.random_state = random_state
if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
raise ValueError(feature_message)
if isinstance(encoder, int):
Expand All @@ -174,7 +184,7 @@ def __init__(
self.clustering = clustering
if dimensionality_reduction is None:
self.dimensionality_reduction = TSNE(
n_components=2, metric="cosine"
n_components=2, metric="cosine", random_state=random_state
)
else:
self.dimensionality_reduction = dimensionality_reduction
Expand All @@ -196,7 +206,9 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
)
old_labels = [label for label in self.classes_ if label != -1]
new_labels = AgglomerativeClustering(
n_clusters=n_reduce_to, metric="cosine", linkage="average"
n_clusters=n_reduce_to,
metric="cosine",
linkage="average",
).fit_predict(interesting_topic_vectors)
res = {}
if -1 in self.classes_:
Expand Down Expand Up @@ -235,7 +247,9 @@ def _estimate_parameters(
self.labels_, classes=self.classes_
)
if self.feature_importance == "soft-c-tf-idf":
self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore
self.components_ = soft_ctf_idf(
document_topic_matrix, doc_term_matrix
) # type: ignore
elif self.feature_importance == "centroid":
self.components_ = cluster_centroid_distance(
self.topic_vectors_,
Expand Down Expand Up @@ -327,7 +341,7 @@ def fit_transform_dynamic(
if embeddings is None:
embeddings = self.encoder_.encode(raw_documents)
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
if hasattr(self, 'components_'):
if hasattr(self, "components_"):
doc_topic_matrix = label_binarize(
self.labels_, classes=self.classes_
)
Expand Down
7 changes: 6 additions & 1 deletion turftopic/models/ctm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
import random
from typing import Optional, Union

import numpy as np
Expand Down Expand Up @@ -129,6 +130,8 @@ class AutoEncodingTopicModel(ContextualModel):
Learning rate for the optimizer.
n_epochs: int, default 50
Number of epochs to run during training.
random_state: int, default None
Random state to use so that results are exactly reproducible.
"""

def __init__(
Expand All @@ -144,8 +147,10 @@ def __init__(
batch_size: int = 42,
learning_rate: float = 1e-2,
n_epochs: int = 50,
random_state: Optional[int] = None,
):
self.n_components = n_components
self.random_state = random_state
self.encoder = encoder
if isinstance(encoder, str):
self.encoder_ = SentenceTransformer(encoder)
Expand Down Expand Up @@ -205,7 +210,7 @@ def fit(
status.update("Extracting terms.")
document_term_matrix = self.vectorizer.fit_transform(raw_documents)
console.log("Term extraction done.")
seed = 0
seed = self.random_state or random.randint(0, 10_000)
torch.manual_seed(seed)
pyro.set_rng_seed(seed)
device = torch.device(
Expand Down
37 changes: 24 additions & 13 deletions turftopic/models/decomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import numpy as np
from rich.console import Console
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA, FastICA
from sklearn.base import TransformerMixin
from sklearn.decomposition import FastICA
from sklearn.feature_extraction.text import CountVectorizer

from turftopic.base import ContextualModel, Encoder
Expand All @@ -20,33 +21,40 @@ class SemanticSignalSeparation(ContextualModel):

corpus: list[str] = ["some text", "more text", ...]

model = SemanticSignalSeparation(10, objective="independence").fit(corpus)
model = SemanticSignalSeparation(10).fit(corpus)
model.print_topics()
```

Parameters
----------
n_components: int
n_components: int, default 10
Number of topics.
encoder: str or SentenceTransformer
Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
vectorizer: CountVectorizer, default None
Vectorizer used for term extraction.
Can be used to prune or filter the vocabulary.
objective: 'orthogonality' or 'independence', default 'independence'
Indicates what the components should be optimized for.
When 'orthogonality', PCA is used to discover components,
when 'independence', ICA is used to discover components.
decomposition: TransformerMixin, default None
Custom decomposition method to use.
Can be an instance of FastICA or PCA, or basically any dimensionality
reduction method. Has to have `fit_transform` and `fit` methods.
If not specified, FastICA is used.
max_iter: int, default 200
Maximum number of iterations for ICA.
random_state: int, default None
Random state to use so that results are exactly reproducible.
"""

def __init__(
self,
n_components: int,
n_components: int = 10,
encoder: Union[
Encoder, str
] = "sentence-transformers/all-MiniLM-L6-v2",
vectorizer: Optional[CountVectorizer] = None,
objective: Literal["orthogonality", "independence"] = "independence",
decomposition: Optional[TransformerMixin] = None,
max_iter: int = 200,
random_state: Optional[int] = None,
):
self.n_components = n_components
self.encoder = encoder
Expand All @@ -58,11 +66,14 @@ def __init__(
self.vectorizer = default_vectorizer()
else:
self.vectorizer = vectorizer
self.objective = objective
if objective == "independence":
self.decomposition = FastICA(n_components)
self.max_iter = max_iter
self.random_state = random_state
if decomposition is None:
self.decomposition = FastICA(
n_components, max_iter=max_iter, random_state=random_state
)
else:
self.decomposition = PCA(n_components)
self.decomposition = decomposition

def fit_transform(
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
Expand Down
11 changes: 9 additions & 2 deletions turftopic/models/gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class GMM(ContextualModel, DynamicTopicModel):
result in Gaussian components.
For even larger datasets you can use IncrementalPCA to reduce
memory load.
random_state: int, default None
Random state to use so that results are exactly reproducible.

Attributes
----------
Expand All @@ -71,11 +73,13 @@ def __init__(
dimensionality_reduction: Optional[TransformerMixin] = None,
weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
gamma: Optional[float] = None,
random_state: Optional[int] = None,
):
self.n_components = n_components
self.encoder = encoder
self.weight_prior = weight_prior
self.gamma = gamma
self.random_state = random_state
if isinstance(encoder, str):
self.encoder_ = SentenceTransformer(encoder)
else:
Expand All @@ -94,9 +98,12 @@ def __init__(
else "dirichlet_process"
),
weight_concentration_prior=gamma,
random_state=self.random_state,
)
else:
mixture = GaussianMixture(n_components)
mixture = GaussianMixture(
n_components, random_state=self.random_state
)
if dimensionality_reduction is not None:
self.gmm_ = make_pipeline(dimensionality_reduction, mixture)
else:
Expand Down Expand Up @@ -162,7 +169,7 @@ def fit_transform_dynamic(
bins: Union[int, list[datetime]] = 10,
):
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
if hasattr(self, 'components_'):
if hasattr(self, "components_"):
doc_topic_matrix = self.transform(
raw_documents, embeddings=embeddings
)
Expand Down
14 changes: 11 additions & 3 deletions turftopic/models/keynmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class KeyNMF(ContextualModel):
is performed on the whole vocabulary ('corpus') or only
using words that are included in the document ('document').
Setting this to 'corpus' allows for multilingual topics.
random_state: int, default None
Random state to use so that results are exactly reproducible.
"""

def __init__(
Expand All @@ -90,7 +92,9 @@ def __init__(
vectorizer: Optional[CountVectorizer] = None,
top_n: int = 25,
keyword_scope: str = "document",
random_state: Optional[int] = None,
):
self.random_state = random_state
if keyword_scope not in ["document", "corpus"]:
raise ValueError("keyword_scope must be 'document' or 'corpus'")
self.n_components = n_components
Expand All @@ -105,7 +109,7 @@ def __init__(
else:
self.vectorizer = vectorizer
self.dict_vectorizer_ = DictVectorizer()
self.nmf_ = NMF(n_components)
self.nmf_ = NMF(n_components, random_state=self.random_state)
self.keyword_scope = keyword_scope

def extract_keywords(
Expand Down Expand Up @@ -172,7 +176,9 @@ def minibatch_train(
console=None,
):
self.dict_vectorizer_.fit(keywords)
self.nmf_ = MiniBatchNMF(self.n_components)
self.nmf_ = MiniBatchNMF(
self.n_components, random_state=self.random_state
)
epoch_costs = []
for i_epoch in range(max_epochs):
epoch_cost = 0
Expand Down Expand Up @@ -220,7 +226,9 @@ def big_fit(
console.log("Keywords extracted.")
keywords = KeywordIterator(keyword_file)
status.update("Fitting NMF.")
self.minibatch_train(keywords, max_epochs, batch_size, console=console) # type: ignore
self.minibatch_train(
keywords, max_epochs, batch_size, console=console
) # type: ignore
console.log("NMF fitted.")
return self

Expand Down
Loading