Skip to content

Commit

Permalink
Added docstrings and meaningful error messages to estimate_components
Browse files Browse the repository at this point in the history
  • Loading branch information
x-tabdeveloping committed Aug 19, 2024
1 parent 49020d2 commit b8688e1
Showing 1 changed file with 31 additions and 2 deletions.
33 changes: 31 additions & 2 deletions turftopic/models/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sentence_transformers import SentenceTransformer
from sklearn.base import ClusterMixin, TransformerMixin
from sklearn.cluster import OPTICS, AgglomerativeClustering
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances
Expand Down Expand Up @@ -300,7 +301,7 @@ def estimate_components(
feature_importance: Literal[
"centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
],
) -> np.array:
) -> np.ndarray:
"""Estimates feature importances based on a fitted clustering.
Parameters
Expand All @@ -319,6 +320,10 @@ def estimate_components(
ndarray of shape (n_components, n_vocab)
Topic-term matrix.
"""
if getattr(self, "labels_", None) is None:
raise NotFittedError(
"The model has not been fitted yet, please fit the model before estimating temporal components."
)
clusters = np.unique(self.labels_)
self.classes_ = np.sort(clusters)
self.topic_sizes_ = np.array(
Expand Down Expand Up @@ -422,8 +427,31 @@ def estimate_temporal_components(
feature_importance: Literal[
"c-tf-idf", "soft-c-tf-idf", "centroid", "bayes"
],
):
) -> np.ndarray:
"""Estimates temporal components based on a fitted topic model.
Parameters
----------
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
'bayes' uses Bayes' rule.
Returns
-------
ndarray of shape (n_time_bins, n_components, n_vocab)
Temporal topic-term matrix.
"""
if getattr(self, "components_", None) is None:
raise NotFittedError(
"The model has not been fitted yet, please fit the model before estimating temporal components."
)
n_comp, n_vocab = self.components_.shape
self.time_bin_edges = time_bin_edges
n_bins = len(self.time_bin_edges) - 1
self.temporal_components_ = np.full(
(n_bins, n_comp, n_vocab),
Expand Down Expand Up @@ -464,6 +492,7 @@ def estimate_temporal_components(
mask_terms = np.squeeze(np.asarray(mask_terms))
components[:, mask_terms == 0] = np.nan
self.temporal_components_[i_timebin] = components
return self.temporal_components_

def fit_transform_dynamic(
self,
Expand Down

0 comments on commit b8688e1

Please sign in to comment.