From 2fa138e40f57458e63be2fe80d91cbca8df3834e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 21 Mar 2024 13:57:51 +0100 Subject: [PATCH 1/8] Added positive-negative division in print/export_topics, made both defaultin S3 --- turftopic/base.py | 108 +++++++++++++++++++++++-------------- turftopic/models/decomp.py | 21 ++++++-- 2 files changed, 86 insertions(+), 43 deletions(-) diff --git a/turftopic/base.py b/turftopic/base.py index 25b1ec7..06d604d 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -23,9 +23,7 @@ def remove_whitespace(text: str) -> str: class ContextualModel(ABC, TransformerMixin, BaseEstimator): """Base class for contextual topic models in Turftopic.""" - def get_topics( - self, top_k: int = 10 - ) -> List[Tuple[Any, List[Tuple[str, float]]]]: + def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]: """Returns high-level topic representations in form of the top K words in each topic. @@ -62,22 +60,53 @@ def get_topics( return topics def _topics_table( - self, top_k: int = 10, show_scores: bool = False + self, + top_k: int = 10, + show_scores: bool = False, + show_negative: bool = False, ) -> list[list[str]]: - topics = self.get_topics(top_k) - columns = ["Topic ID", f"Top {top_k} Words"] + columns = ["Topic ID", "Positive"] + if show_negative: + columns.append("Negative") rows = [] - for topic_id, terms in topics: + try: + classes = self.classes_ + except AttributeError: + classes = list(range(self.components_.shape[0])) + vocab = self.get_vocab() + for topic_id, component in zip(classes, self.components_): + highest = np.argpartition(-component, top_k)[:top_k] + highest = highest[np.argsort(-component[highest])] + lowest = np.argpartition(component, top_k)[:top_k] + lowest = lowest[np.argsort(component[lowest])] if show_scores: - concat_words = ", ".join( - [f"{word}({importance:.2f})" for word, importance in terms] + concat_positive = ", ".join( + [ + f"{word}({importance:.2f})" + for word, importance in zip(vocab[highest], component[highest]) + ] + ) + concat_negative = ", ".join( + [ + f"{word}({importance:.2f})" + for word, importance in zip(vocab[lowest], component[lowest]) + ] ) else: - concat_words = ", ".join([word for word, importance in terms]) - rows.append([f"{topic_id}", f"{concat_words}"]) + concat_positive = ", ".join([word for word in vocab[highest]]) + concat_negative = ", ".join([word for word in vocab[lowest]]) + row = [f"{topic_id}", f"{concat_positive}"] + if show_negative: + row.append(concat_negative) + rows.append(row) return [columns, *rows] - def print_topics(self, top_k: int = 10, show_scores: bool = False): + def print_topics( + self, + top_k: int = 10, + show_scores: bool = False, + show_negative: bool = False, + ): """Pretty prints topics in the model in a table. Parameters @@ -86,23 +115,36 @@ def print_topics(self, top_k: int = 10, show_scores: bool = False): Number of top words to return for each topic. show_scores: bool, default False Indicates whether to show importance scores for each word. + show_negative: bool, default False + Indicates whether the most negative terms should also be displayed. """ - columns, *rows = self._topics_table(top_k, show_scores) + columns, *rows = self._topics_table(top_k, show_scores, show_negative) table = Table(show_lines=True) - table.add_column(columns[0], style="blue", justify="right") + table.add_column("Topic ID", style="blue", justify="right") table.add_column( - columns[1], + "Positive", justify="left", style="magenta", max_width=100, ) + if show_negative: + table.add_column( + "Negative", + justify="left", + style="red", + max_width=100, + ) for row in rows: table.add_row(*row) console = Console() console.print(table) def export_topics( - self, top_k: int = 10, show_scores: bool = False, format: str = "csv" + self, + top_k: int = 10, + show_scores: bool = False, + show_negative: bool = False, + format: str = "csv", ) -> str: """Exports top K words from topics in a table in a given format. Returns table as a pure string. @@ -113,6 +155,8 @@ def export_topics( Number of top words to return for each topic. show_scores: bool, default False Indicates whether to show importance scores for each word. + show_negative: bool, default False + Indicates whether the most negative terms should also be displayed. format: 'csv', 'latex' or 'markdown' Specifies which format should be used. 'csv', 'latex' and 'markdown' are supported. @@ -137,12 +181,8 @@ def _highest_ranking_docs( except AttributeError: pass kth = min(top_k, document_topic_matrix.shape[0] - 1) - highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[ - :kth - ] - highest = highest[ - np.argsort(-document_topic_matrix[highest, topic_id]) - ] + highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth] + highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])] scores = document_topic_matrix[highest, topic_id] columns = [] columns.append("Document") @@ -177,9 +217,7 @@ def print_highest_ranking_documents( topic_id, raw_documents, document_topic_matrix, top_k ) table = Table(show_lines=True) - table.add_column( - "Document", justify="left", style="magenta", max_width=100 - ) + table.add_column("Document", justify="left", style="magenta", max_width=100) table.add_column("Score", style="blue", justify="right") for row in rows: table.add_row(*row) @@ -231,9 +269,7 @@ def _topic_distribution( ) -> list[list[str]]: if topic_dist is None: if text is None: - raise ValueError( - "You should either pass a text or a distribution." - ) + raise ValueError("You should either pass a text or a distribution.") try: topic_dist = self.transform([text]) except AttributeError: @@ -258,9 +294,7 @@ def _topic_distribution( rows.append([topic_names[ind], f"{score:.2f}"]) return [columns, *rows] - def print_topic_distribution( - self, text=None, topic_dist=None, top_k: int = 10 - ): + def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10): """Pretty prints topic distribution in a document. Parameters @@ -342,9 +376,7 @@ def fit_transform( """ pass - def fit( - self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None - ): + def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None): """Fits model on the given corpus. Parameters @@ -410,13 +442,9 @@ def prepare_topic_data( if embeddings is None: embeddings = self.encode_documents(corpus) try: - document_topic_matrix = self.transform( - corpus, embeddings=embeddings - ) + document_topic_matrix = self.transform(corpus, embeddings=embeddings) except (AttributeError, NotFittedError): - document_topic_matrix = self.fit_transform( - corpus, embeddings=embeddings - ) + document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings) dtm = self.vectorizer.transform(corpus) # type: ignore res: TopicData = { "corpus": corpus, diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index c727490..23c3109 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -42,9 +42,7 @@ class SemanticSignalSeparation(ContextualModel): def __init__( self, n_components: int, - encoder: Union[ - Encoder, str - ] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, objective: Literal["orthogonality", "independence"] = "independence", ): @@ -108,3 +106,20 @@ def transform( if embeddings is None: embeddings = self.encoder_.encode(raw_documents) return self.decomposition.transform(embeddings) + + def print_topics( + self, + top_k: int = 5, + show_scores: bool = False, + show_negative: bool = True, + ): + super().print_topics(top_k, show_scores, show_negative) + + def export_topics( + self, + top_k: int = 5, + show_scores: bool = False, + show_negative: bool = False, + format: str = "csv", + ) -> str: + return super().print_topics(top_k, show_scores, show_negative, format) From 02ca826b431ddfcb412e28727dca28b3eb8232e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Thu, 21 Mar 2024 14:16:44 +0100 Subject: [PATCH 2/8] Fixed export_topics method --- turftopic/base.py | 2 +- turftopic/models/decomp.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/turftopic/base.py b/turftopic/base.py index 06d604d..92a7cb1 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -161,7 +161,7 @@ def export_topics( Specifies which format should be used. 'csv', 'latex' and 'markdown' are supported. """ - table = self._topics_table(top_k, show_scores) + table = self._topics_table(top_k, show_scores, show_negative=show_negative) return export_table(table, format=format) def _highest_ranking_docs( diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index 23c3109..be7cfa2 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -119,7 +119,7 @@ def export_topics( self, top_k: int = 5, show_scores: bool = False, - show_negative: bool = False, + show_negative: bool = True, format: str = "csv", ) -> str: - return super().print_topics(top_k, show_scores, show_negative, format) + return super().export_topics(top_k, show_scores, show_negative, format) From d07802ead9fa8d8896576a7e48441ad03ed76742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:04:19 +0100 Subject: [PATCH 3/8] Changed positive/negative to highest/lowest ranking --- turftopic/base.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/turftopic/base.py b/turftopic/base.py index 92a7cb1..24e16f0 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -27,9 +27,7 @@ def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]] """Returns high-level topic representations in form of the top K words in each topic. - Parameters - ---------- - top_k: int, default 10 + Parameters ---------- top_k: int, default 10 Number of top words to return for each topic. Returns @@ -65,9 +63,9 @@ def _topics_table( show_scores: bool = False, show_negative: bool = False, ) -> list[list[str]]: - columns = ["Topic ID", "Positive"] + columns = ["Topic ID", "Highest Ranking"] if show_negative: - columns.append("Negative") + columns.append("Lowest Ranking") rows = [] try: classes = self.classes_ @@ -122,14 +120,14 @@ def print_topics( table = Table(show_lines=True) table.add_column("Topic ID", style="blue", justify="right") table.add_column( - "Positive", + "Highest Ranking", justify="left", style="magenta", max_width=100, ) if show_negative: table.add_column( - "Negative", + "Lowest Ranking", justify="left", style="red", max_width=100, From b662571c2868c9c60c3ce4916dfe4260b3acfa0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:19:35 +0100 Subject: [PATCH 4/8] Added possibility to show most negative documents when interpreting models. --- turftopic/base.py | 102 +++++++++++++++++++++++++++++-------- turftopic/models/decomp.py | 38 +++++++++++++- 2 files changed, 119 insertions(+), 21 deletions(-) diff --git a/turftopic/base.py b/turftopic/base.py index 24e16f0..4f722fe 100644 --- a/turftopic/base.py +++ b/turftopic/base.py @@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str: class ContextualModel(ABC, TransformerMixin, BaseEstimator): """Base class for contextual topic models in Turftopic.""" - def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]: + def get_topics( + self, top_k: int = 10 + ) -> List[Tuple[Any, List[Tuple[str, float]]]]: """Returns high-level topic representations in form of the top K words in each topic. @@ -81,13 +83,17 @@ def _topics_table( concat_positive = ", ".join( [ f"{word}({importance:.2f})" - for word, importance in zip(vocab[highest], component[highest]) + for word, importance in zip( + vocab[highest], component[highest] + ) ] ) concat_negative = ", ".join( [ f"{word}({importance:.2f})" - for word, importance in zip(vocab[lowest], component[lowest]) + for word, importance in zip( + vocab[lowest], component[lowest] + ) ] ) else: @@ -159,11 +165,18 @@ def export_topics( Specifies which format should be used. 'csv', 'latex' and 'markdown' are supported. """ - table = self._topics_table(top_k, show_scores, show_negative=show_negative) + table = self._topics_table( + top_k, show_scores, show_negative=show_negative + ) return export_table(table, format=format) - def _highest_ranking_docs( - self, topic_id, raw_documents, document_topic_matrix=None, top_k=5 + def _representative_docs( + self, + topic_id, + raw_documents, + document_topic_matrix=None, + top_k=5, + show_negative: bool = False, ) -> list[list[str]]: if document_topic_matrix is None: try: @@ -179,8 +192,12 @@ def _highest_ranking_docs( except AttributeError: pass kth = min(top_k, document_topic_matrix.shape[0] - 1) - highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth] - highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])] + highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[ + :kth + ] + highest = highest[ + np.argsort(-document_topic_matrix[highest, topic_id]) + ] scores = document_topic_matrix[highest, topic_id] columns = [] columns.append("Document") @@ -192,10 +209,30 @@ def _highest_ranking_docs( if len(doc) > 300: doc = doc[:300] + "..." rows.append([doc, f"{score:.2f}"]) + if show_negative: + rows.append(["...", ""]) + lowest = np.argpartition(document_topic_matrix[:, topic_id], kth)[ + :kth + ] + lowest = lowest[ + np.argsort(document_topic_matrix[lowest, topic_id]) + ] + scores = document_topic_matrix[lowest, topic_id] + for document_id, score in zip(lowest, scores): + doc = raw_documents[document_id] + doc = remove_whitespace(doc) + if len(doc) > 300: + doc = doc[:300] + "..." + rows.append([doc, f"{score:.2f}"]) return [columns, *rows] - def print_highest_ranking_documents( - self, topic_id, raw_documents, document_topic_matrix=None, top_k=5 + def print_representative_documents( + self, + topic_id, + raw_documents, + document_topic_matrix=None, + top_k=5, + show_negative: bool = False, ): """Pretty prints the highest ranking documents in a topic. @@ -210,24 +247,33 @@ def print_highest_ranking_documents( as they cannot infer topics from text. top_k: int, default 5 Top K documents to show. + show_negative: bool, default False + Indicates whether lowest ranking documents should also be shown. """ - columns, *rows = self._highest_ranking_docs( - topic_id, raw_documents, document_topic_matrix, top_k + columns, *rows = self._representative_docs( + topic_id, + raw_documents, + document_topic_matrix, + top_k, + show_negative, ) table = Table(show_lines=True) - table.add_column("Document", justify="left", style="magenta", max_width=100) + table.add_column( + "Document", justify="left", style="magenta", max_width=100 + ) table.add_column("Score", style="blue", justify="right") for row in rows: table.add_row(*row) console = Console() console.print(table) - def export_highest_ranking_documents( + def export_representative_documents( self, topic_id, raw_documents, document_topic_matrix=None, top_k=5, + show_negative: bool = False, format: str = "csv", ): """Exports the highest ranking documents in a topic as a text table. @@ -243,12 +289,18 @@ def export_highest_ranking_documents( as they cannot infer topics from text. top_k: int, default 5 Top K documents to show. + show_negative: bool, default False + Indicates whether lowest ranking documents should also be shown. format: 'csv', 'latex' or 'markdown' Specifies which format should be used. 'csv', 'latex' and 'markdown' are supported. """ table = self._highest_ranking_docs( - topic_id, raw_documents, document_topic_matrix, top_k + topic_id, + raw_documents, + document_topic_matrix, + top_k, + show_negative, ) return export_table(table, format=format) @@ -267,7 +319,9 @@ def _topic_distribution( ) -> list[list[str]]: if topic_dist is None: if text is None: - raise ValueError("You should either pass a text or a distribution.") + raise ValueError( + "You should either pass a text or a distribution." + ) try: topic_dist = self.transform([text]) except AttributeError: @@ -292,7 +346,9 @@ def _topic_distribution( rows.append([topic_names[ind], f"{score:.2f}"]) return [columns, *rows] - def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10): + def print_topic_distribution( + self, text=None, topic_dist=None, top_k: int = 10 + ): """Pretty prints topic distribution in a document. Parameters @@ -374,7 +430,9 @@ def fit_transform( """ pass - def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None): + def fit( + self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None + ): """Fits model on the given corpus. Parameters @@ -440,9 +498,13 @@ def prepare_topic_data( if embeddings is None: embeddings = self.encode_documents(corpus) try: - document_topic_matrix = self.transform(corpus, embeddings=embeddings) + document_topic_matrix = self.transform( + corpus, embeddings=embeddings + ) except (AttributeError, NotFittedError): - document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings) + document_topic_matrix = self.fit_transform( + corpus, embeddings=embeddings + ) dtm = self.vectorizer.transform(corpus) # type: ignore res: TopicData = { "corpus": corpus, diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py index be7cfa2..d29d119 100644 --- a/turftopic/models/decomp.py +++ b/turftopic/models/decomp.py @@ -42,7 +42,9 @@ class SemanticSignalSeparation(ContextualModel): def __init__( self, n_components: int, - encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2", + encoder: Union[ + Encoder, str + ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, objective: Literal["orthogonality", "independence"] = "independence", ): @@ -123,3 +125,37 @@ def export_topics( format: str = "csv", ) -> str: return super().export_topics(top_k, show_scores, show_negative, format) + + def print_representative_documents( + self, + topic_id, + raw_documents, + document_topic_matrix=None, + top_k=5, + show_negative: bool = True, + ): + super().print_representative_documents( + topic_id, + raw_documents, + document_topic_matrix, + top_k, + show_negative, + ) + + def export_representative_documents( + self, + topic_id, + raw_documents, + document_topic_matrix=None, + top_k=5, + show_negative: bool = True, + format: str = "csv", + ): + return super().export_representative_documents( + topic_id, + raw_documents, + document_topic_matrix, + top_k, + show_negative, + format, + ) From d2e701a1998b372697cfb7bcb8f5bca9489938f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:19:58 +0100 Subject: [PATCH 5/8] Added Ruff line length --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index be2c2ed..a136ee2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,9 @@ [tool.black] line-length=79 +[tool.ruff] +line-length=79 + [tool.poetry] name = "turftopic" version = "0.2.12" From 1a698600fc761e0bd3c7fd65051ee4c6a748b460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:20:23 +0100 Subject: [PATCH 6/8] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a136ee2..56caacb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ line-length=79 [tool.poetry] name = "turftopic" -version = "0.2.12" +version = "0.2.13" description = "Topic modeling with contextual representations from sentence transformers." authors = ["Márton Kardos "] license = "MIT" From 164277959028cf1e8a75c0068a1e0b81684a1428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:28:57 +0100 Subject: [PATCH 7/8] Replaced highest_ranking_documents with representative_documents in docs. --- docs/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 748af1f..7caf87d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -179,7 +179,7 @@ model.print_topics() ```python # Print highest ranking documents for topic 0 -model.print_highest_ranking_documents(0, corpus, document_topic_matrix) +model.print_representative_documents(0, corpus, document_topic_matrix) ```
@@ -217,7 +217,7 @@ csv_table: str = model.export_topic_distribution("something something", format=" latex_table: str = model.export_topics(format="latex") -md_table: str = model.export_highest_ranking_documents(0, corpus, document_topic_matrix, format="markdown") +md_table: str = model.export_representative_documents(0, corpus, document_topic_matrix, format="markdown") ``` ### Visualization From dfc3f82b4a5ed0bedc5c04a9a00f061b2bcd03f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 22 Mar 2024 10:29:59 +0100 Subject: [PATCH 8/8] Replaces highest_ranking_documents with representative_documents in Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 321ce6d..4c2ec63 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ model.print_topics() ```python # Print highest ranking documents for topic 0 -model.print_highest_ranking_documents(0, corpus, document_topic_matrix) +model.print_representative_documents(0, corpus, document_topic_matrix) ```