x-tabdeveloping · rbroc · Mar 14, 2024 · Mar 13, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -29,7 +29,8 @@
 models = [
     GMM(5, encoder=trf),
     SemanticSignalSeparation(5, encoder=trf),
-    KeyNMF(5, encoder=trf),
+    KeyNMF(5, encoder=trf, keyword_scope='document'),
+    KeyNMF(5, encoder=trf, keyword_scope='corpus'),
     ClusteringTopicModel(
         n_reduce_to=5,
         feature_importance="c-tf-idf",

diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -74,6 +74,11 @@ class KeyNMF(ContextualModel):
         Can be used to prune or filter the vocabulary.
     top_n: int, default 25
         Number of keywords to extract for each document.
+    keyword_scope: str, default 'document'
+        Specifies whether keyword extraction for each document
+        is performed on the whole vocabulary ('corpus') or only
+        using words that are included in the document ('document').
+        Setting this to 'corpus' allows for multilingual topics.
     """
 
     def __init__(
@@ -84,7 +89,10 @@ def __init__(
         ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         top_n: int = 25,
+        keyword_scope: str = 'document',
     ):
+        if keyword_scope not in ['document', 'corpus']:
+            raise ValueError("keyword_scope must be 'document' or 'corpus'")
         self.n_components = n_components
         self.top_n = top_n
         self.encoder = encoder
@@ -98,6 +106,7 @@ def __init__(
             self.vectorizer = vectorizer
         self.dict_vectorizer_ = DictVectorizer()
         self.nmf_ = NMF(n_components)
+        self.keyword_scope = keyword_scope
 
     def extract_keywords(
         self,
@@ -114,11 +123,15 @@ def extract_keywords(
         for i in range(total):
             terms = document_term_matrix[i, :].todense()
             embedding = embeddings[i].reshape(1, -1)
-            nonzero = terms > 0
-            if not np.any(nonzero):
+            if self.keyword_scope == 'document':
+                mask = terms > 0
+            else:
+                tot_freq = document_term_matrix.sum(axis=0)
+                mask = tot_freq != 0
+            if not np.any(mask):
                 keywords.append(dict())
                 continue
-            important_terms = np.squeeze(np.asarray(nonzero))
+            important_terms = np.squeeze(np.asarray(mask))
             word_embeddings = self.vocab_embeddings[important_terms]
             sim = cosine_similarity(embedding, word_embeddings)
             sim = np.ravel(sim)
@@ -272,7 +285,7 @@ def prepare_topic_data(
             except (NotFittedError, AttributeError):
                 doc_topic_matrix = self.nmf_.fit_transform(dtm)
                 self.components_ = self.nmf_.components_
-            console.log("Model fiting done.")
+            console.log("Model fitting done.")
         res: TopicData = {
             "corpus": corpus,
             "document_term_matrix": dtm,