release version 2.6

mesolitica · Jun 25, 2019 · d27ac1e · d27ac1e
1 parent 7cd44d1
commit d27ac1e
Show file tree

Hide file tree

Showing 42 changed files with 7,653 additions and 829 deletions.
diff --git a/README.rst b/README.rst
@@ -75,13 +75,18 @@ Features
 -  **Spell Correction**
 
    Using local Malaysia NLP researches to auto-correct any bahasa words.
--  Stemmer
+-  **Stemmer**
+
+   Use Character LSTM Seq2Seq with attention state-of-art to do Bahasa stemming.
 -  **Subjectivity Analysis**
 
    From fine-tuning BERT, Attention-Recurrent model, Sparse Tensorflow and Self-Attention to build deep subjectivity analysis models.
+-  **Similarity**
+
+   Use deep LSTM siamese, deep Dilated CNN siamese, deep Self-Attention, siamese, Doc2Vec and BERT to build deep semantic similarity models.
 -  **Summarization**
 
-   Using skip-thought with attention state-of-art to give precise unsupervised summarization.
+   Using skip-thought and residual-network with attention state-of-art, LDA, LSA and Doc2Vec to give precise unsupervised summarization, and TextRank as scoring algorithm.
 -  **Topic Modelling**
 
    Provide LDA2Vec, LDA, NMF and LSA interface for easy topic modelling with topics visualization.

diff --git a/accuracy/models-accuracy.ipynb b/accuracy/models-accuracy.ipynb
diff --git a/accuracy/models-accuracy.rst b/accuracy/models-accuracy.rst
@@ -824,6 +824,71 @@ BERT
 
    avg / total       0.88      0.87      0.86     84104
 
+Similarity
+----------
+
+Trained on 80% of dataset, tested on 20% of dataset. All training
+sessions stored in
+`session/similarity <https://github.com/huseinzol05/Malaya/tree/master/session/similarity>`__
+
+.. code:: ipython3
+
+    display(Image('similarity-accuracy.png', width=500))
+
+
+
+.. image:: models-accuracy_files/models-accuracy_58_0.png
+   :width: 500px
+
+
+bahdanau
+^^^^^^^^
+
+.. code:: text
+
+                precision    recall  f1-score   support
+
+   not similar       0.83      0.83      0.83     31524
+       similar       0.71      0.71      0.71     18476
+
+   avg / total       0.79      0.79      0.79     50000
+
+self-attention
+^^^^^^^^^^^^^^
+
+.. code:: text
+
+                precision    recall  f1-score   support
+
+   not similar       0.81      0.83      0.82     31524
+       similar       0.70      0.67      0.68     18476
+
+   avg / total       0.77      0.77      0.77     50000
+
+dilated-cnn
+^^^^^^^^^^^
+
+.. code:: text
+
+                precision    recall  f1-score   support
+
+   not similar       0.82      0.82      0.82     31524
+       similar       0.69      0.69      0.69     18476
+
+   avg / total       0.77      0.77      0.77     50000
+
+bert
+^^^^
+
+.. code:: text
+
+                precision    recall  f1-score   support
+
+   not similar       0.86      0.86      0.86     50757
+       similar       0.77      0.76      0.76     30010
+
+   avg / total       0.83      0.83      0.83     80767
+
 Dependency parsing
 ------------------
 
@@ -837,7 +902,7 @@ sessions stored in
 
 
 
-.. image:: models-accuracy_files/models-accuracy_58_0.png
+.. image:: models-accuracy_files/models-accuracy_64_0.png
    :width: 500px
 
 

diff --git a/accuracy/models-accuracy_files/models-accuracy_58_0.png b/accuracy/models-accuracy_files/models-accuracy_58_0.png
diff --git a/accuracy/models-accuracy_files/models-accuracy_64_0.png b/accuracy/models-accuracy_files/models-accuracy_64_0.png
diff --git a/accuracy/similarity-accuracy.png b/accuracy/similarity-accuracy.png
diff --git a/accuracy/similarity-template.js b/accuracy/similarity-template.js
@@ -0,0 +1,26 @@
+option = {
+    xAxis: {
+        type: 'category',
+        axisLabel: {
+            interval: 0,
+            rotate: 30
+        },
+        data: ['bahdanau','self-attention', 'dilated-cnn', 'BERT']
+    },
+    yAxis: {
+        type: 'value',
+        min:0.76,
+        max:0.83
+    },
+    backgroundColor:'rgb(252,252,252)',
+    series: [{
+        data: [0.79, 0.77, 0.77, 0.83],
+        type: 'bar',
+        label: {
+                normal: {
+                    show: true,
+                    position: 'top'
+                }
+            },
+    }]
+};
diff --git a/docs/Api.rst b/docs/Api.rst
@@ -198,3 +198,75 @@ malaya.word2vec
 
 .. autoclass:: malaya.word2vec.word2vec()
     :members:
+
+malaya._models._sklearn_model
+---------------------------------
+
+.. autoclass:: malaya._models._sklearn_model.CRF()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.DEPENDENCY()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.BINARY_XGB()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.BINARY_BAYES()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.MULTICLASS_XGB()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.MULTICLASS_BAYES()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.TOXIC()
+    :members:
+
+.. autoclass:: malaya._models._sklearn_model.LANGUAGE_DETECTION()
+    :members:
+
+malaya._models._tensorflow_model
+---------------------------------
+
+.. autoclass:: malaya._models._tensorflow_model.DEPENDENCY()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.TAGGING()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.BINARY_BERT()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.MULTICLASS_BERT()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SIGMOID_BERT()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SOFTMAX()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.BINARY_SOFTMAX()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.MULTICLASS_SOFTMAX()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SIGMOID()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.DEEP_LANG()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SPARSE_SOFTMAX()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SPARSE_SIGMOID()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SIAMESE()
+    :members:
+
+.. autoclass:: malaya._models._tensorflow_model.SIAMESE_BERT()
+    :members:
diff --git a/docs/README.rst b/docs/README.rst
@@ -75,13 +75,18 @@ Features
 -  **Spell Correction**
 
    Using local Malaysia NLP researches to auto-correct any bahasa words.
--  Stemmer
+-  **Stemmer**
+
+   Use Character LSTM Seq2Seq with attention state-of-art to do Bahasa stemming.
 -  **Subjectivity Analysis**
 
    From fine-tuning BERT, Attention-Recurrent model, Sparse Tensorflow and Self-Attention to build deep subjectivity analysis models.
+-  **Similarity**
+
+   Use deep LSTM siamese, deep Dilated CNN siamese, deep Self-Attention, siamese, Doc2Vec and BERT to build deep semantic similarity models.
 -  **Summarization**
 
-   Using skip-thought with attention state-of-art to give precise unsupervised summarization.
+   Using skip-thought and residual-network with attention state-of-art, LDA, LSA and Doc2Vec to give precise unsupervised summarization, and TextRank as scoring algorithm.
 -  **Topic Modelling**
 
    Provide LDA2Vec, LDA, NMF and LSA interface for easy topic modelling with topics visualization.

diff --git a/docs/conf.py b/docs/conf.py
@@ -76,6 +76,7 @@ def __getattr__(cls, name):
     'sklearn.neighbors',
     'pulp',
     'ftfy',
+    'networkx',
 ]
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
-Original file line number
+Diff line change
@@ Expand Up / @@ -76,6 +76,7 @@ def __getattr__(cls, name): @@
         'sklearn.neighbors',
         'pulp',
         'ftfy',
+        'networkx',
     ]
     sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
@@ Expand Down @@