From 2a7b85d07d0ec6981ab0f14971ec5aa93a556e58 Mon Sep 17 00:00:00 2001 From: husein zolkepli Date: Sun, 20 Jan 2019 20:50:56 +0800 Subject: [PATCH] release version 1.4, added dependency parsing --- README.rst | 3 + docs/Api.rst | 6 + docs/Cache.rst | 2 +- docs/Cluster.rst | 2 +- docs/Crawler.rst | 4 +- docs/Dataset.rst | 20 +- docs/Dependency.rst | 9 + docs/Emotion.rst | 2 +- docs/Entities.rst | 2 +- docs/Installation.rst | 4 +- docs/Language.rst | 2 +- docs/Normalizer.rst | 2 +- docs/Num2word.rst | 2 +- docs/Pos.rst | 2 +- docs/README.rst | 5 +- docs/Sentiment.rst | 2 +- docs/Spell.rst | 2 +- docs/Stack.rst | 2 +- docs/Stemmer.rst | 2 +- docs/Subjective.rst | 2 +- docs/Summarization.rst | 2 +- docs/Topic.rst | 2 +- docs/Topics.rst | 2 +- docs/Toxic.rst | 2 +- docs/Word2vec.rst | 2 +- docs/index.rst | 1 + docs/load-dependency.rst | 396 ++++++ .../load-dependency_10_0.svg | 186 +++ .../load-dependency_21_0.svg | 186 +++ .../load-dependency_27_0.svg | 186 +++ docs/load-entities.rst | 97 +- docs/load-pos.rst | 281 ++-- example/dependency/README.rst | 398 ++++++ example/dependency/load-dependency.ipynb | 1156 +++++++++++++++++ .../load-dependency_10_0.svg | 186 +++ .../load-dependency_21_0.svg | 186 +++ .../load-dependency_27_0.svg | 186 +++ example/entities/README.rst | 97 +- example/entities/load-entities.ipynb | 123 +- example/part-of-speech/README.rst | 281 ++-- example/part-of-speech/load-pos.ipynb | 311 +++-- malaya/__init__.py | 35 + malaya/_models/_sklearn_model.py | 98 +- malaya/_models/_tensorflow_model.py | 263 +++- malaya/_utils/_parse_dependency.py | 382 ++++++ malaya/_utils/_paths.py | 106 +- malaya/_utils/_tag_class.py | 12 +- malaya/dependency.py | 101 ++ malaya/pos.py | 6 +- malaya/stack.py | 29 +- malaya/summarize.py | 3 + malaya/texts/_text_functions.py | 21 +- malaya/texts/vectorizer.py | 43 + malaya/topic_model.py | 9 +- malaya/word2vec.py | 11 +- readme-pypi.rst | 4 +- setup-gpu.py | 5 +- setup.py | 5 +- 58 files changed, 5026 insertions(+), 451 deletions(-) create mode 100644 docs/Dependency.rst create mode 100644 docs/load-dependency.rst create mode 100644 docs/load-dependency_files/load-dependency_10_0.svg create mode 100644 docs/load-dependency_files/load-dependency_21_0.svg create mode 100644 docs/load-dependency_files/load-dependency_27_0.svg create mode 100644 example/dependency/README.rst create mode 100644 example/dependency/load-dependency.ipynb create mode 100644 example/dependency/load-dependency_files/load-dependency_10_0.svg create mode 100644 example/dependency/load-dependency_files/load-dependency_21_0.svg create mode 100644 example/dependency/load-dependency_files/load-dependency_27_0.svg create mode 100644 malaya/_utils/_parse_dependency.py create mode 100644 malaya/dependency.py diff --git a/README.rst b/README.rst index 86b2bf70..d36bee75 100644 --- a/README.rst +++ b/README.rst @@ -63,6 +63,9 @@ Features - **Part-of-Speech Recognition** Latest state-of-art CRF deep learning models to do Naming Entity Recognition. +- **Dependency Parsing** + + Latest state-of-art CRF deep learning models to do analyzes the grammatical structure of a sentence, establishing relationships between words. - **Sentiment Analysis** From BERT, Fast-Text, Dynamic-Memory Network, Sparse Tensorflow, Attention Neural Network to build deep sentiment analysis models. diff --git a/docs/Api.rst b/docs/Api.rst index d9828c8a..1fc7be36 100644 --- a/docs/Api.rst +++ b/docs/Api.rst @@ -9,6 +9,12 @@ malaya .. automodule:: malaya :members: +malaya.dependency +------------------ + +.. automodule:: malaya.dependency + :members: + malaya.emotion ----------------- diff --git a/docs/Cache.rst b/docs/Cache.rst index b5a33900..ca0fd9f3 100644 --- a/docs/Cache.rst +++ b/docs/Cache.rst @@ -4,6 +4,6 @@ Malaya Cache .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-cache.rst diff --git a/docs/Cluster.rst b/docs/Cluster.rst index 10fb628a..d145b726 100644 --- a/docs/Cluster.rst +++ b/docs/Cluster.rst @@ -4,6 +4,6 @@ Text Clustering .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: text-clustering.rst diff --git a/docs/Crawler.rst b/docs/Crawler.rst index f98ca425..88dffae6 100644 --- a/docs/Crawler.rst +++ b/docs/Crawler.rst @@ -7,13 +7,13 @@ From Source ----------- The crawler is actively developed on -`Github `__. +`Github `__. You need to clone the public repo: .. code:: bash - git clone https://github.com/devconx/malaya + git clone https://github.com/huseinzol05/malaya You need to install dependencies before able to use the crawler. diff --git a/docs/Dataset.rst b/docs/Dataset.rst index c1d80e95..4d2c6641 100644 --- a/docs/Dataset.rst +++ b/docs/Dataset.rst @@ -3,54 +3,54 @@ Dataset We want to make sure not just the code we open-sourced, but also goes to dataset, so everyone can validate. -You can check in `/dataset `__ for most of our open dataset. +You can check in `/dataset `__ for most of our open dataset. Sentiment Analysis ---------------------------------- -`/dataset/sentiment `__ +`/dataset/sentiment `__ Emotion Analysis ---------------------------------- -`/dataset/emotion `__ +`/dataset/emotion `__ Entities Recognition ---------------------------------- -`/dataset/entities `__ +`/dataset/entities `__ Part-Of-Speech Recognition ---------------------------------- -`/dataset/pos `__ +`/dataset/pos `__ Polarity ---------------------------------- -`/dataset/polarity `__ +`/dataset/polarity `__ We combined with sentiment analysis models. Subjectivity ---------------------------------- -`/dataset/subjectivity `__ +`/dataset/subjectivity `__ Stemmer ---------------------------------- -`/dataset/stemmer `__ +`/dataset/stemmer `__ Language Detection ---------------------------------- -`/dataset/language-detection `__ +`/dataset/language-detection `__ Dictionary ---------------------------------- -`/dataset/dictionary `__ +`/dataset/dictionary `__ Matbahasa ----------- diff --git a/docs/Dependency.rst b/docs/Dependency.rst new file mode 100644 index 00000000..03202f07 --- /dev/null +++ b/docs/Dependency.rst @@ -0,0 +1,9 @@ +Dependency Parsing +=================== + +.. note:: + + This tutorial is available as an IPython notebook + `here `_. + +.. include:: load-dependency.rst diff --git a/docs/Emotion.rst b/docs/Emotion.rst index fa4540f3..d721acd3 100644 --- a/docs/Emotion.rst +++ b/docs/Emotion.rst @@ -4,6 +4,6 @@ Emotion Analysis .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-emotion.rst diff --git a/docs/Entities.rst b/docs/Entities.rst index db1c3f53..fa096631 100644 --- a/docs/Entities.rst +++ b/docs/Entities.rst @@ -4,6 +4,6 @@ Entities Recognition .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-entities.rst diff --git a/docs/Installation.rst b/docs/Installation.rst index 38f298bc..77592169 100644 --- a/docs/Installation.rst +++ b/docs/Installation.rst @@ -25,13 +25,13 @@ From Source ----------- Malaya is actively developed on -`Github `__. +`Github `__. You can clone the public repo: .. code:: python - git clone https://github.com/devconx/malaya + git clone https://github.com/huseinzol05/malaya Once you have the source, you can install it into your site-packages with: diff --git a/docs/Language.rst b/docs/Language.rst index 98456f3f..29fddac8 100644 --- a/docs/Language.rst +++ b/docs/Language.rst @@ -4,6 +4,6 @@ Language Detection .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-language-detection.rst diff --git a/docs/Normalizer.rst b/docs/Normalizer.rst index a0e0f252..9442d4cb 100644 --- a/docs/Normalizer.rst +++ b/docs/Normalizer.rst @@ -4,6 +4,6 @@ Normalizer .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-normalizer.rst diff --git a/docs/Num2word.rst b/docs/Num2word.rst index 851fe401..d80a3ff7 100644 --- a/docs/Num2word.rst +++ b/docs/Num2word.rst @@ -4,6 +4,6 @@ Num2Word .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-num2word.rst diff --git a/docs/Pos.rst b/docs/Pos.rst index 29e34ee9..0c5bfd37 100644 --- a/docs/Pos.rst +++ b/docs/Pos.rst @@ -4,6 +4,6 @@ Part-of-Speech Recognition .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-pos.rst diff --git a/docs/README.rst b/docs/README.rst index 86b2bf70..e3aa1194 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -2,7 +2,7 @@

- logo + logo

@@ -63,6 +63,9 @@ Features - **Part-of-Speech Recognition** Latest state-of-art CRF deep learning models to do Naming Entity Recognition. +- **Dependency Parsing** + + Latest state-of-art CRF deep learning models to do analyzes the grammatical structure of a sentence, establishing relationships between words. - **Sentiment Analysis** From BERT, Fast-Text, Dynamic-Memory Network, Sparse Tensorflow, Attention Neural Network to build deep sentiment analysis models. diff --git a/docs/Sentiment.rst b/docs/Sentiment.rst index 0f2ead94..eba84dc7 100644 --- a/docs/Sentiment.rst +++ b/docs/Sentiment.rst @@ -4,6 +4,6 @@ Sentiment Analysis .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-sentiment.rst diff --git a/docs/Spell.rst b/docs/Spell.rst index 7ba8fe12..f61dd1e0 100644 --- a/docs/Spell.rst +++ b/docs/Spell.rst @@ -4,6 +4,6 @@ Speller .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-spell-correction.rst diff --git a/docs/Stack.rst b/docs/Stack.rst index 140aeabf..2966c639 100644 --- a/docs/Stack.rst +++ b/docs/Stack.rst @@ -4,6 +4,6 @@ Stacking .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-stack.rst diff --git a/docs/Stemmer.rst b/docs/Stemmer.rst index f8e4ccae..990ebbf4 100644 --- a/docs/Stemmer.rst +++ b/docs/Stemmer.rst @@ -4,6 +4,6 @@ Stemmer .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-stemmer.rst diff --git a/docs/Subjective.rst b/docs/Subjective.rst index 69cbdd47..be937c9f 100644 --- a/docs/Subjective.rst +++ b/docs/Subjective.rst @@ -4,6 +4,6 @@ Subjectivity Analysis .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-subjectivity.rst diff --git a/docs/Summarization.rst b/docs/Summarization.rst index b60d85e3..6c3ead6e 100644 --- a/docs/Summarization.rst +++ b/docs/Summarization.rst @@ -4,6 +4,6 @@ Summarization .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-summarization.rst diff --git a/docs/Topic.rst b/docs/Topic.rst index e61da82a..cc9d5f13 100644 --- a/docs/Topic.rst +++ b/docs/Topic.rst @@ -4,6 +4,6 @@ Topic Modelling .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-topic-modelling.rst diff --git a/docs/Topics.rst b/docs/Topics.rst index f02c819b..e14ae6a7 100644 --- a/docs/Topics.rst +++ b/docs/Topics.rst @@ -4,6 +4,6 @@ Topics & Influencers Analysis .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-topics-influencers.rst diff --git a/docs/Toxic.rst b/docs/Toxic.rst index 2841cd83..1db0b3dd 100644 --- a/docs/Toxic.rst +++ b/docs/Toxic.rst @@ -4,6 +4,6 @@ Toxicity Analysis .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-toxic.rst diff --git a/docs/Word2vec.rst b/docs/Word2vec.rst index 58ae61ed..744152e4 100644 --- a/docs/Word2vec.rst +++ b/docs/Word2vec.rst @@ -4,6 +4,6 @@ Word2Vec .. note:: This tutorial is available as an IPython notebook - `here `_. + `here `_. .. include:: load-word2vec.rst diff --git a/docs/index.rst b/docs/index.rst index c1e9af83..313df988 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Contents: Translator-malaya Accuracy Cache + Dependency Emotion Entities Language diff --git a/docs/load-dependency.rst b/docs/load-dependency.rst new file mode 100644 index 00000000..7bbcf510 --- /dev/null +++ b/docs/load-dependency.rst @@ -0,0 +1,396 @@ + +.. code:: python + + %%time + import malaya + + +.. parsed-literal:: + + CPU times: user 13 s, sys: 1.5 s, total: 14.5 s + Wall time: 18.3 s + + +List available deep learning Dependency models +---------------------------------------------- + +.. code:: python + + malaya.dependency.available_deep_model() + + + + +.. parsed-literal:: + + ['concat', 'bahdanau', 'luong'] + + + +Describe supported dependencies +------------------------------- + +.. code:: python + + malaya.describe_dependency() + + +.. parsed-literal:: + + acl - clausal modifier of noun + advcl - adverbial clause modifier + advmod - adverbial modifier + amod - adjectival modifier + appos - appositional modifier + aux - auxiliary + case - case marking + ccomp - clausal complement + compound - compound + compound:plur - plural compound + conj - conjunct + cop - cop + csubj - clausal subject + dep - dependent + det - determiner + fixed - multi-word expression + flat - name + iobj - indirect object + mark - marker + nmod - nominal modifier + nsubj - nominal subject + obj - direct object + parataxis - parataxis + root - root + xcomp - open clausal complement + you can read more from https://universaldependencies.org/en/dep/xcomp.html + + +.. code:: python + + string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.' + +Load CRF model +-------------- + +.. code:: python + + crf = malaya.dependency.crf() + tagging, indexing = crf.predict(string) + tagging, indexing + + + + +.. parsed-literal:: + + ([('Dr', 'det'), + ('Mahathir', 'nsubj'), + ('menasihati', 'root'), + ('mereka', 'obj'), + ('supaya', 'mark'), + ('berhenti', 'advcl'), + ('berehat', 'xcomp'), + ('dan', 'cc'), + ('tidur', 'conj'), + ('sebentar', 'case'), + ('sekiranya', 'nmod'), + ('mengantuk', 'acl'), + ('ketika', 'mark'), + ('memandu', 'advcl')], + [('Dr', 2), + ('Mahathir', 3), + ('menasihati', 0), + ('mereka', 4), + ('supaya', 9), + ('berhenti', 9), + ('berehat', 9), + ('dan', 9), + ('tidur', 7), + ('sebentar', 7), + ('sekiranya', 7), + ('mengantuk', 1), + ('ketika', 3), + ('memandu', 3)]) + + + +Visualize graph for dependency output +------------------------------------- + +**Make sure you already installed graphvis.** + +.. code:: python + + graph = malaya.dependency.dependency_graph(tagging, indexing) + graph.to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_10_0.svg + + + +Print important features from CRF model +--------------------------------------- + +.. code:: python + + crf.print_features(10) + + +.. parsed-literal:: + + Top-10 tagging positive: + 8.072296 det word:berbagai + 7.858845 det word:para + 7.857109 det word:tersebut + 7.465632 advmod word:memang + 6.809172 nummod is_numeric + 6.232288 amod word:menakjubkan + 6.188577 advmod word:terutama + 6.067059 case word:selama + 5.723111 advmod word:lagi + 5.675961 case word:tentang + + Top-10 tagging negative: + -2.672044 nsubj next_word:memang + -2.690972 root prefix-3:sal + -2.708229 punct prev_word-prefix-1:9 + -2.710053 obl suffix-3:ena + -2.711398 conj suffix-3:aat + -2.758406 flat prefix-2:ya + -2.848409 nsubj next_word:berisi + -3.400050 compound:plur suffix-2:ya + -3.619957 case next_word:pernyataan + -5.017675 flat is_first + + +Print important tagging transitions from CRF Model +-------------------------------------------------- + +.. code:: python + + crf.print_transitions_tag(10) + + +.. parsed-literal:: + + Top-10 likely tagging transitions: + case -> obl 5.106777 + case -> nmod 4.338968 + cc -> conj 3.375610 + flat -> flat 3.347966 + case -> xcomp 2.899393 + appos -> flat 2.632795 + mark -> advcl 2.373561 + nmod -> flat 2.247949 + conj -> flat 2.239869 + nummod -> obl 2.214665 + + Top-10 unlikely tagging transitions: + root -> conj -2.243008 + xcomp -> parataxis -2.250619 + case -> appos -2.273873 + case -> obj -2.506688 + case -> flat -2.524687 + root -> parataxis -2.581892 + mark -> flat -2.664428 + cop -> obj -3.155705 + case -> fixed -3.301385 + root -> root -4.324076 + + +Print important indexing transitions from CRF Model +--------------------------------------------------- + +.. code:: python + + crf.print_transitions_index(10) + + +.. parsed-literal:: + + Top-10 likely indexing transitions: + 78 -> 78 5.050351 + 1 -> 1 5.044279 + 137 -> 137 5.014911 + 90 -> 90 4.912735 + 63 -> 63 4.724542 + 95 -> 95 4.692040 + 107 -> 108 4.620310 + 92 -> 93 4.605423 + 94 -> 98 4.568649 + 96 -> 99 4.556339 + + Top-10 unlikely indexing transitions: + 0 -> 43 -2.899807 + 0 -> 44 -2.904968 + 45 -> 3 -3.004463 + 33 -> 1 -3.115820 + 0 -> 33 -3.147339 + 3 -> 38 -3.170745 + 0 -> 40 -3.220509 + 0 -> 37 -3.272783 + 0 -> 38 -3.425021 + 0 -> 39 -3.439639 + + +Load deep learning models +------------------------- + +.. code:: python + + for i in malaya.dependency.available_deep_model(): + print('Testing %s model'%(i)) + model = malaya.dependency.deep_model(i) + print(model.predict(string)) + print() + + +.. parsed-literal:: + + Testing concat model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'obj'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'obj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 4), ('berhenti', 4), ('berehat', 6), ('dan', 9), ('tidur', 7), ('sebentar', 11), ('sekiranya', 9), ('mengantuk', 9), ('ketika', 13), ('memandu', 12)]) + + Testing bahdanau model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'compound'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 6), ('berhenti', 9), ('berehat', 6), ('dan', 9), ('tidur', 6), ('sebentar', 9), ('sekiranya', 12), ('mengantuk', 9), ('ketika', 13), ('memandu', 3)]) + + Testing luong model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'amod'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'compound'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 1), ('mereka', 3), ('supaya', 3), ('berhenti', 3), ('berehat', 6), ('dan', 10), ('tidur', 7), ('sebentar', 10), ('sekiranya', 10), ('mengantuk', 11), ('ketika', 11), ('memandu', 12)]) + + + +Print important features from deep learning model +------------------------------------------------- + +.. code:: python + + bahdanau = malaya.dependency.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + Balaikota: 6.001306 + jemaatnya: 5.659410 + esai: 5.420834 + menyulitkan: 5.298349 + Khairun: 5.271856 + Scandal: 5.135361 + penolakan: 5.070021 + gundiknya: 5.057362 + gagasan: 4.977351 + Banyuputih: 4.972396 + + Top-10 negative: + Carolina: -5.638381 + kontestan: -5.565759 + Dibalik: -5.185034 + Rotten: -5.032556 + 1982: -4.824227 + ditempatkan: -4.771740 + Player: -4.723217 + Nuh: -4.664867 + rating: -4.659817 + tello: -4.614172 + + +.. code:: python + + tagging, indexing = bahdanau.predict(string) + malaya.dependency.dependency_graph(tagging, indexing).to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_21_0.svg + + + +Print important tagging transitions from deep learning model +------------------------------------------------------------ + +.. code:: python + + bahdanau.print_transitions_tag(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + cc -> conj: 1.361513 + mark -> advcl: 1.160463 + compound:plur -> amod: 1.159281 + mark -> fixed: 0.990238 + obj -> compound: 0.971839 + flat -> flat: 0.927018 + case -> obl: 0.926517 + cop -> det: 0.902245 + nsubj -> dep: 0.844691 + nsubj:pass -> dep: 0.837701 + + Top-10 unlikely transitions: + case -> obj: -2.866276 + root -> root: -2.830104 + case -> parataxis: -2.372282 + nsubj:pass -> nsubj:pass: -2.307715 + punct -> csubj: -2.298815 + compound:plur -> fixed: -2.215350 + parataxis -> advcl: -2.196172 + nsubj:pass -> compound:plur: -2.159937 + mark -> xcomp: -2.143510 + csubj -> advmod: -2.140114 + + +Print important indexing transitions from deep learning model +------------------------------------------------------------- + +.. code:: python + + bahdanau.print_transitions_index(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + 107 -> 108: 1.033257 + 94 -> 95: 1.014054 + 126 -> 127: 1.012626 + 62 -> 63: 1.006339 + 108 -> 109: 0.991839 + 34 -> 32: 0.978045 + 93 -> 94: 0.942446 + 125 -> 126: 0.913999 + 52 -> 53: 0.873350 + 100 -> 103: 0.849339 + + Top-10 unlikely transitions: + 46 -> 45: -3.602909 + 50 -> 44: -3.443869 + 46 -> 39: -3.094924 + 63 -> 62: -3.004683 + 50 -> 58: -2.873691 + 44 -> 32: -2.860855 + 35 -> 13: -2.854243 + 50 -> 40: -2.849881 + 45 -> 32: -2.844934 + 64 -> 63: -2.841505 + + +Voting stack model +------------------ + +.. code:: python + + concat = malaya.dependency.deep_model('concat') + bahdanau = malaya.dependency.deep_model('bahdanau') + luong = malaya.dependency.deep_model('luong') + tagging, indexing = malaya.stack.voting_stack([concat, bahdanau, luong], string) + malaya.dependency.dependency_graph(tagging, indexing).to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_27_0.svg diff --git a/docs/load-dependency_files/load-dependency_10_0.svg b/docs/load-dependency_files/load-dependency_10_0.svg new file mode 100644 index 00000000..1e7cc10c --- /dev/null +++ b/docs/load-dependency_files/load-dependency_10_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +2 +2 (Mahathir) + + + +3->2 + + +nsubj + + + +13 +13 (ketika) + + + +3->13 + + +mark + + + +14 +14 (memandu) + + + +3->14 + + +advcl + + + +1 +1 (Dr) + + + +12 +12 (mengantuk) + + + +1->12 + + +acl + + + +2->1 + + +det + + + +4 +4 (mereka) + + + +4->4 + + +obj + + + +5 +5 (supaya) + + + +6 +6 (berhenti) + + + +7 +7 (berehat) + + + +9 +9 (tidur) + + + +7->9 + + +conj + + + +10 +10 (sebentar) + + + +7->10 + + +case + + + +11 +11 (sekiranya) + + + +7->11 + + +nmod + + + +9->5 + + +mark + + + +9->6 + + +advcl + + + +9->7 + + +xcomp + + + +8 +8 (dan) + + + +9->8 + + +cc + + + diff --git a/docs/load-dependency_files/load-dependency_21_0.svg b/docs/load-dependency_files/load-dependency_21_0.svg new file mode 100644 index 00000000..7492e6ee --- /dev/null +++ b/docs/load-dependency_files/load-dependency_21_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +1 +1 (Dr) + + + +3->1 + + +nsubj + + + +4 +4 (mereka) + + + +3->4 + + +det + + + +14 +14 (memandu) + + + +3->14 + + +advcl + + + +2 +2 (Mahathir) + + + +1->2 + + +flat + + + +5 +5 (supaya) + + + +6 +6 (berhenti) + + + +6->5 + + +mark + + + +7 +7 (berehat) + + + +6->7 + + +amod + + + +9 +9 (tidur) + + + +6->9 + + +conj + + + +8 +8 (dan) + + + +9->8 + + +cc + + + +10 +10 (sebentar) + + + +10->6 + + +advcl + + + +12 +12 (mengantuk) + + + +10->12 + + +advcl + + + +11 +11 (sekiranya) + + + +12->11 + + +nsubj + + + +11->10 + + +advmod + + + +13 +13 (ketika) + + + +13->13 + + +mark + + + diff --git a/docs/load-dependency_files/load-dependency_27_0.svg b/docs/load-dependency_files/load-dependency_27_0.svg new file mode 100644 index 00000000..79487421 --- /dev/null +++ b/docs/load-dependency_files/load-dependency_27_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +1 +1 (Dr) + + + +3->1 + + +nsubj + + + +4 +4 (mereka) + + + +3->4 + + +det + + + +6 +6 (berhenti) + + + +3->6 + + +advcl + + + +2 +2 (Mahathir) + + + +1->2 + + +flat + + + +5 +5 (supaya) + + + +6->5 + + +mark + + + +7 +7 (berehat) + + + +6->7 + + +obj + + + +9 +9 (tidur) + + + +7->9 + + +conj + + + +8 +8 (dan) + + + +9->8 + + +cc + + + +11 +11 (sekiranya) + + + +9->11 + + +nsubj + + + +12 +12 (mengantuk) + + + +9->12 + + +ccomp + + + +10 +10 (sebentar) + + + +11->10 + + +advmod + + + +14 +14 (memandu) + + + +12->14 + + +advcl + + + +13 +13 (ketika) + + + +13->13 + + +mark + + + diff --git a/docs/load-entities.rst b/docs/load-entities.rst index dd86f531..2463cc04 100644 --- a/docs/load-entities.rst +++ b/docs/load-entities.rst @@ -7,8 +7,8 @@ .. parsed-literal:: - CPU times: user 12.3 s, sys: 837 ms, total: 13.1 s - Wall time: 14.1 s + CPU times: user 12.8 s, sys: 1.5 s, total: 14.3 s + Wall time: 18.8 s List available deep learning NER models @@ -125,7 +125,7 @@ Load CRF model Print important features from CRF model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------- .. code:: python @@ -160,7 +160,7 @@ Print important features from CRF model Print important transitions from CRF Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ .. code:: python @@ -209,21 +209,92 @@ Load deep learning models .. parsed-literal:: Testing concat model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'organization'), ('jkjr', 'event'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing bahdanau model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'organization'), ('kepada', 'OTHER'), ('orang', 'organization'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing luong model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'person'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'person'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'person'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'law'), ('raya', 'law'), ('jkjr', 'law'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing entity-network model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'OTHER'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'time'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'OTHER'), ('depan', 'OTHER'), ('perdana', 'OTHER'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing attention model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'person'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'event'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'person'), ('ramai', 'person'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'location'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + + + +Print important features from deep learning model +------------------------------------------------- + +.. code:: python + + bahdanau = malaya.entity.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + ton: 4.945406 + dollar: 4.345774 + disertai: 4.257772 + menjejaskan: 4.252921 + sesaat: 4.082481 + mata: 4.060701 + abul: 4.024586 + ruang: 3.983563 + orator: 3.899390 + universal: 3.866645 + + Top-10 negative: + abah: -4.691194 + raju: -4.370757 + dipengerusikan: -4.142600 + rs: -4.013050 + bacaan: -4.001595 + indonesia-malaysia: -3.921156 + nacp: -3.749232 + memprediksi: -3.659459 + ruhut: -3.620089 + pengesahan: -3.618848 +Print important transitions from deep learning model +---------------------------------------------------- + +.. code:: python + + bahdanau.print_transitions(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + event -> event: 0.810878 + OTHER -> OTHER: 0.626205 + PAD -> OTHER: 0.519626 + PAD -> event: 0.512354 + law -> law: 0.460971 + person -> law: 0.448240 + person -> event: 0.407665 + location -> event: 0.402317 + organization -> PAD: 0.402057 + time -> person: 0.342275 + + Top-10 unlikely transitions: + person -> organization: -0.914907 + law -> event: -0.843547 + event -> law: -0.829639 + organization -> person: -0.810431 + time -> quantity: -0.783691 + person -> location: -0.712586 + quantity -> law: -0.663559 + law -> time: -0.656724 + quantity -> time: -0.640747 + organization -> quantity: -0.615018 + Voting stack model ------------------ @@ -245,8 +316,8 @@ Voting stack model ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), - ('minggu', 'time'), - ('depan', 'time'), + ('minggu', 'OTHER'), + ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), @@ -271,7 +342,7 @@ Voting stack model ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), - ('halaman', 'location'), + ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), @@ -281,7 +352,7 @@ Voting stack model ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), - ('jkjr', 'organization'), + ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), diff --git a/docs/load-pos.rst b/docs/load-pos.rst index 1834e4ba..6012c109 100644 --- a/docs/load-pos.rst +++ b/docs/load-pos.rst @@ -1,8 +1,16 @@ .. code:: python + %%time import malaya + +.. parsed-literal:: + + CPU times: user 14.5 s, sys: 1.57 s, total: 16.1 s + Wall time: 21.3 s + + List available deep learning POS models --------------------------------------- @@ -66,26 +74,26 @@ Load CRF Model .. parsed-literal:: - [('kuala', 'PROPN'), - ('lumpur', 'PROPN'), - ('sempena', 'PROPN'), + [('Kuala', 'PROPN'), + ('Lumpur', 'PROPN'), + ('Sempena', 'SCONJ'), ('sambutan', 'NOUN'), - ('aidilfitri', 'NOUN'), + ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), - ('depan', 'ADJ'), - ('perdana', 'PROPN'), - ('menteri', 'PROPN'), - ('tun', 'PROPN'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), - ('mohamad', 'PROPN'), + ('depan', 'ADP'), + ('Perdana', 'PROPN'), + ('Menteri', 'PROPN'), + ('Tun', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), + ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), - ('menteri', 'VERB'), - ('pengangkutan', 'PROPN'), - ('anthony', 'PROPN'), - ('loke', 'PROPN'), - ('siew', 'PROPN'), - ('fook', 'PROPN'), + ('Menteri', 'PROPN'), + ('Pengangkutan', 'PROPN'), + ('Anthony', 'PROPN'), + ('Loke', 'PROPN'), + ('Siew', 'PROPN'), + ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), @@ -99,18 +107,18 @@ Load CRF Model ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), - ('dalam', 'ADP'), + ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), - ('jabatan', 'NOUN'), - ('keselamatan', 'PROPN'), - ('jalan', 'PROPN'), - ('raya', 'PROPN'), - ('jkjr', 'PROPN'), + ('Jabatan', 'PROPN'), + ('Keselamatan', 'PROPN'), + ('Jalan', 'PROPN'), + ('Raya', 'PROPN'), + ('Jkjr', 'PROPN'), ('itu', 'DET'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), @@ -121,7 +129,7 @@ Load CRF Model ('sebentar', 'ADP'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), - ('ketika', 'SCONJ'), + ('ketika', 'ADV'), ('memandu', 'VERB')] @@ -137,28 +145,28 @@ Print important features CRF model .. parsed-literal:: Top-10 positive: - 16.307872 DET word:tersebut - 15.868179 DET word:para - 15.590679 VERB word:percaya - 15.520492 ADP word:dari - 15.296975 DET word:berbagai - 14.691924 ADJ word:menakjubkan - 14.609917 ADJ word:menyejukkan - 14.503045 PRON word:kapan - 14.319357 DET word:ini - 14.267956 ADV word:pernah + 16.443463 DET word:para + 15.494273 DET word:berbagai + 14.856205 DET word:tersebut + 14.426293 ADJ word:menakjubkan + 14.319714 ADV word:memang + 14.158206 ADP word:tentang + 13.907366 VERB word:percaya + 13.635634 VERB word:integrasi + 13.630582 ADP word:dengan + 13.562358 ADV word:menurutnya Top-10 negative: - -7.217718 PROPN word:bunga - -7.258999 VERB word:memuaskan - -7.498110 ADP prev_word:pernah - -7.523901 ADV next_word-suffix-3:nai - -7.874955 NOUN prev_word-prefix-3:arw - -7.921689 NOUN suffix-2:ke - -8.049832 ADJ prev_word:sunda - -8.210202 PROPN prefix-3:ora - -8.524420 NUM prev_word:perang - -10.346546 CCONJ prev_word-suffix-3:rja + -6.663068 PROPN prefix-2:be + -6.714450 ADV next_word:menyatakan + -6.862083 PROPN next_word:Jepang + -7.183600 PROPN suffix-3:pun + -7.264241 ADV next_word-suffix-3:nai + -7.676069 VERB word:memuaskan + -7.961231 ADP prev_word:pernah + -8.006671 NOUN suffix-2:ke + -8.135974 ADP prev_word-prefix-3:pal + -8.173493 PROPN suffix-3:nya Print important transitions CRF model @@ -172,28 +180,28 @@ Print important transitions CRF model .. parsed-literal:: Top-10 likely transitions: - PROPN -> PROPN 5.529614 - DET -> DET 4.492123 - NOUN -> NOUN 2.600533 - ADJ -> ADJ 2.276762 - CCONJ -> CCONJ 1.888801 - CCONJ -> SCONJ 1.855106 - NOUN -> ADJ 1.729610 - SCONJ -> CCONJ 1.598273 - NUM -> NUM 1.475505 - ADV -> VERB 1.442607 + PROPN -> PROPN 5.767666 + NOUN -> NOUN 4.291842 + DET -> DET 3.723729 + NOUN -> PROPN 3.035784 + CCONJ -> CCONJ 2.545162 + X -> X 2.476296 + ADP -> NOUN 2.324735 + ADJ -> ADJ 2.285807 + NOUN -> ADJ 2.258407 + ADP -> PROPN 2.181474 Top-10 unlikely transitions: - SCONJ -> AUX -3.559017 - X -> SCONJ -3.566058 - SYM -> ADJ -3.720358 - PART -> ADP -3.744172 - X -> CCONJ -4.270577 - PART -> PART -4.543812 - ADV -> X -4.809254 - ADP -> SCONJ -5.157816 - ADP -> CCONJ -5.455725 - ADP -> SYM -6.841944 + SCONJ -> AUX -3.341014 + PART -> NUM -3.406289 + SCONJ -> ADJ -3.447362 + SYM -> ADV -3.468094 + SYM -> ADJ -3.597291 + AUX -> NUM -3.657861 + PART -> PART -4.059430 + X -> CCONJ -4.929272 + ADP -> SCONJ -4.960199 + ADP -> CCONJ -6.236844 Load deep learning models @@ -211,22 +219,93 @@ Load deep learning models .. parsed-literal:: Testing concat model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'PROPN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'NOUN'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADJ'), ('sekiranya', 'NOUN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'NOUN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'NUM'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing bahdanau model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'PROPN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'PROPN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'VERB'), ('depan', 'CCONJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing luong model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'NOUN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'NOUN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'NOUN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing entity-network model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'PROPN'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'NOUN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'VERB'), ('fook', 'NOUN'), ('menitipkan', 'NOUN'), ('pesanan', 'VERB'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'PROPN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'VERB'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'ADJ'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'NUM'), ('Lumpur', 'NUM'), ('Sempena', 'NUM'), ('sambutan', 'NUM'), ('Aidilfitri', 'SYM'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'NUM'), ('Menteri', 'NUM'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'NUM'), ('dan', 'CCONJ'), ('Menteri', 'NUM'), ('Pengangkutan', 'NUM'), ('Anthony', 'NUM'), ('Loke', 'NUM'), ('Siew', 'NUM'), ('Fook', 'NUM'), ('menitipkan', 'NUM'), ('pesanan', 'SYM'), ('khas', 'PROPN'), ('kepada', 'PROPN'), ('orang', 'PROPN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'VERB'), ('pulang', 'PROPN'), ('ke', 'PROPN'), ('kampung', 'VERB'), ('halaman', 'NUM'), ('masing-masing', 'PROPN'), ('Dalam', 'PROPN'), ('video', 'PROPN'), ('pendek', 'PROPN'), ('terbitan', 'NUM'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NUM'), ('Jalan', 'NUM'), ('Raya', 'NUM'), ('Jkjr', 'NUM'), ('itu', 'NUM'), ('Dr', 'NUM'), ('Mahathir', 'NUM'), ('menasihati', 'NUM'), ('mereka', 'NOUN'), ('supaya', 'NOUN'), ('berhenti', 'ADJ'), ('berehat', 'ADJ'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'NOUN'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'PROPN'), ('memandu', 'PROPN')] Testing attention model - [('kuala', 'X'), ('lumpur', 'DET'), ('sempena', 'X'), ('sambutan', 'DET'), ('aidilfitri', 'X'), ('minggu', 'DET'), ('depan', 'X'), ('perdana', 'DET'), ('menteri', 'X'), ('tun', 'DET'), ('dr', 'X'), ('mahathir', 'DET'), ('mohamad', 'X'), ('dan', 'DET'), ('menteri', 'X'), ('pengangkutan', 'DET'), ('anthony', 'X'), ('loke', 'DET'), ('siew', 'X'), ('fook', 'DET'), ('menitipkan', 'X'), ('pesanan', 'DET'), ('khas', 'X'), ('kepada', 'DET'), ('orang', 'X'), ('ramai', 'DET'), ('yang', 'X'), ('mahu', 'DET'), ('pulang', 'X'), ('ke', 'DET'), ('kampung', 'X'), ('halaman', 'DET'), ('masing-masing', 'X'), ('dalam', 'DET'), ('video', 'X'), ('pendek', 'DET'), ('terbitan', 'X'), ('jabatan', 'DET'), ('keselamatan', 'X'), ('jalan', 'DET'), ('raya', 'X'), ('jkjr', 'DET'), ('itu', 'X'), ('dr', 'DET'), ('mahathir', 'X'), ('menasihati', 'DET'), ('mereka', 'X'), ('supaya', 'DET'), ('berhenti', 'X'), ('berehat', 'DET'), ('dan', 'X'), ('tidur', 'DET'), ('sebentar', 'X'), ('sekiranya', 'DET'), ('mengantuk', 'X'), ('ketika', 'DET'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'CCONJ'), ('memandu', 'VERB')] +Print important features from deep learning model +------------------------------------------------- + +.. code:: python + + bahdanau = malaya.pos.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + 1971: 4.942553 + Puisi: 4.754801 + 27: 4.659504 + buahan: 4.551769 + kaisarnya: 4.503439 + Kedua: 4.459490 + Times: 4.378673 + perlengkapan: 4.342615 + kelautan: 4.273527 + Persija: 4.260429 + + Top-10 negative: + Sakova: -5.102705 + engkau: -5.000618 + Cin: -4.962496 + bermesin: -4.823804 + Husm: -4.719638 + saatnya: -4.693280 + Vireta: -4.615777 + menjamu: -4.589007 + Aff: -4.437630 + dilahirkan: -4.422080 + + +Print important transitions from deep learning model +---------------------------------------------------- + +.. code:: python + + bahdanau.print_transitions(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + SCONJ -> CCONJ: 0.688627 + SCONJ -> PRON: 0.539603 + ADV -> NUM: 0.517046 + PROPN -> PART: 0.479875 + ADP -> DET: 0.470052 + AUX -> ADV: 0.424240 + PRON -> NUM: 0.420834 + PAD -> AUX: 0.415958 + NUM -> ADV: 0.401860 + PART -> SYM: 0.395167 + + Top-10 unlikely transitions: + ADP -> CCONJ: -0.791846 + DET -> X: -0.675577 + SCONJ -> SCONJ: -0.665004 + VERB -> VERB: -0.646812 + PART -> NUM: -0.644018 + CCONJ -> CCONJ: -0.590792 + AUX -> NUM: -0.579523 + ADV -> SCONJ: -0.569171 + NUM -> VERB: -0.568291 + PRON -> SYM: -0.563159 + + Voting stack model ------------------ @@ -242,27 +321,27 @@ Voting stack model .. parsed-literal:: - [('kuala', 'PROPN'), - ('lumpur', 'PROPN'), - ('sempena', 'PROPN'), + [('Kuala', 'PROPN'), + ('Lumpur', 'PROPN'), + ('Sempena', 'NUM'), ('sambutan', 'NOUN'), - ('aidilfitri', 'PROPN'), - ('minggu', 'PROPN'), - ('depan', 'ADJ'), - ('perdana', 'PROPN'), - ('menteri', 'PROPN'), - ('tun', 'PROPN'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), - ('mohamad', 'PROPN'), + ('Aidilfitri', 'PROPN'), + ('minggu', 'NOUN'), + ('depan', 'ADP'), + ('Perdana', 'PROPN'), + ('Menteri', 'PROPN'), + ('Tun', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), + ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), - ('menteri', 'NOUN'), - ('pengangkutan', 'PROPN'), - ('anthony', 'PROPN'), - ('loke', 'PROPN'), - ('siew', 'PROPN'), - ('fook', 'NOUN'), - ('menitipkan', 'PROPN'), + ('Menteri', 'PROPN'), + ('Pengangkutan', 'PROPN'), + ('Anthony', 'PROPN'), + ('Loke', 'PROPN'), + ('Siew', 'PROPN'), + ('Fook', 'PROPN'), + ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), @@ -275,18 +354,18 @@ Voting stack model ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), - ('dalam', 'ADP'), + ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), - ('jabatan', 'NOUN'), - ('keselamatan', 'PROPN'), - ('jalan', 'PROPN'), - ('raya', 'PROPN'), - ('jkjr', 'PROPN'), + ('Jabatan', 'NOUN'), + ('Keselamatan', 'NUM'), + ('Jalan', 'PROPN'), + ('Raya', 'PROPN'), + ('Jkjr', 'NUM'), ('itu', 'DET'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), @@ -294,8 +373,8 @@ Voting stack model ('berehat', 'VERB'), ('dan', 'CCONJ'), ('tidur', 'VERB'), - ('sebentar', 'ADV'), + ('sebentar', 'ADP'), ('sekiranya', 'NOUN'), - ('mengantuk', 'PROPN'), - ('ketika', 'SCONJ'), + ('mengantuk', 'NUM'), + ('ketika', 'NUM'), ('memandu', 'VERB')] diff --git a/example/dependency/README.rst b/example/dependency/README.rst new file mode 100644 index 00000000..168d1c3e --- /dev/null +++ b/example/dependency/README.rst @@ -0,0 +1,398 @@ + +.. code:: ipython3 + + %%time + import malaya + + +.. parsed-literal:: + + CPU times: user 13 s, sys: 1.5 s, total: 14.5 s + Wall time: 18.3 s + + +List available deep learning Dependency models +---------------------------------------------- + +.. code:: ipython3 + + malaya.dependency.available_deep_model() + + + + +.. parsed-literal:: + + ['concat', 'bahdanau', 'luong'] + + + +Describe supported dependencies +------------------------------- + +.. code:: ipython3 + + malaya.describe_dependency() + + +.. parsed-literal:: + + acl - clausal modifier of noun + advcl - adverbial clause modifier + advmod - adverbial modifier + amod - adjectival modifier + appos - appositional modifier + aux - auxiliary + case - case marking + ccomp - clausal complement + compound - compound + compound:plur - plural compound + conj - conjunct + cop - cop + csubj - clausal subject + dep - dependent + det - determiner + fixed - multi-word expression + flat - name + iobj - indirect object + mark - marker + nmod - nominal modifier + nsubj - nominal subject + obj - direct object + parataxis - parataxis + root - root + xcomp - open clausal complement + you can read more from https://universaldependencies.org/en/dep/xcomp.html + + +.. code:: ipython3 + + string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.' + +Load CRF model +-------------- + +.. code:: ipython3 + + crf = malaya.dependency.crf() + tagging, indexing = crf.predict(string) + tagging, indexing + + + + +.. parsed-literal:: + + ([('Dr', 'det'), + ('Mahathir', 'nsubj'), + ('menasihati', 'root'), + ('mereka', 'obj'), + ('supaya', 'mark'), + ('berhenti', 'advcl'), + ('berehat', 'xcomp'), + ('dan', 'cc'), + ('tidur', 'conj'), + ('sebentar', 'case'), + ('sekiranya', 'nmod'), + ('mengantuk', 'acl'), + ('ketika', 'mark'), + ('memandu', 'advcl')], + [('Dr', 2), + ('Mahathir', 3), + ('menasihati', 0), + ('mereka', 4), + ('supaya', 9), + ('berhenti', 9), + ('berehat', 9), + ('dan', 9), + ('tidur', 7), + ('sebentar', 7), + ('sekiranya', 7), + ('mengantuk', 1), + ('ketika', 3), + ('memandu', 3)]) + + + +Visualize graph for dependency output +------------------------------------- + +**Make sure you already installed graphvis.** + +.. code:: ipython3 + + graph = malaya.dependency.dependency_graph(tagging, indexing) + graph.to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_10_0.svg + + + +Print important features from CRF model +--------------------------------------- + +.. code:: ipython3 + + crf.print_features(10) + + +.. parsed-literal:: + + Top-10 tagging positive: + 8.072296 det word:berbagai + 7.858845 det word:para + 7.857109 det word:tersebut + 7.465632 advmod word:memang + 6.809172 nummod is_numeric + 6.232288 amod word:menakjubkan + 6.188577 advmod word:terutama + 6.067059 case word:selama + 5.723111 advmod word:lagi + 5.675961 case word:tentang + + Top-10 tagging negative: + -2.672044 nsubj next_word:memang + -2.690972 root prefix-3:sal + -2.708229 punct prev_word-prefix-1:9 + -2.710053 obl suffix-3:ena + -2.711398 conj suffix-3:aat + -2.758406 flat prefix-2:ya + -2.848409 nsubj next_word:berisi + -3.400050 compound:plur suffix-2:ya + -3.619957 case next_word:pernyataan + -5.017675 flat is_first + + +Print important tagging transitions from CRF Model +-------------------------------------------------- + +.. code:: ipython3 + + crf.print_transitions_tag(10) + + +.. parsed-literal:: + + Top-10 likely tagging transitions: + case -> obl 5.106777 + case -> nmod 4.338968 + cc -> conj 3.375610 + flat -> flat 3.347966 + case -> xcomp 2.899393 + appos -> flat 2.632795 + mark -> advcl 2.373561 + nmod -> flat 2.247949 + conj -> flat 2.239869 + nummod -> obl 2.214665 + + Top-10 unlikely tagging transitions: + root -> conj -2.243008 + xcomp -> parataxis -2.250619 + case -> appos -2.273873 + case -> obj -2.506688 + case -> flat -2.524687 + root -> parataxis -2.581892 + mark -> flat -2.664428 + cop -> obj -3.155705 + case -> fixed -3.301385 + root -> root -4.324076 + + +Print important indexing transitions from CRF Model +--------------------------------------------------- + +.. code:: ipython3 + + crf.print_transitions_index(10) + + +.. parsed-literal:: + + Top-10 likely indexing transitions: + 78 -> 78 5.050351 + 1 -> 1 5.044279 + 137 -> 137 5.014911 + 90 -> 90 4.912735 + 63 -> 63 4.724542 + 95 -> 95 4.692040 + 107 -> 108 4.620310 + 92 -> 93 4.605423 + 94 -> 98 4.568649 + 96 -> 99 4.556339 + + Top-10 unlikely indexing transitions: + 0 -> 43 -2.899807 + 0 -> 44 -2.904968 + 45 -> 3 -3.004463 + 33 -> 1 -3.115820 + 0 -> 33 -3.147339 + 3 -> 38 -3.170745 + 0 -> 40 -3.220509 + 0 -> 37 -3.272783 + 0 -> 38 -3.425021 + 0 -> 39 -3.439639 + + +Load deep learning models +------------------------- + +.. code:: ipython3 + + for i in malaya.dependency.available_deep_model(): + print('Testing %s model'%(i)) + model = malaya.dependency.deep_model(i) + print(model.predict(string)) + print() + + +.. parsed-literal:: + + Testing concat model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'obj'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'obj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 4), ('berhenti', 4), ('berehat', 6), ('dan', 9), ('tidur', 7), ('sebentar', 11), ('sekiranya', 9), ('mengantuk', 9), ('ketika', 13), ('memandu', 12)]) + + Testing bahdanau model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'compound'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 6), ('berhenti', 9), ('berehat', 6), ('dan', 9), ('tidur', 6), ('sebentar', 9), ('sekiranya', 12), ('mengantuk', 9), ('ketika', 13), ('memandu', 3)]) + + Testing luong model + ([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'amod'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'compound'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 1), ('mereka', 3), ('supaya', 3), ('berhenti', 3), ('berehat', 6), ('dan', 10), ('tidur', 7), ('sebentar', 10), ('sekiranya', 10), ('mengantuk', 11), ('ketika', 11), ('memandu', 12)]) + + + +Print important features from deep learning model +------------------------------------------------- + +.. code:: ipython3 + + bahdanau = malaya.dependency.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + Balaikota: 6.001306 + jemaatnya: 5.659410 + esai: 5.420834 + menyulitkan: 5.298349 + Khairun: 5.271856 + Scandal: 5.135361 + penolakan: 5.070021 + gundiknya: 5.057362 + gagasan: 4.977351 + Banyuputih: 4.972396 + + Top-10 negative: + Carolina: -5.638381 + kontestan: -5.565759 + Dibalik: -5.185034 + Rotten: -5.032556 + 1982: -4.824227 + ditempatkan: -4.771740 + Player: -4.723217 + Nuh: -4.664867 + rating: -4.659817 + tello: -4.614172 + + +.. code:: ipython3 + + tagging, indexing = bahdanau.predict(string) + malaya.dependency.dependency_graph(tagging, indexing).to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_21_0.svg + + + +Print important tagging transitions from deep learning model +------------------------------------------------------------ + +.. code:: ipython3 + + bahdanau.print_transitions_tag(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + cc -> conj: 1.361513 + mark -> advcl: 1.160463 + compound:plur -> amod: 1.159281 + mark -> fixed: 0.990238 + obj -> compound: 0.971839 + flat -> flat: 0.927018 + case -> obl: 0.926517 + cop -> det: 0.902245 + nsubj -> dep: 0.844691 + nsubj:pass -> dep: 0.837701 + + Top-10 unlikely transitions: + case -> obj: -2.866276 + root -> root: -2.830104 + case -> parataxis: -2.372282 + nsubj:pass -> nsubj:pass: -2.307715 + punct -> csubj: -2.298815 + compound:plur -> fixed: -2.215350 + parataxis -> advcl: -2.196172 + nsubj:pass -> compound:plur: -2.159937 + mark -> xcomp: -2.143510 + csubj -> advmod: -2.140114 + + +Print important indexing transitions from deep learning model +------------------------------------------------------------- + +.. code:: ipython3 + + bahdanau.print_transitions_index(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + 107 -> 108: 1.033257 + 94 -> 95: 1.014054 + 126 -> 127: 1.012626 + 62 -> 63: 1.006339 + 108 -> 109: 0.991839 + 34 -> 32: 0.978045 + 93 -> 94: 0.942446 + 125 -> 126: 0.913999 + 52 -> 53: 0.873350 + 100 -> 103: 0.849339 + + Top-10 unlikely transitions: + 46 -> 45: -3.602909 + 50 -> 44: -3.443869 + 46 -> 39: -3.094924 + 63 -> 62: -3.004683 + 50 -> 58: -2.873691 + 44 -> 32: -2.860855 + 35 -> 13: -2.854243 + 50 -> 40: -2.849881 + 45 -> 32: -2.844934 + 64 -> 63: -2.841505 + + +Voting stack model +------------------ + +.. code:: ipython3 + + concat = malaya.dependency.deep_model('concat') + bahdanau = malaya.dependency.deep_model('bahdanau') + luong = malaya.dependency.deep_model('luong') + tagging, indexing = malaya.stack.voting_stack([concat, bahdanau, luong], string) + malaya.dependency.dependency_graph(tagging, indexing).to_graphvis() + + + + +.. image:: load-dependency_files/load-dependency_27_0.svg + + diff --git a/example/dependency/load-dependency.ipynb b/example/dependency/load-dependency.ipynb new file mode 100644 index 00000000..e071b362 --- /dev/null +++ b/example/dependency/load-dependency.ipynb @@ -0,0 +1,1156 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 13 s, sys: 1.5 s, total: 14.5 s\n", + "Wall time: 18.3 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List available deep learning Dependency models" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['concat', 'bahdanau', 'luong']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.dependency.available_deep_model()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Describe supported dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "acl - clausal modifier of noun\n", + "advcl - adverbial clause modifier\n", + "advmod - adverbial modifier\n", + "amod - adjectival modifier\n", + "appos - appositional modifier\n", + "aux - auxiliary\n", + "case - case marking\n", + "ccomp - clausal complement\n", + "compound - compound\n", + "compound:plur - plural compound\n", + "conj - conjunct\n", + "cop - cop\n", + "csubj - clausal subject\n", + "dep - dependent\n", + "det - determiner\n", + "fixed - multi-word expression\n", + "flat - name\n", + "iobj - indirect object\n", + "mark - marker\n", + "nmod - nominal modifier\n", + "nsubj - nominal subject\n", + "obj - direct object\n", + "parataxis - parataxis\n", + "root - root\n", + "xcomp - open clausal complement\n", + "you can read more from https://universaldependencies.org/en/dep/xcomp.html\n" + ] + } + ], + "source": [ + "malaya.describe_dependency()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load CRF model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([('Dr', 'det'),\n", + " ('Mahathir', 'nsubj'),\n", + " ('menasihati', 'root'),\n", + " ('mereka', 'obj'),\n", + " ('supaya', 'mark'),\n", + " ('berhenti', 'advcl'),\n", + " ('berehat', 'xcomp'),\n", + " ('dan', 'cc'),\n", + " ('tidur', 'conj'),\n", + " ('sebentar', 'case'),\n", + " ('sekiranya', 'nmod'),\n", + " ('mengantuk', 'acl'),\n", + " ('ketika', 'mark'),\n", + " ('memandu', 'advcl')],\n", + " [('Dr', 2),\n", + " ('Mahathir', 3),\n", + " ('menasihati', 0),\n", + " ('mereka', 4),\n", + " ('supaya', 9),\n", + " ('berhenti', 9),\n", + " ('berehat', 9),\n", + " ('dan', 9),\n", + " ('tidur', 7),\n", + " ('sebentar', 7),\n", + " ('sekiranya', 7),\n", + " ('mengantuk', 1),\n", + " ('ketika', 3),\n", + " ('memandu', 3)])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf = malaya.dependency.crf()\n", + "tagging, indexing = crf.predict(string)\n", + "tagging, indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize graph for dependency output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Make sure you already installed graphvis.**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "3->2\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "3->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "3->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "1->12\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "4->4\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "7->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "7->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "7->11\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "9->5\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "9->6\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "9->7\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = malaya.dependency.dependency_graph(tagging, indexing)\n", + "graph.to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important features from CRF model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 tagging positive:\n", + "8.072296 det word:berbagai\n", + "7.858845 det word:para\n", + "7.857109 det word:tersebut\n", + "7.465632 advmod word:memang\n", + "6.809172 nummod is_numeric\n", + "6.232288 amod word:menakjubkan\n", + "6.188577 advmod word:terutama\n", + "6.067059 case word:selama\n", + "5.723111 advmod word:lagi\n", + "5.675961 case word:tentang\n", + "\n", + "Top-10 tagging negative:\n", + "-2.672044 nsubj next_word:memang\n", + "-2.690972 root prefix-3:sal\n", + "-2.708229 punct prev_word-prefix-1:9\n", + "-2.710053 obl suffix-3:ena\n", + "-2.711398 conj suffix-3:aat\n", + "-2.758406 flat prefix-2:ya\n", + "-2.848409 nsubj next_word:berisi\n", + "-3.400050 compound:plur suffix-2:ya\n", + "-3.619957 case next_word:pernyataan\n", + "-5.017675 flat is_first\n" + ] + } + ], + "source": [ + "crf.print_features(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important tagging transitions from CRF Model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely tagging transitions:\n", + "case -> obl 5.106777\n", + "case -> nmod 4.338968\n", + "cc -> conj 3.375610\n", + "flat -> flat 3.347966\n", + "case -> xcomp 2.899393\n", + "appos -> flat 2.632795\n", + "mark -> advcl 2.373561\n", + "nmod -> flat 2.247949\n", + "conj -> flat 2.239869\n", + "nummod -> obl 2.214665\n", + "\n", + "Top-10 unlikely tagging transitions:\n", + "root -> conj -2.243008\n", + "xcomp -> parataxis -2.250619\n", + "case -> appos -2.273873\n", + "case -> obj -2.506688\n", + "case -> flat -2.524687\n", + "root -> parataxis -2.581892\n", + "mark -> flat -2.664428\n", + "cop -> obj -3.155705\n", + "case -> fixed -3.301385\n", + "root -> root -4.324076\n" + ] + } + ], + "source": [ + "crf.print_transitions_tag(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important indexing transitions from CRF Model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely indexing transitions:\n", + "78 -> 78 5.050351\n", + "1 -> 1 5.044279\n", + "137 -> 137 5.014911\n", + "90 -> 90 4.912735\n", + "63 -> 63 4.724542\n", + "95 -> 95 4.692040\n", + "107 -> 108 4.620310\n", + "92 -> 93 4.605423\n", + "94 -> 98 4.568649\n", + "96 -> 99 4.556339\n", + "\n", + "Top-10 unlikely indexing transitions:\n", + "0 -> 43 -2.899807\n", + "0 -> 44 -2.904968\n", + "45 -> 3 -3.004463\n", + "33 -> 1 -3.115820\n", + "0 -> 33 -3.147339\n", + "3 -> 38 -3.170745\n", + "0 -> 40 -3.220509\n", + "0 -> 37 -3.272783\n", + "0 -> 38 -3.425021\n", + "0 -> 39 -3.439639\n" + ] + } + ], + "source": [ + "crf.print_transitions_index(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load deep learning models" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing concat model\n", + "([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'obj'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'obj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 4), ('berhenti', 4), ('berehat', 6), ('dan', 9), ('tidur', 7), ('sebentar', 11), ('sekiranya', 9), ('mengantuk', 9), ('ketika', 13), ('memandu', 12)])\n", + "\n", + "Testing bahdanau model\n", + "([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'root'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'compound'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'advcl'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 0), ('mereka', 3), ('supaya', 6), ('berhenti', 9), ('berehat', 6), ('dan', 9), ('tidur', 6), ('sebentar', 9), ('sekiranya', 12), ('mengantuk', 9), ('ketika', 13), ('memandu', 3)])\n", + "\n", + "Testing luong model\n", + "([('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'amod'), ('mereka', 'det'), ('supaya', 'mark'), ('berhenti', 'advcl'), ('berehat', 'obj'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'nsubj'), ('mengantuk', 'compound'), ('ketika', 'mark'), ('memandu', 'advcl')], [('Dr', 3), ('Mahathir', 1), ('menasihati', 1), ('mereka', 3), ('supaya', 3), ('berhenti', 3), ('berehat', 6), ('dan', 10), ('tidur', 7), ('sebentar', 10), ('sekiranya', 10), ('mengantuk', 11), ('ketika', 11), ('memandu', 12)])\n", + "\n" + ] + } + ], + "source": [ + "for i in malaya.dependency.available_deep_model():\n", + " print('Testing %s model'%(i))\n", + " model = malaya.dependency.deep_model(i)\n", + " print(model.predict(string))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important features from deep learning model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 positive:\n", + "Balaikota: 6.001306\n", + "jemaatnya: 5.659410\n", + "esai: 5.420834\n", + "menyulitkan: 5.298349\n", + "Khairun: 5.271856\n", + "Scandal: 5.135361\n", + "penolakan: 5.070021\n", + "gundiknya: 5.057362\n", + "gagasan: 4.977351\n", + "Banyuputih: 4.972396\n", + "\n", + "Top-10 negative:\n", + "Carolina: -5.638381\n", + "kontestan: -5.565759\n", + "Dibalik: -5.185034\n", + "Rotten: -5.032556\n", + "1982: -4.824227\n", + "ditempatkan: -4.771740\n", + "Player: -4.723217\n", + "Nuh: -4.664867\n", + "rating: -4.659817\n", + "tello: -4.614172\n" + ] + } + ], + "source": [ + "bahdanau = malaya.dependency.deep_model('bahdanau')\n", + "bahdanau.print_features(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "3->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "6->5\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "6->7\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "6->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "10->6\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "10->12\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "13->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tagging, indexing = bahdanau.predict(string)\n", + "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important tagging transitions from deep learning model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely transitions:\n", + "cc -> conj: 1.361513\n", + "mark -> advcl: 1.160463\n", + "compound:plur -> amod: 1.159281\n", + "mark -> fixed: 0.990238\n", + "obj -> compound: 0.971839\n", + "flat -> flat: 0.927018\n", + "case -> obl: 0.926517\n", + "cop -> det: 0.902245\n", + "nsubj -> dep: 0.844691\n", + "nsubj:pass -> dep: 0.837701\n", + "\n", + "Top-10 unlikely transitions:\n", + "case -> obj: -2.866276\n", + "root -> root: -2.830104\n", + "case -> parataxis: -2.372282\n", + "nsubj:pass -> nsubj:pass: -2.307715\n", + "punct -> csubj: -2.298815\n", + "compound:plur -> fixed: -2.215350\n", + "parataxis -> advcl: -2.196172\n", + "nsubj:pass -> compound:plur: -2.159937\n", + "mark -> xcomp: -2.143510\n", + "csubj -> advmod: -2.140114\n" + ] + } + ], + "source": [ + "bahdanau.print_transitions_tag(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important indexing transitions from deep learning model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely transitions:\n", + "107 -> 108: 1.033257\n", + "94 -> 95: 1.014054\n", + "126 -> 127: 1.012626\n", + "62 -> 63: 1.006339\n", + "108 -> 109: 0.991839\n", + "34 -> 32: 0.978045\n", + "93 -> 94: 0.942446\n", + "125 -> 126: 0.913999\n", + "52 -> 53: 0.873350\n", + "100 -> 103: 0.849339\n", + "\n", + "Top-10 unlikely transitions:\n", + "46 -> 45: -3.602909\n", + "50 -> 44: -3.443869\n", + "46 -> 39: -3.094924\n", + "63 -> 62: -3.004683\n", + "50 -> 58: -2.873691\n", + "44 -> 32: -2.860855\n", + "35 -> 13: -2.854243\n", + "50 -> 40: -2.849881\n", + "45 -> 32: -2.844934\n", + "64 -> 63: -2.841505\n" + ] + } + ], + "source": [ + "bahdanau.print_transitions_index(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Voting stack model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "3->6\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6->5\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "6->7\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "7->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "9->11\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "9->12\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "12->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "13->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "concat = malaya.dependency.deep_model('concat')\n", + "bahdanau = malaya.dependency.deep_model('bahdanau')\n", + "luong = malaya.dependency.deep_model('luong')\n", + "tagging, indexing = malaya.stack.voting_stack([concat, bahdanau, luong], string)\n", + "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example/dependency/load-dependency_files/load-dependency_10_0.svg b/example/dependency/load-dependency_files/load-dependency_10_0.svg new file mode 100644 index 00000000..1e7cc10c --- /dev/null +++ b/example/dependency/load-dependency_files/load-dependency_10_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +2 +2 (Mahathir) + + + +3->2 + + +nsubj + + + +13 +13 (ketika) + + + +3->13 + + +mark + + + +14 +14 (memandu) + + + +3->14 + + +advcl + + + +1 +1 (Dr) + + + +12 +12 (mengantuk) + + + +1->12 + + +acl + + + +2->1 + + +det + + + +4 +4 (mereka) + + + +4->4 + + +obj + + + +5 +5 (supaya) + + + +6 +6 (berhenti) + + + +7 +7 (berehat) + + + +9 +9 (tidur) + + + +7->9 + + +conj + + + +10 +10 (sebentar) + + + +7->10 + + +case + + + +11 +11 (sekiranya) + + + +7->11 + + +nmod + + + +9->5 + + +mark + + + +9->6 + + +advcl + + + +9->7 + + +xcomp + + + +8 +8 (dan) + + + +9->8 + + +cc + + + diff --git a/example/dependency/load-dependency_files/load-dependency_21_0.svg b/example/dependency/load-dependency_files/load-dependency_21_0.svg new file mode 100644 index 00000000..7492e6ee --- /dev/null +++ b/example/dependency/load-dependency_files/load-dependency_21_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +1 +1 (Dr) + + + +3->1 + + +nsubj + + + +4 +4 (mereka) + + + +3->4 + + +det + + + +14 +14 (memandu) + + + +3->14 + + +advcl + + + +2 +2 (Mahathir) + + + +1->2 + + +flat + + + +5 +5 (supaya) + + + +6 +6 (berhenti) + + + +6->5 + + +mark + + + +7 +7 (berehat) + + + +6->7 + + +amod + + + +9 +9 (tidur) + + + +6->9 + + +conj + + + +8 +8 (dan) + + + +9->8 + + +cc + + + +10 +10 (sebentar) + + + +10->6 + + +advcl + + + +12 +12 (mengantuk) + + + +10->12 + + +advcl + + + +11 +11 (sekiranya) + + + +12->11 + + +nsubj + + + +11->10 + + +advmod + + + +13 +13 (ketika) + + + +13->13 + + +mark + + + diff --git a/example/dependency/load-dependency_files/load-dependency_27_0.svg b/example/dependency/load-dependency_files/load-dependency_27_0.svg new file mode 100644 index 00000000..79487421 --- /dev/null +++ b/example/dependency/load-dependency_files/load-dependency_27_0.svg @@ -0,0 +1,186 @@ + + + + + + +G + + + +0 +0 (None) + + + +3 +3 (menasihati) + + + +0->3 + + +root + + + +1 +1 (Dr) + + + +3->1 + + +nsubj + + + +4 +4 (mereka) + + + +3->4 + + +det + + + +6 +6 (berhenti) + + + +3->6 + + +advcl + + + +2 +2 (Mahathir) + + + +1->2 + + +flat + + + +5 +5 (supaya) + + + +6->5 + + +mark + + + +7 +7 (berehat) + + + +6->7 + + +obj + + + +9 +9 (tidur) + + + +7->9 + + +conj + + + +8 +8 (dan) + + + +9->8 + + +cc + + + +11 +11 (sekiranya) + + + +9->11 + + +nsubj + + + +12 +12 (mengantuk) + + + +9->12 + + +ccomp + + + +10 +10 (sebentar) + + + +11->10 + + +advmod + + + +14 +14 (memandu) + + + +12->14 + + +advcl + + + +13 +13 (ketika) + + + +13->13 + + +mark + + + diff --git a/example/entities/README.rst b/example/entities/README.rst index 7a6c5382..738b62e9 100644 --- a/example/entities/README.rst +++ b/example/entities/README.rst @@ -7,8 +7,8 @@ .. parsed-literal:: - CPU times: user 12.3 s, sys: 837 ms, total: 13.1 s - Wall time: 14.1 s + CPU times: user 12.8 s, sys: 1.5 s, total: 14.3 s + Wall time: 18.8 s List available deep learning NER models @@ -125,7 +125,7 @@ Load CRF model Print important features from CRF model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------- .. code:: ipython3 @@ -160,7 +160,7 @@ Print important features from CRF model Print important transitions from CRF Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ .. code:: ipython3 @@ -209,20 +209,91 @@ Load deep learning models .. parsed-literal:: Testing concat model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'organization'), ('jkjr', 'event'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing bahdanau model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'organization'), ('kepada', 'OTHER'), ('orang', 'organization'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing luong model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'person'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'person'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'person'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'law'), ('raya', 'law'), ('jkjr', 'law'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing entity-network model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'OTHER'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'time'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'OTHER'), ('depan', 'OTHER'), ('perdana', 'OTHER'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] Testing attention model - [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'person'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + [('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'event'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'person'), ('ramai', 'person'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'location'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')] + + + +Print important features from deep learning model +------------------------------------------------- + +.. code:: ipython3 + + bahdanau = malaya.entity.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + ton: 4.945406 + dollar: 4.345774 + disertai: 4.257772 + menjejaskan: 4.252921 + sesaat: 4.082481 + mata: 4.060701 + abul: 4.024586 + ruang: 3.983563 + orator: 3.899390 + universal: 3.866645 + + Top-10 negative: + abah: -4.691194 + raju: -4.370757 + dipengerusikan: -4.142600 + rs: -4.013050 + bacaan: -4.001595 + indonesia-malaysia: -3.921156 + nacp: -3.749232 + memprediksi: -3.659459 + ruhut: -3.620089 + pengesahan: -3.618848 + + +Print important transitions from deep learning model +---------------------------------------------------- + +.. code:: ipython3 + + bahdanau.print_transitions(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + event -> event: 0.810878 + OTHER -> OTHER: 0.626205 + PAD -> OTHER: 0.519626 + PAD -> event: 0.512354 + law -> law: 0.460971 + person -> law: 0.448240 + person -> event: 0.407665 + location -> event: 0.402317 + organization -> PAD: 0.402057 + time -> person: 0.342275 + Top-10 unlikely transitions: + person -> organization: -0.914907 + law -> event: -0.843547 + event -> law: -0.829639 + organization -> person: -0.810431 + time -> quantity: -0.783691 + person -> location: -0.712586 + quantity -> law: -0.663559 + law -> time: -0.656724 + quantity -> time: -0.640747 + organization -> quantity: -0.615018 Voting stack model @@ -245,8 +316,8 @@ Voting stack model ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), - ('minggu', 'time'), - ('depan', 'time'), + ('minggu', 'OTHER'), + ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), @@ -271,7 +342,7 @@ Voting stack model ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), - ('halaman', 'location'), + ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), @@ -281,7 +352,7 @@ Voting stack model ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), - ('jkjr', 'organization'), + ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), diff --git a/example/entities/load-entities.ipynb b/example/entities/load-entities.ipynb index b5951e3e..351cad27 100644 --- a/example/entities/load-entities.ipynb +++ b/example/entities/load-entities.ipynb @@ -9,8 +9,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 12.3 s, sys: 837 ms, total: 13.1 s\n", - "Wall time: 14.1 s\n" + "CPU times: user 12.8 s, sys: 1.5 s, total: 14.3 s\n", + "Wall time: 18.8 s\n" ] } ], @@ -174,7 +174,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Print important features from CRF model" + "## Print important features from CRF model" ] }, { @@ -220,7 +220,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Print important transitions from CRF Model" + "## Print important transitions from CRF Model" ] }, { @@ -279,19 +279,19 @@ "output_type": "stream", "text": [ "Testing concat model\n", - "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", + "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'organization'), ('jkjr', 'event'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", "\n", "Testing bahdanau model\n", - "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'organization'), ('kepada', 'OTHER'), ('orang', 'organization'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", + "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'organization'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", "\n", "Testing luong model\n", - "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'person'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'person'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'person'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", + "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'person'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'law'), ('raya', 'law'), ('jkjr', 'law'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", "\n", "Testing entity-network model\n", - "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'OTHER'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'time'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", + "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'OTHER'), ('depan', 'OTHER'), ('perdana', 'OTHER'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'OTHER'), ('halaman', 'OTHER'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'organization'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", "\n", "Testing attention model\n", - "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'OTHER'), ('aidilfitri', 'event'), ('minggu', 'time'), ('depan', 'time'), ('perdana', 'person'), ('menteri', 'person'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'organization'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'person'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'OTHER'), ('ramai', 'OTHER'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'organization'), ('keselamatan', 'organization'), ('jalan', 'organization'), ('raya', 'organization'), ('jkjr', 'person'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", + "[('kuala', 'location'), ('lumpur', 'location'), ('sempena', 'OTHER'), ('sambutan', 'event'), ('aidilfitri', 'event'), ('minggu', 'event'), ('depan', 'OTHER'), ('perdana', 'person'), ('menteri', 'OTHER'), ('tun', 'person'), ('dr', 'person'), ('mahathir', 'person'), ('mohamad', 'person'), ('dan', 'OTHER'), ('menteri', 'OTHER'), ('pengangkutan', 'OTHER'), ('anthony', 'person'), ('loke', 'person'), ('siew', 'person'), ('fook', 'person'), ('menitipkan', 'OTHER'), ('pesanan', 'OTHER'), ('khas', 'OTHER'), ('kepada', 'OTHER'), ('orang', 'person'), ('ramai', 'person'), ('yang', 'OTHER'), ('mahu', 'OTHER'), ('pulang', 'OTHER'), ('ke', 'OTHER'), ('kampung', 'location'), ('halaman', 'location'), ('masing-masing', 'OTHER'), ('dalam', 'OTHER'), ('video', 'OTHER'), ('pendek', 'OTHER'), ('terbitan', 'OTHER'), ('jabatan', 'OTHER'), ('keselamatan', 'OTHER'), ('jalan', 'location'), ('raya', 'location'), ('jkjr', 'location'), ('itu', 'OTHER'), ('dr', 'person'), ('mahathir', 'person'), ('menasihati', 'OTHER'), ('mereka', 'OTHER'), ('supaya', 'OTHER'), ('berhenti', 'OTHER'), ('berehat', 'OTHER'), ('dan', 'OTHER'), ('tidur', 'OTHER'), ('sebentar', 'OTHER'), ('sekiranya', 'OTHER'), ('mengantuk', 'OTHER'), ('ketika', 'OTHER'), ('memandu', 'OTHER')]\n", "\n" ] } @@ -308,13 +308,106 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Voting stack model" + "## Print important features from deep learning model" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 positive:\n", + "ton: 4.945406\n", + "dollar: 4.345774\n", + "disertai: 4.257772\n", + "menjejaskan: 4.252921\n", + "sesaat: 4.082481\n", + "mata: 4.060701\n", + "abul: 4.024586\n", + "ruang: 3.983563\n", + "orator: 3.899390\n", + "universal: 3.866645\n", + "\n", + "Top-10 negative:\n", + "abah: -4.691194\n", + "raju: -4.370757\n", + "dipengerusikan: -4.142600\n", + "rs: -4.013050\n", + "bacaan: -4.001595\n", + "indonesia-malaysia: -3.921156\n", + "nacp: -3.749232\n", + "memprediksi: -3.659459\n", + "ruhut: -3.620089\n", + "pengesahan: -3.618848\n" + ] + } + ], + "source": [ + "bahdanau = malaya.entity.deep_model('bahdanau')\n", + "bahdanau.print_features(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important transitions from deep learning model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely transitions:\n", + "event -> event: 0.810878\n", + "OTHER -> OTHER: 0.626205\n", + "PAD -> OTHER: 0.519626\n", + "PAD -> event: 0.512354\n", + "law -> law: 0.460971\n", + "person -> law: 0.448240\n", + "person -> event: 0.407665\n", + "location -> event: 0.402317\n", + "organization -> PAD: 0.402057\n", + "time -> person: 0.342275\n", + "\n", + "Top-10 unlikely transitions:\n", + "person -> organization: -0.914907\n", + "law -> event: -0.843547\n", + "event -> law: -0.829639\n", + "organization -> person: -0.810431\n", + "time -> quantity: -0.783691\n", + "person -> location: -0.712586\n", + "quantity -> law: -0.663559\n", + "law -> time: -0.656724\n", + "quantity -> time: -0.640747\n", + "organization -> quantity: -0.615018\n" + ] + } + ], + "source": [ + "bahdanau.print_transitions(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Voting stack model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [ { "data": { @@ -324,8 +417,8 @@ " ('sempena', 'OTHER'),\n", " ('sambutan', 'event'),\n", " ('aidilfitri', 'event'),\n", - " ('minggu', 'time'),\n", - " ('depan', 'time'),\n", + " ('minggu', 'OTHER'),\n", + " ('depan', 'OTHER'),\n", " ('perdana', 'person'),\n", " ('menteri', 'person'),\n", " ('tun', 'person'),\n", @@ -350,7 +443,7 @@ " ('pulang', 'OTHER'),\n", " ('ke', 'OTHER'),\n", " ('kampung', 'OTHER'),\n", - " ('halaman', 'location'),\n", + " ('halaman', 'OTHER'),\n", " ('masing-masing', 'OTHER'),\n", " ('dalam', 'OTHER'),\n", " ('video', 'OTHER'),\n", @@ -360,7 +453,7 @@ " ('keselamatan', 'organization'),\n", " ('jalan', 'organization'),\n", " ('raya', 'organization'),\n", - " ('jkjr', 'organization'),\n", + " ('jkjr', 'person'),\n", " ('itu', 'OTHER'),\n", " ('dr', 'person'),\n", " ('mahathir', 'person'),\n", @@ -378,7 +471,7 @@ " ('memandu', 'OTHER')]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/example/part-of-speech/README.rst b/example/part-of-speech/README.rst index 67d0239e..6d9830db 100644 --- a/example/part-of-speech/README.rst +++ b/example/part-of-speech/README.rst @@ -1,8 +1,16 @@ .. code:: ipython3 + %%time import malaya + +.. parsed-literal:: + + CPU times: user 14.5 s, sys: 1.57 s, total: 16.1 s + Wall time: 21.3 s + + List available deep learning POS models --------------------------------------- @@ -66,26 +74,26 @@ Load CRF Model .. parsed-literal:: - [('kuala', 'PROPN'), - ('lumpur', 'PROPN'), - ('sempena', 'PROPN'), + [('Kuala', 'PROPN'), + ('Lumpur', 'PROPN'), + ('Sempena', 'SCONJ'), ('sambutan', 'NOUN'), - ('aidilfitri', 'NOUN'), + ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), - ('depan', 'ADJ'), - ('perdana', 'PROPN'), - ('menteri', 'PROPN'), - ('tun', 'PROPN'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), - ('mohamad', 'PROPN'), + ('depan', 'ADP'), + ('Perdana', 'PROPN'), + ('Menteri', 'PROPN'), + ('Tun', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), + ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), - ('menteri', 'VERB'), - ('pengangkutan', 'PROPN'), - ('anthony', 'PROPN'), - ('loke', 'PROPN'), - ('siew', 'PROPN'), - ('fook', 'PROPN'), + ('Menteri', 'PROPN'), + ('Pengangkutan', 'PROPN'), + ('Anthony', 'PROPN'), + ('Loke', 'PROPN'), + ('Siew', 'PROPN'), + ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), @@ -99,18 +107,18 @@ Load CRF Model ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), - ('dalam', 'ADP'), + ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), - ('jabatan', 'NOUN'), - ('keselamatan', 'PROPN'), - ('jalan', 'PROPN'), - ('raya', 'PROPN'), - ('jkjr', 'PROPN'), + ('Jabatan', 'PROPN'), + ('Keselamatan', 'PROPN'), + ('Jalan', 'PROPN'), + ('Raya', 'PROPN'), + ('Jkjr', 'PROPN'), ('itu', 'DET'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), @@ -121,7 +129,7 @@ Load CRF Model ('sebentar', 'ADP'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), - ('ketika', 'SCONJ'), + ('ketika', 'ADV'), ('memandu', 'VERB')] @@ -137,28 +145,28 @@ Print important features CRF model .. parsed-literal:: Top-10 positive: - 16.307872 DET word:tersebut - 15.868179 DET word:para - 15.590679 VERB word:percaya - 15.520492 ADP word:dari - 15.296975 DET word:berbagai - 14.691924 ADJ word:menakjubkan - 14.609917 ADJ word:menyejukkan - 14.503045 PRON word:kapan - 14.319357 DET word:ini - 14.267956 ADV word:pernah + 16.443463 DET word:para + 15.494273 DET word:berbagai + 14.856205 DET word:tersebut + 14.426293 ADJ word:menakjubkan + 14.319714 ADV word:memang + 14.158206 ADP word:tentang + 13.907366 VERB word:percaya + 13.635634 VERB word:integrasi + 13.630582 ADP word:dengan + 13.562358 ADV word:menurutnya Top-10 negative: - -7.217718 PROPN word:bunga - -7.258999 VERB word:memuaskan - -7.498110 ADP prev_word:pernah - -7.523901 ADV next_word-suffix-3:nai - -7.874955 NOUN prev_word-prefix-3:arw - -7.921689 NOUN suffix-2:ke - -8.049832 ADJ prev_word:sunda - -8.210202 PROPN prefix-3:ora - -8.524420 NUM prev_word:perang - -10.346546 CCONJ prev_word-suffix-3:rja + -6.663068 PROPN prefix-2:be + -6.714450 ADV next_word:menyatakan + -6.862083 PROPN next_word:Jepang + -7.183600 PROPN suffix-3:pun + -7.264241 ADV next_word-suffix-3:nai + -7.676069 VERB word:memuaskan + -7.961231 ADP prev_word:pernah + -8.006671 NOUN suffix-2:ke + -8.135974 ADP prev_word-prefix-3:pal + -8.173493 PROPN suffix-3:nya Print important transitions CRF model @@ -172,28 +180,28 @@ Print important transitions CRF model .. parsed-literal:: Top-10 likely transitions: - PROPN -> PROPN 5.529614 - DET -> DET 4.492123 - NOUN -> NOUN 2.600533 - ADJ -> ADJ 2.276762 - CCONJ -> CCONJ 1.888801 - CCONJ -> SCONJ 1.855106 - NOUN -> ADJ 1.729610 - SCONJ -> CCONJ 1.598273 - NUM -> NUM 1.475505 - ADV -> VERB 1.442607 + PROPN -> PROPN 5.767666 + NOUN -> NOUN 4.291842 + DET -> DET 3.723729 + NOUN -> PROPN 3.035784 + CCONJ -> CCONJ 2.545162 + X -> X 2.476296 + ADP -> NOUN 2.324735 + ADJ -> ADJ 2.285807 + NOUN -> ADJ 2.258407 + ADP -> PROPN 2.181474 Top-10 unlikely transitions: - SCONJ -> AUX -3.559017 - X -> SCONJ -3.566058 - SYM -> ADJ -3.720358 - PART -> ADP -3.744172 - X -> CCONJ -4.270577 - PART -> PART -4.543812 - ADV -> X -4.809254 - ADP -> SCONJ -5.157816 - ADP -> CCONJ -5.455725 - ADP -> SYM -6.841944 + SCONJ -> AUX -3.341014 + PART -> NUM -3.406289 + SCONJ -> ADJ -3.447362 + SYM -> ADV -3.468094 + SYM -> ADJ -3.597291 + AUX -> NUM -3.657861 + PART -> PART -4.059430 + X -> CCONJ -4.929272 + ADP -> SCONJ -4.960199 + ADP -> CCONJ -6.236844 Load deep learning models @@ -211,22 +219,93 @@ Load deep learning models .. parsed-literal:: Testing concat model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'PROPN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'NOUN'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADJ'), ('sekiranya', 'NOUN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'NOUN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'NUM'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing bahdanau model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'PROPN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'PROPN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'VERB'), ('depan', 'CCONJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing luong model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'NOUN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'NOUN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'NOUN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] Testing entity-network model - [('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'PROPN'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'NOUN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'VERB'), ('fook', 'NOUN'), ('menitipkan', 'NOUN'), ('pesanan', 'VERB'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'PROPN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'VERB'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'ADJ'), ('ketika', 'SCONJ'), ('memandu', 'VERB')] + [('Kuala', 'NUM'), ('Lumpur', 'NUM'), ('Sempena', 'NUM'), ('sambutan', 'NUM'), ('Aidilfitri', 'SYM'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'NUM'), ('Menteri', 'NUM'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'NUM'), ('dan', 'CCONJ'), ('Menteri', 'NUM'), ('Pengangkutan', 'NUM'), ('Anthony', 'NUM'), ('Loke', 'NUM'), ('Siew', 'NUM'), ('Fook', 'NUM'), ('menitipkan', 'NUM'), ('pesanan', 'SYM'), ('khas', 'PROPN'), ('kepada', 'PROPN'), ('orang', 'PROPN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'VERB'), ('pulang', 'PROPN'), ('ke', 'PROPN'), ('kampung', 'VERB'), ('halaman', 'NUM'), ('masing-masing', 'PROPN'), ('Dalam', 'PROPN'), ('video', 'PROPN'), ('pendek', 'PROPN'), ('terbitan', 'NUM'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NUM'), ('Jalan', 'NUM'), ('Raya', 'NUM'), ('Jkjr', 'NUM'), ('itu', 'NUM'), ('Dr', 'NUM'), ('Mahathir', 'NUM'), ('menasihati', 'NUM'), ('mereka', 'NOUN'), ('supaya', 'NOUN'), ('berhenti', 'ADJ'), ('berehat', 'ADJ'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'NOUN'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'PROPN'), ('memandu', 'PROPN')] Testing attention model - [('kuala', 'X'), ('lumpur', 'DET'), ('sempena', 'X'), ('sambutan', 'DET'), ('aidilfitri', 'X'), ('minggu', 'DET'), ('depan', 'X'), ('perdana', 'DET'), ('menteri', 'X'), ('tun', 'DET'), ('dr', 'X'), ('mahathir', 'DET'), ('mohamad', 'X'), ('dan', 'DET'), ('menteri', 'X'), ('pengangkutan', 'DET'), ('anthony', 'X'), ('loke', 'DET'), ('siew', 'X'), ('fook', 'DET'), ('menitipkan', 'X'), ('pesanan', 'DET'), ('khas', 'X'), ('kepada', 'DET'), ('orang', 'X'), ('ramai', 'DET'), ('yang', 'X'), ('mahu', 'DET'), ('pulang', 'X'), ('ke', 'DET'), ('kampung', 'X'), ('halaman', 'DET'), ('masing-masing', 'X'), ('dalam', 'DET'), ('video', 'X'), ('pendek', 'DET'), ('terbitan', 'X'), ('jabatan', 'DET'), ('keselamatan', 'X'), ('jalan', 'DET'), ('raya', 'X'), ('jkjr', 'DET'), ('itu', 'X'), ('dr', 'DET'), ('mahathir', 'X'), ('menasihati', 'DET'), ('mereka', 'X'), ('supaya', 'DET'), ('berhenti', 'X'), ('berehat', 'DET'), ('dan', 'X'), ('tidur', 'DET'), ('sebentar', 'X'), ('sekiranya', 'DET'), ('mengantuk', 'X'), ('ketika', 'DET'), ('memandu', 'VERB')] + [('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'CCONJ'), ('memandu', 'VERB')] +Print important features from deep learning model +------------------------------------------------- + +.. code:: ipython3 + + bahdanau = malaya.pos.deep_model('bahdanau') + bahdanau.print_features(10) + + +.. parsed-literal:: + + Top-10 positive: + 1971: 4.942553 + Puisi: 4.754801 + 27: 4.659504 + buahan: 4.551769 + kaisarnya: 4.503439 + Kedua: 4.459490 + Times: 4.378673 + perlengkapan: 4.342615 + kelautan: 4.273527 + Persija: 4.260429 + + Top-10 negative: + Sakova: -5.102705 + engkau: -5.000618 + Cin: -4.962496 + bermesin: -4.823804 + Husm: -4.719638 + saatnya: -4.693280 + Vireta: -4.615777 + menjamu: -4.589007 + Aff: -4.437630 + dilahirkan: -4.422080 + + +Print important transitions from deep learning model +---------------------------------------------------- + +.. code:: ipython3 + + bahdanau.print_transitions(10) + + +.. parsed-literal:: + + Top-10 likely transitions: + SCONJ -> CCONJ: 0.688627 + SCONJ -> PRON: 0.539603 + ADV -> NUM: 0.517046 + PROPN -> PART: 0.479875 + ADP -> DET: 0.470052 + AUX -> ADV: 0.424240 + PRON -> NUM: 0.420834 + PAD -> AUX: 0.415958 + NUM -> ADV: 0.401860 + PART -> SYM: 0.395167 + + Top-10 unlikely transitions: + ADP -> CCONJ: -0.791846 + DET -> X: -0.675577 + SCONJ -> SCONJ: -0.665004 + VERB -> VERB: -0.646812 + PART -> NUM: -0.644018 + CCONJ -> CCONJ: -0.590792 + AUX -> NUM: -0.579523 + ADV -> SCONJ: -0.569171 + NUM -> VERB: -0.568291 + PRON -> SYM: -0.563159 + + Voting stack model ------------------ @@ -242,27 +321,27 @@ Voting stack model .. parsed-literal:: - [('kuala', 'PROPN'), - ('lumpur', 'PROPN'), - ('sempena', 'PROPN'), + [('Kuala', 'PROPN'), + ('Lumpur', 'PROPN'), + ('Sempena', 'NUM'), ('sambutan', 'NOUN'), - ('aidilfitri', 'PROPN'), - ('minggu', 'PROPN'), - ('depan', 'ADJ'), - ('perdana', 'PROPN'), - ('menteri', 'PROPN'), - ('tun', 'PROPN'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), - ('mohamad', 'PROPN'), + ('Aidilfitri', 'PROPN'), + ('minggu', 'NOUN'), + ('depan', 'ADP'), + ('Perdana', 'PROPN'), + ('Menteri', 'PROPN'), + ('Tun', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), + ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), - ('menteri', 'NOUN'), - ('pengangkutan', 'PROPN'), - ('anthony', 'PROPN'), - ('loke', 'PROPN'), - ('siew', 'PROPN'), - ('fook', 'NOUN'), - ('menitipkan', 'PROPN'), + ('Menteri', 'PROPN'), + ('Pengangkutan', 'PROPN'), + ('Anthony', 'PROPN'), + ('Loke', 'PROPN'), + ('Siew', 'PROPN'), + ('Fook', 'PROPN'), + ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), @@ -275,18 +354,18 @@ Voting stack model ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), - ('dalam', 'ADP'), + ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), - ('jabatan', 'NOUN'), - ('keselamatan', 'PROPN'), - ('jalan', 'PROPN'), - ('raya', 'PROPN'), - ('jkjr', 'PROPN'), + ('Jabatan', 'NOUN'), + ('Keselamatan', 'NUM'), + ('Jalan', 'PROPN'), + ('Raya', 'PROPN'), + ('Jkjr', 'NUM'), ('itu', 'DET'), - ('dr', 'PROPN'), - ('mahathir', 'PROPN'), + ('Dr', 'PROPN'), + ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), @@ -294,10 +373,10 @@ Voting stack model ('berehat', 'VERB'), ('dan', 'CCONJ'), ('tidur', 'VERB'), - ('sebentar', 'ADV'), + ('sebentar', 'ADP'), ('sekiranya', 'NOUN'), - ('mengantuk', 'PROPN'), - ('ketika', 'SCONJ'), + ('mengantuk', 'NUM'), + ('ketika', 'NUM'), ('memandu', 'VERB')] diff --git a/example/part-of-speech/load-pos.ipynb b/example/part-of-speech/load-pos.ipynb index de956c48..bae48516 100644 --- a/example/part-of-speech/load-pos.ipynb +++ b/example/part-of-speech/load-pos.ipynb @@ -4,8 +4,18 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 14.5 s, sys: 1.57 s, total: 16.1 s\n", + "Wall time: 21.3 s\n" + ] + } + ], "source": [ + "%%time\n", "import malaya" ] }, @@ -107,26 +117,26 @@ { "data": { "text/plain": [ - "[('kuala', 'PROPN'),\n", - " ('lumpur', 'PROPN'),\n", - " ('sempena', 'PROPN'),\n", + "[('Kuala', 'PROPN'),\n", + " ('Lumpur', 'PROPN'),\n", + " ('Sempena', 'SCONJ'),\n", " ('sambutan', 'NOUN'),\n", - " ('aidilfitri', 'NOUN'),\n", + " ('Aidilfitri', 'PROPN'),\n", " ('minggu', 'NOUN'),\n", - " ('depan', 'ADJ'),\n", - " ('perdana', 'PROPN'),\n", - " ('menteri', 'PROPN'),\n", - " ('tun', 'PROPN'),\n", - " ('dr', 'PROPN'),\n", - " ('mahathir', 'PROPN'),\n", - " ('mohamad', 'PROPN'),\n", + " ('depan', 'ADP'),\n", + " ('Perdana', 'PROPN'),\n", + " ('Menteri', 'PROPN'),\n", + " ('Tun', 'PROPN'),\n", + " ('Dr', 'PROPN'),\n", + " ('Mahathir', 'PROPN'),\n", + " ('Mohamad', 'PROPN'),\n", " ('dan', 'CCONJ'),\n", - " ('menteri', 'VERB'),\n", - " ('pengangkutan', 'PROPN'),\n", - " ('anthony', 'PROPN'),\n", - " ('loke', 'PROPN'),\n", - " ('siew', 'PROPN'),\n", - " ('fook', 'PROPN'),\n", + " ('Menteri', 'PROPN'),\n", + " ('Pengangkutan', 'PROPN'),\n", + " ('Anthony', 'PROPN'),\n", + " ('Loke', 'PROPN'),\n", + " ('Siew', 'PROPN'),\n", + " ('Fook', 'PROPN'),\n", " ('menitipkan', 'VERB'),\n", " ('pesanan', 'NOUN'),\n", " ('khas', 'ADJ'),\n", @@ -140,18 +150,18 @@ " ('kampung', 'NOUN'),\n", " ('halaman', 'NOUN'),\n", " ('masing-masing', 'NOUN'),\n", - " ('dalam', 'ADP'),\n", + " ('Dalam', 'ADP'),\n", " ('video', 'NOUN'),\n", " ('pendek', 'ADJ'),\n", " ('terbitan', 'NOUN'),\n", - " ('jabatan', 'NOUN'),\n", - " ('keselamatan', 'PROPN'),\n", - " ('jalan', 'PROPN'),\n", - " ('raya', 'PROPN'),\n", - " ('jkjr', 'PROPN'),\n", + " ('Jabatan', 'PROPN'),\n", + " ('Keselamatan', 'PROPN'),\n", + " ('Jalan', 'PROPN'),\n", + " ('Raya', 'PROPN'),\n", + " ('Jkjr', 'PROPN'),\n", " ('itu', 'DET'),\n", - " ('dr', 'PROPN'),\n", - " ('mahathir', 'PROPN'),\n", + " ('Dr', 'PROPN'),\n", + " ('Mahathir', 'PROPN'),\n", " ('menasihati', 'VERB'),\n", " ('mereka', 'PRON'),\n", " ('supaya', 'SCONJ'),\n", @@ -162,7 +172,7 @@ " ('sebentar', 'ADP'),\n", " ('sekiranya', 'NOUN'),\n", " ('mengantuk', 'VERB'),\n", - " ('ketika', 'SCONJ'),\n", + " ('ketika', 'ADV'),\n", " ('memandu', 'VERB')]" ] }, @@ -192,28 +202,28 @@ "output_type": "stream", "text": [ "Top-10 positive:\n", - "16.307872 DET word:tersebut\n", - "15.868179 DET word:para\n", - "15.590679 VERB word:percaya\n", - "15.520492 ADP word:dari\n", - "15.296975 DET word:berbagai\n", - "14.691924 ADJ word:menakjubkan\n", - "14.609917 ADJ word:menyejukkan\n", - "14.503045 PRON word:kapan\n", - "14.319357 DET word:ini\n", - "14.267956 ADV word:pernah\n", + "16.443463 DET word:para\n", + "15.494273 DET word:berbagai\n", + "14.856205 DET word:tersebut\n", + "14.426293 ADJ word:menakjubkan\n", + "14.319714 ADV word:memang\n", + "14.158206 ADP word:tentang\n", + "13.907366 VERB word:percaya\n", + "13.635634 VERB word:integrasi\n", + "13.630582 ADP word:dengan\n", + "13.562358 ADV word:menurutnya\n", "\n", "Top-10 negative:\n", - "-7.217718 PROPN word:bunga\n", - "-7.258999 VERB word:memuaskan\n", - "-7.498110 ADP prev_word:pernah\n", - "-7.523901 ADV next_word-suffix-3:nai\n", - "-7.874955 NOUN prev_word-prefix-3:arw\n", - "-7.921689 NOUN suffix-2:ke\n", - "-8.049832 ADJ prev_word:sunda\n", - "-8.210202 PROPN prefix-3:ora\n", - "-8.524420 NUM prev_word:perang\n", - "-10.346546 CCONJ prev_word-suffix-3:rja\n" + "-6.663068 PROPN prefix-2:be\n", + "-6.714450 ADV next_word:menyatakan\n", + "-6.862083 PROPN next_word:Jepang\n", + "-7.183600 PROPN suffix-3:pun\n", + "-7.264241 ADV next_word-suffix-3:nai\n", + "-7.676069 VERB word:memuaskan\n", + "-7.961231 ADP prev_word:pernah\n", + "-8.006671 NOUN suffix-2:ke\n", + "-8.135974 ADP prev_word-prefix-3:pal\n", + "-8.173493 PROPN suffix-3:nya\n" ] } ], @@ -238,28 +248,28 @@ "output_type": "stream", "text": [ "Top-10 likely transitions:\n", - "PROPN -> PROPN 5.529614\n", - "DET -> DET 4.492123\n", - "NOUN -> NOUN 2.600533\n", - "ADJ -> ADJ 2.276762\n", - "CCONJ -> CCONJ 1.888801\n", - "CCONJ -> SCONJ 1.855106\n", - "NOUN -> ADJ 1.729610\n", - "SCONJ -> CCONJ 1.598273\n", - "NUM -> NUM 1.475505\n", - "ADV -> VERB 1.442607\n", + "PROPN -> PROPN 5.767666\n", + "NOUN -> NOUN 4.291842\n", + "DET -> DET 3.723729\n", + "NOUN -> PROPN 3.035784\n", + "CCONJ -> CCONJ 2.545162\n", + "X -> X 2.476296\n", + "ADP -> NOUN 2.324735\n", + "ADJ -> ADJ 2.285807\n", + "NOUN -> ADJ 2.258407\n", + "ADP -> PROPN 2.181474\n", "\n", "Top-10 unlikely transitions:\n", - "SCONJ -> AUX -3.559017\n", - "X -> SCONJ -3.566058\n", - "SYM -> ADJ -3.720358\n", - "PART -> ADP -3.744172\n", - "X -> CCONJ -4.270577\n", - "PART -> PART -4.543812\n", - "ADV -> X -4.809254\n", - "ADP -> SCONJ -5.157816\n", - "ADP -> CCONJ -5.455725\n", - "ADP -> SYM -6.841944\n" + "SCONJ -> AUX -3.341014\n", + "PART -> NUM -3.406289\n", + "SCONJ -> ADJ -3.447362\n", + "SYM -> ADV -3.468094\n", + "SYM -> ADJ -3.597291\n", + "AUX -> NUM -3.657861\n", + "PART -> PART -4.059430\n", + "X -> CCONJ -4.929272\n", + "ADP -> SCONJ -4.960199\n", + "ADP -> CCONJ -6.236844\n" ] } ], @@ -284,19 +294,19 @@ "output_type": "stream", "text": [ "Testing concat model\n", - "[('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'PROPN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'NOUN'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADJ'), ('sekiranya', 'NOUN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", + "[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'NOUN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'NOUN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'NUM'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", "\n", "Testing bahdanau model\n", - "[('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'NOUN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'ADJ'), ('menteri', 'NOUN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'PROPN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'ADV'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'PROPN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'PROPN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", + "[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'VERB'), ('depan', 'CCONJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NOUN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'NOUN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", "\n", "Testing luong model\n", - "[('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'ADJ'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'NOUN'), ('pengangkutan', 'NOUN'), ('anthony', 'PROPN'), ('loke', 'PROPN'), ('siew', 'PROPN'), ('fook', 'PROPN'), ('menitipkan', 'PROPN'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'NOUN'), ('jalan', 'NOUN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'NOUN'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", + "[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'PROPN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", "\n", "Testing entity-network model\n", - "[('kuala', 'PROPN'), ('lumpur', 'PROPN'), ('sempena', 'PROPN'), ('sambutan', 'PROPN'), ('aidilfitri', 'PROPN'), ('minggu', 'PROPN'), ('depan', 'PROPN'), ('perdana', 'PROPN'), ('menteri', 'PROPN'), ('tun', 'PROPN'), ('dr', 'PROPN'), ('mahathir', 'PROPN'), ('mohamad', 'PROPN'), ('dan', 'CCONJ'), ('menteri', 'PROPN'), ('pengangkutan', 'NOUN'), ('anthony', 'NOUN'), ('loke', 'NOUN'), ('siew', 'VERB'), ('fook', 'NOUN'), ('menitipkan', 'NOUN'), ('pesanan', 'VERB'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'NOUN'), ('dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('jabatan', 'NOUN'), ('keselamatan', 'PROPN'), ('jalan', 'PROPN'), ('raya', 'PROPN'), ('jkjr', 'PROPN'), ('itu', 'DET'), ('dr', 'PROPN'), ('mahathir', 'VERB'), ('menasihati', 'PROPN'), ('mereka', 'PRON'), ('supaya', 'ADV'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'ADJ'), ('ketika', 'SCONJ'), ('memandu', 'VERB')]\n", + "[('Kuala', 'NUM'), ('Lumpur', 'NUM'), ('Sempena', 'NUM'), ('sambutan', 'NUM'), ('Aidilfitri', 'SYM'), ('minggu', 'NOUN'), ('depan', 'NOUN'), ('Perdana', 'NUM'), ('Menteri', 'NUM'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'NUM'), ('dan', 'CCONJ'), ('Menteri', 'NUM'), ('Pengangkutan', 'NUM'), ('Anthony', 'NUM'), ('Loke', 'NUM'), ('Siew', 'NUM'), ('Fook', 'NUM'), ('menitipkan', 'NUM'), ('pesanan', 'SYM'), ('khas', 'PROPN'), ('kepada', 'PROPN'), ('orang', 'PROPN'), ('ramai', 'NOUN'), ('yang', 'PRON'), ('mahu', 'VERB'), ('pulang', 'PROPN'), ('ke', 'PROPN'), ('kampung', 'VERB'), ('halaman', 'NUM'), ('masing-masing', 'PROPN'), ('Dalam', 'PROPN'), ('video', 'PROPN'), ('pendek', 'PROPN'), ('terbitan', 'NUM'), ('Jabatan', 'NOUN'), ('Keselamatan', 'NUM'), ('Jalan', 'NUM'), ('Raya', 'NUM'), ('Jkjr', 'NUM'), ('itu', 'NUM'), ('Dr', 'NUM'), ('Mahathir', 'NUM'), ('menasihati', 'NUM'), ('mereka', 'NOUN'), ('supaya', 'NOUN'), ('berhenti', 'ADJ'), ('berehat', 'ADJ'), ('dan', 'CCONJ'), ('tidur', 'NOUN'), ('sebentar', 'NOUN'), ('sekiranya', 'PROPN'), ('mengantuk', 'PROPN'), ('ketika', 'PROPN'), ('memandu', 'PROPN')]\n", "\n", "Testing attention model\n", - "[('kuala', 'X'), ('lumpur', 'DET'), ('sempena', 'X'), ('sambutan', 'DET'), ('aidilfitri', 'X'), ('minggu', 'DET'), ('depan', 'X'), ('perdana', 'DET'), ('menteri', 'X'), ('tun', 'DET'), ('dr', 'X'), ('mahathir', 'DET'), ('mohamad', 'X'), ('dan', 'DET'), ('menteri', 'X'), ('pengangkutan', 'DET'), ('anthony', 'X'), ('loke', 'DET'), ('siew', 'X'), ('fook', 'DET'), ('menitipkan', 'X'), ('pesanan', 'DET'), ('khas', 'X'), ('kepada', 'DET'), ('orang', 'X'), ('ramai', 'DET'), ('yang', 'X'), ('mahu', 'DET'), ('pulang', 'X'), ('ke', 'DET'), ('kampung', 'X'), ('halaman', 'DET'), ('masing-masing', 'X'), ('dalam', 'DET'), ('video', 'X'), ('pendek', 'DET'), ('terbitan', 'X'), ('jabatan', 'DET'), ('keselamatan', 'X'), ('jalan', 'DET'), ('raya', 'X'), ('jkjr', 'DET'), ('itu', 'X'), ('dr', 'DET'), ('mahathir', 'X'), ('menasihati', 'DET'), ('mereka', 'X'), ('supaya', 'DET'), ('berhenti', 'X'), ('berehat', 'DET'), ('dan', 'X'), ('tidur', 'DET'), ('sebentar', 'X'), ('sekiranya', 'DET'), ('mengantuk', 'X'), ('ketika', 'DET'), ('memandu', 'VERB')]\n", + "[('Kuala', 'PROPN'), ('Lumpur', 'PROPN'), ('Sempena', 'PROPN'), ('sambutan', 'NOUN'), ('Aidilfitri', 'PROPN'), ('minggu', 'NOUN'), ('depan', 'ADJ'), ('Perdana', 'PROPN'), ('Menteri', 'PROPN'), ('Tun', 'PROPN'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('Mohamad', 'PROPN'), ('dan', 'CCONJ'), ('Menteri', 'PROPN'), ('Pengangkutan', 'PROPN'), ('Anthony', 'PROPN'), ('Loke', 'PROPN'), ('Siew', 'PROPN'), ('Fook', 'PROPN'), ('menitipkan', 'VERB'), ('pesanan', 'NOUN'), ('khas', 'ADJ'), ('kepada', 'ADP'), ('orang', 'NOUN'), ('ramai', 'ADJ'), ('yang', 'PRON'), ('mahu', 'ADV'), ('pulang', 'VERB'), ('ke', 'ADP'), ('kampung', 'NOUN'), ('halaman', 'NOUN'), ('masing-masing', 'VERB'), ('Dalam', 'ADP'), ('video', 'NOUN'), ('pendek', 'ADJ'), ('terbitan', 'NOUN'), ('Jabatan', 'NOUN'), ('Keselamatan', 'PROPN'), ('Jalan', 'PROPN'), ('Raya', 'PROPN'), ('Jkjr', 'PROPN'), ('itu', 'DET'), ('Dr', 'PROPN'), ('Mahathir', 'PROPN'), ('menasihati', 'VERB'), ('mereka', 'PRON'), ('supaya', 'SCONJ'), ('berhenti', 'VERB'), ('berehat', 'NOUN'), ('dan', 'CCONJ'), ('tidur', 'VERB'), ('sebentar', 'ADV'), ('sekiranya', 'NOUN'), ('mengantuk', 'VERB'), ('ketika', 'CCONJ'), ('memandu', 'VERB')]\n", "\n" ] } @@ -313,38 +323,131 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Voting stack model" + "## Print important features from deep learning model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 positive:\n", + "1971: 4.942553\n", + "Puisi: 4.754801\n", + "27: 4.659504\n", + "buahan: 4.551769\n", + "kaisarnya: 4.503439\n", + "Kedua: 4.459490\n", + "Times: 4.378673\n", + "perlengkapan: 4.342615\n", + "kelautan: 4.273527\n", + "Persija: 4.260429\n", + "\n", + "Top-10 negative:\n", + "Sakova: -5.102705\n", + "engkau: -5.000618\n", + "Cin: -4.962496\n", + "bermesin: -4.823804\n", + "Husm: -4.719638\n", + "saatnya: -4.693280\n", + "Vireta: -4.615777\n", + "menjamu: -4.589007\n", + "Aff: -4.437630\n", + "dilahirkan: -4.422080\n" + ] + } + ], + "source": [ + "bahdanau = malaya.pos.deep_model('bahdanau')\n", + "bahdanau.print_features(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print important transitions from deep learning model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top-10 likely transitions:\n", + "SCONJ -> CCONJ: 0.688627\n", + "SCONJ -> PRON: 0.539603\n", + "ADV -> NUM: 0.517046\n", + "PROPN -> PART: 0.479875\n", + "ADP -> DET: 0.470052\n", + "AUX -> ADV: 0.424240\n", + "PRON -> NUM: 0.420834\n", + "PAD -> AUX: 0.415958\n", + "NUM -> ADV: 0.401860\n", + "PART -> SYM: 0.395167\n", + "\n", + "Top-10 unlikely transitions:\n", + "ADP -> CCONJ: -0.791846\n", + "DET -> X: -0.675577\n", + "SCONJ -> SCONJ: -0.665004\n", + "VERB -> VERB: -0.646812\n", + "PART -> NUM: -0.644018\n", + "CCONJ -> CCONJ: -0.590792\n", + "AUX -> NUM: -0.579523\n", + "ADV -> SCONJ: -0.569171\n", + "NUM -> VERB: -0.568291\n", + "PRON -> SYM: -0.563159\n" + ] + } + ], + "source": [ + "bahdanau.print_transitions(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Voting stack model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('kuala', 'PROPN'),\n", - " ('lumpur', 'PROPN'),\n", - " ('sempena', 'PROPN'),\n", + "[('Kuala', 'PROPN'),\n", + " ('Lumpur', 'PROPN'),\n", + " ('Sempena', 'NUM'),\n", " ('sambutan', 'NOUN'),\n", - " ('aidilfitri', 'PROPN'),\n", - " ('minggu', 'PROPN'),\n", - " ('depan', 'ADJ'),\n", - " ('perdana', 'PROPN'),\n", - " ('menteri', 'PROPN'),\n", - " ('tun', 'PROPN'),\n", - " ('dr', 'PROPN'),\n", - " ('mahathir', 'PROPN'),\n", - " ('mohamad', 'PROPN'),\n", + " ('Aidilfitri', 'PROPN'),\n", + " ('minggu', 'NOUN'),\n", + " ('depan', 'ADP'),\n", + " ('Perdana', 'PROPN'),\n", + " ('Menteri', 'PROPN'),\n", + " ('Tun', 'PROPN'),\n", + " ('Dr', 'PROPN'),\n", + " ('Mahathir', 'PROPN'),\n", + " ('Mohamad', 'PROPN'),\n", " ('dan', 'CCONJ'),\n", - " ('menteri', 'NOUN'),\n", - " ('pengangkutan', 'PROPN'),\n", - " ('anthony', 'PROPN'),\n", - " ('loke', 'PROPN'),\n", - " ('siew', 'PROPN'),\n", - " ('fook', 'NOUN'),\n", - " ('menitipkan', 'PROPN'),\n", + " ('Menteri', 'PROPN'),\n", + " ('Pengangkutan', 'PROPN'),\n", + " ('Anthony', 'PROPN'),\n", + " ('Loke', 'PROPN'),\n", + " ('Siew', 'PROPN'),\n", + " ('Fook', 'PROPN'),\n", + " ('menitipkan', 'VERB'),\n", " ('pesanan', 'NOUN'),\n", " ('khas', 'ADJ'),\n", " ('kepada', 'ADP'),\n", @@ -357,18 +460,18 @@ " ('kampung', 'NOUN'),\n", " ('halaman', 'NOUN'),\n", " ('masing-masing', 'NOUN'),\n", - " ('dalam', 'ADP'),\n", + " ('Dalam', 'ADP'),\n", " ('video', 'NOUN'),\n", " ('pendek', 'ADJ'),\n", " ('terbitan', 'NOUN'),\n", - " ('jabatan', 'NOUN'),\n", - " ('keselamatan', 'PROPN'),\n", - " ('jalan', 'PROPN'),\n", - " ('raya', 'PROPN'),\n", - " ('jkjr', 'PROPN'),\n", + " ('Jabatan', 'NOUN'),\n", + " ('Keselamatan', 'NUM'),\n", + " ('Jalan', 'PROPN'),\n", + " ('Raya', 'PROPN'),\n", + " ('Jkjr', 'NUM'),\n", " ('itu', 'DET'),\n", - " ('dr', 'PROPN'),\n", - " ('mahathir', 'PROPN'),\n", + " ('Dr', 'PROPN'),\n", + " ('Mahathir', 'PROPN'),\n", " ('menasihati', 'VERB'),\n", " ('mereka', 'PRON'),\n", " ('supaya', 'SCONJ'),\n", @@ -376,14 +479,14 @@ " ('berehat', 'VERB'),\n", " ('dan', 'CCONJ'),\n", " ('tidur', 'VERB'),\n", - " ('sebentar', 'ADV'),\n", + " ('sebentar', 'ADP'),\n", " ('sekiranya', 'NOUN'),\n", - " ('mengantuk', 'PROPN'),\n", - " ('ketika', 'SCONJ'),\n", + " ('mengantuk', 'NUM'),\n", + " ('ketika', 'NUM'),\n", " ('memandu', 'VERB')]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } diff --git a/malaya/__init__.py b/malaya/__init__.py index 688e7c9a..d89bd175 100644 --- a/malaya/__init__.py +++ b/malaya/__init__.py @@ -225,7 +225,42 @@ def describe_entities(): print('event - unique event happened, etc') +def describe_dependency(): + """ + Describe Dependency supported. + """ + print('acl - clausal modifier of noun') + print('advcl - adverbial clause modifier') + print('advmod - adverbial modifier') + print('amod - adjectival modifier') + print('appos - appositional modifier') + print('aux - auxiliary') + print('case - case marking') + print('ccomp - clausal complement') + print('compound - compound') + print('compound:plur - plural compound') + print('conj - conjunct') + print('cop - cop') + print('csubj - clausal subject') + print('dep - dependent') + print('det - determiner') + print('fixed - multi-word expression') + print('flat - name') + print('iobj - indirect object') + print('mark - marker') + print('nmod - nominal modifier') + print('nsubj - nominal subject') + print('obj - direct object') + print('parataxis - parataxis') + print('root - root') + print('xcomp - open clausal complement') + print( + 'you can read more from https://universaldependencies.org/en/dep/xcomp.html' + ) + + from . import cluster +from . import dependency from . import emotion from . import entity from . import language_detection diff --git a/malaya/_models/_sklearn_model.py b/malaya/_models/_sklearn_model.py index 0a1645e6..f5ca2fbf 100644 --- a/malaya/_models/_sklearn_model.py +++ b/malaya/_models/_sklearn_model.py @@ -8,7 +8,7 @@ entities_textcleaning, language_detection_textcleaning, ) -from ..texts.vectorizer import features_crf +from ..texts.vectorizer import features_crf, features_crf_dependency def transitions(trans_features): @@ -84,6 +84,102 @@ def print_features(self, top_k = 10): ) +class DEPENDENCY: + def __init__(self, tag, depend): + self._tag = tag + self._depend = depend + + def predict(self, string): + """ + Tag a string + + Parameters + ---------- + string : str + + Returns + ------- + string: tagged string + """ + assert isinstance(string, str), 'input must be a string' + string = entities_textcleaning(string) + if len(string) > 120: + raise Exception( + 'Dependency parsing only able to accept string less than 120 words' + ) + batch_x = [features_crf(string, index) for index in range(len(string))] + tagging = self._tag.predict_single(batch_x) + batch_x = [ + features_crf_dependency(string, tagging, index) + for index in range(len(string)) + ] + depend = [int(i) for i in self._depend.predict_single(batch_x)] + for i in range(len(depend)): + if depend[i] == 0 and tagging[i] != 'root': + tagging[i] = 'UNK' + elif depend[i] != 0 and tagging[i] == 'root': + tagging[i] = 'UNK' + elif depend[i] > len(tagging): + depend[i] = len(tagging) + return ( + [(string[i], tagging[i]) for i in range(len(depend))], + [(string[i], depend[i]) for i in range(len(depend))], + ) + + def print_features(self, top_k = 10): + """ + Print important top-k features for tagging dependency + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d tagging positive:' % (top_k)) + state_features(Counter(self._tag.state_features_).most_common(top_k)) + + print('\nTop-%d tagging negative:' % (top_k)) + state_features( + Counter(self._tag.state_features_).most_common()[-top_k:] + ) + + def print_transitions_tag(self, top_k = 10): + """ + Print important top-k transitions for tagging dependency + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d likely tagging transitions:' % (top_k)) + transitions(Counter(self._tag.transition_features_).most_common(top_k)) + + print('\nTop-%d unlikely tagging transitions:' % (top_k)) + transitions( + Counter(self._tag.transition_features_).most_common()[-top_k:] + ) + + def print_transitions_index(self, top_k = 10): + """ + Print important top-k transitions for indexing dependency + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d likely indexing transitions:' % (top_k)) + transitions( + Counter(self._depend.transition_features_).most_common(top_k) + ) + + print('\nTop-%d unlikely indexing transitions:' % (top_k)) + transitions( + Counter(self._depend.transition_features_).most_common()[-top_k:] + ) + + class USER_XGB: def __init__(self, xgb, label, vectorize, cleaning = simple_textcleaning): self.xgb = xgb diff --git a/malaya/_models/_tensorflow_model.py b/malaya/_models/_tensorflow_model.py index 16c27c32..0f16a51d 100644 --- a/malaya/_models/_tensorflow_model.py +++ b/malaya/_models/_tensorflow_model.py @@ -15,6 +15,7 @@ generate_char_seq, language_detection_textcleaning, ) +from .._utils._parse_dependency import DependencyGraph from ..stem import _classification_textcleaning_stemmer_attention @@ -54,6 +55,186 @@ def __init__(self, output_size, embedded_size, vocab_size): self.logits = tf.layers.dense(embed, output_size) +class DEPENDENCY: + def __init__( + self, + X, + X_char, + logits, + logits_depends, + settings, + sess, + model, + transitions, + transitions_depends, + features, + ): + self._X = X + self._X_char = X_char + self._logits = logits + self._logits_depends = logits_depends + self._settings = settings + self._sess = sess + self._model = model + self._settings['idx2tag'] = { + int(k): v for k, v in self._settings['idx2tag'].items() + } + self.transitions, self.transitions_depends, self.features = self._sess.run( + [transitions, transitions_depends, features] + ) + + def print_transitions_tag(self, top_k = 10): + """ + Print important top-k transitions for tagging dependency + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d likely transitions:' % (top_k)) + indices = np.argsort(self.transitions.flatten())[::-1] + top_trans = [ + np.unravel_index(i, self.transitions.shape) for i in indices[:top_k] + ] + for i in range(top_k): + print( + '%s -> %s: %f' + % ( + self._settings['idx2tag'][top_trans[i][0]], + self._settings['idx2tag'][top_trans[i][1]], + self.transitions[top_trans[i]], + ) + ) + + bottom_trans = [ + np.unravel_index(i, self.transitions.shape) + for i in indices[::-1][:top_k] + ] + print('\nTop-%d unlikely transitions:' % (top_k)) + for i in range(top_k): + print( + '%s -> %s: %f' + % ( + self._settings['idx2tag'][bottom_trans[i][0]], + self._settings['idx2tag'][bottom_trans[i][1]], + self.transitions[bottom_trans[i]], + ) + ) + + def print_transitions_index(self, top_k = 10): + """ + Print important top-k transitions for indexing dependency + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d likely transitions:' % (top_k)) + indices = np.argsort(self.transitions_depends.flatten())[::-1] + top_trans = [ + np.unravel_index(i, self.transitions_depends.shape) + for i in indices[:top_k] + ] + for i in range(top_k): + print( + '%d -> %d: %f' + % ( + top_trans[i][0], + top_trans[i][1], + self.transitions_depends[top_trans[i]], + ) + ) + + bottom_trans = [ + np.unravel_index(i, self.transitions_depends.shape) + for i in indices[::-1][:top_k] + ] + print('\nTop-%d unlikely transitions:' % (top_k)) + for i in range(top_k): + print( + '%d -> %d: %f' + % ( + bottom_trans[i][0], + bottom_trans[i][1], + self.transitions_depends[bottom_trans[i]], + ) + ) + + def print_features(self, top_k = 10): + """ + Print important top-k features + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + _features = self.features.sum(axis = 1) + indices = np.argsort(_features)[::-1] + rev_indices = indices[::-1] + print('Top-%d positive:' % (top_k)) + for i in range(top_k): + print( + '%s: %f' + % ( + self._settings['idx2word'][str(indices[i])], + _features[indices[i]], + ) + ) + + print('\nTop-%d negative:' % (top_k)) + for i in range(top_k): + print( + '%s: %f' + % ( + self._settings['idx2word'][str(rev_indices[i])], + _features[rev_indices[i]], + ) + ) + + def predict(self, string): + """ + Tagging a string. + + Parameters + ---------- + string : str + + Returns + ------- + string: tagged string + """ + assert isinstance(string, str), 'input must be a string' + string = entities_textcleaning(string) + if len(string) > 120: + raise Exception( + 'Dependency parsing only able to accept string less than 120 words' + ) + batch_x = char_str_idx([string], self._settings['word2idx'], 2) + batch_x_char = generate_char_seq( + [string], self._settings['char2idx'], 2 + ) + tagging, depend = self._sess.run( + [self._logits, self._logits_depends], + feed_dict = {self._X: batch_x, self._X_char: batch_x_char}, + ) + tagging = [self._settings['idx2tag'][i] for i in tagging[0]] + depend = depend[0] - 1 + for i in range(len(depend)): + if depend[i] == 0 and tagging[i] != 'root': + tagging[i] = 'UNK' + elif depend[i] != 0 and tagging[i] == 'root': + tagging[i] = 'UNK' + elif depend[i] > len(tagging): + depend[i] = len(tagging) + return ( + [(string[i], tagging[i]) for i in range(len(depend))], + [(string[i], depend[i]) for i in range(len(depend))], + ) + + class TAGGING: def __init__( self, @@ -63,6 +244,8 @@ def __init__( settings, sess, model, + transitions, + features, is_lower = True, story = None, ): @@ -74,13 +257,83 @@ def __init__( self._model = model self._is_lower = is_lower self._story = story - - self._settings['idx2word'] = { - int(k): v for k, v in self._settings['idx2word'].items() - } self._settings['idx2tag'] = { int(k): v for k, v in self._settings['idx2tag'].items() } + self.transitions, self.features = self._sess.run( + [transitions, features] + ) + + def print_transitions(self, top_k = 10): + """ + Print important top-k transitions + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + print('Top-%d likely transitions:' % (top_k)) + indices = np.argsort(self.transitions.flatten())[::-1] + top_trans = [ + np.unravel_index(i, self.transitions.shape) for i in indices[:top_k] + ] + for i in range(top_k): + print( + '%s -> %s: %f' + % ( + self._settings['idx2tag'][top_trans[i][0]], + self._settings['idx2tag'][top_trans[i][1]], + self.transitions[top_trans[i]], + ) + ) + + bottom_trans = [ + np.unravel_index(i, self.transitions.shape) + for i in indices[::-1][:top_k] + ] + print('\nTop-%d unlikely transitions:' % (top_k)) + for i in range(top_k): + print( + '%s -> %s: %f' + % ( + self._settings['idx2tag'][bottom_trans[i][0]], + self._settings['idx2tag'][bottom_trans[i][1]], + self.transitions[bottom_trans[i]], + ) + ) + + def print_features(self, top_k = 10): + """ + Print important top-k features + + Parameters + ---------- + top_k : int + """ + assert isinstance(top_k, int), 'input must be an integer' + _features = self.features.sum(axis = 1) + indices = np.argsort(_features)[::-1] + rev_indices = indices[::-1] + print('Top-%d positive:' % (top_k)) + for i in range(top_k): + print( + '%s: %f' + % ( + self._settings['idx2word'][str(indices[i])], + _features[indices[i]], + ) + ) + + print('\nTop-%d negative:' % (top_k)) + for i in range(top_k): + print( + '%s: %f' + % ( + self._settings['idx2word'][str(rev_indices[i])], + _features[rev_indices[i]], + ) + ) def predict(self, string): """ @@ -99,7 +352,7 @@ def predict(self, string): string = entities_textcleaning(string) batch_x = char_str_idx([string], self._settings['word2idx'], 2) batch_x_char = generate_char_seq( - batch_x, self._settings['idx2word'], self._settings['char2idx'] + [string], self._settings['char2idx'], 2 ) if self._model == 'entity-network': batch_x_expand = np.expand_dims(batch_x, axis = 1) diff --git a/malaya/_utils/_parse_dependency.py b/malaya/_utils/_parse_dependency.py new file mode 100644 index 00000000..0ed96314 --- /dev/null +++ b/malaya/_utils/_parse_dependency.py @@ -0,0 +1,382 @@ +import sys +import warnings + +if not sys.warnoptions: + warnings.simplefilter('ignore') + +from collections import defaultdict +from itertools import chain +from six import string_types + + +class DependencyGraph(object): + def __init__( + self, + tree_str = None, + cell_extractor = None, + zero_based = False, + cell_separator = None, + top_relation_label = 'ROOT', + ): + self.nodes = defaultdict( + lambda: { + 'address': None, + 'word': None, + 'lemma': None, + 'ctag': None, + 'tag': None, + 'feats': None, + 'head': None, + 'deps': defaultdict(list), + 'rel': None, + } + ) + + self.nodes[0].update({'ctag': 'TOP', 'tag': 'TOP', 'address': 0}) + + self.root = None + + if tree_str: + self._parse( + tree_str, + cell_extractor = cell_extractor, + zero_based = zero_based, + cell_separator = cell_separator, + top_relation_label = top_relation_label, + ) + + def remove_by_address(self, address): + del self.nodes[address] + + def redirect_arcs(self, originals, redirect): + for node in self.nodes.values(): + new_deps = [] + for dep in node['deps']: + if dep in originals: + new_deps.append(redirect) + else: + new_deps.append(dep) + node['deps'] = new_deps + + def add_arc(self, head_address, mod_address): + relation = self.nodes[mod_address]['rel'] + self.nodes[head_address]['deps'].setdefault(relation, []) + self.nodes[head_address]['deps'][relation].append(mod_address) + + def connect_graph(self): + for node1 in self.nodes.values(): + for node2 in self.nodes.values(): + if ( + node1['address'] != node2['address'] + and node2['rel'] != 'TOP' + ): + relation = node2['rel'] + node1['deps'].setdefault(relation, []) + node1['deps'][relation].append(node2['address']) + + def get_by_address(self, node_address): + """Return the node with the given address.""" + return self.nodes[node_address] + + def contains_address(self, node_address): + """ + Returns true if the graph contains a node with the given node + address, false otherwise. + """ + return node_address in self.nodes + + def to_dot(self): + s = 'digraph G{\n' + s += 'edge [dir=forward]\n' + s += 'node [shape=plaintext]\n' + + for node in sorted(self.nodes.values(), key = lambda v: v['address']): + s += '\n%s [label="%s (%s)"]' % ( + node['address'], + node['address'], + node['word'], + ) + for rel, deps in node['deps'].items(): + for dep in deps: + if rel is not None: + s += '\n%s -> %s [label="%s"]' % ( + node['address'], + dep, + rel, + ) + else: + s += '\n%s -> %s ' % (node['address'], dep) + s += '\n}' + + return s + + def left_children(self, node_index): + children = chain.from_iterable(self.nodes[node_index]['deps'].values()) + index = self.nodes[node_index]['address'] + return sum(1 for c in children if c < index) + + def right_children(self, node_index): + children = chain.from_iterable(self.nodes[node_index]['deps'].values()) + index = self.nodes[node_index]['address'] + return sum(1 for c in children if c > index) + + def add_node(self, node): + if not self.contains_address(node['address']): + self.nodes[node['address']].update(node) + + def _parse( + self, + input_, + cell_extractor = None, + zero_based = False, + cell_separator = None, + top_relation_label = 'ROOT', + ): + def extract_3_cells(cells, index): + word, tag, head = cells + return index, word, word, tag, tag, '', head, '' + + def extract_4_cells(cells, index): + word, tag, head, rel = cells + return index, word, word, tag, tag, '', head, rel + + def extract_7_cells(cells, index): + line_index, word, lemma, tag, _, head, rel = cells + try: + index = int(line_index) + except ValueError: + # index can't be parsed as an integer, use default + pass + return index, word, lemma, tag, tag, '', head, rel + + def extract_10_cells(cells, index): + line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells + try: + index = int(line_index) + except ValueError: + # index can't be parsed as an integer, use default + pass + return index, word, lemma, ctag, tag, feats, head, rel + + extractors = { + 3: extract_3_cells, + 4: extract_4_cells, + 7: extract_7_cells, + 10: extract_10_cells, + } + + if isinstance(input_, string_types): + input_ = (line for line in input_.split('\n')) + + lines = (l.rstrip() for l in input_) + lines = (l for l in lines if l) + + cell_number = None + for index, line in enumerate(lines, start = 1): + cells = line.split(cell_separator) + if cell_number is None: + cell_number = len(cells) + else: + assert cell_number == len(cells) + + if cell_extractor is None: + try: + cell_extractor = extractors[cell_number] + except KeyError: + raise ValueError( + 'Number of tab-delimited fields ({0}) not supported by ' + 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number) + ) + + try: + index, word, lemma, ctag, tag, feats, head, rel = cell_extractor( + cells, index + ) + except (TypeError, ValueError): + word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) + + if head == '_': + continue + + head = int(head) + if zero_based: + head += 1 + + self.nodes[index].update( + { + 'address': index, + 'word': word, + 'lemma': lemma, + 'ctag': ctag, + 'tag': tag, + 'feats': feats, + 'head': head, + 'rel': rel, + } + ) + + # Make sure that the fake root node has labeled dependencies. + if (cell_number == 3) and (head == 0): + rel = top_relation_label + self.nodes[head]['deps'][rel].append(index) + + if self.nodes[0]['deps'][top_relation_label]: + root_address = self.nodes[0]['deps'][top_relation_label][0] + self.root = self.nodes[root_address] + self.top_relation_label = top_relation_label + else: + warnings.warn( + "The graph doesn't contain a node " + 'that depends on the root element.' + ) + + def _word(self, node, filter = True): + w = node['word'] + if filter: + if w != ',': + return w + return w + + def triples(self, node = None): + """ + Extract dependency triples of the form: + ((head word, head tag), rel, (dep word, dep tag)) + """ + + if not node: + node = self.root + + head = (node['word'], node['ctag']) + for i in sorted(chain.from_iterable(node['deps'].values())): + dep = self.get_by_address(i) + yield (head, dep['rel'], (dep['word'], dep['ctag'])) + for triple in self.triples(node = dep): + yield triple + + def _hd(self, i): + try: + return self.nodes[i]['head'] + except IndexError: + return None + + def _rel(self, i): + try: + return self.nodes[i]['rel'] + except IndexError: + return None + + def contains_cycle(self): + """Check whether there are cycles. + + >>> dg = DependencyGraph(treebank_data) + >>> dg.contains_cycle() + False + + >>> cyclic_dg = DependencyGraph() + >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} + >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} + >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} + >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} + >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} + >>> cyclic_dg.nodes = { + ... 0: top, + ... 1: child1, + ... 2: child2, + ... 3: child3, + ... 4: child4, + ... } + >>> cyclic_dg.root = top + + >>> cyclic_dg.contains_cycle() + [3, 1, 2, 4] + + """ + distances = {} + + for node in self.nodes.values(): + for dep in node['deps']: + key = tuple([node['address'], dep]) + distances[key] = 1 + + for _ in self.nodes: + new_entries = {} + + for pair1 in distances: + for pair2 in distances: + if pair1[1] == pair2[0]: + key = tuple([pair1[0], pair2[1]]) + new_entries[key] = distances[pair1] + distances[pair2] + + for pair in new_entries: + distances[pair] = new_entries[pair] + if pair[0] == pair[1]: + path = self.get_cycle_path( + self.get_by_address(pair[0]), pair[0] + ) + return path + + return False + + def get_cycle_path(self, curr_node, goal_node_index): + for dep in curr_node['deps']: + if dep == goal_node_index: + return [curr_node['address']] + for dep in curr_node['deps']: + path = self.get_cycle_path( + self.get_by_address(dep), goal_node_index + ) + if len(path) > 0: + path.insert(0, curr_node['address']) + return path + return [] + + def to_conll(self, style): + + if style == 3: + template = '{word}\t{tag}\t{head}\n' + elif style == 4: + template = '{word}\t{tag}\t{head}\t{rel}\n' + elif style == 10: + template = '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n' + else: + raise ValueError( + 'Number of tab-delimited fields ({0}) not supported by ' + 'CoNLL(10) or Malt-Tab(4) format'.format(style) + ) + + return ''.join( + template.format(i = i, **node) + for i, node in sorted(self.nodes.items()) + if node['tag'] != 'TOP' + ) + + def to_graphvis(self): + try: + from graphviz import Source + except: + raise Exception( + 'graphiz not installed. Please install it and try again.' + ) + return Source(self.to_dot()) + + def to_nx_graph(self): + try: + import networkx + except: + raise Exception( + 'networkx not installed. Please install it and try again.' + ) + + nx_nodelist = list(range(1, len(self.nodes))) + nx_edgelist = [ + (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) + ] + self.nx_labels = {} + for n in nx_nodelist: + self.nx_labels[n] = self.nodes[n]['word'] + + g = networkx.MultiDiGraph() + g.add_nodes_from(nx_nodelist) + g.add_edges_from(nx_edgelist) + + return g diff --git a/malaya/_utils/_paths.py b/malaya/_utils/_paths.py index 5e3c0864..a1694a3d 100644 --- a/malaya/_utils/_paths.py +++ b/malaya/_utils/_paths.py @@ -109,55 +109,55 @@ } PATH_POS = { - 'crf': {'model': home + '/pos/crf/crf-pos.pkl', 'version': 'v8'}, + 'crf': {'model': home + '/pos/crf/crf-pos.pkl', 'version': 'v14'}, 'concat': { 'model': home + '/pos/concat/concat-pos.pb', 'setting': home + '/pos/concat/concat-pos.json', - 'version': 'v8', + 'version': 'v14', }, 'luong': { 'model': home + '/pos/luong/luong-pos.pb', 'setting': home + '/pos/luong/luong-pos.json', - 'version': 'v8', + 'version': 'v14', }, 'bahdanau': { 'model': home + '/pos/bahdanau/bahdanau-pos.pb', 'setting': home + '/pos/bahdanau/bahdanau-pos.json', - 'version': 'v8', + 'version': 'v14', }, 'entity-network': { 'model': home + '/pos/entity-network/entity-pos.pb', 'setting': home + 'pos/entity-network/entity-pos.json', - 'version': 'v8', + 'version': 'v14', }, 'attention': { 'model': home + '/pos/attention/attention-pos.pb', 'setting': home + '/pos/attention/attention-pos.json', - 'version': 'v8', + 'version': 'v14', }, } S3_PATH_POS = { - 'crf': {'model': 'v8/pos/crf-pos.pkl'}, + 'crf': {'model': 'v14/pos/crf-pos.pkl'}, 'concat': { - 'model': 'v8/pos/concat-pos.pb', - 'setting': 'v8/pos/concat-pos.json', + 'model': 'v14/pos/concat-pos.pb', + 'setting': 'v14/pos/concat-pos.json', }, 'luong': { - 'model': 'v8/pos/luong-pos.pb', - 'setting': 'v8/pos/luong-pos.json', + 'model': 'v14/pos/luong-pos.pb', + 'setting': 'v14/pos/luong-pos.json', }, 'bahdanau': { - 'model': 'v8/pos/bahdanau-pos.pb', - 'setting': 'v8/pos/bahdanau-pos.json', + 'model': 'v14/pos/bahdanau-pos.pb', + 'setting': 'v14/pos/bahdanau-pos.json', }, 'entity-network': { - 'model': 'v8/pos/entity-pos.pb', - 'setting': 'v8/pos/entity-pos.json', + 'model': 'v14/pos/entity-pos.pb', + 'setting': 'v14/pos/entity-pos.json', }, 'attention': { - 'model': 'v8/pos/attention-pos.pb', - 'setting': 'v8/pos/attention-pos.json', + 'model': 'v14/pos/attention-pos.pb', + 'setting': 'v14/pos/attention-pos.json', }, } @@ -233,51 +233,51 @@ 'concat': { 'model': home + '/entity/concat/concat-entities.pb', 'setting': home + '/entity/concat/concat-entities.json', - 'version': 'v8', + 'version': 'v14', }, 'luong': { 'model': home + '/entity/luong/luong-entities.pb', 'setting': home + '/entity/luong/luong-entities.json', - 'version': 'v8', + 'version': 'v14', }, 'bahdanau': { 'model': home + '/entity/bahdanau/bahdanau-entities.pb', 'setting': home + '/entity/bahdanau/bahdanau-entities.json', - 'version': 'v8', + 'version': 'v14', }, 'entity-network': { 'model': home + '/entity/entity-network/entity-entities.pb', 'setting': home + '/entity/entity-network/entity-entities.json', - 'version': 'v8', + 'version': 'v14', }, 'attention': { 'model': home + '/entity/attention/attention-entities.pb', 'setting': home + '/entity/attention/attention-entities.json', - 'version': 'v8', + 'version': 'v14', }, } S3_PATH_ENTITIES = { 'crf': {'model': 'v8/entities/crf-entities.pkl'}, 'concat': { - 'model': 'v8/entities/concat-entities.pb', - 'setting': 'v8/entities/concat-entities.json', + 'model': 'v14/entities/concat-entities.pb', + 'setting': 'v14/entities/concat-entities.json', }, 'luong': { - 'model': 'v8/entities/luong-entities.pb', - 'setting': 'v8/entities/luong-entities.json', + 'model': 'v14/entities/luong-entities.pb', + 'setting': 'v14/entities/luong-entities.json', }, 'bahdanau': { - 'model': 'v8/entities/bahdanau-entities.pb', - 'setting': 'v8/entities/bahdanau-entities.json', + 'model': 'v14/entities/bahdanau-entities.pb', + 'setting': 'v14/entities/bahdanau-entities.json', }, 'entity-network': { - 'model': 'v8/entities/entity-entities.pb', - 'setting': 'v8/entities/entity-entities.json', + 'model': 'v14/entities/entity-entities.pb', + 'setting': 'v14/entities/entity-entities.json', }, 'attention': { - 'model': 'v8/entities/attention-entities.pb', - 'setting': 'v8/entities/attention-entities.json', + 'model': 'v14/entities/attention-entities.pb', + 'setting': 'v14/entities/attention-entities.json', }, } @@ -597,3 +597,45 @@ 'vector': 'v12/emotion/vectorizer-sparse-emotion.pkl', }, } + +PATH_DEPEND = { + 'crf': { + 'model': home + '/dependency/crf/crf-label.pkl', + 'depend': home + '/dependency/crf/crf-depend.pkl', + 'version': 'v14', + }, + 'concat': { + 'model': home + '/dependency/concat/concat-dependency.pb', + 'setting': home + '/dependency/concat/concat-dependency.json', + 'version': 'v14', + }, + 'luong': { + 'model': home + '/dependency/luong/luong-dependency.pb', + 'setting': home + '/dependency/luong/luong-dependency.json', + 'version': 'v14', + }, + 'bahdanau': { + 'model': home + '/dependency/bahdanau/bahdanau-dependency.pb', + 'setting': home + '/dependency/bahdanau/bahdanau-dependency.json', + 'version': 'v14', + }, +} + +S3_PATH_DEPEND = { + 'crf': { + 'model': 'v14/dependency/crf-label.pkl', + 'depend': 'v14/dependency/crf-depend.pkl', + }, + 'concat': { + 'model': 'v14/dependency/concat-dependency.pb', + 'setting': 'v14/dependency/concat-dependency.json', + }, + 'luong': { + 'model': 'v14/dependency/luong-dependency.pb', + 'setting': 'v14/dependency/luong-dependency.json', + }, + 'bahdanau': { + 'model': 'v14/dependency/bahdanau-dependency.pb', + 'setting': 'v14/dependency/bahdanau-dependency.json', + }, +} diff --git a/malaya/_utils/_tag_class.py b/malaya/_utils/_tag_class.py index 91993dda..1adcf9f3 100644 --- a/malaya/_utils/_tag_class.py +++ b/malaya/_utils/_tag_class.py @@ -13,7 +13,7 @@ from .._models._sklearn_model import CRF -def crf(path, s3_path, class_name): +def crf(path, s3_path, class_name, is_lower = True): check_file(path['crf'], s3_path['crf']) try: with open(path['crf']['model'], 'rb') as fopen: @@ -23,10 +23,10 @@ def crf(path, s3_path, class_name): "model corrupted due to some reasons, please run malaya.clear_cache('%s/crf') and try again" % (class_name) ) - return CRF(model) + return CRF(model, is_lower = is_lower) -def deep_model(path, s3_path, class_name, model = 'bahdanau'): +def deep_model(path, s3_path, class_name, model = 'bahdanau', is_lower = True): """ Load deep learning NER model. @@ -66,6 +66,9 @@ def deep_model(path, s3_path, class_name, model = 'bahdanau'): nodes, tf.InteractiveSession(graph = g), model, + g.get_tensor_by_name('import/transitions:0'), + g.get_tensor_by_name('import/Variable:0'), + is_lower = is_lower, story = g.get_tensor_by_name('import/story:0'), ) else: @@ -76,6 +79,9 @@ def deep_model(path, s3_path, class_name, model = 'bahdanau'): nodes, tf.InteractiveSession(graph = g), model, + g.get_tensor_by_name('import/transitions:0'), + g.get_tensor_by_name('import/Variable:0'), + is_lower = is_lower, ) else: diff --git a/malaya/dependency.py b/malaya/dependency.py new file mode 100644 index 00000000..0a1dfafe --- /dev/null +++ b/malaya/dependency.py @@ -0,0 +1,101 @@ +import sys +import warnings + +if not sys.warnoptions: + warnings.simplefilter('ignore') + +import pickle +import json +import tensorflow as tf +from ._utils._utils import check_file, load_graph +from ._models._sklearn_model import DEPENDENCY +from ._models._tensorflow_model import DEPENDENCY as TF_DEPENDENCY +from ._utils._parse_dependency import DependencyGraph +from ._utils._paths import PATH_DEPEND, S3_PATH_DEPEND + + +def dependency_graph(tagging, indexing): + result = [] + for i in range(len(tagging)): + result.append( + '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' + % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1]) + ) + return DependencyGraph('\n'.join(result), top_relation_label = 'root') + + +def available_deep_model(): + """ + List available deep learning dependency models, ['concat', 'bahdanau', 'luong'] + """ + return ['concat', 'bahdanau', 'luong'] + + +def crf(): + """ + Load CRF dependency model. + + Returns + ------- + DEPENDENCY : malaya._models._sklearn_model.DEPENDENCY class + """ + check_file(PATH_DEPEND['crf'], S3_PATH_DEPEND['crf']) + try: + with open(PATH_DEPEND['crf']['model'], 'rb') as fopen: + model = pickle.load(fopen) + with open(PATH_DEPEND['crf']['depend'], 'rb') as fopen: + depend = pickle.load(fopen) + except: + raise Exception( + "model corrupted due to some reasons, please run malaya.clear_cache('dependency/crf') and try again" + ) + return DEPENDENCY(model, depend) + + +def deep_model(model = 'bahdanau'): + """ + Load deep learning dependency model. + + Parameters + ---------- + model : str, optional (default='bahdanau') + Model architecture supported. Allowed values: + + * ``'concat'`` - Concating character and word embedded for BiLSTM + * ``'bahdanau'`` - Concating character and word embedded including Bahdanau Attention for BiLSTM + * ``'luong'`` - Concating character and word embedded including Luong Attention for BiLSTM + + Returns + ------- + DEPENDENCY: malaya._models._tensorflow_model.DEPENDENCY class + """ + assert isinstance(model, str), 'model must be a string' + model = model.lower() + if model in ['concat', 'bahdanau', 'luong']: + check_file(PATH_DEPEND[model], S3_PATH_DEPEND[model]) + try: + with open(PATH_DEPEND[model]['setting'], 'r') as fopen: + nodes = json.loads(fopen.read()) + g = load_graph(PATH_DEPEND[model]['model']) + except: + raise Exception( + "model corrupted due to some reasons, please run malaya.clear_cache('dependency/%s') and try again" + % (model) + ) + return TF_DEPENDENCY( + g.get_tensor_by_name('import/Placeholder:0'), + g.get_tensor_by_name('import/Placeholder_1:0'), + g.get_tensor_by_name('import/logits:0'), + g.get_tensor_by_name('import/logits_depends:0'), + nodes, + tf.InteractiveSession(graph = g), + model, + g.get_tensor_by_name('import/transitions:0'), + g.get_tensor_by_name('import/depends/transitions:0'), + g.get_tensor_by_name('import/Variable:0'), + ) + + else: + raise Exception( + 'model not supported, please check supported models from malaya.dependency.available_deep_model()' + ) diff --git a/malaya/pos.py b/malaya/pos.py index dd0a7ad2..1e6d4b29 100644 --- a/malaya/pos.py +++ b/malaya/pos.py @@ -68,7 +68,7 @@ def crf(): ------- CRF : malaya.sklearn_model.CRF class """ - return _tag_class.crf(PATH_POS, S3_PATH_POS, 'pos') + return _tag_class.crf(PATH_POS, S3_PATH_POS, 'pos', is_lower = False) def deep_model(model = 'concat'): @@ -90,4 +90,6 @@ def deep_model(model = 'concat'): ------- TAGGING: malaya.tensorflow_model.TAGGING class """ - return _tag_class.deep_model(PATH_POS, S3_PATH_POS, 'pos', model = model) + return _tag_class.deep_model( + PATH_POS, S3_PATH_POS, 'pos', model = model, is_lower = False + ) diff --git a/malaya/stack.py b/malaya/stack.py index 989ddea7..cb4009a3 100644 --- a/malaya/stack.py +++ b/malaya/stack.py @@ -8,7 +8,7 @@ def _most_common(l): def voting_stack(models, text): """ - Stacking for POS and Entities Recognition models. + Stacking for POS, Entities and Dependency models. Parameters ---------- @@ -23,16 +23,37 @@ def voting_stack(models, text): """ assert isinstance(models, list), 'models must be a list' assert isinstance(text, str), 'text must be a string' - results, texts, votes = [], [], [] + results, texts, votes, votes_indices, indices = [], [], [], [], [] + is_dependency = False for i in range(len(models)): assert 'predict' in dir(models[i]), 'all models must able to predict' - predicted = np.array(models[i].predict(text)) + predicted = models[i].predict(text) + if isinstance(predicted, tuple): + is_dependency = True + predicted, indexing = predicted + indexing = np.array(indexing) + indices.append(indexing[:, 1:2]) + + predicted = np.array(predicted) results.append(predicted[:, 1:2]) texts.append(predicted[:, 0]) concatenated = np.concatenate(results, axis = 1) for row in concatenated: votes.append(_most_common(row.tolist())) - return list(map(lambda X: (X[0], X[1]), list(zip(texts[-1], votes)))) + if is_dependency: + concatenated = np.concatenate(indices, axis = 1) + for row in concatenated: + votes_indices.append(_most_common(row.tolist())) + output = list(map(lambda X: (X[0], X[1]), list(zip(texts[-1], votes)))) + if is_dependency: + return ( + output, + list( + map(lambda X: (X[0], X[1]), list(zip(texts[-1], votes_indices))) + ), + ) + else: + return output def predict_stack(models, text, mode = 'gmean'): diff --git a/malaya/summarize.py b/malaya/summarize.py index 3db0a3cd..3252acbb 100644 --- a/malaya/summarize.py +++ b/malaya/summarize.py @@ -42,6 +42,9 @@ def deep_model_wiki(): ------- DEEP_SUMMARIZER: malaya.skip_thought.DEEP_SUMMARIZER class """ + print( + 'WARNING: this model is using convolutional based, Tensorflow-GPU above 1.10 may got a problem. Please downgrade to Tensorflow-GPU v1.8 if got any cuDNN error.' + ) return _skip_thought.wiki_load_model() diff --git a/malaya/texts/_text_functions.py b/malaya/texts/_text_functions.py index 06a6698d..c03491df 100644 --- a/malaya/texts/_text_functions.py +++ b/malaya/texts/_text_functions.py @@ -386,23 +386,24 @@ def add_ngram(sequences, token_indice, ngram = (2, 3)): return new_sequences -def char_str_idx(corpus, dic, UNK = 0): +def char_str_idx(corpus, dic, UNK = 2): maxlen = max([len(i) for i in corpus]) X = np.zeros((len(corpus), maxlen)) for i in range(len(corpus)): - for no, k in enumerate(corpus[i][:maxlen][::-1]): - X[i, -1 - no] = dic.get(k, UNK) + for no, k in enumerate(corpus[i][:maxlen]): + X[i, no] = dic.get(k, UNK) return X -def generate_char_seq(batch, idx2word, char2idx): - x = [[len(idx2word[i]) for i in k] for k in batch] +def generate_char_seq(batch, dic, UNK = 2): + maxlen_c = max([len(k) for k in batch]) + x = [[len(i) for i in k] for k in batch] maxlen = max([j for i in x for j in i]) - temp = np.zeros((batch.shape[0], batch.shape[1], maxlen), dtype = np.int32) - for i in range(batch.shape[0]): - for k in range(batch.shape[1]): - for no, c in enumerate(idx2word[batch[i, k]].lower()): - temp[i, k, -1 - no] = char2idx[c] + temp = np.zeros((len(batch), maxlen_c, maxlen), dtype = np.int32) + for i in range(len(batch)): + for k in range(len(batch[i])): + for no, c in enumerate(batch[i][k][::-1]): + temp[i, k, -1 - no] = dic.get(c, UNK) return temp diff --git a/malaya/texts/vectorizer.py b/malaya/texts/vectorizer.py index 2aa2b905..c69c0a2f 100644 --- a/malaya/texts/vectorizer.py +++ b/malaya/texts/vectorizer.py @@ -105,6 +105,49 @@ def features_crf(sentence, index): } +def features_crf_dependency(sentence, tag, index): + return { + 'word': sentence[index], + 'tag': tag[index], + 'is_first': index == 0, + 'is_last': index == len(sentence) - 1, + 'prefix-1': sentence[index][0], + 'prefix-2': sentence[index][:2], + 'prefix-3': sentence[index][:3], + 'suffix-1': sentence[index][-1], + 'suffix-2': sentence[index][-2:], + 'suffix-3': sentence[index][-3:], + 'prev_word': '' if index == 0 else sentence[index - 1], + 'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0], + 'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2], + 'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3], + 'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1], + 'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:], + 'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:], + 'next_word-prefix-1': '' + if index == len(sentence) - 1 + else sentence[index + 1][0], + 'next_word-prefix-2': '' + if index == len(sentence) - 1 + else sentence[index + 1][:2], + 'next_word-prefix-3': '' + if index == len(sentence) - 1 + else sentence[index + 1][:3], + 'next_word-suffix-1': '' + if index == len(sentence) - 1 + else sentence[index + 1][-1], + 'next_word-suffix-2': '' + if index == len(sentence) - 1 + else sentence[index + 1][-2:], + 'next_word-suffix-3': '' + if index == len(sentence) - 1 + else sentence[index + 1][-3:], + 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], + 'has_hyphen': '-' in sentence[index], + 'is_numeric': sentence[index].isdigit(), + } + + def skipgrams( sequence, vocabulary_size, diff --git a/malaya/topic_model.py b/malaya/topic_model.py index 10cfd475..cf91ddfd 100644 --- a/malaya/topic_model.py +++ b/malaya/topic_model.py @@ -121,8 +121,13 @@ def visualize_topics(self, notebook_mode = False, mds = 'pcoa'): assert isinstance( notebook_mode, bool ), 'notebook_mode must be a boolean' - import pyLDAvis - import pyLDAvis.sklearn + try: + import pyLDAvis + import pyLDAvis.sklearn + except: + raise Exception( + 'pyldavis not installed. Please install it and try again.' + ) if notebook_mode: pyLDAvis.enable_notebook() diff --git a/malaya/word2vec.py b/malaya/word2vec.py index 42486981..d70483fb 100644 --- a/malaya/word2vec.py +++ b/malaya/word2vec.py @@ -287,10 +287,15 @@ def tree_plot( ------- list_dictionaries: list of results """ - import matplotlib.pyplot as plt - import seaborn as sns + try: + import matplotlib.pyplot as plt + import seaborn as sns - sns.set() + sns.set() + except: + raise Exception( + 'matplotlib and seaborn not installed. Please install it and try again.' + ) idx = [ self.words.index(e[0] if isinstance(e, list) else e) for e in labels ] diff --git a/readme-pypi.rst b/readme-pypi.rst index 77388aea..98936ef8 100644 --- a/readme-pypi.rst +++ b/readme-pypi.rst @@ -49,7 +49,6 @@ Features - **Entities Recognition** Latest state-of-art CRF deep learning models to do Naming Entity Recognition. - - **Language Detection** using Multinomial, SGD, XGB, Fast-text N-grams deep learning to distinguish Malay, English, and Indonesian. @@ -63,6 +62,9 @@ Features - **Part-of-Speech Recognition** Latest state-of-art CRF deep learning models to do Naming Entity Recognition. +- **Dependency Parsing** + + Latest state-of-art CRF deep learning models to do analyzes the grammatical structure of a sentence, establishing relationships between words. - **Sentiment Analysis** From BERT, Fast-Text, Dynamic-Memory Network, Sparse Tensorflow, Attention Neural Network to build deep sentiment analysis models. diff --git a/setup-gpu.py b/setup-gpu.py index 7d7b34e9..4a7704ec 100644 --- a/setup-gpu.py +++ b/setup-gpu.py @@ -6,7 +6,7 @@ setuptools.setup( name = __packagename__, packages = setuptools.find_packages(), - version = '1.3.0.1', + version = '1.4.0', python_requires = '==3.6.*', description = 'Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning. GPU Version', author = 'huseinzol05', @@ -30,9 +30,6 @@ 'pandas', 'PySastrawi', 'toolz', - 'pyldavis', - 'matplotlib', - 'seaborn', ], license = 'MIT', classifiers = [ diff --git a/setup.py b/setup.py index 013a9058..cde4d7f7 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name = __packagename__, packages = setuptools.find_packages(), - version = '1.3.0.1', + version = '1.4.0', python_requires = '==3.6.*', description = 'Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning.', author = 'huseinzol05', @@ -30,9 +30,6 @@ 'pandas', 'PySastrawi', 'toolz', - 'pyldavis', - 'matplotlib', - 'seaborn', ], license = 'MIT', classifiers = [