-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentiment_analysis_naive.py
219 lines (169 loc) · 6.52 KB
/
Sentiment_analysis_naive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import pdb, pickle
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
import re
def process_tweet(tweet):
stemmer = nltk.PorterStemmer()
stopwords_english = stopwords.words('english')
tweet = re.sub(r'\$\w*', '', tweet)
tweet = re.sub(r'^RT[\s]+', '', tweet)
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
tweet = re.sub(r'#', '', tweet)
tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
tweets_clean = []
for word in tweet_tokens:
if (word not in stopwords_english and
word not in string.punctuation):
stem_word = stemmer.stem(word) # stemming word
tweets_clean.append(stem_word)
return tweets_clean
def lookup(freqs, word, label):
pair = (word, label)
p = freqs.get(pair)
if p == None:
return 0
else:
return p
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
# print cleaned tweet
print(process_tweet(custom_tweet))
def count_tweets(result, tweets, ys):
"""
Input:
result: a dictionary that will be used to map each pair to its frequency
tweets: a list of tweets
ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
Output:
result: a dictionary mapping each pair to its frequency
"""
for y, tweet in zip(ys, tweets):
for word in process_tweet(tweet):
pair = (word, y)
# if the key exists in the dictionary, increment the count
if pair in result:
result[pair] += 1
else:
result[pair] = 1
return result
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)
freqs = count_tweets({}, train_x, train_y)
def train_naive_bayes(freqs, train_x, train_y):
"""
Input:
freqs: dictionary from (word, label) to how often the word appears
train_x: a list of tweets
train_y: a list of labels correponding to the tweets (0,1)
Output:
logprior: the log prior. (equation 3 above)
loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
"""
loglikelihood = {}
logprior = 0
# calculate V, the number of unique words in the vocabulary
vocab = set([pair[0] for pair in freqs.keys()])
V = len(vocab)
# print(V)
N_pos = N_neg = 0
for pair in freqs.keys():
if pair[1] > 0:
N_pos += freqs[pair]
else:
N_neg += freqs[pair]
D = len(train_x)
# Calculate D_pos, the number of positive documents.
D_pos = np.sum(train_y)
# Calculate D_neg, the number of negative documents.
D_neg = D - D_pos
logprior = np.log(D_pos) - np.log(D_neg)
for word in vocab:
# print(word)
# print(lookup(freqs, word, 1))
freq_pos = lookup(freqs, word, 1)
freq_neg = lookup(freqs, word, 0)
# calculate the probability that each word is positive, and negative
p_w_pos = (freq_pos + 1)/ (N_pos + V)
p_w_neg = (freq_neg + 1)/ (N_neg + V)
# calculate the log likelihood of the word
b = p_w_pos/ p_w_neg
loglikelihood[word] = np.log(b)
return logprior, loglikelihood
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))
# get logprior, loglikelihood ka pickle files needed.
"""
save_documents = open(r"Doc_logptiot.pickle", "wb")
pickle.dump(logprior, save_documents)
save_documents.close()
save_documents1 = open(r"Doc_loglikelihood.pickle", "wb")
pickle.dump(loglikelihood, save_documents1)
save_documents1.close()
"""
def naive_bayes_predict(tweet, logprior, loglikelihood):
"""
Input:
tweet: a string
logprior: a number
loglikelihood: a dictionary of words mapping to numbers
Output:
p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)
"""
word_l = process_tweet(tweet)
p = 0
p += logprior
for word in word_l:
# check if the word exists in the loglikelihood dictionary
if word in loglikelihood:
p += loglikelihood.get(word)
return p
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
"""
Input:
test_x: A list of tweets
test_y: the corresponding labels for the list of tweets
logprior: the logprior
loglikelihood: a dictionary with the loglikelihoods for each word
Output:
accuracy: (# of tweets classified correctly)/(total # of tweets)
"""
accuracy = 0
y_hats = []
for tweet in test_x:
if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
y_hat_i = 1
else:
y_hat_i = 0
y_hats.append(y_hat_i)
# error is the average of the absolute values of the differences between y_hats and test_y
error = (y_hats != np.squeeze(test_y)).sum() / len(test_x)
accuracy = float(1 - error)
return accuracy
print("Naive Bayes accuracy = %0.4f" %
(test_naive_bayes(test_x, test_y, logprior, loglikelihood)))
# Feel free to check the sentiment of your own tweet below
# Note here that a positive value shows positive sentiment and its magnitude tells how much positive it is, similarly
# for the negative tweets.
my_tweet = 'you are bad :('
print(naive_bayes_predict(my_tweet, logprior, loglikelihood))