-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLDA_model.R
211 lines (178 loc) · 4.92 KB
/
LDA_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#### load libraries
library(tm)
library(topicmodels)
library(ggplot2)
library(MASS)
#### load data
tab <- read.csv("8-21-2014_node_data.csv", header=FALSE)
# fix datetime
tab[,2] <- as.POSIXct(tab[,2])
# order by node and date
tab <- tab[order(tab[,1], tab[,2]),]
# save empty data
save.image("EmptyData.RData")
# make the corpus
vect <- VectorSource(tab[,3])
corpus <- Corpus(vect)
# clean the corpus
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
# generate document term matrix
dtm <- DocumentTermMatrix(corpus)
# save the document term matrix data
save.image("DTM.RData")
#### LDA model
model = LDA(dtm, 20)
# save the model image
save.image("20_Topic_LDA.RData")
# input values
K <- 20
xi2 <- 20
delta2 <- 20
eta <- 0
# make nodes, the length of nodes and the number of periods
nodes <- unique(tab[,1])
ln_nodes <- length(nodes)
stp <- length(tab[,1]) / ln_nodes
# convert topics into mat
top_mat <- matrix(NA, ln_nodes, stp)
topics <- topics(model)
for(i in 1:ln_nodes){
for(j in 1:stp){
top_mat[i, j] <- topics[(i - 1) * ln_nodes + j]
}
}
#### Conditional Prob
cond_prob_mat <- matrix(NA, ln_nodes, ln_nodes)
# iterate through nodes
for(i in 1:ln_nodes){
for(j in 1:ln_nodes){
i_d <- top_mat[i, 2:stp]
j_d <- top_mat[j, 1:(stp-1)]
ln_data <- length(i_d)
cond_prob_mat[i, j] <- sum(i_d == j_d) / ln_data
}
}
#### Generate Truth Value
# independent truth value
truth_ind <- matrix(NA, K, stp - 1)
for(i in 1:K){
for(j in 2:stp){
truth_ij <- 0
prev_ind <- which(top_mat[,j-1] == i)
curr_ind <- which(top_mat[,j] == i)
for(k in curr_ind){
belief <- 1
for(q in prev_ind){
belief <- belief * (1 - cond_prob_mat[k, q])
}
truth_ij <- truth_ij + belief
}
truth_ij <- truth_ij / ln_nodes
truth_ind[i, j - 1] <- truth_ij
}
}
# dependent truth value
truth_dep <- matrix(NA, K, stp - 1)
for(i in 1:K){
for(j in 2:stp){
truth_dep[i, j - 1] <- sum(top_mat[,j] == i) / ln_nodes
}
}
#### Risk for each Node Period
topic_risk <- list()
for(i in 1:K){
t_mat <- matrix(NA, ln_nodes, stp - 1)
for(j in 2:stp){
prev_ind <- which(top_mat[,j - 1] == i)
for(k in 1:ln_nodes){
node_risk <- sum(cond_prob_mat[k, prev_ind])
t_mat[k, j - 1] <- node_risk
}
}
topic_risk[[i]] <- t_mat
}
#### Expected Value for each Node Period
expected_val_mat <- matrix(NA, ln_nodes, stp - 1)
for(i in 1:ln_nodes){
for(j in 2:stp){
mx <- 0
mx_ind <- 1
for(k in 1:K){
if(topic_risk[[k]][i, j - 1] > mx){
mx <- topic_risk[[k]][i, j - 1]
mx_ind <- k
}
}
expected_val_mat[i, j - 1] <- mx_ind
}
}
#### Quality of Node
influence <- rep(NA, ln_nodes)
for(i in 1:ln_nodes){
influence[i] <- sum(cond_prob_mat[,i])
}
### OLS
# initalize storage
reg_l <- list()
p_l <- list()
# iterate through the nodes
for(i in 1:K){
reg_mat <- matrix(NA, ln_nodes, ln_nodes)
p_mat <- matrix(NA, ln_nodes, ln_nodes)
for(j in 1:ln_nodes){
D <- model@gamma[,i][((j-1)*stp + 2):(j*stp)]
reg_data <- data.frame(D)
for(k in 1:ln_nodes){
reg_data[paste("I", k, sep="")] <- model@gamma[,i][((k-1)*stp + 1):(k*stp -1)]
}
mod <- glm(D ~ ., data=reg_data, family=binomial(logit))
for(k in 1:ln_nodes){
reg_mat[j, k] <- coef(summary(mod))[,1][k]
p_mat[j, k] <- coef(summary(mod))[,4][k]
}
}
reg_l[[i]] <- reg_mat
p_l[[i]] <- p_mat
}
#### MCMC Approach
# build data
theta_n <- list()
for (i in 1:(stp-1)){
NK_mat <- matrix(NA, ln_nodes, K)
for(j in 1:ln_nodes){
NK_mat[j,] <- model@gamma[(i - 1) * ln_nodes + j,]
}
theta_n[[i]] <- NK_mat
}
# build theta
theta <- list()
for(i in 1:ln_nodes){
theta_i <- matrix(NA, stp - 1, K)
for(j in 2:stp){
theta_i[j-1,] <- model@gamma[(j - 1) * ln_nodes + i,]
}
theta[[i]] <- theta_i
}
I_N <- diag(ln_nodes)
# mcmc results
gamma <- matrix(NA, ln_nodes, ln_nodes)
pval1 <- matrix(NA, ln_nodes, ln_nodes)
pval2 <- matrix(NA, ln_nodes, ln_nodes)
lambda_gi = solve(1 / xi2 * I_N + Reduce('+', theta_n) %*% t(Reduce('+', theta_n)))
for (i in 1:ln_nodes){
mu_gi <- t(lambda_gi) %*% (eta / xi2 + (Reduce('+', theta_n) %*% colSums(theta[[i]])) / delta2)
samp <- mvrnorm(5000, mu_gi, lambda_gi)
mn <- apply(samp, 2, mean)
std <- apply(samp, 2, sd)
tv <- mn / std
pv1 <- apply(samp, 2, function(x) sum(x > 0) / dim(samp)[1])
pv2 <- apply(samp, 2, function(x) sum(x < 0) / dim(samp)[1])
gamma[i,] <- mn
pval1[i,] <- pv1
pval2[i,] <- pv2
}