-
Notifications
You must be signed in to change notification settings - Fork 0
/
WGCNA Script.R
308 lines (246 loc) · 12.2 KB
/
WGCNA Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#Code derived from Prof.Steve Hovarth's brilliant tutorials found here - https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html#
rm(list = ls(all = TRUE))
getwd()
R.Version()
.libPaths()
install.packages("dynamicTreeCut")
install.packages("fastcluster")
install.packages("WGCNA")
BiocManager::install("preprocessCore")
BiocManager::install("impute")
library(WGCNA)
#WGCNA package setting#
##1a##
#Removing auxiliary data and transposing the expression data for further analysis#
getwd()
rna_norm_counts <- read.csv("~/Bioinformatics MSc Resources/Module5_ComputationalBioforComplexSystems/rna_norm_counts.csv")
View(rna_norm_counts)
options(stringsAsFactors = FALSE);
datExpr0 <- as.data.frame(t(rna_norm_counts[, -c(1)]));
names(datExpr0) <- rna_norm_counts$X;
rownames(datExpr0) <- names(rna_norm_counts)[-c(1)];
##1b##
#Checking data for excessive missing values and ID of outlier microarray samples#
BiocManager::install("GO.db")
library(WGCNA)
#Check for genes and samples with too many missing values#
getAnywhere(goodSamplesGenes)
gsg <- goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK
#The values returned is TRUE, therefore all genes have passed the cut - none need to be removed#
#Clustering of samples to detect any obvious outliers#
sampleTree = hclust(dist(datExpr0), method = "average");
sizeGrWindow(24,20)
par(cex = 0.6);
par(mar = c(0,5,2.4,2))
plot(sampleTree, main = "Clustering dendrogram of samples based on their Euclidean distance", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
abline(h = 0.5e+06, col = "red");
clust = cutreeStatic(sampleTree, cutHeight = 0.5e+06, minSize = 0)
table(clust)
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
head(datExpr)
##1c##
#Loading in our sample data sheet to try and relate that to gene expression#
sample_sheet <- read.csv("~/Bioinformatics MSc Resources/Module5_ComputationalBioforComplexSystems/sample_sheet.csv")
str(sample_sheet)
sample_sheet = sample_sheet[c(1:2)]
str(sample_sheet)
RiverSamples <- rownames(datExpr);
traitRows <- match(RiverSamples, sample_sheet$SampleID);
head(traitRows)
datsamples <- sample_sheet[traitRows,-1];
datsamples <- as.matrix(datsamples)
rownames(datsamples) <- sample_sheet[traitRows, 1];
collectGarbage();
#The expression data is in the 'datexpr' variable, and the corresponding sample traits in the variable 'datSamples'.#
#Before continuing to network construction and module detection, we visualise how the sample features relate to the sample dendrogram#
#Re-cluster samples#
#editing datsamples matrix to remove all characters so it's fully numeric#
write.csv(datsamples,"datsamples.csv")
datsamples <- read.csv("~/Bioinformatics MSc Resources/Module5_ComputationalBioforComplexSystems/datsamples.csv")
sampleTree2 = hclust(dist(datExpr), method = "average")
typeof(datsamples)
traitColors = numbers2colors(datsamples, signed = FALSE);
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datsamples),
main = "Sample dendrogram and trait heatmap")
save(datExpr, datsamples, file="Part1Jan.Rdata")
##2 - Automatic construction of the gene network and ID of modules#
#2.a.1 Choosing the soft-thresholding power: analysis of network topology#
options(stringsAsFactors = FALSE);
lnames = load(file = "Part1Jan.RData");
lnames
print(datsamples)
#Choose a set of soft-thresholding powers#
powers = c(c(1:10), seq(from = 12, to=20, by=2))
#Call the network topology analysis function#
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#Plot the results#
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
abline(h=0.90,col="red")
#Mean connectivity as a function of the soft-thresholding power#
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
##2.a.2##
#One-step network construction and module detection#
library(WGCNA)
net = blockwiseModules(datExpr, power = 4,maxBlockSize = 10000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "RNA_normcounts_Jan",
verbose = 3)
table(net$colors)
net$dendrograms
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "Part2Jan.RData")
##3 - Relating modules to external clinical traits##
#3.a Quantifying module-trait associations#
#Here the aim is to ID modules that have significant association with different traits, i.e. river location.#
load(file = "Part2Jan.RData")
# Defining the numbers of genes and samples#
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculating MEs with colour labels#
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datsamples
, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
#Constructing a figure to assist in interpreting the large number of modules and traits#
#Each association is colour coded with a correlation value#
# Display correlations and their p-values#
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values via a heatmap plot#
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datsamples),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
#3.b Gene relationship to important modules: Gene significance and Module membership#
#We use this step to see if there's correlation between different sample extraction sites and module membership#
#In the following code, associations of individual genes with location are define by Gene significance#
#Also, for each module, the quantitative measure of module membership MM is the correlation of the module eigengene and the gene expression profile.#
#All this allows us to quantify the similarity of all genes to every module.#
# Define variable site containing the site column of datsamples
site = as.data.frame(datsamples$Site);
names(site) = "site"
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr, site, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(site), sep="");
names(GSPvalue) = paste("p.GS.", names(site), sep="");
#3.c Intramodular analysis: identifying genes with high GS and MM#
module = "pink"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for extraction location(site(",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
#correlation is very high at 0.63!Genes are significantly associated with this trait#
##4. Visualising networks in R##
##4.a Visualising the gene network##
#One way to visualize a weighted network is to plot its heatmap.#
#Each row and column of the heatmap correspond to a single gene.#
#The heatmap shows adjacencies or topological overlaps, with light colors denoting low adjacency (overlap) and darker colors higher adjacency (overlap).#
# The gene dendrograms and module colors are plotted along the top and left side of the heatmap.#
# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
##4.b Visualising the eigengene network##
#Visualising the eigengene network representing relationships among modules and between site modules.#
#The following dendrogram and heatmap ID groups of correlated eigengenes - meta-modules.#
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# Isolate weight from the clinical traits
site = as.data.frame(datsamples$site);
names(site) = "site"
# Add the site to existing module eigengenes
MET = orderMEs(cbind(MEs, site))
#Plot the relationships among the eigengenes and the trait#
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
#We split the dendrogram and heatmap plots using the following code#
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
##5 Export network data to Cytoscape##
#Cytoscape allows one to input an edge file and a node file#
# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = rna_norm_counts$X[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = moduleColors[inModule]);