-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpcadapt_PCA_selective_sweep_script.R
104 lines (76 loc) · 2.72 KB
/
pcadapt_PCA_selective_sweep_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Author: Dr.Mohsin Ali
# install.packages("pcadapt", dependencies = T)
library(pcadapt)
library(qvalue)
# pcadapt has been developed to detect genetic markers involved in biological adaptation. pcadapt provides statistical tools for outlier detection based on Principal Component Analysis (PCA).
# pcadapt package can perform genome scans for selection based on individual genotype data
myVCF <- read.pcadapt(
"Repeat_Het0.2_494GBS_TaxaOrdered_Maf0.05_Miss0_QCout_WP.recode.vcf",
type = "vcf",
type.out = c("bed", "matrix"),
allele.sep = c("/", "|")
)
# why pcadapt? => pcadapt is robust to admixture and does not assume prior knowledge of population structure.
# pcadapt performs proceeds in two steps: 1) pca is performed on centered and scaled genotypic data;
# 2) second step involved computation of test statistics and p-vales on the correltions between SNPs and the first "k" PCs.
# note: an outlier test pcadapt (Luu et al., 2017) was based on allele frequency differentiation
# Luu, K., Bazin, E., & Blum, M. G. (2017). pcadapt: an R package to perform genome scans for selection based on principal component analysis. Molecular ecology resources, 17(1), 67-77.
#PCA only
myVCF_PCA_only <- pcadapt(
myVCF,
K = 10, # number of PCs
method = "mahalanobis", #mahalanobis (default): the robust Mahalanobis distance is computed for each genetic marker using a robust estimate of both mean and covariance matrix between the K vectors of z-scores.
min.maf = 0.05,
ploidy = 2,
LD.clumping = NULL,
pca.only = FALSE,
tol = 1e-04
)
PCA_Var_out<- myVCF_PCA_only$singular.values
print(PCA_Var_out)*100
barplot(PCA_Var_out)
# data visualization
plot(myVCF_PCA_only, option = "screeplot")
plot(myVCF_PCA_only , option = "manhattan")
plot(myVCF_PCA_only , option = "qqplot")
plot(myVCF_PCA_only , option = "stat.distribution")
Popinfo <- read.delim("POPINFO.txt")
#2D pca plot
plot(
myVCF_PCA_only,
option = "scores",
pch = 16,
pop = Popinfo$Group..Full.number.,
xlab = "PCA1",
ylab = "PCA2"
)
#pairwise PCs plot with grouping
pairs(
myVCF_PCA_only$scores,
col = c(Popinfo$Color),
pch=16,
labels = c(
"PCA1",
"PCA2",
"PCA3",
"PCA4",
"PCA5",
"PCA6",
"PCA7",
"PCA8",
"PCA9",
"PCA10"
)
)
plot(myVCF_PCA_only, option = "screeplot")
plot(myVCF_PCA_only, option = "scores")
plot(myVCF_PCA_only , option = "manhattan")
qval <- qvalue(myVCF_PCA_only$pvalues)$qvalues
alpha <- 0.1
outliers_pcadapt <- which(qval < alpha)
print(outliers_pcadapt)
length(outliers_pcadapt) # 1428 outliers
alpha <- 0.05 # use of a more stringent threshold to detect outliers
outliers <- which(qval < alpha)
print(outliers)
length(outliers_pcadapt) # 1428 outliers