-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathLDpred.py
executable file
·279 lines (248 loc) · 16.6 KB
/
LDpred.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/env python
import argparse
from ldpred import coord_genotypes
from ldpred import LDpred_gibbs
from ldpred import LDpred_inf
from ldpred import LD_pruning_thres
from ldpred import validate
from ldpred import ld
import sys
import textwrap
__version__ = '1.0.0'
parser = argparse.ArgumentParser(prog='LDpred',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\
\033[96mLDpred v. 1.0\033[0m
--------------------------------------------------------------------------------------
Thank you for using\033[1m LDpred\033[0m!
Typicial workflow:
1. Use\033[1m LDpred coord\033[0m to parse summary statistics and genotypes and coordinate data.
See LDpred coord --help for further usage description and options.
2. Use\033[1m LDpred ldfile\033[0m to obtain cached local ld files and chr level summary information..
See LDpred ldfile --help for further usage description and options.
3. Use\033[1m LDpred gibbs|inf|p+t\033[0m to obtain SNP weights for polygenic scoring using one or
more methods. See LDpred gibbs|inf|p+t --help for further usage description and options.
4. Use\033[1m LDpred score\033[0m to calculate polygenic scores using SNP weights from previous
step. See LDpred score --help for further usage description and options.
2019 (c) Bjarni J Vilhjalmsson: bjarni.vilhjalmsson@gmail.com
"""))
subparsers = parser.add_subparsers(help='Select ldpred action among the following options: ', dest='ldpred_action')
parser_coord = subparsers.add_parser('coord', help='Parse and coordinate summary statistics and genotypes.')
parser_gibbs = subparsers.add_parser('gibbs', help='Estimate LDpred (Gibbs sampler) SNP weights. (Requires a coordinated dataset.)')
parser_inf = subparsers.add_parser('inf', help='Estimate LDpred-inf SNP weights. (Requires a coordinated dataset.)')
parser_pt = subparsers.add_parser('p+t', help='Obtain pruning+thresholding SNP weights. (Requires a coordinated dataset.)')
parser_score = subparsers.add_parser('score', help='Calculate polygenic scores using given SNP weights.')
# Wallace
parser_ldfile = subparsers.add_parser('ldfile', help='Get cached local ld files and chr level summary information.')
# -end wallace
#General arguments
parser.add_argument('--debug', default=False, action='store_true',
help="Activate debugging mode (more verbose)")
#coord arguments
parser_coord.add_argument('--gf', type=str, required=True,
help='LD Reference Genotype File. '
'Should be a (full path) filename prefix to a standard PLINK bed file (without .bed). '
'Make sure that the fam and bim files with same names are in the same directory. ')
parser_coord.add_argument('--ssf', type=str, required=True,
help='Summary Statistic File. '
'Filename for a text file with the GWAS summary statistics')
parser_coord.add_argument('--N', type=int, default=None,
help='Number of Individuals in Summary Statistic File. Required for most summary '
'statistics formats.')
parser_coord.add_argument('--out', type=str, required=True,
help='Output Prefix')
parser_coord.add_argument('--vbim', type=str, default=None,
help='Validation SNP file. '
'This is a PLINK BIM file which can be used to filter the set of SNPs down '
'to the set of validation SNPs. To maximize accuracy, we recommend calculating LDpred '
'weights for the subset of SNPs that are used to calculate the risk scores in the '
'target (validation) sample.')
parser_coord.add_argument('--vgf', type=str, default=None,
help='Validation genotype file. '
'This is a PLINK BIM file which can be used to filter the set of SNPs down to the '
'set of validation SNPs. To maximize accuracy, we recommend calculating LDpred '
'weights for the subset of SNPs that are used to calculate the risk scores in the '
'target (validation) sample.')
parser_coord.add_argument('--ilist', type=str,
help='List of individuals to include in the analysis. ', default=None)
parser_coord.add_argument('--skip-coordination', default=False, action='store_true',
help="Assumes that the alleles have already been coordinated between LD reference, "
"validation samples, and the summary statistics files")
parser_coord.add_argument('--beta', default=False, action='store_true',
help='Assumes the summary statistics are BETA (linear regression) instead of OR (logistic '
'regression)')
parser_coord.add_argument('--maf', type=float, default=0.01,
help='MAF filtering threshold. Set to 0 to disable MAF filtering.')
parser_coord.add_argument('--ssf-format', type=str, default="CUSTOM", choices={'CUSTOM','STANDARD','GIANT', 'PGC'},
help='This is the format type of the summary statistics file. '
'By default the CUSTOM format requires the user to specify the file format using additional '
'arguments.')
parser_coord.add_argument('--rs', type=str, default="SNP",
help="Column header of SNP ID")
parser_coord.add_argument('--A1', type=str, default="A1",
help="Column header containing the effective allele. "
"There isn't any standardized label for the effective allele, "
"therefore extra care must be taken to ensure the correct label is provided, "
"otherwise, the effect will be flipped.")
parser_coord.add_argument('--A2', type=str, default="A2",
help="Column header containing non-effective allele.")
parser_coord.add_argument('--pos', type=str, default="BP",
help="Column header containing the coordinate of SNPs.")
parser_coord.add_argument('--info', type=str, default="INFO",
help="Column header containing the INFO score.")
parser_coord.add_argument('--chr', type=str, default="CHR",
help="Column header containing the chromosome information.")
parser_coord.add_argument('--reffreq', type=str, default="MAF",
help="Column header containing the reference MAF")
parser_coord.add_argument('--pval', type=str, default="P",
help="Column header containing the P-value information")
parser_coord.add_argument('--eff', type=str, default="OR",
help="Column header containing effect size information")
parser_coord.add_argument('--ncol', type=str, default="N",
help="Column header containing sample size information")
# parser_coord.add_argument('--gmdir', type=str,
# help='The directory of genetic map.', default=None)
#gibbs arguments
parser_gibbs.add_argument('--cf', type=str, required=True,
help='Coordinated file (generated using ldpred coord). '
'Should be a (full path) filename. ')
parser_gibbs.add_argument('--ldr', type=int, required=True,
help='LD radius. An integer number which denotes the number of SNPs on each side '
'of the focal SNP for which LD should be adjusted. A value corresponding M/3000, '
'where M is the number of SNPs in the genome is recommended')
parser_gibbs.add_argument('--ldf', type=str, required=True,
help='LD file (prefix). A path and filename prefix for the LD file. If it does not '
'exist, it will be generated. This can take up to several hours, depending on '
'LD radius used.')
parser_gibbs.add_argument('--out', type=str, required=True,
help='Output Prefix for SNP weights')
parser_gibbs.add_argument('--f', default=[1,0.3,0.1,0.03,0.01,0.003,0.001], nargs='+', type=float,
help="Fraction of causal variants used in the Gibbs sampler")
parser_gibbs.add_argument('--N', type=int, default=100000, required=True,
help='Number of individuals on which the summary statistics are assumed to be based on.')
parser_gibbs.add_argument('--n-iter', type=int, default=60,
help='The number of iterations used by the Gibbs sampler. The default is 60.')
parser_gibbs.add_argument('--n-burn-in', type=int, default=5,
help='The number of burn-in iterations used by the Gibbs sampler. The default is 5.')
parser_gibbs.add_argument('--h2', type=float, default=None,
help='The heritability assumed by LDpred. By default it estimates the heritability from'
' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
parser_gibbs.add_argument('--scipy', default=False, action='store_true',
help='Using scipy to compute the (Moore-Penrose) pseudo-inverse matrix, default numpy.')
# parser_gibbs.add_argument('--gm-ldr', type=float, default=None,
# help='If this option is set, then a genetic map will be used to calculate LD-radius. '
# 'A value around 1 is arguably reasonable.')
#inf arguments
parser_inf.add_argument('--cf', type=str, required=True,
help='Coordinated file (generated using ldpred coord). '
'Should be a (full path) filename. ')
parser_inf.add_argument('--ldr', type=int, required=True,
help='LD radius. An integer number which denotes the number of SNPs on each side '
'of the focal SNP for which LD should be adjusted. A value corresponding M/3000, '
'where M is the number of SNPs in the genome is recommended')
parser_inf.add_argument('--ldf', type=str, required=True,
help='LD file (prefix). A path and filename prefix for the LD file. If it does not '
'exist, it will be generated. This can take up to several hours, depending on '
'LD radius used.')
parser_inf.add_argument('--out', type=str, required=True,
help='Output Prefix for SNP weights')
parser_inf.add_argument('--N', type=int, default=100000, required=True,
help='Number of individuals on which the summary statistics are assumed to be based on.')
parser_inf.add_argument('--h2', type=float, default=None,
help='The heritability assumed by LDpred. By default it estimates the heritability from'
' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
# parser_inf.add_argument('--gm-ldr', type=float, default=None,
# help='If this option is set, then a genetic map will be used to calculate LD-radius. '
# 'A value around 1 is arguably reasonable.')
parser_pt.add_argument('--cf', type=str, required=True,
help='Coordinated file (generated using ldpred coord). '
'Should be a (full path) filename. ')
parser_pt.add_argument('--ldr', type=int, required=True,
help='LD radius. An integer number which denotes the number of SNPs on each side '
'of the focal SNP for which LD should be adjusted. A value corresponding M/3000, '
'where M is the number of SNPs in the genome is recommended')
parser_pt.add_argument('--out', type=str, required=True,
help='Output Prefix for SNP weights')
parser_pt.add_argument('--p', default=[1,0.3,0.1,0.03,0.01,0.003,0.001,3*1E-4,
1E-4,3*1E-5,1E-5,1E-6,1E-7,1E-8], nargs='+', type=float,
help="P value thresholds")
parser_pt.add_argument('--r2', default=[0.2], nargs='+', type=float,
help='LD R2 thresholds. The maximum LD squared correlation allowed between any SNPs '
'within the given LD-radius. Default max R2 value is set to 0.2')
# parser_pt.add_argument('--gm-ldr', type=float, default=None,
# help='If this option is set, then a genetic map will be used to calculate LD-radius. '
# 'A value around 1 is arguably reasonable.')
# Wallace
parser_ldfile.add_argument('--cf', type=str, required=True,
help='Coordinated file (generated using ldpred coord). '
'Should be a (full path) filename. ')
parser_ldfile.add_argument('--ldr', type=int, required=True,
help='LD radius. An integer number which denotes the number of SNPs on each side '
'of the focal SNP for which LD should be adjusted. A value corresponding M/3000, '
'where M is the number of SNPs in the genome is recommended')
parser_ldfile.add_argument('--ldf', type=str, required=True,
help='LD file (prefix). A path and filename prefix for the LD file.')
# parser_ldfile.add_argument('--out', type=str, required=True,
# help='Output Prefix for SNP weights')
# - Wallace: it's not necessary for N in this step.
# parser_ldfile.add_argument('--N', type=int, default=100000, required=True,
# help='Number of individuals on which the summary statistics are assumed to be based on.')
parser_ldfile.add_argument('--h2', type=float, default=None,
help='The heritability assumed by LDpred. By default it estimates the heritability from'
' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
#- end Wallace.
parser_score.add_argument('--gf', type=str, default=None,
help='Validation genotype file. '
'PLINK formatted genotypes for which we want to calculate risk scores.')
parser_score.add_argument('--rf', type=str, required=True,
help='SNP weights file (prefix used for output in preceding step), e.g. LDpred SNP weights.')
parser_score.add_argument('--out', type=str, required=True,
help='The prefix of risk score output file.')
parser_score.add_argument('--pf', type=str, default=None,
help='A file with individual IDs and phenotypes.')
parser_score.add_argument('--pf-format', type=str, default='STANDARD', choices = {'STANDARD','FAM'},
help='Phenotype file format. Two formats are supported a (PLINK) FAM format, and '
'STANDARD format (default), which is a whitespace/tab delimited file with two '
'columns IID and PHEN.')
parser_score.add_argument('--rf-format', type=str, default='ANY', choices = {'ANY','LDPRED', 'P+T'},
help='The format to expect the results to be in.')
parser_score.add_argument('--cov-file', type=str, default=None,
help='Covariate file, format is whitespace-delimted file with columns IID, COV1, COV2, etc.')
parser_score.add_argument('--pcs-file', type=str, default=None,
help='PCs file, format is whitespace-delimted file with columns FID, IID, PC1, PC2, etc.')
parser_score.add_argument('--split-by-chrom', default=False, action='store_true',
help='')
parser_score.add_argument('--f', default=[1,0.3,0.1,0.03,0.01,0.003,0.001], nargs='+', type=float,
help="Fraction of causal variants used in the Gibbs sampler")
parser_score.add_argument('--p', default=[1,0.3,0.1,0.03,0.01,0.003,0.001,3*1E-4,
1E-4,3*1E-5,1E-5,1E-6,1E-7,1E-8], nargs='+', type=float,
help="P value thresholds (P+T)")
parser_score.add_argument('--r2', default=[1,0.2], nargs='+', type=float,
help="LD R2 thresholds (P+T)")
def main():
if len(sys.argv)==1:
print ('ERROR: No options provided.\n')
parser.print_help(sys.stderr)
sys.exit(1)
parameters = parser.parse_args()
p_dict= vars(parameters)
if p_dict['debug']:
print ('Parsed parameters:')
print(p_dict)
action = p_dict['ldpred_action']
if action=='coord':
coord_genotypes.main(p_dict)
elif action=='gibbs':
LDpred_gibbs.main(p_dict)
elif action=='inf':
LDpred_inf.main(p_dict)
elif action=='p+t':
LD_pruning_thres.main(p_dict)
elif action=='score':
validate.main(p_dict)
elif action=='ldfile':
ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'],wallaceld=True)
elif action=='all':
pass
if __name__ == '__main__':
main()