forked from trgaleev/AlleleSeq2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PIPELINE_aggregate_over_genomic_regions.mk
executable file
·122 lines (92 loc) · 7.04 KB
/
PIPELINE_aggregate_over_genomic_regions.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#######################
### PIPELINE PARAMS ###
#######################
### system / executables ##
PL := ~/bin/AlleleSeq2
SAMTOOLS := ~/bin/samtools-1.3.1/samtools
BEDTOOLS_intersectBed := ~/bin/bedtools2/bin/intersectBed
### input files / paths ##
REGIONS_FILE :=
COUNTS_FILE :=
PREFIX :=
UNIQ_ALN_FILES :=
MMAP_ALN_FILES :=
# stats / counts params:
FDR_SIMS := 500
FDR_CUTOFF := 0.05
Cntthresh_tot := 6
Cntthresh_min := 0
#AMB_MODE := adjust # 'adjust' or allelic ratio diff threshold for filtering
# only 'adjust' mode only for now: add the 'weaker allele' base counts from multi-mapping reads, if any:
# all of them or until balanced with the stonger to make sure the imbalance is not caused by the multimapping reads
KEEP_CHR := # empty or 'X'
######################
### PIPELINE STEPS ###
######################
empty_string:=
$(info REGIONS_FILE: $(REGIONS_FILE))
$(info COUNTS_FILE: $(COUNTS_FILE))
$(info PREFIX: $(PREFIX))
$(info UNIQ_ALN_FILES: $(UNIQ_ALN_FILES))
$(info MMAP_ALN_FILES: $(MMAP_ALN_FILES))
$(info $(empty_string))
######################
### PIPELINE START ###
######################
all:$(PREFIX)_hap1_allele_ratios.region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf $(PREFIX)_hap1_allele_ratios.region_filtered_counts.pdf $(PREFIX)_hap1_allele_ratios.region_raw_counts.pdf $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
# todo: this seems to work, but the way it deals with paths, filenames, etc needs to be cleaned up
# currently, keeping alleleDB betabinomial scripts with as little modifications as possible
# fix columns and all the workaround with the .tmp files
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
awk '{print $$1"\t"$$2"\t"$$3"\t"$$2"\t"$$3"\t0\t0\t"$$2+$$3"\t"$$4"\t1.0"}' $< | \
sed 's/hap1_count\thap2_count\t0\t0\t0/cA\tcC\tcG\tcT\tsum_ref_n_alt_cnts/' | \
sed 's/hap1_allele_ratio\t1.0/ref_allele_ratio\tcnv/' > $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp
Rscript $(PL)/alleledb_calcOverdispersion.R \
$(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp \
$(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinomial.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min
Rscript $(PL)/alleledb_alleleseqBetabinomial.R \
$(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp \
$(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinomial.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min \
$(PREFIX)_counts.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv \
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tmp \
$(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt \
$(FDR_CUTOFF)
awk '{print $$1"\t"$$2"\t"$$3"\t"$$8"\t"$$9"\t"$$11}' \
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tmp | \
sed 's/sum_ref_n_alt_cnts/hap1_allele_ratio/' > $@
#rm $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp \
# $(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tmp
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
python $(PL)/FalsePos_regions.py $< $(FDR_SIMS) $(FDR_CUTOFF) > $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt
cat $< | python $(PL)/filter_regions_by_pval.py $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt > $@
# allelic ratio distrs
$(PREFIX)_hap1_allele_ratios.region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt hap1_allele_ratio
# filter based on total counts and min per hap count
# and in non-autosomal chr, optionally keeping X;
$(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.tsv
cat $< | \
python $(PL)/filter_regions_non-autosomal_chr.py $(REGIONS_FILE) $(KEEP_CHR) | \
python $(PL)/filter_regions_by_counts.py $(Cntthresh_tot) $(Cntthresh_min) > $@
# allelic ratio distrs
$(PREFIX)_hap1_allele_ratios.region_filtered_counts.pdf: $(PREFIX)_region_filtered_counts.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) region_filtered_counts hap1_allele_ratio
$(PREFIX)_region_filtered_counts.tsv: $(PREFIX)_region_raw_counts_uniq.tsv $(PREFIX)_region_raw_counts_mmap.tsv
cat $< | \
# python $(PL)/filter_regions_w_mmaps.py $(AMB_MODE) \
python $(PL)/filter_regions_w_mmaps.py adjust \
$(PREFIX)_discarded_regions_warn-mmaps.log $(PREFIX)_mmap_reads_over_regions.log $(PREFIX)_region_raw_counts_mmap.tsv > $@
# this can be memory-intensive and fail without error message (empty $(PREFIX)_region_raw_counts_mmap.tsv file)
$(PREFIX)_region_raw_counts_mmap.tsv: hets_regions_hap2.bed
cat hets_regions_hap1.bed hets_regions_hap2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(MMAP_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py mmap hets_regions_hap1.bed > $@
# allelic ratio distrs
$(PREFIX)_hap1_allele_ratios.region_raw_counts.pdf: $(PREFIX)_region_raw_counts_uniq.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) region_raw_counts hap1_allele_ratio
# -split -- not counting reads that splice over a het
$(PREFIX)_region_raw_counts_uniq.tsv: hets_regions_hap2.bed
cat hets_regions_hap1.bed hets_regions_hap2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(UNIQ_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py uniq hets_regions_hap1.bed > $@
hets_regions_hap2.bed : $(REGIONS_FILE) $(COUNTS_FILE)
grep -v '^#chr' $(COUNTS_FILE) | awk '{print $$1"\t"$$2-1"\t"$$2}' | $(BEDTOOLS_intersectBed) -a $(REGIONS_FILE) -b stdin | \
python $(PL)/refbed2hapcoords.py $(COUNTS_FILE) hets_regions_hap1.bed hets_regions_hap2.bed