-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_ncbi_assembly_references.qsub
171 lines (154 loc) · 5.61 KB
/
make_ncbi_assembly_references.qsub
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/bin/bash -l
#$ -S /bin/bash
#$ -cwd
#$ -j y
#$ -m eas
#$ -N make_ncbi_assembly_references
# Build reference FASTA files and indexes for a given NCBI genome assembly
# Adam Gower, based on script by Josh Campbell, originally derived from:
# https://github.com/infphilo/hisat2/tree/master/scripts/make_hg19.sh
# Parse command-line arguments
eval set -- "$(
getopt --options=a:n:o: \
--longoptions=assembly:,ncbi:,output-path: \
--name "$0" -- "$@"
)"
while true
do
case "$1" in
-a|--assembly)
assembly="$2"
shift 2 ;;
-n|--ncbi)
ncbi_accession="$2"
shift 2 ;;
-o|--output-path)
output_path="$(readlink --canonicalize "$2")"
shift 2 ;;
--)
shift
break ;;
*)
echo "Internal error"
exit 1 ;;
esac
done
if [[ ${assembly} == "" || ${ncbi_accession} == "" ]]
then
echo "Usage:"
echo " qsub [qsub flags] make_ncbi_assembly_references.qsub [options]"
echo " -a|--assembly [assembly name]"
echo " -n|--ncbi [NCBI accession]"
echo "Options:"
echo -n " -a, --assembly "
echo "genome assembly name (e.g., 'A_J_v1', 'MesAur1.0')"
echo -n " -r, --ncbi "
echo "NCBI accession (e.g., 'GCA_001624215.1', 'GCF_000349665.1')"
echo -n " -o, --output-path "
echo -n "Path where output will be written "
echo "(will be created if it does not exist)"
echo " (Default: ./[assembly name]/)"
else
# If output path is not specified, a directory named for the assembly
# will be created within the current working directory, and output will
# be written there
[[ "${output_path}" == "" ]] && output_path="$(pwd)/${assembly}"
# Load and list modules
module load bwa
module load picard
module load samtools
module list
# Define URLs
ucsc_base_url="https://hgdownload.soe.ucsc.edu"
assembly_url="${ucsc_base_url}/hubs"
assembly_url="${assembly_url}/${ncbi_accession:0:3}"
assembly_url="${assembly_url}/${ncbi_accession:4:3}"
assembly_url="${assembly_url}/${ncbi_accession:7:3}"
assembly_url="${assembly_url}/${ncbi_accession:10:3}"
assembly_url="${assembly_url}/${ncbi_accession}"
app_url="${ucsc_base_url}/admin/exe/linux.x86_64"
# Define filenames
twobit_file="${ncbi_accession}.2bit"
chrom_sizes_file="${ncbi_accession}.chrom.sizes"
seqList_file="${ncbi_accession}.seqList"
# Define regular expressions
# Note: hg19 includes alternative haplotypes in files named chr*_hap*.fa
# hg38 includes alternative haplotypes in files named chr*_alt.fa
declare -A regexes=(
["random"]="^chr(.+_.+_random|Un_.+)$"
["althap"]="^chr.+_(alt|hap.+)$"
)
# Display the parsed arguments
echo "Genome assembly name: ${assembly}"
echo "NCBI accession: ${ncbi_accession}"
echo "FASTA file and indexes will be written to: ${output_path}/"
# Prepare for download
cd $TMPDIR/ || exit
echo "Retrieving files from ${assembly_url}/ to: $TMPDIR"
# Try retrieving .2bit file from UCSC to scratch space;
# if file could not be retrieved due to fatal error, terminate
wget ${assembly_url}/${twobit_file}
if [[ ! -e ${twobit_file} ]]
then
echo "File '${twobit_file}' could not be retrieved; terminating."
exit
fi
# Retrieve UCSC apps and make them executable
for app in twoBitInfo twoBitToFa
do
wget ${app_url}/${app}
chmod +x ${app}
done
# Create the output path if it does not already exist
[ ! -d "${output_path}" ] && mkdir --verbose -p "${output_path}"/
# Ensure directory is not world-readable yet (to keep others out until ready)
chmod 2700 "${output_path}"/
# Generate chrom.sizes file from .2bit file
./twoBitInfo "${twobit_file}" "${chrom_sizes_file}"
# Iterate over each subset of contigs,
# adding them sequentially (if they are included) to a new FASTA reference
for chrom_subset in base random althap
do
if [[ ${chrom_subset} == "base" ]]
then
# Initialize contig set name
chrom_set_name="${chrom_subset}"
# Extract list of "base" contigs to text file
# (i.e., those that do _not_ match any of the regexes)
readarray -t contigs < <(
cut -f1 "${chrom_sizes_file}" | \
grep -E -v -f <(printf "%s\n" "${regexes[@]}")
)
printf "%s\n" ${contigs[@]} > "${seqList_file}"
else
# Get array of contigs matching the regex for the current subset
readarray -t contigs < <(
cut -f1 "${chrom_sizes_file}" | grep -E "${regexes[${chrom_subset}]}"
)
# If any contigs match the regex, proceed
if [[ ${#contigs[@]} -gt 0 ]]
then
# Append subset name to contig set name
chrom_set_name="${chrom_set_name}_${chrom_subset}"
# Append any contigs that match the regex to the text file
printf "%s\n" ${contigs[@]} >> "${seqList_file}"
else
break
fi
fi
# If the for-loop was not exited, create a directory named after contig set
mkdir --verbose --mode=2700 "${output_path}"/${chrom_set_name}/
# Generate a FASTA file from the .2bit file
fa_file="${output_path}"/${chrom_set_name}/${assembly}.fa
./twoBitToFa -seqList="${seqList_file}" "${twobit_file}" "${fa_file}"
# Make indices and sequence dictionary from the FASTA file
bwa index -a bwtsw "${fa_file}"
samtools faidx "${fa_file}"
# Note: this step calls a wrapper script named 'picard' that calls Java
# with the 'picard.jar' file and 2GB maximum heap size
picard CreateSequenceDictionary \
REFERENCE="${fa_file}" OUTPUT="${fa_file%.fa}.dict"
done
# Make all output files and directories world-readable and read-only
chmod -Rc ugo-w,ugo+rX "${output_path}"/
fi