-
Notifications
You must be signed in to change notification settings - Fork 1
/
fastqcparser.py
235 lines (212 loc) · 13.6 KB
/
fastqcparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/local/bin python
# Author: Lalitha Viswanathan
# Affiliation: Stanford HealthCare
import sys
import json
from argparse import ArgumentParser
from typing import TextIO, Dict, Any
import numpy as np
import basicstatistics as basestats
import utilities as util
import per_base_seq_quality as pbq
import per_base_GC_Content as pbgccontent
import pertilesequencequality as ptseqqual
import per_sequence_quality_scores as perseqqualscores
import per_base_sequence_content.extractperbaseSequenceContentFromFastQC as perbaseseqcontent
import kmer_content.extractKmerContentlinesFromFastQC as kmerc
import adaptercontent as adaptcont
import per_base_NContent as pbNContent
import per_base_seq_quality as perbaseseqqual
# from pprint import pprint
# import errno
def parse_fastqc_data(file: TextIO) -> object:
try:
with open(file) as filepointer:
while True:
line = filepointer.readline()
if not line: break
if line.startswith('##FastQC'):
resultsdata["fastqcversion"] = line.split()[1]
continue
basicstatistics: dict
results_perbase_sequencecontentA: dict
perbasegccontent: dict
tilingrows: dict
results_perbase_sequencecontentC: dict
Adaptercontentrows: dict
perbasesequencequality: list[str]
basicstatistics, perbasesequencequality, perbasegccontent, tilingrows, \
results_perbase_sequencecontentA, \
results_perbase_sequencecontentT, results_perbase_sequencecontentG, \
results_perbase_sequencecontentC, Ncontentrows, Adaptercontentrows, KMercontentrows, \
perbasesequencecontentA, perbasesequencecontentT, perbasesequencecontentG, \
perbasesequencecontentC, resultsdata = util.initialize()
filepointer: filepointer
####################################################################################
filepointer, resultsdata, basicstatistics = basestats.extractBasicStatisticsLinesFromFastQC(line,
resultsdata,
basicstatistics)
assert (len(basicstatistics) != 0), "Basic Statistics lines are empty"
####################################################################################
filepointer, resultsdata, perbasesequencequality = \
pbq.extractperbasesequencequalityFromFastQC(filepointer, line, resultsdata, perbasesequencequality)
assert (len(perbasesequencequality) != 0), "Per base sequence quality is empty"
####################################################################################
perbasegccontent: list[str]
filepointer, resultsdata, perbasegccontent = \
pbgccontent.extract_perbase_GCContentRowsFromFastQC(filepointer, line, resultsdata,
perbasegccontent)
assert (len(perbasegccontent) != 0), "Per base GC Content is empty"
####################################################################################
filepointer, resultsdata, tilingrows = ptseqqual.extract_pertile_sequence_quality_records_from_FastQC(
filepointer, line, resultsdata,
tilingrows)
assert (len(tilingrows) != 0), "Tiling row entries is empty"
####################################################################################
filepointer, resultsdata, perbasesequencequality = perseqqualscores.perseqqualityscores(filepointer,
line,
resultsdata,
perbasesequencequality)
assert (len(perbasesequencequality) != 0), "Per base sequence quality is empty"
####################################################################################
filepointer, resultsdata, perbasesequencecontentA, \
perbasesequencecontentT, perbasesequencecontentG, \
perbasesequencecontentC = perbaseseqcontent.extract_perbase_sequence_content(filepointer,
line, resultsdata,
perbasesequencecontentA,
perbasesequencecontentT,
perbasesequencecontentG,
perbasesequencecontentC)
assert (len(perbasesequencecontentA) != 0), "Per-base sequence content (A) is empty"
assert (len(perbasesequencecontentT) != 0), "Per-base sequence content (T) is empty"
assert (len(perbasesequencecontentG) != 0), "Per-base sequence content (G) is empty"
assert (len(perbasesequencecontentC) != 0), "Per-base sequence content (C) is empty"
####################################################################################
Ncontentrows: list[str]
assert isinstance(Ncontentrows, list)
filepointer, resultsdata, Ncontentrows = pbNContent.getperbaseNContent(filepointer,
line, resultsdata,
Ncontentrows)
assert (len(Ncontentrows) != 0), "Length of N Content rows is empty"
####################################################################################
AdapterContentrows: list[str]
assert isinstance(AdapterContentrows, list)
filepointer, resultsdata, AdapterContentrows = adaptcont.extractAdapterContentrows(filepointer, line,
resultsdata,
AdapterContentrows)
assert (len(AdapterContentrows) != 0), "Adapter Content rows is empty"
####################################################################################
# Skip line with "Measure Value" labels
nextrow = filepointer.readline()
if nextrow.startswith('#Measure'):
nextrow = filepointer.readline()
while not nextrow.startswith('>>END_MODULE'):
parts = nextrow.rstrip().split('\t')
key = parts[0]
value = parts[1]
basicstatistics[key] = value
nextrow = filepointer.readline()
continue
####################################################################################
KMercontentrows: list[str]
assert isinstance(KMercontentrows, list)
filepointer, resultsdata, KMercontentrows = kmerc.extractKMerContentLines(filepointer,
line,
resultsdata,
KMercontentrows)
assert len(KMercontentrows) != 0, "Length of KMer content rows is empty"
####################################################################################
# calculateaveragesequencequalityoverintervals
assert isinstance(perbasesequencecontentT, dict)
assert isinstance(perbasesequencecontentA, dict)
assert isinstance(perbasesequencecontentG, dict)
assert isinstance(perbasesequencecontentC, dict)
assert isinstance(results_perbase_sequencecontentA, dict)
assert isinstance(results_perbase_sequencecontentT, dict)
assert isinstance(results_perbase_sequencecontentG, dict)
assert isinstance(results_perbase_sequencecontentC, dict)
results_perbase_sequencecontentA, results_perbase_sequencecontentT, \
results_perbase_sequencecontentG, results_perbase_sequencecontentC, \
perbasesequencecontentA, perbasesequencecontentT,
perbasesequencecontentG, perbasesequencecontentC = \
perbaseseqqual.calculateaveragesequencequalityoverintervals(
perbasesequencequality, resultsdata,
perbasesequencecontentA,
perbasesequencecontentT,
perbasesequencecontentG,
perbasesequencecontentC)
####################################################################################
results_perbase_gc_content = pbgccontent.perbasegccontent(resultsdata)
if perbasesequencequality:
try:
del perbasesequencequality['rows']
results_perbase_sequencequality: dict[str, dict] = {
'fastqcperbasesequencequality': perbasesequencequality}
except Exception:
print("Unexpected error:", sys.exc_info()[0])
raise Exception(sys.exc_info()[0])
imagefile: np.uint8 = None # Can be deleted. To be confirmed
except (FileNotFoundError, IOError):
print(" File Not Found")
return (resultsdata, results_perbase_gc_content, results_perbase_sequencecontentC, results_perbase_sequencecontentT,
results_perbase_sequencecontentG, results_perbase_sequencecontentA, results_perbase_sequencequality,
imagefile)
########################################################################
def parse_fastqc_data_wrapper(fastqcfilename: TextIO):
"""
:type fastqcfilename: object
"""
fastqcparserresults: dict[str, dict] = {}
(results_data, results_perbase_gccontent, results_perbase_sequencecontentC, results_perbase_sequencecontentT,
results_perbase_sequencecontentG, results_perbase_sequencecontentA, results_perbase_sequencequality,
imagefiles) = parse_fastqc_data(fastqcfilename)
# fastqcparserresults['allresults'] = results_data
# We are not storing all results_data
try:
if results_perbase_gccontent:
fastqcparserresults['results_perbase_gccontent'] = results_perbase_gccontent
if results_perbase_sequencecontentC:
fastqcparserresults['results_perbase_sequencecontentC'] = results_perbase_sequencecontentC
if results_perbase_sequencecontentT:
fastqcparserresults['results_perbase_sequencecontentT'] = results_perbase_sequencecontentT
if results_perbase_sequencecontentG:
fastqcparserresults['results_perbase_sequencecontentG'] = results_perbase_sequencecontentG
if results_perbase_sequencecontentA:
fastqcparserresults['results_perbase_sequencecontentA'] = results_perbase_sequencecontentA
if results_perbase_sequencequality:
fastqcparserresults['results_perbase_sequencequality'] = results_perbase_sequencequality
except Exception as exceptn:
print("Unexpected error %s" % exceptn)
# No image files either
# fastqcparserresults['imagesfiles'] = None
if not util.is_json(json.dumps(fastqcparserresults)):
raise Exception(sys.exc_info()[0])
return json.dumps(fastqcparserresults)
########################################################################
if __name__ == '__main__':
try:
table_lookup = {
'perbasesqqualityrows': 'fastqc_per_base_sequencing_quality',
'pertileseqqualityrows': 'fastqc_per_tile_seq_quality',
'perseqqualityrows': 'fastqc_per_seq_quality',
'perbaseseqcontentrows': 'fastqc_per_base_seq_content',
'perbaseNcontentrows': 'fastqc_per_base_NContent',
'AdapterContentrows': 'fastqc_adapterContent',
'basicstatisticsdata': 'fastqc_basicstatistics',
'KmerContentrows': 'fastqc_kmercontent'
}
parser = ArgumentParser()
parser.add_argument('fastqcfilename')
parser.add_argument('sampleid')
parser.add_argument('runid')
# parser.add_argument('sqlfilename')
# parser.add_argument('tablename')
args = parser.parse_args()
(results, resultsperbasegccontent, resultsperbasesequencecontentC, resultsperbasesequencecontentT,
resultsperbasesequencecontentG, resultsperbasesequencecontentA, resultsperbasesequencequality,
imagefiles) = parse_fastqc_data(args.fastqcfilename)
# Removed all subsequent writes to file, etc.
except Exception as exception:
print("Error encountered parsing FastQC results %s " % exception)
finally:
print("FastQC parser completed successfully")