-
Notifications
You must be signed in to change notification settings - Fork 8
/
PEAlgnmt.h
516 lines (420 loc) · 17.2 KB
/
PEAlgnmt.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
/*
*
* PEAlgnmt.h
* Soap3(gpu)
*
* Copyright (C) 2011, HKU
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
*/
/////////////////////////////////////////////////////
/*
Each pair-end result is considered to have two legs
which each of them is a position on the reference sequence.
The left leg is the position of the left-most position of
the match (which must be a %strandLeftLeg% strand alignment) ;
the right leg is the position of its mate
(which must be a %strandRightLeg% strand alignment);
| left-leg | right-leg
v v
___|XXXXXXXXXXXXX|_____________|XXXXXXXXXXXXX|_
|<----------- insertion size ---------->|
*/
/////////////////////////////////////////////////////
#ifndef __PE_ALIGNMENT_H__
#define __PE_ALIGNMENT_H__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "2bwt-lib/BWT.h"
#include "2bwt-lib/HSP.h"
#include "2bwt-flex/SRACore.h"
#include "HSPAux.h"
#define PE_MAX_BUCKET_SIZE 1024
#define PE_REPORT_ALL 0
#define PE_REPORT_ONE 1
#define PE_ALIGNMENT_COMPLETED 0
#define PE_ALIGNMENT_INITIALISED 1
#define PE_ALIGNMENT_INPUT_ERROR 2
#define PE_ALIGNMENT_FAILED 3
#define PE_NON_INPUT_VALUE 99999999
#define INITIAL_SIZE_SA_LIST_FOR_DP 1048576 // 1M
#define INITIAL_SIZE_OCC_LIST_FOR_DP 1048576 // 1M
#define MAX_SEED_NUMBER_FOR_SINGLE_DP 10
#define INITIAL_SIZE_CHAR_ARRAY 40000 // 40K //1048576 // 1M
#define SIZE_1_M 1048576
// Uncomment the below two sentences to ensure the first read for forward direction only
// and the second read for reverse direction only if paired-end reads are inputted.
// this option is only valid for simple output format and all-best alignment
// #define BGS_FWD_FOR_FIRST_REV_FOR_SECOND
// #define BGS_DISABLE_NEGATIVE_STRAND
// NOTE: Remember to delete the file GPUfunctions.cpp before recompile the codes.
//----
/////////////////////////////////////////////////////
/*
Structure SRAOccurrence / PESRAAlignmentResult (Input)
They are the structures to hold the input to Pair-End alignment.
SRAOccurrence is supposed to hold a list of occurrences.
The function PEMappingOccurrences() takes two lists of SRAOccurrence as input.
PESRAAlignmentResult is supposed to hold a list of SA ranges.
The function PEMapping() takes two lists of PESRAAlignmentResult as input.
*/
/////////////////////////////////////////////////////
/*
// SRAOccurrence is part of SRA now
typedef struct SRAOccurrence
{
unsigned int readID; // read ID
unsigned int ambPosition;
char strand; // (i.e. 1: +ve; 2: -ve)
char mismatchCount;
} SRAOccurrence;
*/
typedef struct PESRAAlignmentResult
{
unsigned int readID; // read ID
unsigned int saIndexLeft;
unsigned int saIndexRight;
unsigned char strand; // (i.e. 1: +ve; 2: -ve)
unsigned char mismatchCount;
} PESRAAlignmentResult;
/////////////////////////////////////////////////////
/*
Structure PEInput (Parameters)
It is the structure to hold the query parameters to Pair-End alignment.
Some of the parameters are compulsory; some of them are optional.
PEInput SHOULD ALWAYS BE constructed by PEInputConstruct() function.
*/
/////////////////////////////////////////////////////
typedef struct PEInput
{
BWT * bwt; //Compulsory
HSP * hsp; //Compulsory
int OutputType; //Compulsory. Valid value being PE_REPORT_*.
int patternLength; //Compulsory
int strandLeftLeg; //Compulsory
int strandRightLeg; //Compulsory
// Compulsory
///////////////////////////////////////////////////
// Either option A or B should be defined and it is compulsory
// Option B takes precedence over option A.
//
// Option A: Insertion size defined by Mean and StdDev.
int insertSizeMean; //Optional.
int insertSizeStdDev; //Optional.
// Option B: Insertion size defined by Lower bound and Upper bound.
int insertLbound; //Optional.
int insertUbound; //Optional.
///////////////////////////////////////////////////
} PEInput;
/////////////////////////////////////////////////////
/*
Structure PEPairs (Output)
It is the structure to hold 1 Pair-End alignment result.
The result being held is w.r.t to the read set source.
The alignment/strand from the first read set is stored in
algnmt_1/strand_1 while those from the second read set is
stored in algnmt_2/strand_2.
*/
/////////////////////////////////////////////////////
typedef struct PEPairs
{
unsigned int algnmt_1;
char strand_1;
char mismatch_1;
unsigned int algnmt_2;
char strand_2;
char mismatch_2;
char totalMismatchCount;
int insertion;
} PEPairs;
/////////////////////////////////////////////////////
/*
Structure PEPairList (Output)
It is the structure to hold a bucket of alignment result(PEPairs).
*/
/////////////////////////////////////////////////////
typedef struct PEPairList
{
// The bucket of pair-end results
PEPairs pairs[PE_MAX_BUCKET_SIZE];
unsigned int pairsCount;
// Link to the next bucket
struct PEPairList * next;
} PEPairList;
/////////////////////////////////////////////////////
/*
Structure PEOutput (Output)
It is the structure to enclose a linked-list of bucket(PEPairList).
*/
/////////////////////////////////////////////////////
typedef struct PEOutput
{
int flag;
struct PEPairList * root;
struct PEPairList * tail;
} PEOutput;
/////////////////////////////////////////////////////
/*
Function SRAEnrichSARanges
This function fills the gap between SOAP3-GPU and the PE Aligner
by enriching each reported SA range with the information of the
strand and mismatchCount of the alignment.
*/
/////////////////////////////////////////////////////
/*
unsigned int SRAEnrichSARanges(BWT * bwt, HSP * hsp,
unsigned int saIndexLeft, unsigned int saIndexRight,
char * strand, int * mismatchCount);
*/
/////////////////////////////////////////////////////
/*
Function PEMapping
This function takes in 2 lists of SA ranges, which
are assumed to be enriched by SRAEnrichSARanges.
This function will perform the following,
1. Initialise output collector
2. Retrieve the occurrences of all SA ranges
3. Allocate enough memory for all occurrences
4. Sort the 2 lists of occurrences
5. Merge the occurrences to generate a list of PE
alignment
Parameters:
peInput - The input parameter for PE
peOutput - The output of the PE alignment, which is
expected to be constructed by the caller.
It is initialised again by PEMapping to allow
reusing of constructed PEOutput.
resultListA,
resultCountA - The first list of SA ranges being
passed into PEMapping.
resultListB,
resultCountB - The second list of SA ranges being
passed into PEMapping.
*/
/////////////////////////////////////////////////////
void PEMapping ( PEInput * peInput, PEOutput * peOutput,
PESRAAlignmentResult * resultListA, unsigned int resultCountA,
PESRAAlignmentResult * resultListB, unsigned int resultCountB );
/////////////////////////////////////////////////////
/*
Function PEMappingOccurrences
This function is basically behaves the same as the above function,
however it takes in 2 lists of Occurrences, instead of two lists of
SA Ranges.
This function will perform the following,
1. Allocate enough memory for sorting all occurrences
2. Sort the 2 lists of occurrences
3. Merge the occurrences to generate a list of PE
alignment
Parameters:
peInput - The input parameter for PE
peOutput - The output of the PE alignment, which is
expected to be constructed by the caller.
It is initialised again by PEMapping to allow
reusing of constructed PEOutput.
occList_1,
occCount_1 - The first list of occurrences
being passed into PEMapping.
occList_2,
occCount_2 - The second list of occurrences
being passed into PEMapping.
*/
/////////////////////////////////////////////////////
void PEMappingOccurrences ( PEInput * peInput, PEOutput * peOutput,
SRAOccurrence * occList_1, unsigned int occCount_1,
SRAOccurrence * occList_2, unsigned int occCount_2 );
// unsigned int PEMappingToDisk(PEInput * peInput, PEPairList * pePairList);
/////////////////////////////////////////////////////
// Constructor and Destructor
/////////////////////////////////////////////////////
PEInput * PEInputConstruct ( BWT * bwt, HSP * hsp );
void PEInputFree ( PEInput * peInput );
PEOutput * PEOutputConstruct ();
void PEOutputFree ( PEOutput * peOutput );
/////////////////////////////////////////////////////
// Utilities
/////////////////////////////////////////////////////
unsigned int PECountPEOutput ( PEOutput * peOutput );
unsigned int PEStatsPEOutput ( PEOutput * peOutput, PEPairs ** optimal, PEPairs ** suboptimal, unsigned int * mismatchStats );
///////////////////////////////////////////////////////////
// Structures for semi-global DP //
///////////////////////////////////////////////////////////
// To hold SOAP3 alignment results of the reads which can be mapped but their mates cannot be mapped
// these reads are required to be processed by semi-global DP
// this array is also used for the input of gap alignment
typedef struct ReadInputForDP
{
unsigned int readNum; // number of reads
PESRAAlignmentResult * sa_list; // the list of SA ranges for all reads
SRAOccurrence * occ_list; // the list of occurrences for all reads
unsigned int saRangeTotalNum; // the number of SA Ranges in sa_list
unsigned int occTotalNum; // the number of occurrences in occ_list
unsigned int saListSize; // the size of sa_list
unsigned int occListSize; // the size of occ_list
unsigned int lastReadID; // the ID (i.e. even id) of the latest pair inputted in the array
} ReadInputForDP;
// To hold a set of arrays of ReadInputForDP
typedef struct ReadInputForDPArrays
{
ReadInputForDP ** inputArrays;
unsigned int numArrays; // number of arrays of ReadInputForDP
} ReadInputForDPArrays;
// To contain the parameters for DP
typedef struct DPParam
{
int cutoffThreshold; // cutoff threshold when performing DP
int maxHitNum; // maximum number of hits allowed for soap3 seeding
int sampleDist; // sample distane of seeding (i.e. sample rate = 1/sampleDist)
int seedLength; // length of each seed for seeding
} DPParam;
typedef struct DPParameters
{
int matchScore; // score for match
int mismatchScore; // score for mismatch
int openGapScore; // score for gap opening
int extendGapScore; // score for gap extension
int numOfCPUThreads; // number of CPU threads used for matrix computation
int numOfCPUForSeeding; // number of CPU threads used for seeding
int softClipLeft; // left
int softClipRight; // right
int tailTrimLen; // the length of tail trimmed for seeding
int singleDPSeedNum; // the number of seeds for single-end DP
int singleDPSeedPos[MAX_SEED_NUMBER_FOR_SINGLE_DP]; // the seed positions for single-end DP
DPParam paramRead[2]; // the specific paramters for the first and the second reads.
// for single-end DP, use the first one.
} DPParameters;
// Construct the structure ReadInputForDP for each CPU thread
ReadInputForDP * constructReadInputForDP ( int cpuNum );
// Release the memory for the structure ReadInputForDP
void freeReadInputForDP ( ReadInputForDP * readInput );
// Reset the structure ReadInputForDP
void resetReadInputForDP ( ReadInputForDP * readInput );
// To add the alignment results of a read to ReadInputForDP
void addToReadInputForDP ( ReadInputForDP * readInput, unsigned int readid, PESRAAlignmentResult * saList, unsigned int saNum,
SRAOccurrence * occList, unsigned int occNum );
//////////////////////////////////////////////////////////
//// The followings structure is for DEFAULT DP ///////
//////////////////////////////////////////////////////////
// To hold one pair-end alignment result
typedef struct AlgnmtDPResult
{
unsigned int readID; // read ID of first read (i.e. even)
char whichFromDP; // 0: first read; 1: second read; 2: none
char * cigarString; // alignment pattern of the hit from DP
int editdist; // edit distance of the hit from DP
int insertSize; // insert size
int num_sameScore;
// alignment result of the first read
unsigned int algnmt_1; // 0xFFFFFFFF if unaligned
char strand_1; // 1: +ve; 2: -ve
int score_1; // # of mismatches or score from DP
// alignment result of the second read
unsigned int algnmt_2; // 0xFFFFFFFF if unaligned
char strand_2; // 1: +ve; 2: -ve
int score_2; // # of mismatches or score from DP
} AlgnmtDPResult;
///////////////////////////////////////////////////////
//// The followings structure is for DEEP DP ///////
///////////////////////////////////////////////////////
// To hold one pair-end alignment result of Deep DP for unaligned pairs
typedef struct DeepDPAlignResult
{
unsigned int readID; // read ID of first read (i.e. even)
int insertSize; // insert size
// alignment result of the first read
unsigned int algnmt_1; // 0xFFFFFFFF if unaligned
char strand_1; // 1: +ve; 2: -ve
int score_1; // score from DP
int editdist_1;
char * cigarString_1; // alignment pattern of the hit from DP
int num_sameScore_1;
// alignment result of the second read
unsigned int algnmt_2; // 0xFFFFFFFF if unaligned
char strand_2; // 1: +ve; 2: -ve
int score_2; // score from DP
int editdist_2;
char * cigarString_2; // alignment pattern of the hit from DP
int num_sameScore_2;
} DeepDPAlignResult;
// To hold one single-end alignment result
typedef struct SingleAlgnmtResult
{
unsigned int readID; // read ID
char * cigarString; // alignment pattern of the hit from DP
// alignment result of the read
unsigned int algnmt; // alignment pos; 0xFFFFFFFF if unaligned
char strand; // 1: +ve; 2: -ve
int score; // score from DP
int editdist;
int num_sameScore;
} SingleAlgnmtResult;
// dynamic-size character array
typedef struct DynamicUint8Array
{
uint8_t * charStr; // the character array
unsigned long long length; // length of the array
unsigned long long size; // available size of the char array
} DynamicUint8Array;
DynamicUint8Array * DynamicUint8ArrayConstruct ();
void DynamicUint8ArrayFree ( DynamicUint8Array * uint8Array );
void DynamicUint8ArrayReset ( DynamicUint8Array * uint8Array );
void appendStringToUint8Array ( DynamicUint8Array * uint8Array, char * charString, int len );
// ================================================
// To hold a set of single-end alignment results
// ================================================
// each algnment
typedef struct Algnmt
{
char * cigarString; // alignment pattern of the hit from DP
// an alignment of the read
unsigned int algnmt; // alignment pos; 0xFFFFFFFF if unaligned
char strand; // 1: +ve; 2: -ve
int score; // score from DP
int editdist;
int num_sameScore;
int isFromDP; // 1: from DP; 0: from SOAP3
} Algnmt;
// read pointer
typedef struct ReadPtr
{
unsigned int readID;
unsigned int startIndex; // start index of the array of algnmt
unsigned int numAlgnmt; // number of alignments
} ReadPtr;
// the structure to hold a set of alignment results
typedef struct AllHits
{
Algnmt * hitArray;
int hitNum;
int hitArrayAvailSize;
ReadPtr * readPtrArray;
int readNum;
int readArrayAvailSize;
} AllHits;
AllHits * constructAllHits ();
void inputAlgnmtsToArray ( AllHits * allHits, SingleAlgnmtResult * algnResults, int algnNum );
void sortReadPtrs ( AllHits * allHits );
void resetAllHits ( AllHits * allHits );
void releaseAllHits ( AllHits * allHits );
void printOutHits ( AllHits * allHits ); // for debugging
void inputSoap3AnsToArray ( AllHits * allHits, unsigned int readID, HSPAux * hspaux, BWT * bwt, int readLength );
// input soap3 answer to array
// soap3 answer should be stored in HSP->soap3AnsArray
// ================================================
#endif