From 12609a01acd8bf5fa3a4631afe12d9283434e772 Mon Sep 17 00:00:00 2001 From: Sowmya Iyer Date: Mon, 9 Apr 2018 22:19:39 -0400 Subject: [PATCH] code changes to (1) move key field to end of line because it was messing up the filter step (2) change identified reference data to match code changes and (3) change filter reference data to add additional key that tagged along from identify step --- guideseq/identifyOfftargetSites.py | 6 +++--- test/data/filtered/EMX1_backgroundFiltered.txt | 2 +- test/data/identified/EMX1_identifiedOfftargets.txt | 14 +++++++------- .../identified/control_identifiedOfftargets.txt | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/guideseq/identifyOfftargetSites.py b/guideseq/identifyOfftargetSites.py index 73ec7dd..24228d1 100644 --- a/guideseq/identifyOfftargetSites.py +++ b/guideseq/identifyOfftargetSites.py @@ -329,7 +329,7 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, with open(outfile, 'w') as f, open(outfile_unmerged, 'w') as f_unmerged: # Write header - print('Window.key','Chromosome', 'Min.Position', 'Max.Position', 'Name', 'Filename', 'Position', 'WindowSequence', # 0:6 + print('Chromosome', 'Min.Position', 'Max.Position', 'Name', 'Filename', 'Position', 'WindowSequence', # 0:6 '+.mi', '-.mi', 'bi.sum.mi', 'bi.geometric_mean.mi', '+.total', '-.total', 'total.sum', 'total.geometric_mean', # 7:14 'primer1.mi', 'primer2.mi', 'primer.geometric_mean', 'position.stdev', # 15:18 'Site_SubstitutionsOnly.Sequence', 'Site_SubstitutionsOnly.NumSubstitutions', # 19:20 @@ -337,7 +337,7 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, 'Site_GapsAllowed.Sequence', 'Site_GapsAllowed.Length', 'Site_GapsAllowed.Score', # 24:26 'Site_GapsAllowed.Substitutions', 'Site_GapsAllowed.Insertions', 'Site_GapsAllowed.Deletions', # 27:29 'Site_GapsAllowed.Strand', 'Site_GapsAllowed.Start', 'Site_GapsAllowed.End', # 30:32 - 'Cell', 'Targetsite', 'TargetSequence', 'RealignedTargetSequence', sep='\t', file=f) # 33:36 + 'Cell', 'Targetsite', 'TargetSequence', 'RealignedTargetSequence', 'Window.key',sep='\t', file=f) # 33:36 print('Window.key','Chromosome', 'Min.Position', 'Max.Position', 'Name', 'Filename', 'Position', 'WindowSequence', # 0:6 '+.mi', '-.mi', 'bi.sum.mi', 'bi.geometric_mean.mi', '+.total', '-.total', 'total.sum', 'total.geometric_mean', # 7:14 @@ -418,7 +418,7 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, print(*output_row_with_key, sep='\t', file=f_unmerged) for key in sorted(output_dict.keys()): - output_dict[key].insert(0, key) + output_dict[key].append(key) print(*output_dict[key], sep='\t', file=f) def assignPrimerstoReads(read_sequence, sam_flag): diff --git a/test/data/filtered/EMX1_backgroundFiltered.txt b/test/data/filtered/EMX1_backgroundFiltered.txt index 94bf1d9..4ab9cdc 100644 --- a/test/data/filtered/EMX1_backgroundFiltered.txt +++ b/test/data/filtered/EMX1_backgroundFiltered.txt @@ -1 +1 @@ -1:236259170-236261754 1473 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none +1:236259170-236261754 1473 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr1:236259170-236261754_1465_1486 diff --git a/test/data/identified/EMX1_identifiedOfftargets.txt b/test/data/identified/EMX1_identifiedOfftargets.txt index 130bfdc..e4e2350 100644 --- a/test/data/identified/EMX1_identifiedOfftargets.txt +++ b/test/data/identified/EMX1_identifiedOfftargets.txt @@ -1,7 +1,7 @@ -Window.key Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence -chr15:44108746-44110769_1000_1023 15:44108746-44110769 1007 1025 chr15:44108746-44110769_1017_189 EMX1.sam 1017 GTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCT 116 73 189 92.0217365626 258 148 406 195.407267009 96 80 87.6356092008 4.931631338038255 GAGTCTAAGCAGAAGAAGAAGAG 3 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none -chr1:236259170-236261754_1465_1486 1:236259170-236261754 1465 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none -chr1:236259170-236261754_1531_1539 1:236259170-236261754 1531 1539 chr1:236259170-236261754_1531_5 EMX1.sam 1531 GGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACT 0 5 5 0.0 0 5 5 0.0 1 2 1.41421356237 2.947456530637899 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none -chr2:73159981-73162004_1000_1023 2:73159981-73162004 1008 1024 chr2:73159981-73162004_1017_489 EMX1.sam 1017 AAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGG 243 246 489 244.49539873 619 541 1160 578.68730762 236 231 233.486616319 4.710360920354193 GAGTCCGAGCAGAAGAAGAAGGG 0 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none -chr3:197899267-197901348_1075_1081 3:197899267-197901348 1075 1081 chr3:197899267-197901348_1080_10 EMX1.sam 1080 TTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGG 0 10 10 0.0 0 32 32 0.0 9 1 3.0 2.5495097567963922 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none -chr6:9117792-9119815_1000_1023 6:9117792-9119815 1007 1007 chr6:9117792-9119815_1007_4 EMX1.sam 1007 ATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGA 1 3 4 1.73205080757 1 9 10 3.0 2 2 2.0 0.0 ACGTCTGAGCAGAAGAAGAATGG 3 - 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none +Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence Window.key +15:44108746-44110769 1007 1025 chr15:44108746-44110769_1017_189 EMX1.sam 1017 GTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCT 116 73 189 92.0217365626 258 148 406 195.407267009 96 80 87.6356092008 4.931631338038255 GAGTCTAAGCAGAAGAAGAAGAG 3 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr15:44108746-44110769_1000_1023 +1:236259170-236261754 1465 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr1:236259170-236261754_1465_1486 +1:236259170-236261754 1531 1539 chr1:236259170-236261754_1531_5 EMX1.sam 1531 GGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACT 0 5 5 0.0 0 5 5 0.0 1 2 1.41421356237 2.947456530637899 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr1:236259170-236261754_1531_1539 +2:73159981-73162004 1008 1024 chr2:73159981-73162004_1017_489 EMX1.sam 1017 AAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGG 243 246 489 244.49539873 619 541 1160 578.68730762 236 231 233.486616319 4.710360920354193 GAGTCCGAGCAGAAGAAGAAGGG 0 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr2:73159981-73162004_1000_1023 +3:197899267-197901348 1075 1081 chr3:197899267-197901348_1080_10 EMX1.sam 1080 TTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGG 0 10 10 0.0 0 32 32 0.0 9 1 3.0 2.5495097567963922 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr3:197899267-197901348_1075_1081 +6:9117792-9119815 1007 1007 chr6:9117792-9119815_1007_4 EMX1.sam 1007 ATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGA 1 3 4 1.73205080757 1 9 10 3.0 2 2 2.0 0.0 ACGTCTGAGCAGAAGAAGAATGG 3 - 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none chr6:9117792-9119815_1000_1023 diff --git a/test/data/identified/control_identifiedOfftargets.txt b/test/data/identified/control_identifiedOfftargets.txt index 933bca8..b12c6c5 100644 --- a/test/data/identified/control_identifiedOfftargets.txt +++ b/test/data/identified/control_identifiedOfftargets.txt @@ -1,4 +1,4 @@ -Window.key Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence -chr1:236259170-236261754_1473_1490 1:236259170-236261754 1473 1490 chr1:236259170-236261754_1481_7 control.sam 1481 TCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCA 1 6 7 2.44948974278 1 9 10 3.0 2 5 3.16227766017 5.535341001239219 Control control None none -chr1:236259170-236261754_1521_1531 1:236259170-236261754 1521 1531 chr1:236259170-236261754_1523_14 control.sam 1523 GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT 0 14 14 0.0 0 18 18 0.0 7 7 7.0 3.7094473981982814 Control control None none -chr3:197899267-197901348_1035_1040 3:197899267-197901348 1035 1040 chr3:197899267-197901348_1040_3 control.sam 1040 TAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAG 3 0 3 0.0 5 0 5 0.0 1 1 1.0 2.0548046676563256 Control control None none +Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence Window.key +1:236259170-236261754 1473 1490 chr1:236259170-236261754_1481_7 control.sam 1481 TCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCA 1 6 7 2.44948974278 1 9 10 3.0 2 5 3.16227766017 5.535341001239219 Control control None none chr1:236259170-236261754_1473_1490 +1:236259170-236261754 1521 1531 chr1:236259170-236261754_1523_14 control.sam 1523 GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT 0 14 14 0.0 0 18 18 0.0 7 7 7.0 3.7094473981982814 Control control None none chr1:236259170-236261754_1521_1531 +3:197899267-197901348 1035 1040 chr3:197899267-197901348_1040_3 control.sam 1040 TAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAG 3 0 3 0.0 5 0 5 0.0 1 1 1.0 2.0548046676563256 Control control None none chr3:197899267-197901348_1035_1040