diff --git a/guideseq/identifyOfftargetSites.py b/guideseq/identifyOfftargetSites.py index 4551872..60c25ba 100644 --- a/guideseq/identifyOfftargetSites.py +++ b/guideseq/identifyOfftargetSites.py @@ -238,6 +238,7 @@ def alignSequences(targetsite_sequence, window_sequence, max_score=7): lowest_distance_score, lowest_mismatch = 100, max_score + 1 chosen_alignment_b, chosen_alignment_m, chosen_alignment_strand_b, chosen_alignment_strand_m = None, None, '', '' + # Use regex to find the best match allowing only for mismatches for aln_m in alignments_mm: strand_m, alignment_type_m, match_m = aln_m if match_m != None: @@ -247,6 +248,8 @@ def alignSequences(targetsite_sequence, window_sequence, max_score=7): chosen_alignment_strand_m = strand_m lowest_mismatch = mismatches + # Use regex to find the best match allowing for gaps, so that its edit distance is strictly lower than the + # total number of mismatches of the sequence founded (if any) allowing only for mismatches. for aln_b in alignments_bulge: strand_b, alignment_type_b, match_b = aln_b if match_b != None: @@ -280,8 +283,9 @@ def alignSequences(targetsite_sequence, window_sequence, max_score=7): else: chosen_alignment_strand_b = '' - return [offtarget_sequence_no_bulge, mismatches, chosen_alignment_strand_m, start_no_bulge, end_no_bulge, - bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end, realigned_target] + return [offtarget_sequence_no_bulge, mismatches, len(offtarget_sequence_no_bulge), chosen_alignment_strand_m, start_no_bulge, end_no_bulge, + realigned_target, + bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end] """ @@ -320,17 +324,15 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, with open(outfile, 'w') as f: # Write header - print('BED_Chromosome', 'BED_Min.Position', 'BED_Max.Position', 'BED_Name', 'Filename', - 'WindowIndex', 'WindowChromosome', 'Position', 'WindowSequence', - '+.mi', '-.mi', 'bi.sum.mi', 'bi.geometric_mean.mi', '+.total', '-.total', 'total.sum', 'total.geometric_mean', - 'primer1.mi', 'primer2.mi', 'primer.geometric_mean', 'position.stdev', - 'BED_Site_Name', 'BED_Score', 'BED_Site_Chromosome', - 'Site_SubstitutionsOnly.Sequence', 'Site_SubstitutionsOnly.NumSubstitutions', # 24:25 - 'Site_SubstitutionsOnly.Strand', 'Site_SubstitutionsOnly.Start', 'Site_SubstitutionsOnly.End', # 26:28 - 'Site_GapsAllowed.Sequence', 'Site_GapsAllowed.Length', 'Site_GapsAllowed.Score', # 29:31 - 'Site_GapsAllowed.Substitutions', 'Site_GapsAllowed.Insertions', 'Site_GapsAllowed.Deletions', # 32:34 - 'Site_GapsAllowed.Strand', 'Site_GapsAllowed.Start', 'Site_GapsAllowed.End', # 35:37 - 'Cell', 'Targetsite', 'TargetSequence', 'RealignedTargetSequence', sep='\t', file=f) # 38:41 + print('Chromosome', 'Min.Position', 'Max.Position', 'Name', 'Filename', 'Position', 'WindowSequence', # 0:6 + '+.mi', '-.mi', 'bi.sum.mi', 'bi.geometric_mean.mi', '+.total', '-.total', 'total.sum', 'total.geometric_mean', # 7:14 + 'primer1.mi', 'primer2.mi', 'primer.geometric_mean', 'position.stdev', # 15:18 + 'Site_SubstitutionsOnly.Sequence', 'Site_SubstitutionsOnly.NumSubstitutions', # 19:20 + 'Site_SubstitutionsOnly.Strand', 'Site_SubstitutionsOnly.Start', 'Site_SubstitutionsOnly.End', # 21:23 + 'Site_GapsAllowed.Sequence', 'Site_GapsAllowed.Length', 'Site_GapsAllowed.Score', # 24:26 + 'Site_GapsAllowed.Substitutions', 'Site_GapsAllowed.Insertions', 'Site_GapsAllowed.Deletions', # 27:29 + 'Site_GapsAllowed.Strand', 'Site_GapsAllowed.Start', 'Site_GapsAllowed.End', # 30:32 + 'Cell', 'Targetsite', 'TargetSequence', 'RealignedTargetSequence', sep='\t', file=f) # 33:36 # Output summary of each window summary = chromosome_position.SummarizeBarcodeIndex(search_radius) @@ -341,16 +343,15 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, output_dict = {} for row in summary: - window_sequence, window_chromosome, window_start, window_end, BED_name = row[3:8] + window_sequence, window_chromosome, window_start, window_end = row[3:7] non_bulged_target_start_absolute, bulged_target_start_absolute = '', '' if target_sequence: - offtarget_sequence_no_bulge, mismatches, chosen_alignment_strand_m, start_no_bulge, end_no_bulge, \ - bulged_offtarget_sequence, length, distance, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end, \ - realigned_target_sequence = alignSequences(target_sequence, window_sequence, max_score) + offtarget_sequence_no_bulge, mismatches, offtarget_sequence_length, chosen_alignment_strand_m, start_no_bulge, end_no_bulge, \ + realigned_target, \ + bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end = \ + alignSequences(target_sequence, window_sequence, max_score=max_score) - BED_score = 1 - BED_chromosome = window_chromosome if chosen_alignment_strand_m == "+": non_bulged_target_start_absolute = start_no_bulge + int(row[2]) - search_radius non_bulged_target_end_absolute = end_no_bulge + int(row[2]) - search_radius @@ -369,27 +370,24 @@ def analyze(sam_filename, reference_genome, outfile, annotations, search_radius, else: bulged_target_start_absolute, bulged_target_end_absolute = [""] * 2 - if not (chosen_alignment_strand_m or chosen_alignment_strand_b): - BED_chromosome, BED_score, BED_name = [""] * 3 - - output_row = row[4:8] + [filename_tail] + row[0:4] + row[8:] + \ - [str(x) for x in BED_name, BED_score, BED_chromosome, - offtarget_sequence_no_bulge, mismatches, chosen_alignment_strand_m, + output_row = [row[1]] + row[5:8] + [filename_tail] + row[2:4] + row[8:] + \ + [str(x) for x in offtarget_sequence_no_bulge, mismatches, chosen_alignment_strand_m, non_bulged_target_start_absolute, non_bulged_target_end_absolute, - bulged_offtarget_sequence, length, distance, substitutions, insertions, deletions, + bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_target_start_absolute, bulged_target_end_absolute] + \ - [str(x) for x in annotation] + [realigned_target_sequence] + [str(x) for x in annotation] + [realigned_target] else: - output_row = [str(x) for x in row[4:8] + [filename_tail] + row[0:4] + row[8:] + [""] * 17 + annotation + ['none']] + output_row = [row[1]] + row[5:8] + [filename_tail] + row[2:4] + row[8:] + [""] * 14 + [str(x) for x in annotation] + ['none'] if non_bulged_target_start_absolute != '' or bulged_target_start_absolute != '': output_row_key = '{0}_{1}_{2}'.format(window_chromosome, min(non_bulged_target_start_absolute, bulged_target_start_absolute), max(non_bulged_target_end_absolute, bulged_target_end_absolute)) else: output_row_key = '{0}_{1}_{2}'.format(window_chromosome, window_start, window_end) + # update read count if output_row_key in output_dict.keys(): - read_count_total = int(output_row[11]) + int(output_dict[output_row_key][11]) - output_dict[output_row_key][11] = str(read_count_total) + read_count_total = int(output_row[9]) + int(output_dict[output_row_key][9]) + output_dict[output_row_key][9] = str(read_count_total) else: output_dict[output_row_key] = output_row diff --git a/guideseq/visualization.py b/guideseq/visualization.py index 2f8c236..92f438a 100644 --- a/guideseq/visualization.py +++ b/guideseq/visualization.py @@ -23,11 +23,11 @@ def parseSitesFile(infile): for line in f: line = line.rstrip('\n') line_items = line.split('\t') - offtarget_reads = line_items[11] - no_bulge_offtarget_sequence = line_items[24] - bulge_offtarget_sequence = line_items[29] - target_seq = line_items[40] - realigned_target_seq = line_items[41] + offtarget_reads = line_items[9] + no_bulge_offtarget_sequence = line_items[19] + bulge_offtarget_sequence = line_items[24] + target_seq = line_items[35] + realigned_target_seq = line_items[36] if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '': if no_bulge_offtarget_sequence: @@ -166,14 +166,14 @@ def visualizeOfftargets(infile, outfile, title=None): def main(): parser = argparse.ArgumentParser(description='Plot visualization plots for aligned reads.') - parser.add_argument("--identified_file", help="Full path to sample identified output", required=True) + parser.add_argument("--identified", help="Full path to sample identified output", required=True) parser.add_argument("--outfile", help="Full path to output file", required=True) parser.add_argument("--title", help="Plot title", required=True) args = parser.parse_args() print(args) - visualizeOfftargets(args.identified_file, args.outfile, title=args.title) + visualizeOfftargets(args.identified, args.outfile, title=args.title) if __name__ == "__main__":