diff --git a/tools/tapscan/tapscan.xml b/tools/tapscan/tapscan.xml index c86b46760e..3953f2b6c5 100644 --- a/tools/tapscan/tapscan.xml +++ b/tools/tapscan/tapscan.xml @@ -1,4 +1,4 @@ - + Detect Transcription Associated Proteins (TAPs) topic_0121 @@ -9,28 +9,28 @@ sed - - - - + + + + <\"filter\" (if desired)>\n\n"; exit; @@ -56,7 +64,7 @@ # Containes the actual result for a query sequence my $akt_entry = ""; # Used to define query entry to ignore similar domains -my $whole_entry = ""; +my $whole_entry = ""; # Includes the final entries after ignoring similar domains my @results_of_extraction = (); # Used to define query entry to ignore similar domains @@ -64,7 +72,7 @@ # Used to define query entry to ignore similar domains my $present = ""; # Used to define query entry to ignore similar domains -my $protein = ""; +my $protein = ""; my $lek = ""; @@ -84,7 +92,7 @@ exit; } -### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #. +### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #. @output = grep !/^#.*/, @output; # Read domain-specific coverage values file @@ -94,11 +102,11 @@ open(FILENAME,$domspec_cuts); my %cuts = map { chomp; split /\t/ } ; -### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element). +### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element). foreach my $output (@output) { $output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/; $lek = (($6-$5)+1)/$3; - $output = $1."\t".$2."\t".$4."\t".$lek; + $output = $1."\t".$2."\t".$4."\t".$lek; } # Stucture: @@ -113,12 +121,18 @@ ### 2.1 Run through length_cut_off loop (throws out entries below coverage values) my @cutarray; +open(FH, '>', "${basename}_filtered_sequences.txt") or die $!; +print FH "Below are the sequences that were filtered out, along with the reason"; foreach my $output (@output) { $output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; if ($4 > $cuts{$2}) { push (@cutarray, $output); } + else{ + print FH "$output --> did not meet coverage cutoff of $cuts{$2} (cov was $4)\n" + } } +close(FH); # End up with (prot name) (domain name) (dom evalue) (overlapping) @@ -139,7 +153,7 @@ my @newarray; foreach my $cutarray (@cutarray) { $cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/; - $akt_entry = $1.$2; + $akt_entry = $1.$2; # If the protein contains more than one of the same domain if ($old_prot eq $akt_entry) { if ($3>$scoreeval) { @@ -149,7 +163,7 @@ else { $y++; $newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y; - } + } } # If the protein is new reset $j and push the entry in the next array else { @@ -204,7 +218,7 @@ $C1domain = 0; } - + # Get the Query entry $line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/; $protein = "$1"; @@ -214,7 +228,7 @@ # Ignore FIE if it is not better scored than WD40 - if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;} + if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;} # Ignore similar domains part 1 @@ -240,10 +254,10 @@ if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;} if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;} - + # Save the entry push @results_of_extraction, $whole_entry; - $entry_counter++; + $entry_counter++; } print "$entry_counter domain matches were found in $hmmsearch_output\n\n"; @@ -272,12 +286,12 @@ foreach my $domain_entry (@sorted_results_of_extraction) { $domain_entry =~ /^([^;]+);/; - $akt_entry = $1; + $akt_entry = $1; # If the protein has more than one domain push it behind the old entry if ($old_entry eq $akt_entry) { $j++; $array_of_arrays->[$i][$j] = $domain_entry; - } + } # If the protein is new reset $j and push the entry in the next array else { @@ -319,14 +333,14 @@ foreach my $classification (@classifications) { $classification =~ /^([^;]+);/; $akt_entry = $1; - + # If the family has more than one domain entry push it behind the old entry if ($old_entry eq $akt_entry) { - + $j++; $array_of_classifications->[$i][$j] = $classification; } - + # If the family is new push it in the next array and reset $j else { $j = 0; @@ -334,8 +348,8 @@ $array_of_classifications->[$i][$j] = $classification; # Add family to list of all families in the classification rules push(@liste_alle_familien,$akt_entry); - - # Add hardcoded "OR" defined families to list of families if subfamily is present + + # Add hardcoded "OR" defined families to list of families if subfamily is present # Do this just once for each family if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) { if ($ARR_B_in_list == 0) { @@ -354,10 +368,10 @@ }} } $old_entry = $akt_entry; -} +} my $number_of_classifications = $i+1; -print "$number_of_classifications different families were found in $decision_table\n\n"; +print "$number_of_classifications different families were found in $decision_table\n\n"; $number_of_classifications--; print "*** begin classification ***\n\n"; @@ -400,7 +414,7 @@ # Flag to mark that any family was found for the current protein my $member_found = 0; - + # In this string all the domains of one protein will be stored # The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2) my $domains = ";"; @@ -456,13 +470,13 @@ # This loop will be made for every domain classification entry for the current family while ($array_of_classifications->[$i][$j]) { - + # For "should" entries if ($array_of_classifications->[$i][$j] =~ /;should$/) { # Get the domain from the classification entry $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; - + my $domain_classification_entry = $1; # Check if the protein might be a member of the current family. If yes check the next entry if ($domains =~ /;$domain_classification_entry;/) { @@ -482,7 +496,7 @@ # Get the domain from the classification entry $array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/; - + # Check if the protein has the domain. If yes go to the next family my $domain_check = $1; if ($domains =~ /;$domain_check;/) { @@ -495,11 +509,11 @@ next; } } - + # This happens when there is no "should" or "should not" entry at the right position print "error in classification entry $array_of_classifications->[$i][$j]\n" } - + # If the check was successful print the found classification if ($possible_member) { @@ -542,7 +556,7 @@ pop @family_classifications_file; pop @subfamily_classifications_file; - + # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ @@ -574,7 +588,7 @@ push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n"; push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n"; - } + } $possibilities--; last; @@ -587,12 +601,12 @@ # Normal enty: if the family is classified for one family write it in the array else { - # Store the matched domains of the actual family for respectively later doubly classification decision + # Store the matched domains of the actual family for respectively later doubly classification decision $shoulds_prev_family = $shoulds_act_family; # Store the entry in the output array # Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B" # Add GARP_ARR-B to @liste_alle_familien ONCE - + if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){ push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n"; push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n"; @@ -631,7 +645,7 @@ } } - + if ($possibilities == 0) { # If no family was found for the current protein give this out @@ -639,7 +653,7 @@ push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n"; $unclassified_families++; } - + # Increment the number of possible members in the array at the right position else { @@ -647,17 +661,17 @@ $temp++; $entry_counter[$possibilities] = $temp; } - + } # Define for which TAP families also subfamilies may be present: foreach my $subfamily_classifications_file (@subfamily_classifications_file) { - - # If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively. + + # If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively. if ($subfamily_classifications_file =~ /C2H2/ ) { $subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/ } - # If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively. + # If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively. if ($subfamily_classifications_file =~ /C2H2_IDD/ ) { $subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/ } @@ -745,7 +759,7 @@ @subfamily_classifications_file = sort @subfamily_classifications_file; if ($gene_model_filter and $gene_model_filter eq "filter") { print "*** filtering gene models: removing splice variants ***\n"; - + # temporary array for the filtered entries my @filtered_entries = []; my $models_removed_counter = 0; @@ -785,7 +799,7 @@ push @family_list, "$2"; } print SUBFAMILY_CLASSIFICATIONS "$fcf_line"; -} +} close (SUBFAMILY_CLASSIFICATIONS); @@ -842,8 +856,8 @@ delete $hash{'HRT'}; delete $hash{'GIY_YIG'}; -# Add all not found families from the classification rules to @output_family_statistics -# With zero as number of families found +# Add all not found families from the classification rules to @output_family_statistics +# With zero as number of families found foreach my $element (keys %hash) { if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) { @@ -854,7 +868,7 @@ } } -# Sort @output_family_statistics caseinsensitive-alphabetically +# Sort @output_family_statistics caseinsensitive-alphabetically my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics; # Print headline to @sortierte_statistik @@ -891,10 +905,11 @@ print "*** The results were written in $family_classifications and $subfamily_classifications ***\n"; print "*** done ***\n\n"; + exit; sub get_file_data { - + my ($filename) = @_; use strict; @@ -914,3 +929,4 @@ sub get_file_data { return @filedata; } + diff --git a/tools/tapscan/tapscan_coverage_values_v10.txt b/tools/tapscan/tapscan_coverage_values_v11.txt similarity index 99% rename from tools/tapscan/tapscan_coverage_values_v10.txt rename to tools/tapscan/tapscan_coverage_values_v11.txt index 4d1bea6df0..ded33fecae 100644 --- a/tools/tapscan/tapscan_coverage_values_v10.txt +++ b/tools/tapscan/tapscan_coverage_values_v11.txt @@ -152,3 +152,4 @@ NDX 0.75 SAWADEE 0.75 C1HDZ 0.75 C2HDZ 0.75 +Homeobox_KN 0.75 diff --git a/tools/tapscan/tapscan_domains_v12.txt.gz b/tools/tapscan/tapscan_domains_v12.txt.gz deleted file mode 100644 index e38012340c..0000000000 Binary files a/tools/tapscan/tapscan_domains_v12.txt.gz and /dev/null differ diff --git a/tools/tapscan/tapscan_domains_v13.txt.gz b/tools/tapscan/tapscan_domains_v13.txt.gz new file mode 100644 index 0000000000..d597bdb50b Binary files /dev/null and b/tools/tapscan/tapscan_domains_v13.txt.gz differ diff --git a/tools/tapscan/tapscan_rules_v81.txt b/tools/tapscan/tapscan_rules_v82.txt similarity index 94% rename from tools/tapscan/tapscan_rules_v81.txt rename to tools/tapscan/tapscan_rules_v82.txt index 0cbf8a593e..fdfa11d98e 100644 --- a/tools/tapscan/tapscan_rules_v81.txt +++ b/tools/tapscan/tapscan_rules_v82.txt @@ -119,16 +119,16 @@ HD_PHD;PHD;should HD_PHD;Homeobox;should HD_PINTOX;Homeobox;should HD_PINTOX;PINTOX;should -HD_BEL;Homeobox;should -HD_BEL;BEL;should -HD_KNOX1;Homeobox;should -HD_KNOX1;KNOX1;should -HD_KNOX1;KNOX2;should -HD_KNOX1;KNOXC;should not -HD_KNOX2;Homeobox;should -HD_KNOX2;KNOX1;should -HD_KNOX2;KNOX2;should -HD_KNOX2;KNOXC;should +HD_TALE_BEL;Homeobox_KN;should +HD_TALE_BEL;BEL;should +HD_TALE_KNOX1;Homeobox_KN;should +HD_TALE_KNOX1;KNOX1;should +HD_TALE_KNOX1;KNOX2;should +HD_TALE_KNOX1;KNOXC;should not +HD_TALE_KNOX2;Homeobox_KN;should +HD_TALE_KNOX2;KNOX1;should +HD_TALE_KNOX2;KNOX2;should +HD_TALE_KNOX2;KNOXC;should HD-other;EIN3;should not HD-other;Homeobox;should HD-other;bZIP_1;should not @@ -136,6 +136,10 @@ HD-other;WOX_HD;should not HD-other;PINTOX;should not HD-other;PHD;should not HD-other;BEL;should not +HD_TALE;Homeobox_KN;should +HD_TALE;BEL;should not +HD_TALE;KNOX1;should not +HD_TALE;KNOX2;should not HMG;ARID;should not HMG;HMG_box;should HMG;YABBY;should not diff --git a/tools/tapscan/test-data/output.2.tsv b/tools/tapscan/test-data/output.2.tsv index 6fe6bb4b79..bfbb2d92fc 100644 --- a/tools/tapscan/test-data/output.2.tsv +++ b/tools/tapscan/test-data/output.2.tsv @@ -57,13 +57,14 @@ HD-LD 0 HD-NDX 0 HD-other 0 HD-SAWADEE 0 -HD_BEL 0 HD_DDT 0 -HD_KNOX1 0 -HD_KNOX2 0 HD_PHD 0 HD_PINTOX 0 HD_PLINC 0 +HD_TALE 0 +HD_TALE_BEL 0 +HD_TALE_KNOX1 0 +HD_TALE_KNOX2 0 HD_WOX 0 HMG 0 HSF 0 diff --git a/tools/tapscan/test-data/output.domtbl.tsv b/tools/tapscan/test-data/output.domtbl.tsv index 6846bd1bf6..9e006bb592 100644 --- a/tools/tapscan/test-data/output.domtbl.tsv +++ b/tools/tapscan/test-data/output.domtbl.tsv @@ -20,9 +20,9 @@ Ec-00_001360.1 - 442 bZIP_AUREO - 52 2. # Program: hmmsearch # Version: 3.3.2 (Nov 2020) # Pipeline mode: SEARCH -# Query file: /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt -# Target file: /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat -# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/galaxyproject/tools-iuc/tools/tapscan/tapscan_domains_v12.txt /tmp/saskia/tmp4n93kzh1/files/b/7/b/dataset_b7b39fd6-41f1-440d-bfb7-da7a3a9f070e.dat -# Current dir: /tmp/saskia/tmp4n93kzh1/job_working_directory/000/4/working -# Date: Mon Nov 13 14:57:28 2023 +# Query file: /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz +# Target file: /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat +# Option settings: hmmsearch --domtblout domtblout.txt --cut_ga /home/saskia/code/github/bgruening/galaxytools/tools/tapscan/tapscan_domains_v13.txt.gz /tmp/saskia/tmpwta2cyq_/files/f/5/c/dataset_f5c238a3-8cc8-42f9-86b5-ebc2431be570.dat +# Current dir: /tmp/saskia/tmpwta2cyq_/job_working_directory/000/4/working +# Date: Thu Feb 22 10:10:40 2024 # [ok]