From d127597bb3359020e35380810202b094140e3d68 Mon Sep 17 00:00:00 2001 From: Herman DE BEUKELAER Date: Thu, 21 Sep 2023 11:34:59 +0200 Subject: [PATCH 01/14] Remove code duplication in main workflow --- mini_ac.nf | 117 ++++++++++++++--------------------------------------- 1 file changed, 31 insertions(+), 86 deletions(-) diff --git a/mini_ac.nf b/mini_ac.nf index 5774729..e4a2776 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -7,99 +7,44 @@ workflow MINIAC { params.OBO_file = "$projectDir/data/ontologies/go.obo" - if (params.mode == "genome_wide" && params.species == "maize_v4") { - - params.MotMapsFile_gw = "$projectDir/data/zma_v4/zma_v4_genome_wide_motif_mappings.bed" - params.Non_cod_genome = "$projectDir/data/zma_v4/zma_v4_noncod_merged.bed" - params.Faix_file = "$projectDir/data/zma_v4/zma_v4.fasta.fai" - params.Motif_tf_file = "$projectDir/data/zma_v4/zma_v4_motif_TF_file.txt" - params.Genes_coords = "$projectDir/data/zma_v4/zma_v4_genes_coords_sorted.bed" - params.Feature_file = "$projectDir/data/zma_v4/zma_v4_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/zma_v4/zma_v4_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/zma_v4/maize_v4_gene_metadata_file.txt" - params.P_val = 0.1 + // define species id used for data subfolder and data file prefix + def species = "ath" + if(params.species == "maize_v4") { + species = "zma_v4" + } else if(params.species == "maize_v5") { + species = "zma_v5" + } else { + exit 1, "MINI-AC can only be run for the species 'arabidopsis', 'maize_v4' and 'maize_v5'. Instead it got '${params.species}'." + } + + // set input data parameters shared between genome-wide and locus-based modes + params.Faix_file = "$projectDir/data/${species}/${species}.fasta.fai" + params.Motif_tf_file = "$projectDir/data/${species}/${species}_motif_TF_file.txt" + params.Genes_coords = "$projectDir/data/${species}/${species}_genes_coords_sorted.bed" + params.Feature_file = "$projectDir/data/${species}/${species}_go_gene_file.txt" + params.TF_fam_file = "$projectDir/data/${species}/${species}_TF_family_file.txt" + params.Genes_metadata = "$projectDir/data/${species}/${species}_gene_metadata_file.txt" + + // set defaut p-value + params.P_val = 0.1 + + if (params.mode == "genome_wide") { - genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile_gw, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - } - - else if (params.mode == "genome_wide" && params.species == "maize_v5") { + params.MotMapsFile_gw = "$projectDir/data/${species}/${species}_genome_wide_motif_mappings.bed" + params.Non_cod_genome = "$projectDir/data/${species}/${species}_noncod_merged.bed" - params.MotMapsFile_gw = "$projectDir/data/zma_v5/zma_v5_genome_wide_motif_mappings.bed" - params.Non_cod_genome = "$projectDir/data/zma_v5/zma_v5_noncod_merged.bed" - params.Faix_file = "$projectDir/data/zma_v5/zma_v5.fasta.fai" - params.Motif_tf_file = "$projectDir/data/zma_v5/zma_v5_motif_TF_file.txt" - params.Genes_coords = "$projectDir/data/zma_v5/zma_v5_genes_coords_sorted.bed" - params.Feature_file = "$projectDir/data/zma_v5/zma_v5_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/zma_v5/zma_v5_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/zma_v5/maize_v5_gene_metadata_file.txt" - params.P_val = 0.1 - genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile_gw, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - } - - else if (params.mode == "genome_wide" && params.species == "arabidopsis") { - - params.MotMapsFile_gw = "$projectDir/data/ath/ath_genome_wide_motif_mappings.bed" - params.Non_cod_genome = "$projectDir/data/ath/ath_noncod_merged.bed" - params.Faix_file = "$projectDir/data/ath/ath.fasta.fai" - params.Motif_tf_file = "$projectDir/data/ath/ath_motif_TF_file.txt" - params.Genes_coords = "$projectDir/data/ath/ath_genes_coords_sorted.bed" - params.Feature_file = "$projectDir/data/ath/ath_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/ath/ath_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/ath/arabidopsis_gene_metadata_file.txt" - params.P_val = 0.1 - - genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile_gw, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - - } - - else if (params.mode == "locus_based" && params.species == "maize_v4") { + + else if (params.mode == "locus_based") { - params.MotMapsFile_lb = "$projectDir/data/zma_v4/zma_v4_locus_based_motif_mappings_5kbup_1kbdown.bed" - params.Promoter_file = "$projectDir/data/zma_v4/zma_v4_promoter_5kbup_1kbdown_sorted.bed" - params.Faix_file = "$projectDir/data/zma_v4/zma_v4.fasta.fai" - params.Motif_tf_file = "$projectDir/data/zma_v4/zma_v4_motif_TF_file.txt" - params.Feature_file = "$projectDir/data/zma_v4/zma_v4_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/zma_v4/zma_v4_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/zma_v4/maize_v4_gene_metadata_file.txt" - params.P_val = 0.01 + params.MotMapsFile_lb = "$projectDir/data/${species}/${species}_locus_based_motif_mappings_5kbup_1kbdown.bed" + params.Promoter_file = "$projectDir/data/${species}/${species}_promoter_5kbup_1kbdown_sorted.bed" locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.MotMapsFile_lb, params.Promoter_file, params.Faix_file, params.Motif_tf_file, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - - } - - else if (params.mode == "locus_based" && params.species == "maize_v5") { - - params.MotMapsFile_lb = "$projectDir/data/zma_v5/zma_v5_locus_based_motif_mappings_5kbup_1kbdown.bed" - params.Promoter_file = "$projectDir/data/zma_v5/zma_v5_promoter_5kbup_1kbdown_sorted.bed" - params.Faix_file = "$projectDir/data/zma_v5/zma_v5.fasta.fai" - params.Motif_tf_file = "$projectDir/data/zma_v5/zma_v5_motif_TF_file.txt" - params.Feature_file = "$projectDir/data/zma_v5/zma_v5_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/zma_v5/zma_v5_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/zma_v5/maize_v5_gene_metadata_file.txt" - params.P_val = 0.01 - - locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.MotMapsFile_lb, params.Promoter_file, params.Faix_file, params.Motif_tf_file, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - + + } else { + exit 1, "MINI-AC can only be run using the modes 'genome_wide' or 'locus_based'. Instead it got '${params.mode}'." } - - else if (params.mode == "locus_based" && params.species == "arabidopsis") { - - params.MotMapsFile_lb = "$projectDir/data/ath/ath_locus_based_motif_mappings_5kbup_1kbdown.bed" - params.Promoter_file = "$projectDir/data/ath/ath_promoter_5kbup_1kbdown_sorted.bed" - params.Faix_file = "$projectDir/data/ath/ath.fasta.fai" - params.Motif_tf_file = "$projectDir/data/ath/ath_motif_TF_file.txt" - params.Feature_file = "$projectDir/data/ath/ath_go_gene_file.txt" - params.TF_fam_file = "$projectDir/data/ath/ath_TF_family_file.txt" - params.Genes_metadata = "$projectDir/data/ath/arabidopsis_gene_metadata_file.txt" - params.P_val = 0.01 - - locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.MotMapsFile_lb, params.Promoter_file, params.Faix_file, params.Motif_tf_file, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - } - - else { - exit 1, "MINI-AC can only be run using the modes 'genome_wide' and 'locus_based', and with the species 'arabidopsis', 'maize_v4' and 'maize_v5'. Instead it got '${params.species}' and '${params.mode}' " - } } From a059ef2043b35dd2dbcd0712c26f769b6353ee5c Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 12:06:33 +0200 Subject: [PATCH 02/14] Add missing curly brace --- mini_ac.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mini_ac.nf b/mini_ac.nf index e4a2776..d8e792e 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -35,7 +35,7 @@ workflow MINIAC { genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile_gw, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) - else if (params.mode == "locus_based") { + } else if (params.mode == "locus_based") { params.MotMapsFile_lb = "$projectDir/data/${species}/${species}_locus_based_motif_mappings_5kbup_1kbdown.bed" params.Promoter_file = "$projectDir/data/${species}/${species}_promoter_5kbup_1kbdown_sorted.bed" From 2c92d71c7f26efe9fe334e828bc32f38c7c4327d Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 12:12:42 +0200 Subject: [PATCH 03/14] Use switch for setting species id --- mini_ac.nf | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mini_ac.nf b/mini_ac.nf index d8e792e..190f116 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -8,14 +8,17 @@ workflow MINIAC { params.OBO_file = "$projectDir/data/ontologies/go.obo" // define species id used for data subfolder and data file prefix - def species = "ath" - if(params.species == "maize_v4") { - species = "zma_v4" - } else if(params.species == "maize_v5") { - species = "zma_v5" - } else { - exit 1, "MINI-AC can only be run for the species 'arabidopsis', 'maize_v4' and 'maize_v5'. Instead it got '${params.species}'." - } + def species + switch(params.species) { + case "arabidopsis": + species = "ath" + case "maize_v4": + species = "zma_v4" + case "maize_v5": + species = "zma_v5" + default: + exit 1, "MINI-AC can only be run for the species 'arabidopsis', 'maize_v4' and 'maize_v5'. Instead it got '${params.species}'." + } // set input data parameters shared between genome-wide and locus-based modes params.Faix_file = "$projectDir/data/${species}/${species}.fasta.fai" From b52ec4656fe7fa4dd36c84e4ec9e666bbc35a73b Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 12:20:59 +0200 Subject: [PATCH 04/14] Add missing breaks to switch --- mini_ac.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mini_ac.nf b/mini_ac.nf index 190f116..1181637 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -12,10 +12,13 @@ workflow MINIAC { switch(params.species) { case "arabidopsis": species = "ath" + break case "maize_v4": species = "zma_v4" + break case "maize_v5": species = "zma_v5" + break default: exit 1, "MINI-AC can only be run for the species 'arabidopsis', 'maize_v4' and 'maize_v5'. Instead it got '${params.species}'." } From 3138b1c5d1f03704400d3395f12df8744f48f1c2 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 12:24:18 +0200 Subject: [PATCH 05/14] Exclude nextflow cache dir and logs from version control --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d2264c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# ignore Nextflow cache and logs +.nextflow/ +.nextflow.log* From 9dfe14de2099ca80367f853694e092381dfafef1 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 14:16:33 +0200 Subject: [PATCH 06/14] Rename gene metadata files to match other file path structure --- ...bidopsis_gene_metadata_file.txt => ath_gene_metadata_file.txt} | 0 ...ze_v4_gene_metadata_file.txt => zma_v4_gene_metadata_file.txt} | 0 ...ze_v5_gene_metadata_file.txt => zma_v5_gene_metadata_file.txt} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename data/ath/{arabidopsis_gene_metadata_file.txt => ath_gene_metadata_file.txt} (100%) rename data/zma_v4/{maize_v4_gene_metadata_file.txt => zma_v4_gene_metadata_file.txt} (100%) rename data/zma_v5/{maize_v5_gene_metadata_file.txt => zma_v5_gene_metadata_file.txt} (100%) diff --git a/data/ath/arabidopsis_gene_metadata_file.txt b/data/ath/ath_gene_metadata_file.txt similarity index 100% rename from data/ath/arabidopsis_gene_metadata_file.txt rename to data/ath/ath_gene_metadata_file.txt diff --git a/data/zma_v4/maize_v4_gene_metadata_file.txt b/data/zma_v4/zma_v4_gene_metadata_file.txt similarity index 100% rename from data/zma_v4/maize_v4_gene_metadata_file.txt rename to data/zma_v4/zma_v4_gene_metadata_file.txt diff --git a/data/zma_v5/maize_v5_gene_metadata_file.txt b/data/zma_v5/zma_v5_gene_metadata_file.txt similarity index 100% rename from data/zma_v5/maize_v5_gene_metadata_file.txt rename to data/zma_v5/zma_v5_gene_metadata_file.txt From ddc800e282ed4f536a32c7b5ed10e4eb65426ed2 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 14:20:24 +0200 Subject: [PATCH 07/14] Rename MotMapsFile_gw and *_lb parameters to just MotifMapsFile --- docs/configuration_pipeline.md | 6 +++--- mini_ac.nf | 8 ++++---- workflows/miniac_gw.nf | 4 ++-- workflows/miniac_lb.nf | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/configuration_pipeline.md b/docs/configuration_pipeline.md index d8668f9..4effad5 100644 --- a/docs/configuration_pipeline.md +++ b/docs/configuration_pipeline.md @@ -162,7 +162,7 @@ There are mainly two cases in which the user might want to alter the internal MI ### Modification of the motif mapping file for the locus-based mode of maize -By default, the maize MINI-AC locus-based mode runs on the "medium" non-coding genomic space, which corresponds, for each locus in the genome, to the 5kb upstream of the translation start site, the 1kb downstream of the translation end site, and the introns. However, we generated two additional motif mapping files for the locus-based mode of maize, that cover "large" (15kb upstream of the translation start site, the 2.5kb downstream of the translation end site, and the introns), and "small" (1kb upstream of the translation start site, the 1kb downstream of the translation end site, and the introns) non-coding genomic spaces. For Arabidopsis only the "medium" non-coding genomic space motif mapping file was generated because it already covers 73.5% of the whole non-coding genomic psace (see publication). To use these files, first they need to be downloaded, and then, the corresponding parameters of the motif mapping file (```MotMapsFile_lb```) and the non-coding genomic space coordinates file (```Promoter_file```) should be modified either on the command line or in the configuration file. +By default, the maize MINI-AC locus-based mode runs on the "medium" non-coding genomic space, which corresponds, for each locus in the genome, to the 5kb upstream of the translation start site, the 1kb downstream of the translation end site, and the introns. However, we generated two additional motif mapping files for the locus-based mode of maize, that cover "large" (15kb upstream of the translation start site, the 2.5kb downstream of the translation end site, and the introns), and "small" (1kb upstream of the translation start site, the 1kb downstream of the translation end site, and the introns) non-coding genomic spaces. For Arabidopsis only the "medium" non-coding genomic space motif mapping file was generated because it already covers 73.5% of the whole non-coding genomic psace (see publication). To use these files, first they need to be downloaded, and then, the corresponding parameters of the motif mapping file (```MotMapsFile```) and the non-coding genomic space coordinates file (```Promoter_file```) should be modified either on the command line or in the configuration file. To download the maize "large" motif mapping file and coordinates of the "large" non-coding genomic space: @@ -180,14 +180,14 @@ wget https://zenodo.org/record/7974527/files/zma_promoter_1kbup_1kbdown_sorted.b Then (using the "small" definition as example), change the parameters on the command line: ``` -nextflow -C mini_ac.config run mini_ac.nf --mode locus_based --species maize --MotMapsFile_lb data/zma/zma_locus_based_motif_mappings_1kbup_1kbdown.bed --Promoter_file data/zma/zma_promoter_1kbup_1kbdown_sorted.bed +nextflow -C mini_ac.config run mini_ac.nf --mode locus_based --species maize --MotMapsFile data/zma/zma_locus_based_motif_mappings_1kbup_1kbdown.bed --Promoter_file data/zma/zma_promoter_1kbup_1kbdown_sorted.bed ``` or add them to the configuration file, along with the other parameters: ```nextflow params { /// [Other parameters...] - MotMapsFile_lb = "$projectDir/data/zma/zma_locus_based_motif_mappings_1kbup_1kbdown.bed" + MotMapsFile = "$projectDir/data/zma/zma_locus_based_motif_mappings_1kbup_1kbdown.bed" Promoter_file = "$projectDir/data/zma/zma_promoter_1kbup_1kbdown_sorted.bed" /// [Other parameters...] } diff --git a/mini_ac.nf b/mini_ac.nf index 1181637..fa26379 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -36,17 +36,17 @@ workflow MINIAC { if (params.mode == "genome_wide") { - params.MotMapsFile_gw = "$projectDir/data/${species}/${species}_genome_wide_motif_mappings.bed" + params.MotMapsFile = "$projectDir/data/${species}/${species}_genome_wide_motif_mappings.bed" params.Non_cod_genome = "$projectDir/data/${species}/${species}_noncod_merged.bed" - genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile_gw, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) + genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile, params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) } else if (params.mode == "locus_based") { - params.MotMapsFile_lb = "$projectDir/data/${species}/${species}_locus_based_motif_mappings_5kbup_1kbdown.bed" + params.MotMapsFile = "$projectDir/data/${species}/${species}_locus_based_motif_mappings_5kbup_1kbdown.bed" params.Promoter_file = "$projectDir/data/${species}/${species}_promoter_5kbup_1kbdown_sorted.bed" - locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.MotMapsFile_lb, params.Promoter_file, params.Faix_file, params.Motif_tf_file, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) + locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.MotMapsFile, params.Promoter_file, params.Faix_file, params.Motif_tf_file, params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata) } else { exit 1, "MINI-AC can only be run using the modes 'genome_wide' or 'locus_based'. Instead it got '${params.mode}'." diff --git a/workflows/miniac_gw.nf b/workflows/miniac_gw.nf index 16c61bb..a45560d 100755 --- a/workflows/miniac_gw.nf +++ b/workflows/miniac_gw.nf @@ -274,7 +274,7 @@ workflow genome_wide_miniac { Bps_intersect Second_gene_annot Second_gene_dist - MotMapsFile_gw + MotMapsFile Non_cod_genome Faix_file Motif_tf_file @@ -294,7 +294,7 @@ workflow genome_wide_miniac { parsed_acr = get_ACR_shufflings.out.acr_input .map {n -> [n.baseName.split("_")[0..-2].join("_"), n]} - motmaps_ch = Channel.fromPath(MotMapsFile_gw).ifEmpty { error "There was an error downloading the motif mapping files ${MotMapsFile_gw}" } + motmaps_ch = Channel.fromPath(MotMapsFile).ifEmpty { error "There was an error downloading the motif mapping files ${MotMapsFile}" } input_stats = acr_shufflings_ch.combine(motmaps_ch) diff --git a/workflows/miniac_lb.nf b/workflows/miniac_lb.nf index b5e1bd7..63be59e 100755 --- a/workflows/miniac_lb.nf +++ b/workflows/miniac_lb.nf @@ -273,7 +273,7 @@ workflow locus_based_miniac { One_DE_set P_val Bps_intersect - MotMapsFile_lb + MotMapsFile Promoter_file Faix_file Motif_tf_file @@ -292,7 +292,7 @@ workflow locus_based_miniac { parsed_acr = get_ACR_shufflings.out.acr_input .map {n -> [n.baseName.split("_")[0..-2].join("_"), n]} - motmaps_ch = Channel.fromPath(MotMapsFile_lb).ifEmpty { error "There was an error downloading the motif mapping files ${MotMapsFile_lb}" } + motmaps_ch = Channel.fromPath(MotMapsFile).ifEmpty { error "There was an error downloading the motif mapping files ${MotMapsFile}" } input_stats = acr_shufflings_ch.combine(motmaps_ch) From f8aa8cf7a345193cc36ec603869a1c4ae56e3714 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 14:37:08 +0200 Subject: [PATCH 08/14] Refine gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 0d2264c..28f8d23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ # ignore Nextflow cache and logs .nextflow/ .nextflow.log* + +# ignore Singularity cache +singularity_cache/ + +# ignore large motif mapping files +*motif_mappings*.bed From b4149907d7b33698bf2680c43a72a9a1c0c95ae0 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 21 Sep 2023 22:50:13 +0200 Subject: [PATCH 09/14] Ignore bin folder --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 28f8d23..26ae0c2 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ singularity_cache/ # ignore large motif mapping files *motif_mappings*.bed + +# ignore bin folder +bin/ \ No newline at end of file From b81569c2fcbb34ef8de6b2825221449a29288250 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 28 Sep 2023 13:59:51 +0200 Subject: [PATCH 10/14] Revert accidental change of p-value for locus-based --- mini_ac.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mini_ac.nf b/mini_ac.nf index 64cb1b4..935c2bb 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -34,14 +34,13 @@ workflow MINIAC { params.TF_fam_file = "$projectDir/data/${species}/${species}_TF_family_file.txt" params.Genes_metadata = "$projectDir/data/${species}/${species}_gene_metadata_file.txt" - // set defaut p-value - params.P_val = 0.1 - if (params.mode == "genome_wide") { params.MotMapsFile = "$projectDir/data/${species}/${species}_genome_wide_motif_mappings.bed" params.Non_cod_genome = "$projectDir/data/${species}/${species}_noncod_merged.bed" + params.P_val = 0.1 + genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile, @@ -53,6 +52,8 @@ workflow MINIAC { params.MotMapsFile = "$projectDir/data/${species}/${species}_locus_based_motif_mappings_5kbup_1kbdown.bed" params.Promoter_file = "$projectDir/data/${species}/${species}_promoter_5kbup_1kbdown_sorted.bed" + + params.P_val = 0.01 locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, From 99ea93bac3ce2364fd2e875f61fa556deb96ebfd Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 28 Sep 2023 14:34:01 +0200 Subject: [PATCH 11/14] Provide gene coords for gw mode only --- mini_ac.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mini_ac.nf b/mini_ac.nf index 935c2bb..05dcfb3 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -29,7 +29,6 @@ workflow MINIAC { // set input data parameters shared between genome-wide and locus-based modes params.Faix_file = "$projectDir/data/${species}/${species}.fasta.fai" params.Motif_tf_file = "$projectDir/data/${species}/${species}_motif_TF_file.txt" - params.Genes_coords = "$projectDir/data/${species}/${species}_genes_coords_sorted.bed" params.Feature_file = "$projectDir/data/${species}/${species}_go_gene_file.txt" params.TF_fam_file = "$projectDir/data/${species}/${species}_TF_family_file.txt" params.Genes_metadata = "$projectDir/data/${species}/${species}_gene_metadata_file.txt" @@ -38,6 +37,7 @@ workflow MINIAC { params.MotMapsFile = "$projectDir/data/${species}/${species}_genome_wide_motif_mappings.bed" params.Non_cod_genome = "$projectDir/data/${species}/${species}_noncod_merged.bed" + params.Genes_coords = "$projectDir/data/${species}/${species}_genes_coords_sorted.bed" params.P_val = 0.1 From b5637eea393ef7038557525f237993879460a39c Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 28 Sep 2023 14:34:19 +0200 Subject: [PATCH 12/14] Correct test input file paths due to rename --- tests/mini_ac.nf.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/mini_ac.nf.test b/tests/mini_ac.nf.test index 5eb172e..2fd20b2 100644 --- a/tests/mini_ac.nf.test +++ b/tests/mini_ac.nf.test @@ -29,7 +29,7 @@ nextflow_workflow { Genes_coords = "${baseDir}/data/zma_v4/zma_v4_genes_coords_sorted.bed" Feature_file = "${baseDir}/data/zma_v4/zma_v4_go_gene_file.txt" TF_fam_file = "${baseDir}/data/zma_v4/zma_v4_TF_family_file.txt" - Genes_metadata = "${baseDir}/data/zma_v4/maize_v4_gene_metadata_file.txt" + Genes_metadata = "${baseDir}/data/zma_v4/zma_v4_gene_metadata_file.txt" OBO_file = "${baseDir}/data/ontologies/go.obo" //// Output folder @@ -103,7 +103,7 @@ nextflow_workflow { Motif_tf_file = "${baseDir}/data/zma_v4/zma_v4_motif_TF_file.txt" Feature_file = "${baseDir}/data/zma_v4/zma_v4_go_gene_file.txt" TF_fam_file = "${baseDir}/data/zma_v4/zma_v4_TF_family_file.txt" - Genes_metadata = "${baseDir}/data/zma_v4/maize_v4_gene_metadata_file.txt" + Genes_metadata = "${baseDir}/data/zma_v4/zma_v4_gene_metadata_file.txt" OBO_file = "${baseDir}/data/ontologies/go.obo" //// Output folder @@ -172,7 +172,7 @@ nextflow_workflow { Genes_coords = "${baseDir}/data/ath/ath_genes_coords_sorted.bed" Feature_file = "${baseDir}/data/ath/ath_go_gene_file.txt" TF_fam_file = "${baseDir}/data/ath/ath_TF_family_file.txt" - Genes_metadata = "${baseDir}/data/ath/arabidopsis_gene_metadata_file.txt" + Genes_metadata = "${baseDir}/data/ath/ath_gene_metadata_file.txt" OBO_file = "${baseDir}/data/ontologies/go.obo" //// Output folder @@ -244,7 +244,7 @@ nextflow_workflow { Motif_tf_file = "${baseDir}/data/ath/ath_motif_TF_file.txt" Feature_file = "${baseDir}/data/ath/ath_go_gene_file.txt" TF_fam_file = "${baseDir}/data/ath/ath_TF_family_file.txt" - Genes_metadata = "${baseDir}/data/ath/arabidopsis_gene_metadata_file.txt" + Genes_metadata = "${baseDir}/data/ath/ath_gene_metadata_file.txt" OBO_file = "${baseDir}/data/ontologies/go.obo" //// Output folder From d70ee8dabf081f3ad08b6a1ece414472655a1999 Mon Sep 17 00:00:00 2001 From: Herman De Beukelaer Date: Thu, 28 Sep 2023 16:18:30 +0200 Subject: [PATCH 13/14] Pass full params object from main workflow to gw and lb workflows --- mini_ac.nf | 13 ++------ workflows/miniac_gw.nf | 70 +++++++++++++++--------------------------- workflows/miniac_lb.nf | 67 ++++++++++++++-------------------------- 3 files changed, 49 insertions(+), 101 deletions(-) diff --git a/mini_ac.nf b/mini_ac.nf index 05dcfb3..4f81ca6 100755 --- a/mini_ac.nf +++ b/mini_ac.nf @@ -41,12 +41,7 @@ workflow MINIAC { params.P_val = 0.1 - genome_wide_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, - params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, - params.Bps_intersect, params.Second_gene_annot, params.Second_gene_dist, params.MotMapsFile, - params.Non_cod_genome, params.Faix_file, params.Motif_tf_file, params.Genes_coords, params.Feature_file, - params.OBO_file, params.TF_fam_file, params.Genes_metadata, params.Shuffle_count, params.Shuffle_seed, - params.Csv_output) + genome_wide_miniac(params) } else if (params.mode == "locus_based") { @@ -55,11 +50,7 @@ workflow MINIAC { params.P_val = 0.01 - locus_based_miniac(params.OutDir, params.ACR_dir, params.Filter_set_genes, params.Set_genes_dir, - params.One_filtering_set, params.DE_genes, params.DE_genes_dir, params.One_DE_set, params.P_val, - params.Bps_intersect, params.MotMapsFile, params.Promoter_file, params.Faix_file, params.Motif_tf_file, - params.Feature_file, params.OBO_file, params.TF_fam_file, params.Genes_metadata, params.Shuffle_count, - params.Shuffle_seed, params.Csv_output) + locus_based_miniac(params) } else { exit 1, "MINI-AC can only be run using the modes 'genome_wide' or 'locus_based'. Instead it got '${params.mode}'." diff --git a/workflows/miniac_gw.nf b/workflows/miniac_gw.nf index c079b7f..fe5e551 100755 --- a/workflows/miniac_gw.nf +++ b/workflows/miniac_gw.nf @@ -268,53 +268,30 @@ process getIntegrativeOutputs { workflow genome_wide_miniac { take: - OutDir - ACR_dir - Filter_set_genes - Set_genes_dir - One_filtering_set - DE_genes - DE_genes_dir - One_DE_set - P_val - Bps_intersect - Second_gene_annot - Second_gene_dist - MotMapsFile - Non_cod_genome - Faix_file - Motif_tf_file - Genes_coords - Feature_file - OBO_file - TF_fam_file - Genes_metadata - Shuffle_count - Shuffle_seed - Csv_output + params main: - if (!file(MotMapsFile).exists()) { error "Please make sure that you downloaded the motif mapping files as described in the documentation." } + if (!file(params.MotMapsFile).exists()) { error "Please make sure that you downloaded the motif mapping files as described in the documentation." } - ACR_files = Channel.fromPath("${ACR_dir}/*.bed").ifEmpty { error "No *.bed files could be found in the specified ACR directory ${ACR_dir}" } + ACR_files = Channel.fromPath("${params.ACR_dir}/*.bed").ifEmpty { error "No *.bed files could be found in the specified ACR directory ${params.ACR_dir}" } - get_ACR_shufflings(ACR_files, Faix_file, Non_cod_genome, Shuffle_count, Shuffle_seed) + get_ACR_shufflings(ACR_files, params.Faix_file, params.Non_cod_genome, params.Shuffle_count, params.Shuffle_seed) acr_shufflings_ch = get_ACR_shufflings.out.shufflings parsed_acr = get_ACR_shufflings.out.acr_input .map {n -> [n.baseName.split("_")[0..-2].join("_"), n]} - motmaps_ch = Channel.fromPath(MotMapsFile) + motmaps_ch = Channel.fromPath(params.MotMapsFile) input_stats = acr_shufflings_ch.combine(motmaps_ch) - if (Bps_intersect == false) { + if (params.Bps_intersect == false) { script_proc_stats = "${projectDir}/bin/processStats_gw.py" - getStats(input_stats, script_proc_stats, Motif_tf_file, OutDir, Shuffle_count) + getStats(input_stats, script_proc_stats, params.Motif_tf_file, params.OutDir, params.Shuffle_count) stats_ch = getStats.out.proc_stats .map { n -> [n.BaseName.split("_")[0..-5].join("_"), n]} @@ -325,7 +302,7 @@ workflow genome_wide_miniac { script_proc_stats_bps = "${projectDir}/bin/processStats_bps_gw.py" - getStats_bps(input_stats, script_proc_stats_bps, Motif_tf_file, OutDir, Shuffle_count) + getStats_bps(input_stats, script_proc_stats_bps, params.Motif_tf_file, params.OutDir, params.Shuffle_count) stats_ch = getStats_bps.out.proc_stats .map { n -> [n.BaseName.split("_")[0..-5].join("_"), n]} @@ -337,18 +314,19 @@ workflow genome_wide_miniac { script_getnetwork = "${projectDir}/bin/getNetwork_gw.py" - getNetwork(stats_acr_motmaps_ch, Genes_coords, script_getnetwork, Motif_tf_file, Second_gene_dist, Second_gene_annot, P_val, OutDir) + getNetwork(stats_acr_motmaps_ch, params.Genes_coords, script_getnetwork, params.Motif_tf_file, + params.Second_gene_dist, params.Second_gene_annot, params.P_val, params.OutDir) networks = getNetwork.out - if (Filter_set_genes == true) { + if (params.Filter_set_genes == true) { filteringScript = "${projectDir}/bin/filterNetwork_gw.py" - filt_set_files = Channel.fromPath("${Set_genes_dir}/*.txt") - .ifEmpty { error "Cannot find any directory: ${Set_genes_dir}" } + filt_set_files = Channel.fromPath("${params.Set_genes_dir}/*.txt") + .ifEmpty { error "Cannot find any directory: ${params.Set_genes_dir}" } - if (One_filtering_set == true) { + if (params.One_filtering_set == true) { networks_gene_set = networks.combine(filt_set_files) @@ -373,7 +351,7 @@ workflow genome_wide_miniac { } - filterSetOfGenes(filteringScript, networks_gene_set, OutDir) + filterSetOfGenes(filteringScript, networks_gene_set, params.OutDir) net_tuple = filterSetOfGenes.out .map { n -> [n.BaseName.split("_")[0..-3].join("_"), n]} @@ -395,21 +373,20 @@ workflow genome_wide_miniac { script_add_go_names = "${projectDir}/bin/add_go_names.py" script_filter_reduced = "${projectDir}/bin/FilterReduceGO.py" - GOenrichment(net_tuple, Feature_file, - script_enricher, script_reduce_go, - script_add_go_names, OBO_file, script_filter_reduced, OutDir) + GOenrichment(net_tuple, params.Feature_file, script_enricher, script_reduce_go, + script_add_go_names, params.OBO_file, script_filter_reduced, params.OutDir) go_tuple = GOenrichment.out .map { n -> [n.BaseName.split("_")[0..-3].join("_"), n]} int_input = int_input.join(go_tuple) - if (DE_genes == true) { + if (params.DE_genes == true) { - de_files = Channel.fromPath("${DE_genes_dir}/*.txt") - .ifEmpty { error "Cannot find any directory: ${Set_genes_dir}" } + de_files = Channel.fromPath("${params.DE_genes_dir}/*.txt") + .ifEmpty { error "Cannot find any directory: ${params.Set_genes_dir}" } - if (One_DE_set == true) { + if (params.One_DE_set == true) { int_input = int_input.combine(de_files) @@ -435,7 +412,8 @@ workflow genome_wide_miniac { script_go_file = "${projectDir}/bin/getGO_xlsx_gw.py" script_net_files = "${projectDir}/bin/getNetVisualizationOutput_gw.py" - getIntegrativeOutputs(int_input, Motif_tf_file, TF_fam_file, Genes_metadata, P_val, Filter_set_genes, DE_genes, - script_tf_file, script_motif_file, script_go_file, script_net_files, OutDir, Csv_output) + getIntegrativeOutputs(int_input, params.Motif_tf_file, params.TF_fam_file, params.Genes_metadata, params.P_val, + params.Filter_set_genes, params.DE_genes, script_tf_file, script_motif_file, script_go_file, + script_net_files, params.OutDir, params.Csv_output) } diff --git a/workflows/miniac_lb.nf b/workflows/miniac_lb.nf index 15c3848..e84fd43 100755 --- a/workflows/miniac_lb.nf +++ b/workflows/miniac_lb.nf @@ -267,52 +267,31 @@ process getIntegrativeOutputs { } workflow locus_based_miniac { - take: - OutDir - ACR_dir - Filter_set_genes - Set_genes_dir - One_filtering_set - DE_genes - DE_genes_dir - One_DE_set - P_val - Bps_intersect - MotMapsFile - Promoter_file - Faix_file - Motif_tf_file - Feature_file - OBO_file - TF_fam_file - Genes_metadata - Shuffle_count - Shuffle_seed - Csv_output + params main: - if (!file(MotMapsFile).exists()) { error "Please make sure that you downloaded the motif mapping files as described in the documentation." } + if (!file(params.MotMapsFile).exists()) { error "Please make sure that you downloaded the motif mapping files as described in the documentation." } - ACR_files = Channel.fromPath("${ACR_dir}/*.bed").ifEmpty { error "No *.bed files could be found in the specified ACR directory ${ACR_dir}" } + ACR_files = Channel.fromPath("${params.ACR_dir}/*.bed").ifEmpty { error "No *.bed files could be found in the specified ACR directory ${params.ACR_dir}" } - get_ACR_shufflings(ACR_files, Faix_file, Promoter_file, Shuffle_count, Shuffle_seed) + get_ACR_shufflings(ACR_files, params.Faix_file, params.Promoter_file, params.Shuffle_count, params.Shuffle_seed) acr_shufflings_ch = get_ACR_shufflings.out.shufflings parsed_acr = get_ACR_shufflings.out.acr_input .map {n -> [n.baseName.split("_")[0..-2].join("_"), n]} - motmaps_ch = Channel.fromPath(MotMapsFile) + motmaps_ch = Channel.fromPath(params.MotMapsFile) input_stats = acr_shufflings_ch.combine(motmaps_ch) - if (Bps_intersect == false) { + if (params.Bps_intersect == false) { script_proc_stats = "${projectDir}/bin/processStats_lb.py" - getStats(input_stats, script_proc_stats, Motif_tf_file, Promoter_file, OutDir, Shuffle_count) + getStats(input_stats, script_proc_stats, params.Motif_tf_file, params.Promoter_file, params.OutDir, params.Shuffle_count) stats_ch = getStats.out.proc_stats .map { n -> [n.BaseName.split("_")[0..-5].join("_"), n]} @@ -322,7 +301,7 @@ workflow locus_based_miniac { script_proc_stats_bps = "${projectDir}/bin/processStats_bps_lb.py" - getStats_bps(input_stats, script_proc_stats_bps, Motif_tf_file, Promoter_file, OutDir, Shuffle_count) + getStats_bps(input_stats, script_proc_stats_bps, params.Motif_tf_file, params.Promoter_file, params.OutDir, params.Shuffle_count) stats_ch = getStats_bps.out.proc_stats .map { n -> [n.BaseName.split("_")[0..-5].join("_"), n]} @@ -334,18 +313,18 @@ workflow locus_based_miniac { script_getnetwork = "${projectDir}/bin/getNetwork_lb.py" - getNetwork(stats_acr_motmaps_ch, Promoter_file, script_getnetwork, Motif_tf_file, P_val, OutDir) + getNetwork(stats_acr_motmaps_ch, params.Promoter_file, script_getnetwork, params.Motif_tf_file, params.P_val, params.OutDir) networks = getNetwork.out - if (Filter_set_genes == true) { + if (params.Filter_set_genes == true) { filteringScript = "${projectDir}/bin/filterNetwork_lb.py" - filt_set_files = Channel.fromPath("${Set_genes_dir}/*.txt") - .ifEmpty { error "Cannot find any directory: ${Set_genes_dir}" } + filt_set_files = Channel.fromPath("${params.Set_genes_dir}/*.txt") + .ifEmpty { error "Cannot find any directory: ${params.Set_genes_dir}" } - if (One_filtering_set == true) { + if (params.One_filtering_set == true) { networks_gene_set = networks.combine(filt_set_files) @@ -370,7 +349,7 @@ workflow locus_based_miniac { } - filterSetOfGenes(filteringScript, networks_gene_set, OutDir) + filterSetOfGenes(filteringScript, networks_gene_set, params.OutDir) net_tuple = filterSetOfGenes.out .map { n -> [n.BaseName.split("_")[0..-3].join("_"), n]} @@ -393,21 +372,20 @@ workflow locus_based_miniac { script_filter_reduced = "${projectDir}/bin/FilterReduceGO.py" - GOenrichment(net_tuple, Feature_file, - script_enricher, script_reduce_go, - script_add_go_names, OBO_file, script_filter_reduced, OutDir) + GOenrichment(net_tuple, params.Feature_file, script_enricher, script_reduce_go, + script_add_go_names, params.OBO_file, script_filter_reduced, params.OutDir) go_tuple = GOenrichment.out .map { n -> [n.BaseName.split("_")[0..-3].join("_"), n]} int_input = int_input.join(go_tuple) - if (DE_genes == true) { + if (params.DE_genes == true) { - de_files = Channel.fromPath("${DE_genes_dir}/*.txt") - .ifEmpty { error "Cannot find any directory: ${Set_genes_dir}" } + de_files = Channel.fromPath("${params.DE_genes_dir}/*.txt") + .ifEmpty { error "Cannot find any directory: ${params.Set_genes_dir}" } - if (One_DE_set == true) { + if (params.One_DE_set == true) { int_input = int_input.combine(de_files) @@ -433,7 +411,8 @@ workflow locus_based_miniac { script_go_file = "${projectDir}/bin/getGO_xlsx_lb.py" script_net_files = "${projectDir}/bin/getNetVisualizationOutput_lb.py" - getIntegrativeOutputs(int_input, Motif_tf_file, TF_fam_file, Genes_metadata, P_val, Filter_set_genes, DE_genes, - script_tf_file, script_motif_file, script_go_file, script_net_files, OutDir, Csv_output) + getIntegrativeOutputs(int_input, params.Motif_tf_file, params.TF_fam_file, params.Genes_metadata, params.P_val, + params.Filter_set_genes, params.DE_genes, script_tf_file, script_motif_file, script_go_file, + script_net_files, params.OutDir, params.Csv_output) } From bef0d0769411f5c2bac047a43b52f3ee3a55f66d Mon Sep 17 00:00:00 2001 From: Nicolas MANOSALVA PEREZ Date: Fri, 29 Sep 2023 09:53:34 +0200 Subject: [PATCH 14/14] update on .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index c612040..573806b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,10 @@ tests/outputs/ # ignore SLURM output and error files slurm.*.out slurm.*.err + +# ignore jupyter notebook checkpoints +.ipynb_checkpoints/ + +# python cache and compiled files +__pycache__/ +*.pyc \ No newline at end of file