Add basic description to README, fix remote YAML in test

sanger-tol · Dec 20, 2024 · 9d3f933 · 9d3f933
1 parent 3970d41
commit 9d3f933
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 58 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ null/
 co2footprint*
 .nf-test/
 .nf-test.log
+.vscode
diff --git a/README.md b/README.md
@@ -2,49 +2,53 @@
 
 ## Introduction
 
-**sanger-tol/longreadmag** is a bioinformatics pipeline that ...
-
-<!-- TODO nf-core:
-   Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
-   major pipeline sections and the types of output it produces. You're giving an overview to someone new
-   to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
--->
+**sanger-tol/longreadmag** is a bioinformatics pipeline for the assembly and binning of metagenomes
+using PacBio HiFi data and (optionally) Hi-C Illumina data.
 
 <!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
-<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
 
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+1. Assembles raw reads using metaMDBG ([`MultiQC`](http://multiqc.info/))
+2. Maps HiFi and (optionally) Hi-C reads to the assembly
+3. Bins the assembly using MetaBat2, MaxBin2, Bin3C, and Metator
+4. (optionally) refine the bins using DAS_Tool or MagScoT
+5. Assesses the completeness and contamination of bins using CheckM2 and assessing ncRNA content using Prokka
+6. Assigns taxonomy to medium-quality and above bins using GTDB-Tk
+7. Summarises information at the bin level
 
 ## Usage
 
 > [!NOTE]
 > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
-<!-- TODO nf-core: Describe the minimum required steps to execute the pipeline, e.g. how to prepare samplesheets.
-     Explain what rows and columns represent. For instance (please edit as appropriate):
-
-First, prepare a samplesheet with your input data that looks as follows:
-
-`samplesheet.csv`:
-
-```csv
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
+First, prepare a YAML with your input data that looks as follows:
+
+`input.yaml`:
+
+```yaml
+id: SampleName
+pacbio:
+  fasta:
+    - /path/to/pacbio/file1.fasta.gz
+    - /path/to/pacbio/file2.fasta.gz
+    - ...
+hic:
+  cram:
+    - /path/to/hic/hic1.cram
+    - /path/to/hic/hic2.cram
+    - ...
+  enzymes:
+    - enzyme_name_1 (e.g. DpnII)
+    - enzyme_name_1 (e.g. HinfI)
+    - ...
 ```
 
-Each row represents a fastq file (single-end) or a pair of fastq files (paired end).
-
--->
-
 Now, you can run the pipeline using:
 
-<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
-
 ```bash
 nextflow run sanger-tol/longreadmag \
    -profile <docker/singularity/.../institute> \
-   --input samplesheet.csv \
+   --input input.yaml \
    --outdir <OUTDIR>
 ```
 
@@ -55,9 +59,7 @@ nextflow run sanger-tol/longreadmag \
 
 sanger-tol/longreadmag was originally written by Jim Downie, Will Eagles, Noah Gettle.
 
-We thank the following people for their extensive assistance in the development of this pipeline:
-
-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+<!-- We thank the following people for their extensive assistance in the development of this pipeline: -->
 
 ## Contributions and Support
 

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,6 +15,5 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    // Input data
-    input  = "${baseDir}/assets/test_full_input.yaml"
+    input = params.pipelines_testdata_base_path + "test_input_full.yaml"
 }
diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf
@@ -1,7 +1,7 @@
 #!/usr/bin/env nextflow
 
 def readYAML( yamlfile ) {
-    return new org.yaml.snakeyaml.Yaml().load( new FileReader( yamlfile.toString() ) )
+    return new org.yaml.snakeyaml.Yaml().load(yamlfile.text)
 }
 
 workflow YAML_INPUT {
@@ -10,9 +10,8 @@ workflow YAML_INPUT {
 
     main:
     // ch_versions = Channel.empty()
-
-    yamlfile = Channel.from(input_file)
-        .map { file -> readYAML(file) }
+    yamlfile = Channel.fromPath(input_file)
+        | map { file -> readYAML(file) }
 
     //
     // LOGIC: PARSES THE TOP LEVEL OF YAML VALUES
@@ -21,11 +20,11 @@ workflow YAML_INPUT {
         | flatten()
         | multiMap { data ->
                 pacbio_fasta: ( data.pacbio ? [
-                    [ id: data.tolid ],
+                    [ id: data.id ],
                     data.pacbio.fasta.collect { file(it, checkIfExists: true) }
                 ] : error("ERROR: Pacbio reads not provided! Pipeline will not run as there is nothing to do.") )
                 hic_cram: ( data.hic ? [
-                    [ id: data.tolid ],
+                    [ id: data.id ],
                     data.hic.cram.collect { file(it, checkIfExists: true) }
                 ] : [] )
                 hic_enzymes: ( data.hic ?

diff --git a/tests/main.nf.test b/tests/main.nf.test
@@ -8,6 +8,7 @@ nextflow_pipeline {
         when {
             params {
                 outdir = "${outputDir}"
+                enable_gtdbtk = false
             }
         }
 
@@ -22,7 +23,7 @@ nextflow_pipeline {
                     // Number of successful tasks
                     workflow.trace.succeeded().size(),
                     // pipeline versions.yml file for multiqc from which Nextflow version is removed because we tests pipelines on multiple Nextflow versions
-                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_rnaseq_software_mqc_versions.yml"),
+                    removeNextflowVersion("$outputDir/pipeline_info/sangertol_longreadmag_pipeline_software_mqc_versions.yml"),
                     // All stable path name, with a relative path
                     stable_name,
                     // All files with stable contents

diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap
diff --git a/workflows/longreadmag.nf b/workflows/longreadmag.nf
@@ -150,7 +150,7 @@ workflow LONGREADMAG {
     softwareVersionsToYAML(ch_versions)
         .collectFile(
             storeDir: "${params.outdir}/pipeline_info",
-            name:  'longreadmag_'  + 'pipeline_software_' +  'mqc_'  + 'versions.yml',
+            name:  'sangertol_longreadmag_'  + 'pipeline_software_' +  'mqc_'  + 'versions.yml',
             sort: true,
             newLine: true
         ).set { ch_collated_versions }