nextstrain · joverlee521 · Jun 19, 2024 · Jun 21, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/config/auspice_config_h5n1-cattle-outbreak.json b/config/auspice_config_h5n1-cattle-outbreak.json
@@ -80,6 +80,11 @@
       "title": "Subtype",
       "type": "categorical"
     },
+    {
+      "key": "clade",
+      "title": "Nextclade Clade",
+      "type": "categorical"
+    },
     {
       "key": "furin_cleavage_motif",
       "title": "Furin Cleavage Motif",
@@ -127,6 +132,7 @@
     "country",
     "division",
     "subtype",
+    "clade",
     "author",
     "originating_lab",
     "submitting_lab",

diff --git a/config/auspice_config_h5n1.json b/config/auspice_config_h5n1.json
@@ -39,7 +39,7 @@
       "key": "division",
       "title": "Admin Division",
       "type": "categorical"
-    },    
+    },
     {
       "key": "host",
       "title": "Host",
@@ -65,6 +65,11 @@
       "title": "GISAID Clade",
       "type": "categorical"
     },
+    {
+      "key": "clade",
+      "title": "Nextclade Clade",
+      "type": "categorical"
+    },
     {
       "key": "furin_cleavage_motif",
       "title": "Furin Cleavage Motif",
@@ -109,6 +114,7 @@
     "subtype",
     "h5_label_clade",
     "gisaid_clade",
+    "clade",
     "authors",
     "originating_lab",
     "submitting_lab"

diff --git a/config/auspice_config_h5nx.json b/config/auspice_config_h5nx.json
@@ -65,6 +65,11 @@
       "title": "GISAID Clade",
       "type": "categorical"
     },
+    {
+      "key": "clade",
+      "title": "Nextclade Clade",
+      "type": "categorical"
+    },
     {
       "key": "furin_cleavage_motif",
       "title": "Furin Cleavage Motif",
@@ -109,6 +114,7 @@
     "subtype",
     "h5_label_clade",
     "gisaid_clade",
+    "clade",
     "authors",
     "originating_lab",
     "submitting_lab"

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -20,9 +20,11 @@ rule upload_all:
     input:
         sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]),
         metadata="fauna/s3/metadata.done",
+        nextclade="fauna/s3/nextclade.done",
 
 include: "rules/ingest_fauna.smk"
 include: "rules/merge_segment_metadata.smk"
+include: "rules/nextclade.smk"
 include: "rules/upload_to_s3.smk"
 
 # Allow users to import custom rules provided via the config.

diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile
@@ -40,6 +40,7 @@ rule upload_all_ncbi:
         expand([
             "{data_source}/s3/sequences_{segment}.done",
             "{data_source}/s3/metadata.done",
+            "{data_source}/s3/nextclade.done",
         ], data_source=NCBI_DATA_SOURCES, segment=config["segments"]),
 
 

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -10,3 +10,8 @@ segments:
 
 s3_dst:
   fauna: s3://nextstrain-data-private/files/workflows/avian-flu
+
+nextclade:
+  dataset_name: community/moncla-lab/iav-h5/ha/all-clades
+  field_map: defaults/nextclade_field_map.tsv
+  id_field: seqName
diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv
@@ -0,0 +1,28 @@
+# TSV file that is a mapping of column names for Nextclade output TSV
+# The first column should be the original column name of the Nextclade TSV
+# The second column should be the new column name to use in the final metadata TSV
+# Nextclade can have pathogen specific output columns so make sure to check which
+# columns would be useful for your downstream phylogenetic analysis.
+seqName	seqName
+clade	clade
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.overallStatus	QC_overall
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.snpClusters.status	QC_snp_clusters
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
+privateNucMutations.reversionSubstitutions	private_reversion_substitutions
+privateNucMutations.labeledSubstitutions	private_labeled_substitutions
+privateNucMutations.unlabeledSubstitutions	private_unlabeled_substitutions
+privateNucMutations.totalReversionSubstitutions	private_total_reversion_substitutions
+privateNucMutations.totalLabeledSubstitutions	private_total_labeled_substitutions
+privateNucMutations.totalUnlabeledSubstitutions	private_total_unlabeled_substitutions
+privateNucMutations.totalPrivateSubstitutions	private_total_private_substitutions
+qc.snpClusters.clusteredSNPs	private_snp_clusters
+qc.snpClusters.totalSNPs	private_total_snp_clusters
diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk
@@ -16,7 +16,7 @@ rule merge_segment_metadata:
         segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]),
         metadata = "{data_source}/data/metadata_ha.tsv",
     output:
-        metadata = "{data_source}/results/metadata.tsv",
+        metadata = "{data_source}/data/merged_segment_metadata.tsv",
     shell:
         """
         python scripts/add_segment_counts.py \

diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -0,0 +1,78 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+"""
+
+
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade/{DATASET_NAME}.zip",
+    benchmark:
+        "benchmarks/get_nextclade_dataset.txt"
+    params:
+        dataset_name=DATASET_NAME
+    shell:
+        """
+        nextclade3 dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade/{DATASET_NAME}.zip",
+        # The H5NX datasets should only be for the HA segment
+        sequences="{data_source}/results/sequences_ha.fasta",
+    output:
+        nextclade="{data_source}/results/nextclade.tsv",
+    benchmark:
+        "{data_source}/benchmarks/run_nextclade.txt"
+    shell:
+        """
+        nextclade3 run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade}
+        """
+
+
+rule join_metadata_and_nextclade:
+    input:
+        nextclade="{data_source}/results/nextclade.tsv",
+        metadata="{data_source}/data/merged_segment_metadata.tsv",
+        nextclade_field_map=config["nextclade"]["field_map"],
+    output:
+        metadata="{data_source}/results/metadata.tsv",
+    params:
+        # Making this param optional because we don't have curate pipeline for fauna data
+        metadata_id_field=config.get("curate", {}).get("output_id_field", "strain"),
+        nextclade_id_field=config["nextclade"]["id_field"],
+    shell:
+        """
+        export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk fix-quotes -t {input.nextclade} \
+        | csvtk -t cut -f $SUBSET_FIELDS \
+        | csvtk -t rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | csvtk del-quotes -t \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields {params.nextclade_id_field} \
+            --data-fields {params.metadata_id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude {params.nextclade_id_field} \
+            > {output.metadata}
+        """
diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk
@@ -37,3 +37,21 @@ rule upload_metadata:
             {params.s3_dst:q}/metadata.tsv.zst \
             {params.cloudfront_domain} 2>&1 | tee {output.flag}
         """
+
+
+rule upload_nextclade_tsv:
+    input:
+        nextclade="{data_source}/results/nextclade.tsv",
+    output:
+        flag="{data_source}/s3/nextclade.done",
+    params:
+        s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source],
+        cloudfront_domain=config.get("cloudfront_domain", ""),
+    shell:
+        """
+        ./vendored/upload-to-s3 \
+            --quiet \
+            {input.nextclade:q} \
+            {params.s3_dst:q}/nextclade.tsv.zst \
+            {params.cloudfront_domain} 2>&1 | tee {output.flag}
+        """