nextstrain · jameshadfield · Nov 20, 2024 · Nov 27, 2024 · Dec 8, 2024 · genehack
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -11,4 +11,4 @@ jobs:
     # conform since this is a collaborative repo with an external group.
     uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@v0
     with:
-      build-args: --configfile config/gisaid.yaml -pf test_target
+      build-args: --snakefile gisaid/Snakefile -pf test_target
diff --git a/.github/workflows/phylogenetic-fauna.yaml b/.github/workflows/phylogenetic-fauna.yaml
@@ -88,7 +88,7 @@ jobs:
           --memory 28800mib \
           . \
             deploy_all \
-            --configfile config/gisaid.yaml \
+            --snakefile gisaid/Snakefile \
             --config "${config[@]}"
 
       env: |

diff --git a/.github/workflows/phylogenetic-ncbi.yaml b/.github/workflows/phylogenetic-ncbi.yaml
@@ -95,7 +95,7 @@ jobs:
           --memory 28800mib \
           . \
             deploy_all \
-            --configfile config/h5n1-cattle-outbreak.yaml \
+            --snakefile h5n1-cattle-outbreak/Snakefile \
             --config "${config[@]}"
 
       env: |

diff --git a/Snakefile b/Snakefile
@@ -8,6 +8,75 @@ wildcard_constraints:
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
 #SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]
 
+CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty
+
+# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile)
+if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')):
+    configfile: os.path.join(workflow.basedir, 'config.yaml')
+
+# load a config.yaml file if it exists in the current working directory
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
+
+from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove
+
+class InvalidConfigError(Exception):
+    pass
+
+def resolve_config_path(path):
+    """
+    Resolve a relative *path* given in a configuration value. Returns a
+    function which takes a single argument *wildcards* and returns the resolved
+    path with any '{x}' substrings are replaced by their corresponding wildcards
+    filled in
+
+    Search order (first match returned):
+    1. Relative to the analysis directory
+    2. Relative to the directory the entry snakefile was in. Typically this is
+       not the Snakefile you are looking at now but (e.g.) the one in
+       avian-flu/gisaid
+    3. Relative to where this Snakefile is (i.e. `avian-flu/`)
+    """
+    if not isinstance(path, str):
+        raise InvalidConfigError(f"Config path provided to resolve_config_path must be a string. Provided value: {str(path)}")
+
+    def resolve(wildcards):
+        try:
+            path_expanded = expand(path, **wildcards)[0]
+        except snakemake.exceptions.WildcardError as e:
+            # str(e) looks like "No values given for wildcard 'subtypes'."
+            raise InvalidConfigError(f"resolve_config_path called with path {path!r} however {str(e)}")
+
+        # check if the path exists relative to the working analysis directory
+        if os.path.exists(path_expanded):
+            return path_expanded
+
+        # Check if the path exists relative to the subdir where the entry snakefile is
+        # (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv)
+        # you're expected to supply the 'config/x.tsv' as the value in the config YAML
+        # NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but
+        # given the different directories avian-flu uses that's acceptable. In other words, if we standardised
+        # avian-flu then we could add subdirectories to the search order here
+        basepath = os.path.join(workflow.basedir, path_expanded)
+        if os.path.exists(basepath):
+            return basepath
+
+        # Check if the path exists relative to where _this snakefile is_
+        # NOTE: We may move this Snakefile to `rules/base.smk`, or perhaps go further and split
+        # these functions out into `rules/common.smk` etc. If we do so then then I think the correct behvaiour for
+        # step (3) is to search for paths relative to `avian-flu` (not `avian-flu/rules`)
+        if workflow.basedir != CURRENT_BASEDIR:
+            basepath = os.path.join(CURRENT_BASEDIR, path_expanded)
+            if os.path.exists(basepath):
+                return basepath
+
+        raise InvalidConfigError(f"Unable to resolve the config-provided path {path!r}, expanded to {path_expanded!r} after filling in wildcards. "
+            f"The following directories were searched:\n"
+            f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n"
+            f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
+            f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
+    return resolve
+
 # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
 # will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps:
 # (1) Filter the HA segment as normal plus filter to those strains with 8 segments
@@ -19,6 +88,14 @@ S3_SRC = config.get('s3_src', {})
 LOCAL_INGEST = config.get('local_ingest', None)
 
 def sanity_check_config():
+    if not len(config.keys()):
+        print("-"*80 + "\nNo config loaded!", file=sys.stderr)
+        print("Avian-flu is indented to be run from the snakefile inside a subdir " 
+            "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " 
+            "Alternatively you can pass in the config via `--configfile`", file=sys.stderr)
+        print("-"*80, file=sys.stderr)
+        raise InvalidConfigError("No config")
+
     assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'"
     # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST
     # if we want to use `--config local_ingest=gisaid` overrides.
@@ -52,6 +129,7 @@ rule all:
 
 
 # This must be after the `all` rule above since it depends on its inputs
+# Note this is relative to the `workflow.current_basedir`
 include: "rules/deploy.smk"
 
 rule test_target:
@@ -60,6 +138,8 @@ rule test_target:
     """
     input: "auspice/avian-flu_h5n1_ha_all-time.json"
 
+# TODO - I find this indirection more confusing than helpful and I'd rather just
+# specify `config['colors']` in the rules which use it (e.g.)
 rule files:
     params:
         dropped_strains = config['dropped_strains'],
@@ -213,12 +293,14 @@ rule add_h5_clade:
     message: "Adding in a column for h5 clade numbering"
     input:
         metadata = "results/{subtype}/metadata.tsv",
-        clades_file = files.clades_file
+        clades_file = resolve_config_path(files.clades_file)
     output:
         metadata= "results/{subtype}/metadata-with-clade.tsv"
+    params:
+        script = os.path.join(workflow.current_basedir, "clade-labeling/add-clades.py")
     shell:
-        """
-        python clade-labeling/add-clades.py \
+        r"""
+        python {params.script} \
             --metadata {input.metadata} \
             --output {output.metadata} \
             --clades {input.clades_file}
@@ -271,8 +353,8 @@ rule filter:
     input:
         sequences = "results/{subtype}/{segment}/sequences.fasta",
         metadata = metadata_by_wildcards,
-        exclude = files.dropped_strains,
-        include = files.include_strains,
+        exclude = resolve_config_path(files.dropped_strains),
+        include = resolve_config_path(files.include_strains),
         strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [],
     output:
         sequences = "results/{subtype}/{segment}/{time}/filtered.fasta",
@@ -303,7 +385,7 @@ rule align:
         """
     input:
         sequences = rules.filter.output.sequences,
-        reference = files.reference
+        reference = resolve_config_path(files.reference)
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
     wildcard_constraints:
@@ -402,7 +484,9 @@ def ancestral_root_seq(wildcards):
     root_seq = get_config('ancestral', 'genome_root_seq', wildcards) \
         if wildcards.segment=='genome' \
         else get_config('ancestral', 'root_seq', wildcards)
-    return f"--root-sequence {root_seq}" if root_seq else ""
+    if not root_seq:
+        return ""
+    return f"--root-sequence {resolve_config_path(root_seq)(wildcards)}"
 
 rule ancestral:
     message: "Reconstructing ancestral sequences and mutations"
@@ -430,7 +514,7 @@ rule translate:
     input:
         tree = refined_tree,
         node_data = rules.ancestral.output.node_data,
-        reference = lambda w: config['genome_reference'] if w.segment=='genome' else files.reference
+        reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference)(w)
     output:
         node_data = "results/{subtype}/{segment}/{time}/aa-muts.json"
     shell:
@@ -489,9 +573,11 @@ rule cleavage_site:
     output:
         cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json",
         cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json"
+    params:
+        script = os.path.join(workflow.current_basedir, "scripts/annotate-ha-cleavage-site.py")
     shell:
         """
-        python scripts/annotate-ha-cleavage-site.py \
+        python {params.script} \
             --alignment {input.alignment} \
             --furin_site_motif {output.cleavage_site_annotations} \
             --cleavage_site_sequence {output.cleavage_site_sequences}
@@ -531,7 +617,7 @@ rule auspice_config:
     If we implement config overlays in augur this rule will become unnecessary.
     """
     input:
-        auspice_config = files.auspice_config,
+        auspice_config = resolve_config_path(files.auspice_config),
     output:
         auspice_config = "results/{subtype}/{segment}/{time}/auspice-config.json",
     run:
@@ -556,7 +642,7 @@ rule auspice_config:
 
 rule colors:
     input:
-        colors = files.colors,
+        colors = resolve_config_path(files.colors),
     output:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     shell:
@@ -574,9 +660,9 @@ rule export:
         metadata = rules.filter.output.metadata,
         node_data = export_node_data_files,
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
-        lat_longs = files.lat_longs,
+        lat_longs = resolve_config_path(files.lat_longs),
         auspice_config = rules.auspice_config.output.auspice_config,
-        description = files.description
+        description = resolve_config_path(files.description),
     output:
         auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json"
     params:
@@ -646,4 +732,5 @@ rule clean:
 
 
 for rule_file in config.get('custom_rules', []):
-    include: rule_file
+    # Relative custom rule paths in the config are expected to be relative to the analysis (working) directory
+    include: os.path.join(os.getcwd(), rule_file)
diff --git a/gisaid/Snakefile b/gisaid/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
diff --git a/config/gisaid.yaml → gisaid/config.yaml b/config/gisaid.yaml → gisaid/config.yaml
diff --git a/h5n1-cattle-outbreak/Snakefile b/h5n1-cattle-outbreak/Snakefile
@@ -0,0 +1,6 @@
+include: "../Snakefile"
+
+include: "cattle-flu.smk"
+
+rule _all:
+    input: rules.all.input
diff --git a/rules/cattle-flu.smk → h5n1-cattle-outbreak/cattle-flu.smk b/rules/cattle-flu.smk → h5n1-cattle-outbreak/cattle-flu.smk
@@ -10,8 +10,8 @@ rule filter_segments_for_genome:
     input:
         sequences = "results/{subtype}/{genome_seg}/sequences.fasta",
         metadata = "results/{subtype}/metadata-with-clade.tsv", # TODO: use a function here instead of hardcoding
-        include = config['include_strains'],
-        exclude = config['dropped_strains'],
+        include = resolve_config_path(config['include_strains']),
+        exclude = resolve_config_path(config['dropped_strains']),
     output:
         sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta"
     params:
@@ -39,7 +39,11 @@ rule align_segments_for_genome:
     input:
         sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta",
         # Use the H5N1 reference sequences for alignment
-        reference = lambda w: expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
+        reference = lambda w: [
+            resolve_config_path(expanded)(w)
+            for expanded in
+            expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
+        ]
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned_{genome_seg}.fasta"
     wildcard_constraints:
@@ -71,9 +75,11 @@ rule join_segments:
         subtype = 'h5n1-cattle-outbreak',
         segment = 'genome',
         time = 'default',
+    params:
+        script = os.path.join(workflow.current_basedir, "../scripts/join-segments.py")
     shell:
         """
-        python scripts/join-segments.py \
+        python {params.script} \
             --segments {input.alignment} \
             --output {output.alignment}
         """
@@ -140,9 +146,11 @@ rule prune_tree:
     wildcard_constraints:
         subtype="h5n1-cattle-outbreak",
         time="default",
+    params:
+        script = os.path.join(workflow.current_basedir, "../scripts/restrict-via-common-ancestor.py")
     shell:
-        """
-        python3 scripts/restrict-via-common-ancestor.py \
+        r"""
+        python3 {params.script} \
             --tree {input.tree} \
             --strains {input.strains} \
             --output-tree {output.tree} \
@@ -162,13 +170,14 @@ rule colors_genome:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     params:
         duplications = "division=division_metadata",
+        script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py")
     wildcard_constraints:
         subtype="h5n1-cattle-outbreak",
         time="default",
     shell:
-        """
+        r"""
         cp {input.colors} {output.colors} && \
-        python3 scripts/assign-colors.py \
+        python3 {params.script} \ \
             --metadata {input.metadata} \
             --ordering {input.ordering} \
             --color-schemes {input.schemes} \

diff --git a/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml b/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml
@@ -1,9 +1,3 @@
-# NOTE: The h5n1-cattle-outbreak builds use a specific rule-file (specified within this config)
-# and that rule file may have some config-like parameters within it.
-custom_rules:
-  - "rules/cattle-flu.smk"
-
-
 #### Parameters which define which builds to produce via this config ###
 builds:
   h5n1-cattle-outbreak: ''
@@ -36,6 +30,7 @@ target_sequences_per_tree: 10_000
 
 
 #### Config files ####
+
 reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
 genome_reference: config/{subtype}/h5_cattle_genome_root.gb
 auspice_config: config/{subtype}/auspice_config_{subtype}.json