sweep-script-gen.py

import os
import json
from dataclasses import dataclass
from rich import print
import sys
import itertools

if len(sys.argv) != 2:
    print("Usage: script_name '<path_to_json_config_or_json_string>'")
    sys.exit(1)

input_str = sys.argv[1]

# Check if input is a file path or JSON string
if os.path.exists(input_str):
    with open(input_str, "r") as f:
        config = json.load(f)
else:
    try:
        config = json.loads(input_str)
    except json.JSONDecodeError:
        print(
            f"Error: The provided input is neither a valid file path nor a valid JSON string."
        )
        sys.exit(1)

# Check necessary keys in the JSON
required_keys = ["name", "GPUS", "model_indexes", "dataset_indexes"]
for key in required_keys:
    if key not in config:
        print(f"Missing key in config: {key}")
        sys.exit(1)

name = config["name"]
GPUS = config["GPUS"]
model_indexes = config["model_indexes"]
dataset_indexes = config["dataset_indexes"]


@dataclass
class Variant:
    name: str  # Name like 'prompt invariance'
    flag: str  # CLI flag like '--promptinv'
    values: list[str]  # Values it can take, e.g. ["True", "False"]


### SELECTING MODELS AND DATASETS ###

models_list = [
    "meta-llama/Llama-2-7b-hf",  # 0
    "meta-llama/Llama-2-13b-hf",  # 1
    "EleutherAI/pythia-12b",  # 2
    "bigscience/bloom-7b1",  # 3
    "EleutherAI/pythia-6.9b",  # 4
    "gpt2",  # 5
]

selected_models = [
    models_list[idx] for idx in model_indexes if 0 <= idx < len(models_list)
]

if not selected_models:
    print(f"No valid model indexes provided.")
    sys.exit(1)

models = "--models " + " ".join(f"'{model}'" for model in selected_models)

BURNS_DATASETS = [
    "ag_news",  # 0
    "amazon_polarity",  # 1
    "dbpedia_14",  # 2
    "glue:qnli",  # 3
    "imdb",  # 4
    "piqa",  # 5
    "super_glue:boolq",  # 6
    "super_glue:copa",  # 7
    "super_glue:rte",  # 8
]

selected_datasets = [
    BURNS_DATASETS[idx] for idx in dataset_indexes if 0 <= idx < len(BURNS_DATASETS)
]
if not selected_datasets:
    print(f"No valid dataset indexes provided.")
    sys.exit(1)

### COMPOSING THE CLI COMMAND ###

datasets = "--datasets " + " ".join(f"'{dataset}'" for dataset in selected_datasets)
binarize = "--binarize"
num_gpus = f"--num_gpus {GPUS}"

START_NUM = 0


def make_script(variants: list[Variant]) -> str:
    """Return a script that runs a sweep over the given variants."""
    script = f"""#!/bin/bash
#SBATCH --nodes=1
#SBATCH --gpus-per-node={GPUS}
#SBATCH --time=2-0
#SBATCH --partition=single
#SBATCH --job-name=elk_sweep_alpha
"""
    script += "# This script was generated by sweep-script-gen.py\n\n"
    script += f"# {variants}\n\n"
    script += f"# {models}\n"
    script += f"# {datasets}\n"
    script += """
i=0
while [[ -e not-133-sweep-out-$i.txt ]] ; do
    let i++
done
filename="not-133-sweep-out-$i.txt"
exec > $filename 2>&1

j=0
while [[ -e commands_status-$j.csv ]] ; do
    let j++
done
csv_file="commands_status-$j.csv"
echo \"idx,status,command\" > $csv_file
"""
    script += "# cd ../elk\n"

    combinations = list(itertools.product(*[variant.values for variant in variants]))

    commands = []

    NET = 0
    NORM = 1
    PER_PROBE_PROMPT = 2
    PROMPT_INDICES = 3
    NEG_COV_WEIGHT = 4
    LOSS = 5
    ERASE_PROMPT = 6
    VISUALIZE = 7

    combinations = [
        combo
        for combo in combinations
        if not (combo[NET] == "eigen" and combo[NORM] == "burns")
    ]
    combinations = [
        combo
        for combo in combinations
        if not (combo[NET] == "eigen" and combo[LOSS] == "ccs_prompt_var")
    ]  # does not apply for vinc
    combinations = [
        combo
        for combo in combinations
        if not (combo[NET] == "ccs" and combo[NEG_COV_WEIGHT] is not None)
    ]  # ccs should not have neg_cov_var
    combinations = [
        combo
        for combo in combinations
        if not (combo[NET] == "eigen" and combo[NEG_COV_WEIGHT] is None)
    ]  # vinc should not have None, only 0, 0.5, 1
    combinations = [
        combo
        for combo in combinations
        if not (combo[LOSS] == "ccs_prompt_var" and combo[PROMPT_INDICES] == "1")
    ]  # doing this throws a Warning Only one variant provided. Prompt variance loss will cause errors.
    combinations = [
        combo
        for combo in combinations
        if not (combo[NET] == "ccs" and bool(combo[ERASE_PROMPT]) is False)
    ]  # exclude if erase_prompt is true and net is ccs

    print(f"Number of combinations: {len(combinations)}")

    for combo in combinations:
        command = "elk sweep "
        command += models + " " + datasets + " " + binarize + " "
        out_dir = "--name "
        for i, value in enumerate(combo):
            net = combo[1]
            if value is not None:
                if net == "eigen" and variants[i].flag == "--norm":
                    pass
                else:
                    command += f"{variants[i].flag}={value} "
                    out_dir += f"{variants[i].flag[2:]}={value}-"

        command += num_gpus
        # add visualize
        command += " --visualize True"
        commands.append(command)

    script += "commands=( \\\n"
    for command in commands:
        print(command + "\n")
        script += f'"{command}" \\\n'
    script = script[:-2] + "\n)\n\n"

    script += """
idx=0
for command in "${commands[@]}"; do
    echo "$idx,NOT STARTED,$command" >> $csv_file
    ((idx=idx+1))
done
"""
    ntfy_name = "elk-sweeps"
    script += f"""
len=${{#commands[@]}}
for ((idx={START_NUM};idx<len;idx++)); do
    command=${{commands[$idx]}}
    sed -i "s|^$idx,NOT STARTED|$idx,RUNNING|g" $csv_file
    echo "Running command: $command"
    curl -d "Sweep [$idx]: $command" ntfy.sh/{ntfy_name}
    if ! eval "$command"; then
        sed -i "s|^$idx,RUNNING|$idx,ERROR|g" $csv_file
        echo "Error occurred: Failed to execute command: $command"
        curl -d "[$idx] Error occurred" ntfy.sh/{ntfy_name}
    else
        sed -i "s|^$idx,RUNNING|$idx,DONE|g" $csv_file
        echo "Command completed successfully: $command"
        curl -d "[$idx] Success" ntfy.sh/{ntfy_name}
    fi
done
"""

    script += "echo 'All combinations completed.'\n"

    return script


if __name__ == "__main__":
    variants = [
        Variant("net", "--net", ["ccs", "eigen"]),  # 0
        Variant("norm", "--norm", ["burns", None]),  # 1
        Variant("per probe prompt", "--probe_per_prompt", ["True", "False"]),  # 2
        Variant("prompt indices", "--prompt_indices", [None]),  # 3
        Variant("neg_cov_weight", "--neg_cov_weight", [None, 0, 0.5, 1]),  # 4
        Variant("loss", "--loss", ["ccs_prompt_var", None]),  # 5
        Variant("erase_prompt", "--erase_prompt", [False, True]),  # 6
    ]

    OUT_FILE = f"sweep-not-291-{name}.sh"
    script = make_script(variants)
    with open(OUT_FILE, "w") as f:
        f.write(script)
    os.system(f"chmod +x {OUT_FILE}")