From cb949fc94b72fc70a9822db94d7ba92b38d2dfd3 Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Fri, 20 Dec 2024 18:00:05 +1100
Subject: [PATCH] refactor: add inference manifest template and generation
 script

---
 .github/e2e-preset-configs.json               | 180 +++++++++++++++---
 .github/workflows/e2e-preset-test.yml         | 102 ++++------
 .../falcon-40b-instruct-service.yaml          |  13 --
 .../falcon-40b-instruct_hf.yaml               |  56 ------
 .../falcon-40b-instruct_vllm.yaml             |  56 ------
 .../falcon-40b/falcon-40b-service.yaml        |  13 --
 .../manifests/falcon-40b/falcon-40b_hf.yaml   |  56 ------
 .../manifests/falcon-40b/falcon-40b_vllm.yaml |  56 ------
 .../falcon-7b-instruct-service.yaml           |  13 --
 .../falcon-7b-instruct_hf.yaml                |  56 ------
 .../falcon-7b-instruct_vllm.yaml              |  56 ------
 .../falcon-7b/falcon-7b-service.yaml          |  13 --
 .../manifests/falcon-7b/falcon-7b_hf.yaml     |  56 ------
 .../manifests/falcon-7b/falcon-7b_vllm.yaml   |  56 ------
 .../manifests/inference-tmpl/manifest.yaml    | 100 ++++++++++
 .../llama-2-13b-chat-service.yaml             |  19 --
 .../llama-2-13b-chat/llama-2-13b-chat.yaml    |  28 ++-
 .../llama-2-13b/llama-2-13b-service.yaml      |  19 --
 .../manifests/llama-2-13b/llama-2-13b.yaml    |  26 ++-
 .../llama-2-7b-chat-service.yaml              |  14 --
 .../llama-2-7b-chat/llama-2-7b-chat.yaml      |  17 +-
 .../llama-2-7b/llama-2-7b-service.yaml        |  14 --
 .../test/manifests/llama-2-7b/llama-2-7b.yaml |  17 +-
 .../mistral-7b-instruct-service.yaml          |  13 --
 .../mistral-7b-instruct_hf.yaml               |  55 ------
 .../mistral-7b-instruct_vllm.yaml             |  55 ------
 .../mistral-7b/mistral-7b-service.yaml        |  13 --
 .../manifests/mistral-7b/mistral-7b_hf.yaml   |  55 ------
 .../manifests/mistral-7b/mistral-7b_vllm.yaml |  55 ------
 .../test/manifests/phi-2/phi-2-service.yaml   |  13 --
 .../test/manifests/phi-2/phi-2_hf.yaml        |  55 ------
 .../test/manifests/phi-2/phi-2_vllm.yaml      |  55 ------
 .../phi-3-medium-128k-instruct-service.yaml   |  13 --
 .../phi-3-medium-128k-instruct_hf.yaml        |  55 ------
 .../phi-3-medium-128k-instruct_vllm.yaml      |  55 ------
 .../phi-3-medium-4k-instruct-service.yaml     |  13 --
 .../phi-3-medium-4k-instruct_hf.yaml          |  55 ------
 .../phi-3-medium-4k-instruct_vllm.yaml        |  55 ------
 .../phi-3-mini-128k-instruct-service.yaml     |  13 --
 .../phi-3-mini-128k-instruct_hf.yaml          |  55 ------
 .../phi-3-mini-128k-instruct_vllm.yaml        |  55 ------
 .../phi-3-mini-4k-instruct-service.yaml       |  13 --
 .../phi-3-mini-4k-instruct_hf.yaml            |  55 ------
 .../phi-3-mini-4k-instruct_vllm.yaml          |  55 ------
 .../phi-3-small-128k-instruct-service.yaml    |  13 --
 .../phi-3-small-128k-instruct.yaml            |  55 ------
 .../phi-3-small-8k-instruct-service.yaml      |  13 --
 .../phi-3-small-8k-instruct.yaml              |  55 ------
 .../qwen2-5-coder-7b-instruct-service.yaml    |  13 --
 .../qwen2-5-coder-7b-instruct_hf.yaml         |  55 ------
 .../qwen2-5-coder-7b-instruct_vllm.yaml       |  83 --------
 .../manifests/tuning/falcon/falcon-7b.yaml    | 103 ----------
 .../tuning/tuning.yaml}                       |   0
 presets/workspace/test/scripts/README.md      |  82 ++++++++
 .../test/scripts/generate_manifests.py        | 112 +++++++++++
 55 files changed, 564 insertions(+), 1917 deletions(-)
 delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml
 create mode 100644 presets/workspace/test/manifests/inference-tmpl/manifest.yaml
 delete mode 100644 presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
 delete mode 100644 presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
 delete mode 100644 presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
 delete mode 100644 presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml
 delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
 delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
 delete mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
 delete mode 100644 presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml
 rename presets/workspace/test/{tuning/tuning-job.yaml => manifests/tuning/tuning.yaml} (100%)
 create mode 100644 presets/workspace/test/scripts/README.md
 create mode 100644 presets/workspace/test/scripts/generate_manifests.py

diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
index 9d20b0e38..37f49667a 100644
--- a/.github/e2e-preset-configs.json
+++ b/.github/e2e-preset-configs.json
@@ -7,16 +7,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
-      },
-      {
-        "name": "falcon-7b-adapter",
-        "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
-        "node-osdisk-size": 100,
-        "OSS": true,
-        "loads_adapter": true,
-        "expected_adapter": "amod-mental-health"
+        "loads_adapter": false,
+        "node_pool": "falcon7b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "falcon-7b-instruct",
@@ -24,7 +26,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon7binst",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "falcon-40b",
@@ -32,7 +45,18 @@
         "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon40b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 2
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "falcon-40b-instruct",
@@ -40,7 +64,18 @@
         "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "falcon40bins",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 2
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "mistral-7b",
@@ -48,7 +83,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "mistral7b",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "mistral-7b-instruct",
@@ -56,7 +102,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "mistral7bins",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-2",
@@ -64,7 +121,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi2",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-mini-4k-instruct",
@@ -72,7 +140,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3mini4kin",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-mini-128k-instruct",
@@ -80,7 +159,18 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 50,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3mini128k",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16",
+            "gpu_count": 1
+          }
+        }
       },
       {
         "name": "phi-3-medium-4k-instruct",
@@ -88,7 +178,18 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3medium4k",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "phi-3-medium-128k-instruct",
@@ -96,7 +197,18 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "phi3medium12",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --dtype float16 --max-model-len 1024 --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "qwen2.5-coder-7b-instruct",
@@ -105,7 +217,18 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "qwen25coder7",
+        "runtimes": {
+          "hf": {
+            "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code",
+            "gpu_count": 1
+          },
+          "vllm": {
+            "command": "python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml --tensor-parallel-size 2",
+            "gpu_count": 2
+          }
+        }
       },
       {
         "name": "llama-2-7b",
@@ -113,7 +236,8 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama27b"
       },
       {
         "name": "llama-2-7b-chat",
@@ -121,7 +245,8 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 100,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama27bchat"
       },
       {
         "name": "llama-2-13b",
@@ -129,7 +254,8 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 150,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama213b"
       },
       {
         "name": "llama-2-13b-chat",
@@ -137,7 +263,8 @@
         "node-vm-size": "Standard_NC12s_v3",
         "node-osdisk-size": 150,
         "OSS": false,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "llama213bchat"
       },
       {
         "name": "tuning",
@@ -145,7 +272,8 @@
         "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
-        "loads_adapter": false
+        "loads_adapter": false,
+        "node_pool": "tuning"
       }
     ]
   }
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 123bc1436..d859308d8 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -107,6 +107,7 @@ jobs:
     environment: preset-env
     strategy:
       fail-fast: false
+      max-parallel: 10
       matrix:
         # Ex matrix element:
         # {"name":"falcon-40b","type":"text-generation","version":"#",
@@ -164,44 +165,49 @@ jobs:
       - name: Set up kubectl context
         run: |
           az aks get-credentials --resource-group llm-test --name GitRunner
-    
-      - name: Get Nodepool Name
-        id: get_nodepool_name
+
+      - name: Get testing workload
+        id: workload
         run: |
-            NAME_SUFFIX=${{ matrix.model.name }}
-            NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/}  # Removing all '-' symbols
-            NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols
+            NODEPOOL_NAME=${{ matrix.model.node_pool }}
+            echo "NODEPOOL_NAME=$NODEPOOL_NAME" >> $GITHUB_OUTPUT
 
-            if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then
-                TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12}
-            else
-                TRUNCATED_NAME_SUFFIX=$NAME_SUFFIX_WITHOUT_DASHES
-            fi
-            echo "Nodepool Name: $TRUNCATED_NAME_SUFFIX"
-            echo "NODEPOOL_NAME=$TRUNCATED_NAME_SUFFIX" >> $GITHUB_OUTPUT
+            WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
+            echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+            WORKLOAD_FILE=$WORKLOAD_NAME.yaml
+            echo "WORKLOAD_FILE=$WORKLOAD_FILE" >> $GITHUB_OUTPUT
+            RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment")
+            echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT
+
+            pip install pyyaml
+            python3 presets/workspace/test/scripts/generate_manifests.py ${{ matrix.model.name }} ${{ env.RUNTIME }} \
+                --tag ${{ matrix.model.tag }} \
+                --repo ${{ secrets.ACR_AMRT_USERNAME }}.azurecr.io > $WORKLOAD_FILE
+
+            cat $WORKLOAD_FILE
 
       - name: Create Nodepool
         run: |
             NODEPOOL_EXIST=$(az aks nodepool show \
-                            --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                            --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \
                             --cluster-name GitRunner \
                             --resource-group llm-test \
                             --query 'name' -o tsv || echo "")
             echo "NODEPOOL_EXIST: $NODEPOOL_EXIST"
             if [ -z "$NODEPOOL_EXIST" ]; then
                 az aks nodepool add \
-                    --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                    --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \
                     --cluster-name GitRunner \
                     --resource-group llm-test \
                     --node-count ${{ matrix.model.node-count }} \
                     --node-vm-size ${{ matrix.model.node-vm-size }} \
                     --node-osdisk-size ${{ matrix.model.node-osdisk-size }} \
-                    --labels pool=${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                    --labels pool=${{ steps.workload.outputs.NODEPOOL_NAME }} \
                     --node-taints sku=gpu:NoSchedule \
                     --aks-custom-headers UseGPUDedicatedVHD=true
             else
                 NODEPOOL_STATE=$(az aks nodepool show \
-                                --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                                --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \
                                 --cluster-name GitRunner \
                                 --resource-group llm-test \
                                 --query 'provisioningState' -o tsv)
@@ -214,44 +220,14 @@ jobs:
                 fi
             fi
 
-      - name: Get testing workload
-        id: workload
+      - name: Create workload
         run: |
-            WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
-            echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
-            echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT
-
-      - name: Create Service
-        run: |
-            kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml
-      
-      - name: Retrieve External Service IP
-        id: get_ip
-        run: |
-            SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}')
-            echo "Service IP is $SERVICE_IP"
-            echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
-
-      - name: Get Resource Type
-        id: resource
-        run: |
-            RESOURCE_TYPE=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "statefulset" || echo "deployment")
-            echo "RESOURCE_TYPE=$RESOURCE_TYPE" >> $GITHUB_OUTPUT
-      
-      - name: Replace IP and Deploy Resource to K8s
-        run: |
-            POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
-            WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml
-
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE
-            kubectl apply -f $WORKLOAD_FILE
+            kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE}}
 
       - name: Wait for Resource to be ready
         run: |
-            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
-    
+            kubectl rollout status ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
+
       - name: Check Adapter Loading from Logs
         if: matrix.model.loads_adapter == true
         run: |
@@ -260,19 +236,19 @@ jobs:
 
       - name: Install testing commands
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
+            kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
+            kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
+            kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
             curl -s http://localhost:5000/health
 
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
+                kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -293,7 +269,7 @@ jobs:
                 }' \
                 http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
+                kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -309,7 +285,7 @@ jobs:
                 }' \
                 http://localhost:5000/generate
             elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
+                kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -328,7 +304,7 @@ jobs:
                     }' \
                 http://localhost:5000/v1/chat/completions
             else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
+                kubectl exec ${{steps.workload.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -370,9 +346,9 @@ jobs:
         if: always()
         run: |
             # Only proceed if RESOURCE_TYPE is set (else resource wasn't created)
-            if [ -n "${{ steps.resource.outputs.RESOURCE_TYPE }}" ]; then
+            if [ -n "${{ steps.workload.outputs.RESOURCE_TYPE }}" ]; then
                 # Use RESOURCE_TYPE from the previous step
-                RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }}
+                RESOURCE_TYPE=${{ steps.workload.outputs.RESOURCE_TYPE }}
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
                 if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
@@ -387,16 +363,16 @@ jobs:
             fi
         
             # Check and Delete AKS Nodepool if it exists            
-            if [ -n "${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }}" ]; then
+            if [ -n "${{ steps.workload.outputs.NODEPOOL_NAME }}" ]; then
                 NODEPOOL_EXIST=$(az aks nodepool show \
-                                --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                                --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \
                                 --cluster-name GitRunner \
                                 --resource-group llm-test \
                                 --query 'name' -o tsv || echo "")
 
                 if [ -n "$NODEPOOL_EXIST" ]; then
                     az aks nodepool delete \
-                    --name ${{ steps.get_nodepool_name.outputs.NODEPOOL_NAME }} \
+                    --name ${{ steps.workload.outputs.NODEPOOL_NAME }} \
                     --cluster-name GitRunner \
                     --resource-group llm-test
                 fi
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
deleted file mode 100644
index fc357931a..000000000
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: falcon-40b-instruct
-spec:
-  selector:
-    app: falcon
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml
deleted file mode 100644
index a44043894..000000000
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b-instruct
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40bins
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml
deleted file mode 100644
index 7b40cbac4..000000000
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b-instruct
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40bins
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
deleted file mode 100644
index 80ab4b539..000000000
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: falcon-40b
-spec:
-  selector:
-    app: falcon
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml
deleted file mode 100644
index 514d12e60..000000000
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40b
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml
deleted file mode 100644
index 7e74ac7a7..000000000
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-40b
-spec:
-  progressDeadlineSeconds: 1800
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon40b
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
deleted file mode 100644
index 2f27d46cb..000000000
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: falcon-7b-instruct
-spec:
-  selector:
-    app: falcon
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml
deleted file mode 100644
index 1b2092b36..000000000
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b-instruct
-spec:
-  progressDeadlineSeconds: 1200
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7binst
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml
deleted file mode 100644
index 4019d64f5..000000000
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b-instruct
-spec:
-  progressDeadlineSeconds: 1200
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7binst
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
deleted file mode 100644
index 595e83942..000000000
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: falcon-7b
-spec:
-  selector:
-    app: falcon
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml
deleted file mode 100644
index 56a775fff..000000000
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b
-spec:
-  progressDeadlineSeconds: 1200
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7b
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml
deleted file mode 100644
index bceb14560..000000000
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b
-spec:
-  progressDeadlineSeconds: 1200
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-      - name: falcon-container
-        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: falcon7b
diff --git a/presets/workspace/test/manifests/inference-tmpl/manifest.yaml b/presets/workspace/test/manifests/inference-tmpl/manifest.yaml
new file mode 100644
index 000000000..3609065da
--- /dev/null
+++ b/presets/workspace/test/manifests/inference-tmpl/manifest.yaml
@@ -0,0 +1,100 @@
+deployment:
+  apiVersion: apps/v1
+  kind: Deployment
+  metadata:
+    name: WORKLOAD_NAME
+  spec:
+    progressDeadlineSeconds: 1800
+    replicas: 1
+    selector:
+      matchLabels:
+        app: WORKLOAD_NAME
+    template:
+      metadata:
+        labels:
+          app: WORKLOAD_NAME
+      spec:
+        containers:
+        - name: WORKLOAD_NAME-container
+          image: REPO/MODEL_NAME:TAG
+          command:
+            - /bin/sh
+            - -c
+            - RUNTIME_COMMAND
+          resources:
+            requests:
+              nvidia.com/gpu: GPU_COUNT
+            limits:
+              nvidia.com/gpu: GPU_COUNT
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+          - name: dshm
+            mountPath: /dev/shm
+          - name: config-volume
+            mountPath: /mnt/config
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: config-volume
+          configMap:
+            defaultMode: 420
+            name: testing-inference-params
+        tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+        nodeSelector:
+          pool: NODE_POOL
+
+config:
+  apiVersion: v1
+  kind: ConfigMap
+  metadata:
+    name: testing-inference-params
+  data:
+    inference_config.yaml: |
+      # Maximum number of steps to find the max available seq len fitting in the GPU memory.
+      max_probe_steps: 6
+
+      vllm:
+        cpu-offload-gb: 0
+        gpu-memory-utilization: 0.95
+        swap-space: 4
+        served-model-name: test
+        dtype: float16
+
+        # max-seq-len-to-capture: 8192
+        # num-scheduler-steps: 1
+        # enable-chunked-prefill: false
+        # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.
+
+service:
+  apiVersion: v1
+  kind: Service
+  metadata:
+    name: WORKLOAD_NAME
+  spec:
+    ports:
+    - port: 5000
+      targetPort: 5000
+      protocol: TCP
+      name: http
+    selector:
+      app: WORKLOAD_NAME
+    type: ClusterIP
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
deleted file mode 100644
index 58720a91d..000000000
--- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-2-13b-chat
-spec:
-  selector:
-    app: llama
-    statefulset.kubernetes.io/pod-name: llama-2-13b-chat-0
-  ports:
-    - name: http
-      protocol: TCP
-      port: 80
-      targetPort: 5000
-    - name: torchrun
-      protocol: TCP
-      port: 29500
-      targetPort: 29500
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
index 2520ddb9c..1bbef236e 100644
--- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
@@ -25,14 +25,12 @@ spec:
             topologyKey: "kubernetes.io/hostname"
       containers:
         - name: llama-container
-          image: REPO_HERE.azurecr.io/llama-2-13b-chat:TAG_HERE # Placeholder that will be replaced
-          env:
-          - name: MASTER_ADDR
-            value: "MASTER_ADDR_HERE"  # Placeholder that will be replaced
+          image: REPO/llama-2-13b-chat:TAG # Placeholder that will be replaced
           command:
             - /bin/sh
             - -c
             - |
+              MASTER_ADDR=$(dig +short llama-2-13b-chat.default.svc.cluster.local)
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
               cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
@@ -69,4 +67,24 @@ spec:
           key: nvidia.com/gpu
           operator: Exists
       nodeSelector:
-        pool: llama213bcha
+        pool: llama213bchat
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-13b-chat
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-13b-chat-0
+  ports:
+    - name: http
+      protocol: TCP
+      port: 80
+      targetPort: 5000
+    - name: torchrun
+      protocol: TCP
+      port: 29500
+      targetPort: 29500
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
deleted file mode 100644
index f43826a48..000000000
--- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-2-13b
-spec:
-  selector:
-    app: llama
-    statefulset.kubernetes.io/pod-name: llama-2-13b-0
-  ports:
-    - name: http
-      protocol: TCP
-      port: 80
-      targetPort: 5000
-    - name: torchrun
-      protocol: TCP
-      port: 29500
-      targetPort: 29500
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
index 3bc5f72d2..314d81095 100644
--- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
@@ -25,14 +25,12 @@ spec:
             topologyKey: "kubernetes.io/hostname"
       containers:
         - name: llama-container
-          image: REPO_HERE.azurecr.io/llama-2-13b:TAG_HERE # Placeholder that will be replaced
-          env:
-          - name: MASTER_ADDR
-            value: "MASTER_ADDR_HERE"  # Placeholder that will be replaced
+          image: REPO/llama-2-13b:TAG # Placeholder that will be replaced
           command:
             - /bin/sh
             - -c
             - |
+              MASTER_ADDR=$(dig +short llama-2-13b.default.svc.cluster.local)
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
               cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py
@@ -70,3 +68,23 @@ spec:
           operator: Exists
       nodeSelector:
         pool: llama213b
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-13b
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-13b-0
+  ports:
+    - name: http
+      protocol: TCP
+      port: 80
+      targetPort: 5000
+    - name: torchrun
+      protocol: TCP
+      port: 29500
+      targetPort: 29500
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
deleted file mode 100644
index 99fc7895d..000000000
--- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-2-7b-chat
-spec:
-  selector:
-    app: llama
-    statefulset.kubernetes.io/pod-name: llama-2-7b-chat-0
-  ports:
-    - protocol: TCP
-      port: 80
-      targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
index af895fb3c..fb1babc05 100644
--- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: llama-container
-          image: REPO_HERE.azurecr.io/llama-2-7b-chat:TAG_HERE
+          image: REPO/llama-2-7b-chat:TAG
           command:
             - /bin/sh
             - -c
@@ -54,3 +54,18 @@ spec:
           operator: Exists
       nodeSelector:
         pool: llama27bchat
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-7b-chat
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-7b-chat-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
deleted file mode 100644
index d8dfb84c7..000000000
--- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-2-7b
-spec:
-  selector:
-    app: llama
-    statefulset.kubernetes.io/pod-name: llama-2-7b-0
-  ports:
-    - protocol: TCP
-      port: 80
-      targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
index 265b3c1a1..1179d0347 100644
--- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: llama-container
-          image: REPO_HERE.azurecr.io/llama-2-7b:TAG_HERE
+          image: REPO/llama-2-7b:TAG
           command:
             - /bin/sh
             - -c
@@ -54,3 +54,18 @@ spec:
           operator: Exists
       nodeSelector:
         pool: llama27b
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-7b
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-7b-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
deleted file mode 100644
index 94627746d..000000000
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: mistral-7b-instruct
-spec:
-  selector:
-    app: mistral
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml
deleted file mode 100644
index 75179683f..000000000
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-instruct-container
-        image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7bins
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml
deleted file mode 100644
index 939d6c75b..000000000
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-instruct-container
-        image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7bins
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
deleted file mode 100644
index 90ba3ec8f..000000000
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: mistral-7b
-spec:
-  selector:
-    app: mistral
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml
deleted file mode 100644
index 3eff5594f..000000000
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-container
-        image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7b
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml
deleted file mode 100644
index 2bd945319..000000000
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: mistral-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: mistral
-  template:
-    metadata:
-      labels:
-        app: mistral
-    spec:
-      containers:
-      - name: mistral-container
-        image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: mistral7b
diff --git a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
deleted file mode 100644
index d0f99f9ad..000000000
--- a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-2
-spec:
-  selector:
-    app: phi-2
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml
deleted file mode 100644
index cbc6f94e7..000000000
--- a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-2
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-2
-  template:
-    metadata:
-      labels:
-        app: phi-2
-    spec:
-      containers:
-      - name: phi-2-container
-        image: REPO_HERE.azurecr.io/phi-2:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi2
diff --git a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml
deleted file mode 100644
index e77f21268..000000000
--- a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-2
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-2
-  template:
-    metadata:
-      labels:
-        app: phi-2
-    spec:
-      containers:
-      - name: phi-2-container
-        image: REPO_HERE.azurecr.io/phi-2:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi2
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
deleted file mode 100644
index bab354ee9..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-medium-128k-instruct
-spec:
-  selector:
-    app: phi-3-medium-128k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml
deleted file mode 100644
index 0adb122e4..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium12
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
deleted file mode 100644
index 5b93bde50..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --max-model-len 1024 --tensor-parallel-size 2
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium12
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
deleted file mode 100644
index 60710504f..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-medium-4k-instruct
-spec:
-  selector:
-    app: phi-3-medium-4k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml
deleted file mode 100644
index 1d0d64e47..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium4k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
deleted file mode 100644
index 3bdce8072..000000000
--- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-medium-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-medium-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-medium-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-medium-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --tensor-parallel-size 2
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3medium4k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
deleted file mode 100644
index ef86aefb2..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-mini-128k-instruct
-spec:
-  selector:
-    app: phi-3-mini-128k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml
deleted file mode 100644
index cf8898015..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini128k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
deleted file mode 100644
index f719bf96b..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini128k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
deleted file mode 100644
index 0063f24aa..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-mini-4k-instruct
-spec:
-  selector:
-    app: phi-3-mini-4k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml
deleted file mode 100644
index 1d7069a38..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini4kin
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
deleted file mode 100644
index 8d1275678..000000000
--- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-mini-4k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-mini-4k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-mini-4k-instruct
-    spec:
-      containers:
-      - name: phi-3-mini-4k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3mini4kin
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
deleted file mode 100644
index a28bac071..000000000
--- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-small-128k-instruct
-spec:
-  selector:
-    app: phi-3-small-128k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml
deleted file mode 100644
index 1827155f4..000000000
--- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-small-128k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-small-128k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-small-128k-instruct
-    spec:
-      containers:
-      - name: phi-3-small-128k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-small-128k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3small128
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
deleted file mode 100644
index 17e031f87..000000000
--- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: phi-3-small-8k-instruct
-spec:
-  selector:
-    app: phi-3-small-8k-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml
deleted file mode 100644
index 1f515cc6a..000000000
--- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: phi-3-small-8k-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: phi-3-small-8k-instruct
-  template:
-    metadata:
-      labels:
-        app: phi-3-small-8k-instruct
-    spec:
-      containers:
-      - name: phi-3-small-8k-instruct-container
-        image: REPO_HERE.azurecr.io/phi-3-small-8k-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 1
-          limits:
-            nvidia.com/gpu: 1  # Requesting 1 GPU
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: phi3small8ki
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
deleted file mode 100644
index 73637c99a..000000000
--- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: qwen2-5-coder-7b-instruct
-spec:
-  selector:
-    app: qwen2-5-coder-7b-instruct
-  ports:
-  - protocol: TCP
-    port: 80
-    targetPort: 5000
-  type: ClusterIP
-  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
deleted file mode 100644
index e92d906d7..000000000
--- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: qwen2-5-coder-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: qwen2-5-coder-7b-instruct
-  template:
-    metadata:
-      labels:
-        app: qwen2-5-coder-7b-instruct
-    spec:
-      containers:
-      - name: qwen2-5-coder-7b-instruct-container
-        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: qwen25coder7
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
deleted file mode 100644
index 4c1e72510..000000000
--- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: qwen2-5-coder-7b-instruct
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: qwen2-5-coder-7b-instruct
-  template:
-    metadata:
-      labels:
-        app: qwen2-5-coder-7b-instruct
-    spec:
-      containers:
-      - name: qwen2-5-coder-7b-instruct-container
-        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
-        command:
-          - /bin/sh
-          - -c
-          - python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2
-        livenessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 600 # 10 Min
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /health
-            port: 5000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        volumeMounts:
-        - name: dshm
-          mountPath: /dev/shm
-        - mountPath: /mnt/config
-          name: config-volume
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      - configMap:
-          defaultMode: 420
-          name: qwen2-5-coder-7b-inference-params
-        name: config-volume
-      tolerations:
-      - effect: NoSchedule
-        key: sku
-        operator: Equal
-        value: gpu
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
-      nodeSelector:
-        pool: qwen25coder7
----
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: qwen2-5-coder-7b-inference-params
-data:
-  inference_config.yaml: |
-    # Maximum number of steps to find the max available seq len fitting in the GPU memory.
-    max_probe_steps: 6
-
-    vllm:
-      cpu-offload-gb: 0
-      gpu-memory-utilization: 0.95
-      swap-space: 4
-      served-model-name: test
-      dtype: float16
-      tensor-parallel-size: 2
-
-      # max-seq-len-to-capture: 8192
-      # num-scheduler-steps: 1
-      # enable-chunked-prefill: false
-      # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.
diff --git a/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml b/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml
deleted file mode 100644
index 7852bf01a..000000000
--- a/presets/workspace/test/manifests/tuning/falcon/falcon-7b.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b-tuning
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      containers:
-        - name: falcon-container
-          image: aimodelsregistrytest.azurecr.io/tuning-falcon-7b:0.0.1
-          command: ["/bin/sh", "-c", "sleep infinity"]
-          resources:
-            requests:
-              nvidia.com/gpu: 2
-            limits:
-              nvidia.com/gpu: 2  # Requesting 2 GPUs
-          volumeMounts:
-            - name: dshm
-              mountPath: /dev/shm
-            - name: workspace
-              mountPath: /workspace
-
-        - name: docker-sidecar
-          image: docker:dind
-          securityContext:
-            privileged: true  # Allows container to manage its own containers
-          volumeMounts:
-            - name: workspace
-              mountPath: /workspace
-          env:
-            - name: ACR_USERNAME
-              value: "{{ACR_USERNAME}}"
-            - name: ACR_PASSWORD
-              value: "{{ACR_PASSWORD}}"
-            - name: TAG
-              value: "{{TAG}}"
-          command: ["/bin/sh"]
-          args:
-            - -c
-            - |
-              # Start the Docker daemon in the background with specific options for DinD
-              dockerd &
-              # Wait for the Docker daemon to be ready
-              while ! docker info > /dev/null 2>&1; do
-                echo "Waiting for Docker daemon to start..."
-                sleep 1
-              done
-              echo 'Docker daemon started'
-
-              while true; do
-                FILE_PATH=$(find /workspace/tfs -name 'fine_tuning_completed.txt')
-                if [ ! -z "$FILE_PATH" ]; then
-                  echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"
-
-                  PARENT_DIR=$(dirname "$FILE_PATH")
-                  echo "Parent directory is $PARENT_DIR"
-
-                  TEMP_CONTEXT=$(mktemp -d)
-                  cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
-                  cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"
-
-                  # Create a minimal Dockerfile
-                  echo 'FROM scratch
-                  ADD adapter_config.json /
-                  ADD adapter_model.safetensors /' > "$TEMP_CONTEXT/Dockerfile"
-              
-                  # Login to Docker registry
-                  echo $ACR_PASSWORD | docker login $ACR_USERNAME.azurecr.io -u $ACR_USERNAME --password-stdin
-
-                  docker build -t $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG "$TEMP_CONTEXT"
-                  docker push $ACR_USERNAME.azurecr.io/adapter-falcon-7b:$TAG
-
-                  # Cleanup: Remove the temporary directory
-                  rm -rf "$TEMP_CONTEXT"
-
-                  # Remove the file to prevent repeated builds, or handle as needed
-                  # rm "$FILE_PATH"
-                fi
-                sleep 10  # Check every 10 seconds
-              done
-
-      volumes:
-        - name: dshm
-          emptyDir:
-            medium: Memory
-        - name: workspace
-          emptyDir: {}
-
-      tolerations:
-        - effect: NoSchedule
-          key: sku
-          operator: Equal
-          value: gpu
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
diff --git a/presets/workspace/test/tuning/tuning-job.yaml b/presets/workspace/test/manifests/tuning/tuning.yaml
similarity index 100%
rename from presets/workspace/test/tuning/tuning-job.yaml
rename to presets/workspace/test/manifests/tuning/tuning.yaml
diff --git a/presets/workspace/test/scripts/README.md b/presets/workspace/test/scripts/README.md
new file mode 100644
index 000000000..aaa49ca69
--- /dev/null
+++ b/presets/workspace/test/scripts/README.md
@@ -0,0 +1,82 @@
+# Manifest Generation Script
+
+This script generates Kubernetes manifests for different model deployments based on templates and configurations.
+
+## Overview
+
+The script (`generate_manifests.py`) combines:
+- Base templates from `presets/workspace/test/manifests/inference-tmpl/manifest.yaml`
+- Model configurations from `.github/e2e-preset-configs.json`
+- Predefined manifests from `presets/workspace/test/manifests/<model_name>/<model_name>.yaml`
+to generate deployment/statefulset and service manifests for each model.
+
+## Usage
+
+```bash
+# Print manifests to stdout
+python generate_manifests.py <model_name> <runtime> [--repo REPO] [--tag TAG]
+```
+
+### Parameters:
+- `model_name`: Name of the model (e.g., "mistral-7b", "falcon-40b")
+- `runtime`: Runtime to use ("hf" for Hugging Face or "vllm" for vLLM)
+- `--repo`: (Optional) Repository name to replace REPO in the template
+- `--tag`: (Optional) Tag to replace TAG in the template
+
+### Example:
+```bash
+# Generate manifest for Mistral 7B with vLLM runtime
+python generate_manifests.py mistral-7b vllm
+
+# Generate manifest for Falcon 40B with Hugging Face runtime and custom repo/tag
+python generate_manifests.py falcon-40b hf --repo myregistry.azurecr.io --tag v1.0.0
+```
+
+## Configuration Files
+
+### Template (presets/workspace/test/manifests/inference-tmpl/manifest.yaml)
+Contains base templates for:
+- Deployment
+- Service
+- ConfigMap
+
+Placeholders:
+- MODEL_NAME
+- RUNTIME_COMMAND
+- GPU_COUNT
+- NODE_POOL
+- NODE_COUNT
+- REPO (can be overridden with --repo flag)
+- TAG (can be overridden with --tag flag)
+
+### Model Configurations
+Located at `.github/e2e-preset-configs.json`
+- Contains configurations for all supported models
+- Each model configuration includes:
+  - Basic info (name, node count, VM size, etc.)
+  - Node pool specification
+  - Runtime-specific configurations
+
+## Example Configuration
+
+```json
+{
+  "name": "mistral-7b",
+  "node-count": 1,
+  "node-vm-size": "Standard_NC6s_v3",
+  "node-osdisk-size": 100,
+  "OSS": true,
+  "loads_adapter": false,
+  "node_pool": "mistral7b",
+  "runtimes": {
+    "hf": {
+      "command": "accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16",
+      "gpu_count": 1
+    },
+    "vllm": {
+      "command": "python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja",
+      "gpu_count": 1
+    }
+  }
+}
+```
diff --git a/presets/workspace/test/scripts/generate_manifests.py b/presets/workspace/test/scripts/generate_manifests.py
new file mode 100644
index 000000000..defc2d80a
--- /dev/null
+++ b/presets/workspace/test/scripts/generate_manifests.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import json
+import yaml
+import sys
+import os
+import argparse
+from pathlib import Path
+
+def get_project_root():
+    # Get the directory containing the current script
+    current_dir = Path(__file__).resolve().parent
+    # Go up 4 levels to reach project root (from presets/workspace/test/scripts)
+    return current_dir.parents[3]
+
+def load_json_config():
+    project_root = get_project_root()
+    config_path = project_root / '.github' / 'e2e-preset-configs.json'
+    with open(config_path, 'r') as f:
+        data = json.load(f)
+        return data['matrix']['image']  # Return the array of model configs
+
+def load_template():
+    project_root = get_project_root()
+    template_path = project_root / 'presets' / 'workspace' / 'test' / 'manifests' / 'inference-tmpl' / 'manifest.yaml'
+    with open(template_path, 'r') as f:
+        return yaml.safe_load(f)
+
+def check_predefined_manifest(model_name):
+    project_root = get_project_root()
+    manifest_path = project_root / 'presets' / 'workspace' / 'test' / 'manifests' / f'{model_name}' / f'{model_name}.yaml'
+    if not os.path.exists(manifest_path):
+        return (None, False)
+    with open(manifest_path, 'r') as f:
+        return (f.read(), True)
+
+def process_model(model_name, runtime, repo=None, tag=None):
+    configs = load_json_config()
+    model_config = next((m for m in configs if m['name'] == model_name), None)
+
+    if not model_config:
+        print(f"Model {model_name} not found in configs", file=sys.stderr)
+        sys.exit(1)
+
+    predefined_manifest, predefined_manifest_exists = check_predefined_manifest(model_name)
+    if predefined_manifest_exists:
+        return process_predefined_manifest(model_name, runtime, predefined_manifest, repo, tag)
+
+    if runtime not in model_config.get('runtimes', {}):
+        print(f"Runtime {runtime} not configured for model {model_name}", file=sys.stderr)
+        sys.exit(1)
+
+    runtime_config = model_config['runtimes'][runtime]
+    workload_name = model_config.get('workload', model_name)
+
+    templates = load_template()
+    manifest_str = yaml.dump(templates['deployment'])
+
+    # Replace placeholders in template
+    manifest_str = (
+        manifest_str
+        .replace('WORKLOAD_NAME', workload_name)
+        .replace('MODEL_NAME', model_name)
+        .replace('RUNTIME_COMMAND', runtime_config['command'])
+        .replace('GPU_COUNT', str(runtime_config['gpu_count']))
+        .replace('NODE_POOL', model_config['node_pool'])
+        .replace('NODE_COUNT', str(model_config['node-count']))
+    )
+
+    # Replace repo and tag if provided
+    if repo:
+        manifest_str = manifest_str.replace('REPO', repo)
+    if tag:
+        manifest_str = manifest_str.replace('TAG', tag)
+
+    # Parse the template string back into YAML
+    manifest = yaml.safe_load(manifest_str)
+
+    # Generate service manifest
+    service_template = templates['service']
+    service_str = yaml.dump(service_template)
+    service_str = service_str.replace('WORKLOAD_NAME', workload_name)
+    service_manifest = yaml.safe_load(service_str)
+
+    # Generate config manifest
+    config_template = templates['config']
+    config_str = yaml.dump(config_template)
+    config_manifest = yaml.safe_load(config_str)
+
+    # Print manifests to stdout
+    yaml.dump(manifest, sys.stdout, default_flow_style=False)
+    print('---')  # Document separator
+    yaml.dump(service_manifest, sys.stdout, default_flow_style=False)
+    print('---')  # Document separator
+    yaml.dump(config_manifest, sys.stdout, default_flow_style=False)
+
+def process_predefined_manifest(model_name, runtime, predefined_manifest, repo=None, tag=None):
+    if repo:
+        predefined_manifest = predefined_manifest.replace('REPO', repo)
+    if tag:
+        predefined_manifest = predefined_manifest.replace('TAG', tag)
+
+    print(predefined_manifest)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process model template with optional repo and tag.')
+    parser.add_argument('model_name', help='Name of the model')
+    parser.add_argument('runtime', help='Runtime to use')
+    parser.add_argument('--repo', help='Repository name to use instead of REPO')
+    parser.add_argument('--tag', help='Tag to use instead of TAG')
+
+    args = parser.parse_args()
+    process_model(args.model_name, args.runtime, args.repo, args.tag)